# Version changes:
* use eca-nfnet-l1 more powerful
* use 3 models for text [Roberta - xlma-roberta - distilbert-indonesian]
* use 4 model for iamges [ eca-nfnet-l0, eca-nfnet-l1, efficientnet_b3, efficientnet_b5]

In [1]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [2]:
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2
import transformers
from transformers import (BertTokenizer, BertModel,DistilBertTokenizer, DistilBertModel)

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [3]:
class CFG:
    img_size = 512
    batch_size = 12
    seed = 2021

    bert_hidden_size = 768
    
    device = 'cuda'
    classes = 11014
    
    model_name1 = 'eca_nfnet_l1'
    model_path1 = '../input/effb-shopee/arcface_512x512_nfnet_l0(mish)_ep15.pt'
    
    model_name2 = 'efficientnet_b3'
    model_path2 = '../input/shopee-pytorch-models/arcface_512x512_eff_b3.pt'
    
    #model_name3 = 'dm_nfnet_f0'
    #model_path3 = '../input/shopeepytorchselftrained/arcface_512x512_dm_nfnet_f0(mish).pt'

    model_name3 = 'tf_efficientnet_b5_ns'
    model_path3 = '../input/shopee-pytorch-models/arcface_512x512_eff_b5_.pt'
    
    model_name4 = 'eca_nfnet_l0'
    model_path4 = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    max_length = 30
    scale = 30 
    margin = 0.5
    num_workers = 4

In [4]:
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 2021

device = torch.device('cuda')

CHECK_SUB = False
GET_CV = False

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


# INFO HOW TO LOAD 
# https://stackoverflow.com/questions/64001128/load-a-pre-trained-model-from-disk-with-huggingface-transformers
# WATCH BELOW HOW GETTING EMBEDDING

################################################# № 1 MODEL & MODEL PATH ####################################################################

transformer_model_1 = '../input/sentence-transformer-models/stsb-roberta-base/0_Transformer'

TEXT_MODEL_PATH_1 = '../input/best-selftrained-lang-models/roberta-base_best_loss_num_epochs_15_arcface.bin'


################################################# № 2 MODEL & MODEL PATH ####################################################################

transformer_model_2 = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'

TEXT_MODEL_PATH_2 = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

################################################# № 3 MODEL & MODEL PATH ####################################################################

transformer_model_3 = '../input/distilbert-base-indonesian'

TEXT_MODEL_PATH_3 = '../input/best-selftrained-lang-models/distilbert-base-indonesian_best_loss_num_epochs_15_arcface.bin'


################################################ Metric Loss and its params #######################################################
loss_module = 'arcface'#'softmax'
scale = 30.0
m = 0.5 
ls_eps = 0.0
easy_margin = False


model_params = {
    'n_classes':11014,
    'model_name': transformer_model_1,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.0
}


this submission notebook will compute CV score, but commit notebook will not


In [5]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
    return df, df_cu, image_paths

In [6]:
df,df_cu,image_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


# [Word2Vec](https://www.kaggle.com/medvedew/shopee-only-w2v-cpu)

In [7]:
import re
import gc
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import random
from collections import Counter
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)


TOKEN_RE = re.compile(r'[\w]+')

def tokenize_text_simple_regex(txt, min_token_size=2):
    txt = str(txt).lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [wordnet_lemmatizer.lemmatize(token, pos="v") for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]
'''
corpus = tokenize_corpus(list(df['title']))

model = Word2Vec(
        sentences=corpus,
        vector_size=200, 
        window=20, 
        min_count=1, 
        sg=1, #skip-gram
        negative=10, 
        epochs=1000, 
        seed=SEED,
        workers=10
        )

def plot_vectors(vectors, labels, how='tsne', ax=None):
    if how == 'tsne':
        projections = TSNE().fit_transform(vectors)
    elif how == 'svd':
        projections = TruncatedSVD().fit_transform(vectors)
    x = projections[:, 0]
    y = projections[:, 1]
    ax.scatter(x, y)
    for cur_x, cur_y, cur_label in zip(x, y, labels):
        ax.annotate(cur_label, (cur_x, cur_y))
        
def n_grams(ngram, data):
    freq_dict = defaultdict(int)
    for text in data:
        tokens = [w for w in text.lower().split() if w != " " if w not in stopwords]
        ngrams = zip(*[tokens[i:] for i in range(ngram)])
        list_grams = [" ".join(ngram) for ngram in ngrams]
        for word in list_grams:
            freq_dict[word] += 1
    df_ngram =  pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])   
    df_ngram.columns = ["word", "wordcount"]
    return df_ngram 

df_3_grams = n_grams(3, df['title']) 
print(df_3_grams.head(20))

test_words = ['jam', 'tangan', 'wanita', 'xiaomi','redmi','note', 'somebymi', 'yuja', 'niacin', '100', 'ml']
gensim_words = [w for w in test_words if w in model.wv.index_to_key]
gensim_vectors = np.stack([model.wv[w] for w in gensim_words])

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))
plot_vectors(gensim_vectors, test_words, how='svd', ax=ax)


del train
del test 
del corpus
gc.collect()
'''

'\ncorpus = tokenize_corpus(list(df[\'title\']))\n\nmodel = Word2Vec(\n        sentences=corpus,\n        vector_size=200, \n        window=20, \n        min_count=1, \n        sg=1, #skip-gram\n        negative=10, \n        epochs=1000, \n        seed=SEED,\n        workers=10\n        )\n\ndef plot_vectors(vectors, labels, how=\'tsne\', ax=None):\n    if how == \'tsne\':\n        projections = TSNE().fit_transform(vectors)\n    elif how == \'svd\':\n        projections = TruncatedSVD().fit_transform(vectors)\n    x = projections[:, 0]\n    y = projections[:, 1]\n    ax.scatter(x, y)\n    for cur_x, cur_y, cur_label in zip(x, y, labels):\n        ax.annotate(cur_label, (cur_x, cur_y))\n        \ndef n_grams(ngram, data):\n    freq_dict = defaultdict(int)\n    for text in data:\n        tokens = [w for w in text.lower().split() if w != " " if w not in stopwords]\n        ngrams = zip(*[tokens[i:] for i in range(ngram)])\n        list_grams = [" ".join(ngram) for ngram in ngrams]\n    

In [8]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [9]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [10]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'],row['text_predictions_sbert'],row['text_predictions_w2v']])
    return ' '.join( np.unique(x))

In [11]:
def get_image_predictions(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions

In [12]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [13]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def fetch_loss():
    loss = nn.CrossEntropyLoss()
    return loss

In [14]:
class ShopeeTextDataset(Dataset):

    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text,
                         return_attention_mask=False,
                         return_token_type_ids=False,
                         padding='max_length',
                         truncation=True,
                         max_length=64)
        
        input_ids = text['input_ids']
        
        input_ids,labels = self.prepare_mlm_input_and_labels(np.array(input_ids))

        input_ids = torch.tensor(input_ids,dtype=torch.long)
        labels = torch.tensor(labels,dtype=torch.long)
    
        return input_ids,labels
    
    def prepare_mlm_input_and_labels(self,X):
        # 15% BERT masking
        inp_mask = np.random.rand(*X.shape)<0.15 
        # do not mask special tokens
        inp_mask[X<=2] = False
        # set targets to -1 by default, it means ignore
        labels = -100 * np.ones(X.shape, dtype=int)
        # set labels for masked tokens
        labels[inp_mask] = X[inp_mask]
        
        # prepare input
        X_mlm = np.copy(X)
        # set input to [MASK] which is the last token for the 90% of tokens
        # this means leaving 10% unchanged
        inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
        X_mlm[inp_mask_2mask] = mask_tok

        # set 10% to a random token
        inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
        X_mlm[inp_mask_2random] = np.random.randint(3, CONFIG.vocab_size, inp_mask_2random.sum())

        return X_mlm, labels

In [15]:
class ShopeeDatasetText(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask


In [16]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

In [17]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

In [18]:
def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.65,0.95,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
        threshold = 0.85

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

In [19]:
'''
credit : https://github.com/HuangYG123/CurricularFace/blob/8b2f47318117995aa05490c05b455b113489917e/head/metrics.py#L70
'''

def l2_norm(input, axis = 1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)

    return output

class CurricularFace(nn.Module):
    def __init__(self, in_features, out_features, s = 30, m = 0.50):
        super(CurricularFace, self).__init__()

        print('Using Curricular Face')

        self.in_features = in_features
        self.out_features = out_features
        self.m = m
        self.s = s
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.threshold = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        self.kernel = nn.Parameter(torch.Tensor(in_features, out_features))
        self.register_buffer('t', torch.zeros(1))
        nn.init.normal_(self.kernel, std=0.01)

    def forward(self, embbedings, label):
        embbedings = l2_norm(embbedings, axis = 1)
        kernel_norm = l2_norm(self.kernel, axis = 0)
        cos_theta = torch.mm(embbedings, kernel_norm)
        cos_theta = cos_theta.clamp(-1, 1)  # for numerical stability
        with torch.no_grad():
            origin_cos = cos_theta.clone()
        target_logit = cos_theta[torch.arange(0, embbedings.size(0)), label].view(-1, 1)

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m #cos(target+margin)
        mask = cos_theta > cos_theta_m
        final_target_logit = torch.where(target_logit > self.threshold, cos_theta_m, target_logit - self.mm)

        hard_example = cos_theta[mask]
        with torch.no_grad():
            self.t = target_logit.mean() * 0.01 + (1 - 0.01) * self.t
        cos_theta[mask] = hard_example * (self.t + hard_example)
        cos_theta.scatter_(1, label.view(-1, 1).long(), final_target_logit)
        output = cos_theta * self.s
        return output, nn.CrossEntropyLoss()(output,label)

class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ShopeeModel(nn.Module):

    def __init__(
        self, model_name,
        n_classes = CFG.classes,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        if model_name == "curricular_face_eca_nfnet_l1":
            self.backbone = timm.create_model("eca_nfnet_l1", pretrained=pretrained)
        else:
            self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif "nfnet" in model_name:
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()
            
        self.pooling = nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim
        
        if model_name == "curricular_face_eca_nfnet_l1":
            self.final = CurricularFace(final_in_features, 
                                   n_classes, 
                                   s=scale, 
                                   m=margin)
        else:
            self.final = ArcMarginProduct(
                final_in_features,
                n_classes,
                scale = scale,
                margin = margin,
                easy_margin = False,
                ls_eps = 0.0
            )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

In [20]:
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

In [21]:
def get_image_embeddings(image_paths, model_name, model_path, is_mish = False):
    embeds = []
    
    model = ShopeeModel(model_name=model_name)
    model.eval()
    
    if is_mish:
        model = replace_activations(model, torch.nn.SiLU, Mish())    

    model.load_state_dict(torch.load(model_path))
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [22]:
class ShopeeNetExtend(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 scale=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNetExtend, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self.relu = nn.ReLU()
            self._init_params()
            final_in_features = fc_dim

        self.loss_module = loss_module
        if loss_module == 'arcface':
            self.final = ArcMarginProduct(final_in_features, n_classes,
                                          scale=scale, margin=margin, easy_margin=False, ls_eps=ls_eps)
        else:
            self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask, label):
        feature = self.extract_feat(input_ids,attention_mask)
        if self.loss_module == 'arcface':
            logits = self.final(feature, label)
        else:
            logits = self.final(feature)
        return logits

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)
            features = self.relu(features)

        return features

In [23]:
def get_text_embeddings(df,model_params, transformer_model_path, ext = False):
    embeds = []
    if ext :
        model = ShopeeNetExtend(**model_params)
    else:
        model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(transformer_model_path).items())[:-1]))
    model = model.to(device)

    # change datasets for different Models
    text_dataset = ShopeeDatasetText(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [24]:
def get_text_predictions(df, max_features = 25_000):
    
    model = TfidfVectorizer( binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]> 0.75)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    
    del model,text_embeddings
    gc.collect()
    return preds

# IMAGE INFERENCE

In [25]:
image_embeddings1 = get_image_embeddings(image_paths.values, CFG.model_name1, CFG.model_path1, True)
image_embeddings2 = get_image_embeddings(image_paths.values, CFG.model_name2, CFG.model_path2, False)
image_embeddings3 = get_image_embeddings(image_paths.values, CFG.model_name3, CFG.model_path3, False)
image_embeddings4 = get_image_embeddings(image_paths.values, CFG.model_name4, CFG.model_path4, True)
#image_embeddings5 = get_image_embeddings(image_paths.values, CFG.model_name5, CFG.model_path5, True)

image_embeddings = (image_embeddings1 + image_embeddings2 + image_embeddings3 + image_embeddings4 ) / 4
image_predictions = get_image_predictions(df, image_embeddings, threshold = 0.36)

Building Model Backbone for eca_nfnet_l1 model


100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


Our image embeddings shape is (3, 512)
Building Model Backbone for efficientnet_b3 model


100%|██████████| 1/1 [00:00<00:00,  3.34it/s]


Our image embeddings shape is (3, 512)
Building Model Backbone for tf_efficientnet_b5_ns model


100%|██████████| 1/1 [00:00<00:00,  2.98it/s]


Our image embeddings shape is (3, 512)
Building Model Backbone for eca_nfnet_l0 model


100%|██████████| 1/1 [00:00<00:00,  3.19it/s]
100%|██████████| 3/3 [00:00<00:00, 3864.53it/s]

Our image embeddings shape is (3, 512)





# TEXT INFERENCE

In [26]:

model_params['model_name'] = transformer_model_1
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model_1)
CONFIG = transformers.AutoConfig.from_pretrained(transformer_model_1)
mask_tok = 250001
text_embeddings_roberta_base = get_text_embeddings(df, model_params, TEXT_MODEL_PATH_1)

# THIS WORKING 
# sentence-transformers/paraphrase-xlm-r-multilingual-v1
model_params['model_name'] = transformer_model_2
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model_2)
CONFIG = transformers.AutoConfig.from_pretrained(transformer_model_2)
mask_tok = 250001
text_embeddings_param = get_text_embeddings(df, model_params,TEXT_MODEL_PATH_2)

model_params['model_name'] = transformer_model_3
TOKENIZER = DistilBertTokenizer.from_pretrained(transformer_model_3)
DistilBertModel.from_pretrained(transformer_model_3)
CONFIG = transformers.AutoConfig.from_pretrained(transformer_model_3)
mask_tok = 31999
text_embeddings_distil_bert_indonesian = get_text_embeddings(df, model_params,TEXT_MODEL_PATH_3)

text_embeddings = (text_embeddings_roberta_base + text_embeddings_param + text_embeddings_distil_bert_indonesian) / 3

df,text_predictions_sbert = get_neighbours_cos_sim(df,text_embeddings)

text_predictions_tfidf = get_text_predictions(df, max_features = 25_000)

100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


Our text embeddings shape is (3, 768)


100%|██████████| 1/1 [00:00<00:00,  2.37it/s]


Our text embeddings shape is (3, 768)


100%|██████████| 1/1 [00:00<00:00,  3.72it/s]


Our text embeddings shape is (3, 768)
Finding similar texts...for threshold : 0.85
chunk 0 to 3
Finding similar titles...
chunk 0 to 3


# W2V Inference

In [27]:
corpus = tokenize_corpus(list(df['title']))
w2v_model = Word2Vec.load("../input/word2vec-100-indonesian/idwiki_word2vec_100.model")
#w2v_model = np.load('../input/word2vec-100-indonesian/idwiki_word2vec_100.model.wv.vectors.npy')
embeds = []
for sentence in corpus:
    words = [w for w in sentence if w in w2v_model.wv.index_to_key]
    words_vector = np.array([w2v_model.wv[w] for w in words])
    if len(words_vector)==0: 
        embeds.append(np.zeros((200), dtype='float32').tolist())
    else:
        embed = np.median(words_vector, axis=0).tolist()
        embeds.append(embed)
        
df,text_predictions_w2v = get_neighbours_cos_sim(df,embeds)


Finding similar texts...for threshold : 0.85
chunk 0 to 3


In [28]:
if not GET_CV:
    df['text_predictions_w2v'] = text_predictions_w2v
    df['text_predictions_sbert'] = text_predictions_sbert
    df['image_predictions'] = image_predictions
    df['text_predictions'] = text_predictions_tfidf
    df['matches'] = df.apply(combine_predictions, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['text_predictions_w2v'] = text_predictions_w2v
    df['text_predictions_sbert'] = text_predictions_sbert
    df['image_predictions'] = image_predictions
    df['text_predictions'] = text_predictions_tfidf
    df['matches'] = df.apply(combine_predictions, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

Thanks you so much for reading this notebook. If you have any suggestions or ideas on ensembling models together then do let me know. 😁