In [1]:
import numpy as np
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
try:
    from sentence_transformers import SentenceTransformer, InputExample, losses, util
except:
    !pip install sentence_transformers
    from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, fbeta_score, hamming_loss, jaccard_score, average_precision_score

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=23db5284bc6ea862b99d6b8533082f898f01edfc0e9322d3f4fb862de4da86de
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [2]:
from sklearn.metrics import roc_auc_score

In [3]:
BASE_MODEL = '../input/name-similarity-transformer/trained_MiniLM-L12-v2'
NEW_MODEL = '../input/nst-part-3/1.0_db-multilingual-cased-v2'

In [4]:
df = pd.read_csv('../input/new-preprocessing-nst/data.csv')

In [5]:
train, test = train_test_split(df,test_size=0.4, 
                                random_state=128,
                                stratify=df['is_dup'])

In [6]:
test.is_dup.value_counts()

0    93224
1    93224
Name: is_dup, dtype: int64

In [7]:
# Two lists of sentences
name_1 = [x[0] for x in test[['name_1']].values.tolist()[160000:]]
name_2 = [x[0] for x in test[['name_2']].values.tolist()[160000:]]

In [8]:
y_test = test.is_dup.values.tolist()[160000:]

In [9]:
import gc
def get_pred(model, name1, name2):
    torch.cuda.empty_cache()
    gc.collect()
    with torch.no_grad():
        model1 = SentenceTransformer(model, device = 'cuda')
        embeddings1 = model1.encode(name_1, convert_to_tensor=True)
        embeddings2 = model1.encode(name_2, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings1, embeddings2)
        y_pred = []
        for i in range(len(cosine_scores)):
            y_pred.append(float(cosine_scores[i][i]))
        return y_pred

def evaluate(y_test, y_pred, metrics):
    y_test = np.asarray(y_test, dtype='float32')
    y_predf = [1 if x > 0.7 else 0 for x in y_pred]
    y_predf = np.asarray(y_predf, dtype='float32')
    metrics['F1-score'].append(f1_score(y_test, y_predf))
    metrics['Recall'].append(recall_score(y_test, y_predf))
    metrics['Precision'].append(precision_score(y_test, y_predf))
    metrics['Accuracy'].append(accuracy_score(y_test, y_predf))
    metrics['F-beta'].append(fbeta_score(y_test, y_predf, beta=0.5))
    metrics['Hamming loss'].append(hamming_loss(y_test, y_predf))
    metrics['Jaccard score'].append(jaccard_score(y_test, y_predf))
    metrics['AP'].append(average_precision_score(y_test, y_predf))
    metrics['ROC-AUC'].append(roc_auc_score(y_test, y_predf))
    return metrics

In [10]:
modelsp = [BASE_MODEL, NEW_MODEL]

In [11]:
metrics = {'Models':[],'F1-score':[],'Recall':[],'Precision':[],'Accuracy':[],'F-beta':[],'Hamming loss':[],'Jaccard score':[],'AP':[],'ROC-AUC':[]}
for i in range(len(modelsp)):
    metrics['Models'].append(modelsp[i].split('/')[-1])
    y = get_pred(modelsp[i], name_1, name_2)
    metrics = evaluate(y_test, y, metrics)

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

In [12]:
dat = pd.DataFrame(data = metrics)
dat

Unnamed: 0,Models,F1-score,Recall,Precision,Accuracy,F-beta,Hamming loss,Jaccard score,AP,ROC-AUC
0,trained_MiniLM-L12-v2,0.384427,0.243046,0.919042,0.609611,0.590541,0.390389,0.237951,0.60302,0.610751
1,1.0_db-multilingual-cased-v2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


In [13]:
dat.to_csv('comp_model.csv',index = False)

In [14]:
#hard test
metrics = {'Models':[],'F1-score':[],'Recall':[],'Precision':[],'Accuracy':[],'F-beta':[],'Hamming loss':[],'Jaccard score':[],'AP':[],'ROC-AUC':[]}
name_1 = ['bridge', 'Газпром', 'Россети','Queen Elizabeth', 'Зенит Санкт-Петербург', 'Росбанк','Улыбка Радуги', 'Kamaz']
name_2 = ['Brigitte Bardot', 'Газпромнефтегаз', 'Роскачество','queen of Great Britain', 'Zenit SPb', 'Сельхозбанк','Радужный леприкон', 'Kamaz Auto of Russia']
y_test = [0,1,0,1,1,0,0,1]
for i in range(len(modelsp)):
    metrics['Models'].append(modelsp[i].split('/')[-1])
    y = get_pred(modelsp[i], name_1, name_2)
    metrics = evaluate(y_test, y, metrics)
dat = pd.DataFrame(data = metrics)
dat

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Models,F1-score,Recall,Precision,Accuracy,F-beta,Hamming loss,Jaccard score,AP,ROC-AUC
0,trained_MiniLM-L12-v2,0.4,0.5,0.333333,0.25,0.357143,0.75,0.25,0.416667,0.25
1,1.0_db-multilingual-cased-v2,0.888889,1.0,0.8,0.875,0.833333,0.125,0.8,0.8,0.875


In [15]:
dat.to_csv('hard_comp_model.csv',index = False)

In [16]:
import pickle
model = SentenceTransformer(NEW_MODEL, device = 'cuda')





In [17]:
name_1 = [x[0] for x in test[['name_1']].values.tolist()[160000:]]
name_2 = [x[0] for x in test[['name_2']].values.tolist()[160000:]]
embeddings1 = model.encode(name_1, convert_to_numpy=True)
embeddings2 = model.encode(name_2, convert_to_numpy=True)
y_test = test.is_dup.values.tolist()[160000:]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

In [18]:
from sklearn.decomposition import PCA

In [19]:
len(name_1)

26448

In [20]:
new_dimension = 128
pca = PCA(n_components=new_dimension)
pca.fit(embeddings1[:128])
pca.fit(embeddings2[:128])
pca_comp = np.asarray(pca.components_)

In [21]:
from sentence_transformers import models

In [22]:
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module('dense', dense)

In [23]:
model.save('./128_size_bert_v2')

In [24]:
import shutil 
shutil.make_archive('./128_size_bert_v2', 'zip', './')

'/kaggle/working/128_size_bert_v2.zip'

In [25]:
modelsp.append('./128_size_bert_v2')

In [26]:
metrics = {'Models':[],'F1-score':[],'Recall':[],'Precision':[],'Accuracy':[],'F-beta':[],'Hamming loss':[],'Jaccard score':[],'AP':[],'ROC-AUC':[]}
for i in range(len(modelsp)):
    metrics['Models'].append(modelsp[i].split('/')[-1])
    y = get_pred(modelsp[i], name_1, name_2)
    metrics = evaluate(y_test, y, metrics)

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

Batches:   0%|          | 0/827 [00:00<?, ?it/s]

In [27]:
dat = pd.DataFrame(data = metrics)
dat

Unnamed: 0,Models,F1-score,Recall,Precision,Accuracy,F-beta,Hamming loss,Jaccard score,AP,ROC-AUC
0,trained_MiniLM-L12-v2,0.384427,0.243046,0.919042,0.609611,0.590541,0.390389,0.237951,0.60302,0.610751
1,1.0_db-multilingual-cased-v2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
2,128_size_bert_v2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


In [28]:
dat.to_csv('comp_with_128.csv', index = False)

In [29]:
#hard test
metrics = {'Models':[],'F1-score':[],'Recall':[],'Precision':[],'Accuracy':[],'F-beta':[],'Hamming loss':[],'Jaccard score':[],'AP':[],'ROC-AUC':[]}
name_1 = ['bridge', 'Газпром', 'Россети','Queen Elizabeth', 'Зенит Санкт-Петербург', 'Росбанк','Улыбка Радуги', 'Kamaz']
name_2 = ['Brigitte Bardot', 'Газпромнефтегаз', 'Роскачество','queen of Great Britain', 'Zenit SPb', 'Сельхозбанк','Радужный леприкон', 'Kamaz Auto of Russia']
y_test = [0,1,0,1,1,0,0,1]
for i in range(len(modelsp)):
    metrics['Models'].append(modelsp[i].split('/')[-1])
    y = get_pred(modelsp[i], name_1, name_2)
    metrics = evaluate(y_test, y, metrics)
dat = pd.DataFrame(data = metrics)
dat

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Models,F1-score,Recall,Precision,Accuracy,F-beta,Hamming loss,Jaccard score,AP,ROC-AUC
0,trained_MiniLM-L12-v2,0.4,0.5,0.333333,0.25,0.357143,0.75,0.25,0.416667,0.25
1,1.0_db-multilingual-cased-v2,0.888889,1.0,0.8,0.875,0.833333,0.125,0.8,0.8,0.875
2,128_size_bert_v2,0.888889,1.0,0.8,0.875,0.833333,0.125,0.8,0.8,0.875


In [30]:
dat.to_csv('hard_comp_with_128.csv', index = False)