In [1]:
import numpy as np 
import pandas as pd 
import os
import torch
from sklearn.model_selection import train_test_split
try:
    from sentence_transformers import SentenceTransformer, InputExample, losses, util
except:
    !pip install sentence_transformers
    from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, fbeta_score, hamming_loss, jaccard_score, average_precision_score

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=60a34f90273e219a989349fd356195168bd214e3fe599d3c7883d73427e44d86
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [2]:
BASE = '../input/pretrained-transformers-for-fine-tuning/models'
path_models = os.listdir(BASE)

In [3]:
df = pd.read_csv('../input/new-preprocessing-nst/data.csv')

In [4]:
train, test = train_test_split(df,test_size=0.2, 
                                random_state=128,
                                stratify=df['is_dup'])

In [5]:
test.is_dup.value_counts()

1    46612
0    46612
Name: is_dup, dtype: int64

In [6]:
mod_path = os.path.join(BASE, 'sentence-transformers_distiluse-base-multilingual-cased-v2')

In [7]:
ls = train.values.tolist()
train_examples = []
for x in tqdm(range(len(train))):
    train_examples.append(InputExample(texts=[ls[x][0], ls[x][1]], label=float(ls[x][2])))

100%|██████████| 372895/372895 [00:01<00:00, 197780.06it/s]


In [8]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [9]:
model = SentenceTransformer(mod_path, device = 'cuda')
train_loss = losses.CosineSimilarityLoss(model)

In [10]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23306 [00:00<?, ?it/s]

In [11]:
name_1 = [x[0] for x in test[['name_1']].values.tolist()[:10000]]
name_2 = [x[0] for x in test[['name_2']].values.tolist()[:10000]]
y_test = test.is_dup.values.tolist()[:10000]

embeddings1 = model.encode(name_1, convert_to_tensor=True)
embeddings2 = model.encode(name_2, convert_to_tensor=True)

cosine_scores = util.cos_sim(embeddings1, embeddings2)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [12]:
y_pred = []
for i in range(len(cosine_scores)):
    y_pred.append(float(cosine_scores[i][i]))

In [13]:
def evaluate(y_test, y_pred):
    y_test = np.asarray(y_test, dtype='float32')
    y_predf = [1 if x > 0.88 else 0 for x in y_pred]
    y_predf = np.asarray(y_predf, dtype='float32')
    score = f1_score(y_test, y_predf)
    rec = recall_score(y_test, y_predf)
    pre = precision_score(y_test, y_predf)
    ac = accuracy_score(y_test, y_predf)
    fb = fbeta_score(y_test, y_predf, beta=0.5)
    hm = hamming_loss(y_test, y_predf)
    js = jaccard_score(y_test, y_predf)
    ap = average_precision_score(y_test, y_predf)
    return ac, pre, rec, score, fb, hm, js, ap

In [14]:
evaluate(y_test, y_pred)

(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0)

In [15]:
name_1 = ['Gazprom', 'Зенит Санкт-Петербург', 'Tesla Texas']
name_2 = ['Rosneft', 'Спартак', 'Tesla']
y_test = [0, 0, 1]

embeddings1 = model.encode(name_1, convert_to_tensor=True)
embeddings2 = model.encode(name_2, convert_to_tensor=True)

cosine_scores = util.cos_sim(embeddings1, embeddings2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
y_pred = []
for i in range(len(cosine_scores)):
    y_pred.append(float(cosine_scores[i][i]))

In [17]:
evaluate(y_test, y_pred)

(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0)

In [18]:
y_pred

[0.0029274853877723217, 0.038226231932640076, 0.9981639385223389]

In [19]:
model.save('1.0_db-multilingual-cased-v2')

In [20]:
import shutil
shutil.make_archive('./1.0_db-multilingual-cased-v2', 'zip')

'./1.0_db-multilingual-cased-v2.zip'