In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("data/siamese_synonyms.csv")
df

Unnamed: 0,Word_1,Word_2,Label
0,машина,тачка,1
1,машина,колёса,1
2,тачка,автомашина,1
3,тачка,драндулет,1
4,тачка,колёса,1
...,...,...,...
554555,товарищ,наговаривание,0
554556,блоха,пролегать,0
554557,жадность,давать,0
554558,непросвещённый,постигнуть,0


## Get word vectors (RU fasttext embeddings)

In [5]:
import fasttext.util

fasttext.util.download_model("ru", if_exists="ignore")
fasttext_model = fasttext.load_model("cc.ru.300.bin")



In [6]:
import torch
from torch.utils.data import DataLoader
from dataset import SynonymsDataset

from sklearn.model_selection import train_test_split

train, test = train_test_split(df)
train, val = train_test_split(train)
BATCH_SIZE = 256

train_dataloader = DataLoader(
    SynonymsDataset(train, fasttext_model), batch_size=BATCH_SIZE, shuffle=False
)
val_dataloader = DataLoader(
    SynonymsDataset(val, fasttext_model), batch_size=BATCH_SIZE, shuffle=False
)
test_dataloader = DataLoader(
    SynonymsDataset(test, fasttext_model), batch_size=BATCH_SIZE, shuffle=False
)

## Train base Siamese model

In [7]:
from model import BaseSiamese

config = {}

DEVICE = torch.device("cuda")
EMBEDDING_SIZE = fasttext_model.get_dimension()
OUTPUT_DIR = "trained_models"
MODEL_NAME = "siamese_ft"
config = (OUTPUT_DIR, MODEL_NAME)

model = BaseSiamese(EMBEDDING_SIZE)
model.to(DEVICE)

loss_fn = torch.nn.CosineEmbeddingLoss()
optimizer = torch.optim.Adam(params=model.parameters(), weight_decay=1e-9)

In [8]:
from model import train
train(DEVICE, model, train_dataloader, val_dataloader, loss_fn, optimizer, config, num_epochs=5)

100%|██████████| 1219/1219 [00:27<00:00, 44.70it/s]
100%|██████████| 407/407 [00:09<00:00, 44.33it/s]


epoch 1, loss: 0.20077415099145937
valid loss: 0.18861222029171645, valid F1 0.8369712817174858
Best score, save model


100%|██████████| 1219/1219 [00:27<00:00, 44.56it/s]
100%|██████████| 407/407 [00:09<00:00, 43.05it/s]


epoch 2, loss: 0.18790905056574778
valid loss: 0.18759974324732506, valid F1 0.8375980282321309
Best score, save model


100%|██████████| 1219/1219 [00:28<00:00, 43.10it/s]
100%|██████████| 407/407 [00:09<00:00, 43.24it/s]


epoch 3, loss: 0.18730605846550544
valid loss: 0.18732822669958307, valid F1 0.837749123376987
Best score, save model


100%|██████████| 1219/1219 [00:27<00:00, 43.60it/s]
100%|██████████| 407/407 [00:09<00:00, 44.13it/s]


epoch 4, loss: 0.187013447358043
valid loss: 0.18712008720680304, valid F1 0.8379032258064516
Best score, save model


100%|██████████| 1219/1219 [00:26<00:00, 45.21it/s]
100%|██████████| 407/407 [00:08<00:00, 45.59it/s]


epoch 5, loss: 0.1868202631301426
valid loss: 0.1870305415526655, valid F1 0.8379614065756741
Best score, save model


In [9]:
from model import evaluate
from sklearn.metrics import classification_report

best_model = BaseSiamese(EMBEDDING_SIZE)
best_model.load_state_dict(torch.load(f"{OUTPUT_DIR}/{MODEL_NAME}.pth"))
best_model.to(DEVICE)

_, predicted_labels, correct_labels = evaluate(DEVICE, best_model, test_dataloader, loss_fn)
print(classification_report(predicted_labels, correct_labels))

100%|██████████| 542/542 [00:12<00:00, 45.13it/s]


              precision    recall  f1-score   support

       False       0.85      0.91      0.88     76161
        True       0.88      0.80      0.84     62479

    accuracy                           0.86    138640
   macro avg       0.86      0.85      0.86    138640
weighted avg       0.86      0.86      0.86    138640

