In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from root_extractor.baseline import get_heuristic_cognate, get_simple_heuristic_cognate
from root_extractor.neural_morph_segm import load_cls


data = pd.read_csv("data/dataset.csv")
data = data.loc[:, ~data.columns.str.contains("^Unnamed")]
data.head()

Unnamed: 0,Lemma_1,Lemma_2,Type,isCognate
0,вода,сыр,Неоднокоренные (разные корни),0
1,коса,зябь,Неоднокоренные (разные корни),0
2,любой,водитель,Неоднокоренные (разные корни),0
3,нож,красить,Неоднокоренные (разные корни),0
4,красивый,переносица,Неоднокоренные (разные корни),0


# Сравнение корней без чередования

In [2]:
model = load_cls('root_extractor/models/morphemes-3-5-3-memo.json')

# print(get_simple_heuristic_cognate(model, 'вода', 'сыр'))
# print(get_simple_heuristic_cognate(model, 'запах', 'пахнуть'))

simple_heuristic_predicts = []

for index, row in data.iterrows():
    simple_heuristic_predicts.append(get_simple_heuristic_cognate(model, row['Lemma_1'], row['Lemma_2']))

['вод'] ['сыр']
['кос'] ['зябь']
['люб'] ['вод']
['нож'] ['крас']
['крас'] ['нос']
['бур'] ['люб']
['лж'] ['сыр']
['крас'] ['мак']
['крас'] ['мак']
['краш'] ['мак']
['краш'] ['мак']
['крас'] ['мед']
['крас'] ['мед']
['полн'] ['кос']
['тир'] ['кос']
['люб'] ['кос']
['люб'] ['кос']
['люб'] ['речк']
['люб'] ['речь']
['люб'] ['реч']
['бур'] ['речь']
['бур'] ['лет']
['тер'] ['лет']
['бур'] ['лет']
['нос'] ['мор']
['нос'] ['мор']
['люб'] ['мор']
['нос'] ['мор']
['нос'] ['морь']
['сыр'] ['пар']
['нос'] ['пар']
['нос'] ['пар']
['нос'] ['пар']
['гор'] ['пар']
['гор'] ['пар']
['гор'] ['пар']
['гор'] ['пах']
['гор'] ['пах']
['гор'] ['паш']
['пла'] ['пах']
['мир'] ['пах']
['гар'] ['полн']
['мир'] ['полн']
['гор'] ['полн']
['мер'] ['полн']
['дорож'] ['полн']
['дорож'] ['полн']
['дорог'] ['красн']
['мир'] ['крас']
['нож'] ['красн']
['нож'] ['красн']
['лож'] ['крас']
['лож'] ['красн']
['ложь'] ['красн']
['лг'] ['пах']
['тер'] ['пах']
['тер'] ['пах']
['мир'] ['пла']
['дорог'] ['пла']
['мир'] ['пло']
[

  curr_costs[state] = costs[-1][prev_state] - np.log(


['дорог'] ['бур']
['дорож'] ['бур']
['тер'] ['нос']
['тер'] ['нос']
['ног'] ['нос']
['ног'] ['нос']
['ложь'] ['нос']
['ложь'] ['нос']
['лж'] ['нос']
['как'] ['нос']
['скак'] ['нос']
['скак'] ['гор']
['скак'] ['гор']
['скак'] ['гор']
['скакал'] ['гор']
['скак'] ['гор']
['скак'] ['гор']
['скак'] ['гор']
['клон'] ['мир']
['клон'] ['мир']
['клон'] ['мир']
['клон'] ['мир']
['пах'] ['мер']
['пах'] ['дорож']
['бег'] ['дорож']
['бег'] ['дорог']
['бег'] ['нож']
['бег'] ['нож']
['бег'] ['нож']
['бег'] ['нож']
['бег'] ['лож']
['гор'] ['лож']
['гор'] ['лож']
['гор'] ['лг']
['гар'] ['тер']
['гар'] ['тер']
['гар'] ['дорог']
['гор'] ['нож']
['гор'] ['ног']
['гор'] ['нож']
['гар'] ['ног']
['гор'] ['нож']
['лаг'] ['нож']
['предлаг'] ['зорь']
['предполож'] ['зор']
['излож'] ['зорь']
['лож'] ['чет']
['лож'] ['чет']
['лаг'] ['чт']
['крас'] ['чес']
['крас'] ['счёт']
['крас'] ['чет']
['тир'] ['чет']
['тир'] ['чт']
['тер'] ['чес']
['лет'] ['счёт']
['лет'] ['пир']
['дир'] ['жиг']
['дравш'] ['жига']
['дир'] ['

In [3]:
data['Simple_heuristic'] = simple_heuristic_predicts
data['Simple_heuristic'].value_counts()

False    470
True     230
Name: Simple_heuristic, dtype: int64

In [4]:
data['Simple_heuristic'] = data['Simple_heuristic'].astype(int)
data['Simple_heuristic'].value_counts()

0    470
1    230
Name: Simple_heuristic, dtype: int64

In [7]:
print(classification_report(data['isCognate'], data['Simple_heuristic']))

              precision    recall  f1-score   support

           0       0.68      0.76      0.71       420
           1       0.56      0.46      0.50       280

    accuracy                           0.64       700
   macro avg       0.62      0.61      0.61       700
weighted avg       0.63      0.64      0.63       700



# Сравнение корней с чередованием

In [8]:
model = load_cls('root_extractor/models/morphemes-3-5-3-memo.json')

# print(get_simple_heuristic_cognate(model, 'вода', 'сыр'))
# print(get_simple_heuristic_cognate(model, 'запах', 'пахнуть'))

heuristic_predicts = []

for index, row in data.iterrows():
    heuristic_predicts.append(get_heuristic_cognate(model, row['Lemma_1'], row['Lemma_2']))



  curr_costs[state] = costs[-1][prev_state] - np.log(




In [9]:
data['Heuristic'] = heuristic_predicts
data['Heuristic'].value_counts()

False    382
True     318
Name: Heuristic, dtype: int64

In [10]:
data['Heuristic'] = data['Heuristic'].astype(int)
data['Heuristic'].value_counts()

0    382
1    318
Name: Heuristic, dtype: int64

In [11]:
print(classification_report(data['isCognate'], data['Heuristic']))

              precision    recall  f1-score   support

           0       0.76      0.69      0.72       420
           1       0.59      0.67      0.63       280

    accuracy                           0.68       700
   macro avg       0.68      0.68      0.68       700
weighted avg       0.69      0.68      0.69       700



# Косинусное расстояние эмбеддингов

In [12]:
geowac_model_path = 'vectors/model.model'
geowac_model = gensim.models.KeyedVectors.load(geowac_model_path)
data = pd.read_csv('data/pos_dataset.csv')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data.head()

Unnamed: 0,Lemma_1,Lemma_2,Type,isCognate
0,вода_NOUN,сыр_NOUN,Неоднокоренные (разные корни),0
1,коса_NOUN,зябь_NOUN,Неоднокоренные (разные корни),0
2,любой_ADJ,водитель_NOUN,Неоднокоренные (разные корни),0
3,нож_NOUN,красить_VERB,Неоднокоренные (разные корни),0
4,красивый_ADJ,переносица_NOUN,Неоднокоренные (разные корни),0


In [13]:
# print(geowac_model.similarity('лесник_NOUN', 'лес_NOUN'))

geowac_cos_sim = []
f1_results = []
thresholds = np.arange(0.01, 1, 0.01).tolist()

for index, row in data.iterrows():
    geowac_cos_sim.append(geowac_model.similarity(row['Lemma_1'], row['Lemma_2']))

In [14]:
data['geowac_cos_sim'] = geowac_cos_sim
data

Unnamed: 0,Lemma_1,Lemma_2,Type,isCognate,geowac_cos_sim
0,вода_NOUN,сыр_NOUN,Неоднокоренные (разные корни),0,0.490754
1,коса_NOUN,зябь_NOUN,Неоднокоренные (разные корни),0,0.271747
2,любой_ADJ,водитель_NOUN,Неоднокоренные (разные корни),0,0.398547
3,нож_NOUN,красить_VERB,Неоднокоренные (разные корни),0,-0.046823
4,красивый_ADJ,переносица_NOUN,Неоднокоренные (разные корни),0,0.156545
...,...,...,...,...,...
695,восторг_NOUN,ликование_NOUN,Синонимы,0,0.369388
696,циничный_ADJ,безнравственный_ADJ,Синонимы,0,0.831184
697,состоятельность_NOUN,зажиточность_NOUN,Синонимы,0,0.673900
698,оригинальный_ADJ,самобытный_ADJ,Синонимы,0,0.840551


# Перебираем порог с наибольшим f1

In [15]:
for threshold in thresholds:
    geowac_predict = []
    for cos_sim in geowac_cos_sim:
        if cos_sim > threshold:
            geowac_predict.append(1)
        else:
            geowac_predict.append(0)

    f1_res = f1_score(data['isCognate'], geowac_predict)
    f1_results.append((f1_res, threshold))

In [16]:
f1_results

[(0.5707964601769911, 0.01),
 (0.568868980963046, 0.02),
 (0.5685164212910533, 0.03),
 (0.5730593607305936, 0.04),
 (0.5734104046242774, 0.05),
 (0.5694282380396732, 0.060000000000000005),
 (0.5694117647058823, 0.06999999999999999),
 (0.5659928656361475, 0.08),
 (0.5583634175691937, 0.09),
 (0.5514705882352942, 0.09999999999999999),
 (0.5488257107540173, 0.11),
 (0.5475, 0.12),
 (0.5392405063291139, 0.13),
 (0.5335051546391752, 0.14),
 (0.5261780104712042, 0.15000000000000002),
 (0.5139813581890813, 0.16),
 (0.5101214574898786, 0.17),
 (0.5068119891008175, 0.18000000000000002),
 (0.5034387895460798, 0.19),
 (0.49720670391061444, 0.2),
 (0.49929478138222855, 0.21000000000000002),
 (0.4949928469241775, 0.22),
 (0.4883720930232558, 0.23),
 (0.48901903367496347, 0.24000000000000002),
 (0.4940476190476191, 0.25),
 (0.4939759036144578, 0.26),
 (0.4823348694316436, 0.27),
 (0.4829721362229102, 0.28),
 (0.4804992199687988, 0.29000000000000004),
 (0.4772370486656201, 0.3),
 (0.4709677419354839,

# Берем пару с наибольшим f1 

In [22]:
from operator import itemgetter
best_f1, best_th = max(f1_results,key=itemgetter(0))
best_f1, best_th

(0.5734104046242774, 0.05)

In [27]:
best_geowac_predict = []
for cos_sim in geowac_cos_sim:
    if cos_sim > best_th:
        best_geowac_predict.append(1)
    else:
        best_geowac_predict.append(0)

print(len(best_geowac_predict))

700


In [28]:
print(classification_report(data['isCognate'], best_geowac_predict))

              precision    recall  f1-score   support

           0       0.72      0.20      0.31       420
           1       0.42      0.89      0.57       280

    accuracy                           0.47       700
   macro avg       0.57      0.54      0.44       700
weighted avg       0.60      0.47      0.42       700



# Сиамская сеть

In [29]:
from model import BaseSiamese, inference
from inference import pos_tag_input
from pymystem3 import Mystem
import torch
import gensim

In [30]:
m = Mystem()   
fasttext_model = gensim.models.KeyedVectors.load('vectors/model.model')

DEVICE = torch.device('cpu')
EMBEDDING_SIZE = fasttext_model.vector_size
MODEL_PATH = 'trained_models/siamese/cognates_siamese_ft_balanced.pth'
ROOTS_MODEL_PATH = 'trained_models/roots/morphemes-3-5-3-memo.json'

siamese_model = BaseSiamese(EMBEDDING_SIZE)
siamese_model.load_state_dict(torch.load(MODEL_PATH))
siamese_model.to(DEVICE)

root_extractor_model = load_cls(ROOTS_MODEL_PATH)

siamese_model.eval()

BaseSiamese(
  (fc): Linear(in_features=300, out_features=150, bias=True)
)

In [31]:
data = pd.read_csv('data/preprocessed/test_pos_dataset.csv')
data.head()

Unnamed: 0,Lemma_1,Lemma_2,isCognate
0,выкрасить_VERB,вычитать_VERB,0
1,краска_NOUN,подсчет_NOUN,0
2,красить_VERB,вычет_NOUN,0
3,вытирать_VERB,зачет_NOUN,0
4,растирать_VERB,зачитывать_VERB,0


In [39]:
siamese_predicts = []

for i in range(len(data)):
    word_1_vec = fasttext_model[data['Lemma_1'][i]]
    word_2_vec = fasttext_model[data['Lemma_2'][i]]
    siamese_prob = inference(siamese_model, word_1_vec, word_2_vec)
    siamese_predicts.append(siamese_prob)

In [42]:
f1_res = []
f1_results = []
thresholds = np.arange(0.01, 1, 0.01).tolist()


for threshold in thresholds:
    siam_predict = []
    for siam_pr in siamese_predicts:
        if siam_pr > threshold:
            siam_predict.append(1)
        else:
            siam_predict.append(0)

    f1_res = f1_score(data['isCognate'], siam_predict)
    f1_results.append((f1_res, threshold))

# Берем пару с наибольшим f1 

In [43]:
from operator import itemgetter
best_f1, best_th = max(f1_results,key=itemgetter(0))
best_f1, best_th

(0.5165562913907285, 0.08)

In [45]:
best_siam_predict = []
for sim_pred in siamese_predicts:
    if sim_pred > best_th:
        best_siam_predict.append(1)
    else:
        best_siam_predict.append(0)

print(len(best_siam_predict))

140


In [46]:
print(classification_report(data['isCognate'], best_siam_predict))

              precision    recall  f1-score   support

           0       0.62      0.33      0.43        84
           1       0.41      0.70      0.52        56

    accuracy                           0.48       140
   macro avg       0.52      0.51      0.48       140
weighted avg       0.54      0.48      0.47       140

