Loan word classification experiments

In [1]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd 
import numpy as np 
import io
import requests
import csv
import json
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import pprint
import matplotlib.pyplot as plt
import random

In [2]:
import torch
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# device = 'cpu'
# print(device)

Using cuda device


In [4]:
pairs = None

with open('../language-pairs.json', 'r') as f:
    pairs = json.loads(f.read())
    print(pairs)
    
train_alldata = {}
test_alldata = {}

train_realdist = {}
test_realdist = {}

train_balanced = {}
test_balanced = {}

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L2 = pairs[pair]['source']['name']

    # load datasets
    prefix = f'../Datasets/production_train_test/{L1}-{L2}'

    train_alldata[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')
    
    
features = ['Fast Levenshtein Distance Div Maxlen',
            'Dolgo Prime Distance Div Maxlen',
            'Feature Edit Distance Div Maxlen',
            'Hamming Feature Distance Div Maxlen',
            'Weighted Feature Distance Div Maxlen',
            'Partial Hamming Feature Distance Div Maxlen',
            'plain Levenshtein',
            'DNN_logits',
            'MBERT_cos_sim',
            'XLM_cos_sim'
           ]


# features = ['Fast Levenshtein Distance Div Maxlen',
#             'Dolgo Prime Distance Div Maxlen',
#             'Feature Edit Distance Div Maxlen',
#             'Hamming Feature Distance Div Maxlen',
#             'Weighted Feature Distance Div Maxlen',
#             'Partial Hamming Feature Distance Div Maxlen',
#             'plain Levenshtein',
             
#            ]




# features = [ 
#             'plain Levenshtein',
#             'DNN_logits',
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]


# features = [ 
#             'plain Levenshtein',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]

# features = [ 
            
#                 'DNN_logits',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]
# features = [ 
#             'Partial Hamming Feature Distance Div Maxlen',
#             'plain Levenshtein',
#                 'DNN_logits',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]


labels = ['label_bin']

train_allpairs_alldata = pd.concat([train_alldata[pair] for pair in pairs])
x_train_allpairs_alldata = np.hstack([train_allpairs_alldata[features].values])
y_train_allpairs_alldata = train_allpairs_alldata[labels].values.ravel()
x_train_allpairs_alldata_means = np.mean(x_train_allpairs_alldata, axis=1).reshape(-1,1)
x_train_allpairs_alldata_stds = np.std(x_train_allpairs_alldata, axis=1).reshape(-1,1)

test_allpairs_alldata = pd.concat([test_alldata[pair] for pair in pairs])
x_test_allpairs_alldata = np.hstack([test_allpairs_alldata[features].values])
y_test_allpairs_alldata = test_allpairs_alldata[labels].values.ravel()

train_allpairs_realdist = pd.concat([train_realdist[pair] for pair in pairs])
x_train_allpairs_realdist = np.hstack([train_allpairs_realdist[features].values])
y_train_allpairs_realdist = train_allpairs_realdist[labels].values.ravel()
x_train_allpairs_realdist_means = np.mean(x_train_allpairs_realdist, axis=1).reshape(-1,1)
x_train_allpairs_realdist_stds = np.std(x_train_allpairs_realdist, axis=1).reshape(-1,1)

test_allpairs_realdist = pd.concat([test_realdist[pair] for pair in pairs])
x_test_allpairs_realdist = np.hstack([test_allpairs_realdist[features].values])
y_test_allpairs_realdist = test_allpairs_realdist[labels].values.ravel()

train_allpairs_balanced = pd.concat([train_balanced[pair] for pair in pairs])
x_train_allpairs_balanced = np.hstack([train_allpairs_balanced[features].values])
y_train_allpairs_balanced = train_allpairs_balanced[labels].values.ravel()
x_train_allpairs_balanced_means = np.mean(x_train_allpairs_balanced, axis=1).reshape(-1,1)
x_train_allpairs_balanced_stds = np.std(x_train_allpairs_balanced, axis=1).reshape(-1,1)

test_allpairs_balanced = pd.concat([test_balanced[pair] for pair in pairs])
x_test_allpairs_balanced = np.hstack([test_allpairs_balanced[features].values])
y_test_allpairs_balanced = test_allpairs_balanced[labels].values.ravel()

{'Hindi-Persian': {'target': {'name': 'Hindi', 'epi': 'hin-Deva', 'gtrans': 'hi', 'unicode': 'Devanagari'}, 'source': {'name': 'Persian', 'epi': 'fas-Arab', 'gtrans': 'fa', 'unicode': 'Arabic'}, 'wiki': 'https://en.m.wiktionary.org/wiki/Category:Hindi_terms_borrowed_from_Persian'}, 'English-French': {'target': {'name': 'English', 'epi': 'eng-Latn', 'gtrans': 'en', 'unicode': 'Latin'}, 'source': {'name': 'French', 'epi': 'fra-Latn', 'gtrans': 'fr', 'unicode': 'Latin'}, 'wiki': 'https://en.m.wiktionary.org/wiki/Category:English_terms_borrowed_from_French'}}
Hindi-Persian
English-French


# Logistic Regression classifier

In [5]:
def lr_evaluate(full_train_set, full_test_set, pairs_list):    
    train_set = pd.concat([full_train_set[pair] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds

    print("Training on all langs\n")
    LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500).fit(x_train, y_train)

    # display regressor weights
    df = pd.DataFrame(LR.coef_, columns=features).style.set_caption('Weights')
    display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = LR.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    display(test_set.reset_index(drop=True).iloc[fp,:].style.set_caption('False positives'))
    display(test_set.reset_index(drop=True).iloc[fn,:].style.set_caption('False negatives'))
    
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = LR.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))
        
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500).fit(x_train, y_train)
        
        # display regressor weights
        df = pd.DataFrame(LR.coef_, columns=features).style.set_caption('Weights')
        display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = LR.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))

In [6]:
print("Evaluating on alldata splits\n")
lr_evaluate(train_alldata, test_alldata, pairs)

print("Evaluating on realdist splits\n")
lr_evaluate(train_realdist, test_realdist, pairs)

print("Evaluating on balanced splits\n")
lr_evaluate(train_balanced, test_balanced, pairs)

Evaluating on alldata splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-0.436158,-0.15206,0.330502,0.234419,0.202898,-0.394573,-0.535563,5.280771,0.648894,0.220875



Evaluating on all langs
f1-score :  0.7360970677451971
precision :  0.7711864406779662
recall :  0.7040618955512572
accuracy :  0.9122689075630253
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2458
           1       0.77      0.70      0.74       517

    accuracy                           0.91      2975
   macro avg       0.86      0.83      0.84      2975
weighted avg       0.91      0.91      0.91      2975

[[2350  108]
 [ 153  364]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
61,61,2431,फ़ायदा,فایده,faːjdaː,fɒjdh,advantage,Faydeh,0.571429,0.2,0.075,0.083333,1.425,0.075,6,hard_negative,0,1.957343,0.607233,0.632149,
75,75,705,ख़त,خفت,xət,xft,letter,faint,0.333333,0.333333,0.104167,0.125,2.583333,0.104167,3,hard_negative,0,0.627898,0.568654,0.723995,
215,215,3379,रिश्वत,رشوت,riʃvət,rʃvt,bribe,Bribery,0.333333,0.333333,0.298611,0.333333,2.416667,0.333333,6,hard_negative,0,0.926623,0.686041,0.544634,
282,282,5678,मौजूदा,موجود,mɔːd͡ʒuːdaː,mvd͡ʒvd,existing,Available,0.545455,0.5,0.267361,0.305556,4.041667,0.284722,6,synonym,0,2.950336,0.512315,0.531326,
348,348,1035,गुज़ारा,گذار,ɡuzaːraː,ɡzɒr,survive,Transition,0.625,0.333333,0.319444,0.354167,2.666667,0.354167,7,hard_negative,0,0.720412,0.54236,0.724166,
436,436,542,क़ानून,کانون,qaːnuːn,kɒnvn,law,Canon,0.714286,0.2,0.104167,0.116667,2.1,0.104167,6,hard_negative,0,6.771222,0.565388,0.775422,
444,444,1257,ज़कात,ذکاوت,zəkaːt,zkɒvt,Zakat,ذکاوت,0.5,0.4,0.195833,0.225,3.2,0.195833,5,hard_negative,0,0.976864,0.462078,0.67249,
615,615,3448,रोज़ाना,رضوان,rozaːnaː,rzvɒn,everyday,Radwan,0.625,0.5,0.243056,0.277778,3.0,0.260417,7,hard_negative,0,0.657934,0.581792,0.712921,
848,848,1432,बर्बर,بربر,bərbər,brbr,barbarian,barbarian,0.333333,0.333333,0.298611,0.333333,2.416667,0.333333,5,synonym,0,4.265605,0.62418,0.600969,
959,959,863,ख़ुशामद,خوش‌آمد,xuʃaːməd,xvʃ‌ɒmd,happy,Welcome,0.5,0.285714,0.190476,0.214286,2.392857,0.205357,7,hard_negative,0,0.766222,0.496819,0.719226,


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
64,64,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237,
80,80,834,बाक़ायदा,باقاعده,baːqaːjdaː,bɒɣɒʔdh,quite often,Regularly,0.8,0.142857,0.113095,0.119048,1.767857,0.113095,8,loan,1,-0.342959,0.408781,0.50733,
88,88,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651,
98,98,456,जान,جان,d͡ʒaːn,d͡ʒɒn,Life,John,0.333333,0.0,0.041667,0.041667,0.5,0.041667,3,loan,1,-1.289959,0.44532,0.774739,
148,148,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052,
159,159,1158,शागिर्द,شاگرد,ʃaːɡirdə,ʃɒɡrd,disciple,Student,0.5,0.285714,0.27381,0.303571,2.285714,0.303571,7,loan,1,0.438255,0.49706,0.583752,
194,194,82,आसान,آسان,aːsaːn,ɒsɒn,easy,Easy,0.666667,0.0,0.0625,0.0625,0.75,0.0625,4,loan,1,-1.954477,0.620488,0.595963,
203,203,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639,
256,256,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758,
273,273,533,ताक़त,طاقت,taːqət,tɒɣt,vigor,طاقت,0.666667,0.2,0.229167,0.25,2.1,0.25,5,loan,1,0.991477,0.493937,0.387655,


Evaluating on Hindi-Persian
f1-score :  0.7520661157024793
precision :  0.8504672897196262
recall :  0.674074074074074
accuracy :  0.9545454545454546
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1185
           1       0.85      0.67      0.75       135

    accuracy                           0.95      1320
   macro avg       0.91      0.83      0.86      1320
weighted avg       0.95      0.95      0.95      1320

[[1169   16]
 [  44   91]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
61,61,2431,फ़ायदा,فایده,faːjdaː,fɒjdh,advantage,Faydeh,0.571429,0.2,0.075,0.083333,1.425,0.075,6,hard_negative,0,1.957343,0.607233,0.632149
75,75,705,ख़त,خفت,xət,xft,letter,faint,0.333333,0.333333,0.104167,0.125,2.583333,0.104167,3,hard_negative,0,0.627898,0.568654,0.723995
215,215,3379,रिश्वत,رشوت,riʃvət,rʃvt,bribe,Bribery,0.333333,0.333333,0.298611,0.333333,2.416667,0.333333,6,hard_negative,0,0.926623,0.686041,0.544634
282,282,5678,मौजूदा,موجود,mɔːd͡ʒuːdaː,mvd͡ʒvd,existing,Available,0.545455,0.5,0.267361,0.305556,4.041667,0.284722,6,synonym,0,2.950336,0.512315,0.531326
348,348,1035,गुज़ारा,گذار,ɡuzaːraː,ɡzɒr,survive,Transition,0.625,0.333333,0.319444,0.354167,2.666667,0.354167,7,hard_negative,0,0.720412,0.54236,0.724166
436,436,542,क़ानून,کانون,qaːnuːn,kɒnvn,law,Canon,0.714286,0.2,0.104167,0.116667,2.1,0.104167,6,hard_negative,0,6.771222,0.565388,0.775422
444,444,1257,ज़कात,ذکاوت,zəkaːt,zkɒvt,Zakat,ذکاوت,0.5,0.4,0.195833,0.225,3.2,0.195833,5,hard_negative,0,0.976864,0.462078,0.67249
615,615,3448,रोज़ाना,رضوان,rozaːnaː,rzvɒn,everyday,Radwan,0.625,0.5,0.243056,0.277778,3.0,0.260417,7,hard_negative,0,0.657934,0.581792,0.712921
848,848,1432,बर्बर,بربر,bərbər,brbr,barbarian,barbarian,0.333333,0.333333,0.298611,0.333333,2.416667,0.333333,5,synonym,0,4.265605,0.62418,0.600969
959,959,863,ख़ुशामद,خوش‌آمد,xuʃaːməd,xvʃ‌ɒmd,happy,Welcome,0.5,0.285714,0.190476,0.214286,2.392857,0.205357,7,hard_negative,0,0.766222,0.496819,0.719226


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
64,64,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237
80,80,834,बाक़ायदा,باقاعده,baːqaːjdaː,bɒɣɒʔdh,quite often,Regularly,0.8,0.142857,0.113095,0.119048,1.767857,0.113095,8,loan,1,-0.342959,0.408781,0.50733
88,88,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651
98,98,456,जान,جان,d͡ʒaːn,d͡ʒɒn,Life,John,0.333333,0.0,0.041667,0.041667,0.5,0.041667,3,loan,1,-1.289959,0.44532,0.774739
148,148,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052
159,159,1158,शागिर्द,شاگرد,ʃaːɡirdə,ʃɒɡrd,disciple,Student,0.5,0.285714,0.27381,0.303571,2.285714,0.303571,7,loan,1,0.438255,0.49706,0.583752
194,194,82,आसान,آسان,aːsaːn,ɒsɒn,easy,Easy,0.666667,0.0,0.0625,0.0625,0.75,0.0625,4,loan,1,-1.954477,0.620488,0.595963
203,203,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639
256,256,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
273,273,533,ताक़त,طاقت,taːqət,tɒɣt,vigor,طاقت,0.666667,0.2,0.229167,0.25,2.1,0.25,5,loan,1,0.991477,0.493937,0.387655


Evaluating on English-French
f1-score :  0.7309236947791165
precision :  0.7479452054794521
recall :  0.7146596858638743
accuracy :  0.8785498489425981
              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1273
           1       0.75      0.71      0.73       382

    accuracy                           0.88      1655
   macro avg       0.83      0.82      0.83      1655
weighted avg       0.88      0.88      0.88      1655

[[1181   92]
 [ 109  273]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
1,1,829,residue,résidu,ˈrɛzəˌdu,ʀezidy,residue,residue,0.75,0.0,0.0625,0.076389,0.645833,0.0625,2,synonym,0,,0.84655,0.64152,0.420214
11,11,3767,brusque,brisque,brəsk,bʀisk,brusque,brisque,0.4,0.0,0.058333,0.075,0.625,0.058333,1,hard_negative,0,,0.761474,0.872441,0.838017
34,34,1854,hectare,instant,ˈhɛkˌtɑr,ɛ̃stɑ̃,hectare,instant,0.625,0.5,0.340278,0.388889,3.104167,0.385417,5,hard_negative,0,,4.17516,0.476067,0.536381
66,66,304,perpetual,perpétuel,pərˈpɛʧuəl,pɛʀpetyl,perpetual,perpetual,0.7,0.125,0.117188,0.135417,1.890625,0.117188,2,synonym,0,,0.410291,0.582457,0.618388
82,82,705,cryptic,cryptique,ˈkrɪptɪk,kʀiptik,cryptic,cryptique,0.5,0.0,0.035714,0.047619,0.339286,0.035714,3,synonym,0,,1.01671,0.868188,0.900735
107,107,276,babul,babouche,babul*,babuʃ,Babylon,babouche,0.333333,0.2,0.041667,0.041667,0.7,0.041667,4,hard_negative,0,,-1.444635,0.773397,0.7391
111,111,447,privilege,privilège,ˈprɪvɪlɪʤ,pʀivilə̀ʒ,privilege,privilege,0.777778,0.125,0.15625,0.177083,1.328125,0.166667,1,synonym,0,,-0.177626,0.785339,0.925608
136,136,9933,chansonnier,garçonnière,chansonnier*,ɡaʀsɔnjə̀ʀ,singer,bachelor pad,0.75,0.454545,0.229167,0.261364,2.170455,0.246212,6,hard_negative,0,,-0.054721,0.789534,0.802875
154,154,9967,chloroform,chlorophylle,ˈklɔrəˌfɔrm,ʃlɔʀɔfij,chloroform,chlorophylle,0.727273,0.222222,0.18287,0.212963,2.013889,0.199074,6,hard_negative,0,,4.23461,0.942351,0.814433
169,169,5501,paludism,paroxysme,paludism*,paʀɔɡzism,paludism,climax,0.555556,0.111111,0.141204,0.157407,1.319444,0.152778,5,hard_negative,0,,0.267962,0.618922,0.809365


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
2,2,1984,galvanize,galvaniser,ˈgælvəˌnaɪz,ɡalvanizəʀ,galvanize,galvanize,0.818182,0.3,0.23125,0.254167,2.1625,0.25,2,loan,1,,-2.92541,0.822644,0.890509
15,15,3033,ponce,alphonse,ˈpɑnseɪ,alfɔ̃s,ponce,alphonse,1.0,0.666667,0.28125,0.319444,3.458333,0.298611,4,loan,1,,-10.249369,0.343229,0.756543
31,31,986,cinema,cinéma,ˈsɪnəmə,sinema,cinema,movie theater,0.571429,0.0,0.041667,0.041667,0.375,0.041667,1,loan,1,,-7.539831,0.86588,0.836437
38,38,1330,decalcomanie,décalcomanie,decalcomanie*,dekalkɔmani,decals,decal,0.384615,0.25,0.085069,0.09375,0.708333,0.09375,1,loan,1,,-3.734234,0.848105,0.900687
40,40,1041,coincident,coïncident,koʊˈɪnsədənt,koinsidɑ̃,coincident,coincide,0.583333,0.272727,0.276515,0.30303,2.25,0.301136,1,loan,1,,-6.739983,0.588436,0.705931
62,62,559,bouillabaisse,bouillabaisse,bouillabaisse*,bujlabɛs,bouillabaisse,bouillabaisse,0.571429,0.384615,0.360577,0.400641,3.028846,0.397436,0,loan,1,,-4.260263,1.0,1.0
63,63,16,abaissé,abaissé,abaissé*é,abɛse,lowered,lowered,0.666667,0.333333,0.348958,0.385417,2.8125,0.385417,0,loan,1,,-4.694026,1.0,1.0
84,84,3558,telecommunication,télécommunication,ˌtɛləkəmˌjunɪˈkeɪʃən,telekɔmynikasjɔ̃,telecommunication,telecommunication,0.7,0.235294,0.155637,0.17402,1.669118,0.167892,2,loan,1,,-11.287354,0.831114,0.633883
103,103,2219,impersonal,impersonnel,ˌɪmˈpərsənəl,ɛ̃pɛʀsɔnəl,impersonal,impersonal,0.583333,0.1,0.122917,0.141667,1.1125,0.133333,2,loan,1,,-5.260839,0.939827,0.869831
156,156,1882,flageolet,flageolet,ˌflæʤəˈlɛt,flaʒəɔlɛ,flageolet,flageolet,0.5,0.375,0.210938,0.239583,2.71875,0.223958,0,loan,1,,-7.85252,1.0,1.0


Training and evaluating on Hindi-Persian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-0.796071,0.077745,0.258065,0.125116,-0.546942,-0.179132,0.548149,6.892038,0.099948,-0.055798


f1-score :  0.7407407407407407
precision :  0.7407407407407407
recall :  0.7407407407407407
accuracy :  0.946969696969697
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1185
           1       0.74      0.74      0.74       135

    accuracy                           0.95      1320
   macro avg       0.86      0.86      0.86      1320
weighted avg       0.95      0.95      0.95      1320

[[1150   35]
 [  35  100]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
24,24,1725,तुग़लक़,تقلا,tuɣləq,tɣlɒ,Tughlaq,Scramble,0.5,0.333333,0.319444,0.354167,2.625,0.354167,7,hard_negative,0,-0.491203,0.518141,0.455296
61,61,2431,फ़ायदा,فایده,faːjdaː,fɒjdh,advantage,Faydeh,0.571429,0.2,0.075,0.083333,1.425,0.075,6,hard_negative,0,1.957343,0.607233,0.632149
75,75,705,ख़त,خفت,xət,xft,letter,faint,0.333333,0.333333,0.104167,0.125,2.583333,0.104167,3,hard_negative,0,0.627898,0.568654,0.723995
99,99,3846,सनसनीख़ेज़,دندان‌گرد,sənsəniːxez,dndɒn‌ɡrd,sensational,Toothpaste,0.818182,0.3,0.247917,0.283333,2.65,0.26875,10,hard_negative,0,2.48204,0.34163,0.616893
102,102,292,इत्तिफ़ाक़,ازدواج,ittifaːq,ɒzdvɒd͡ʒ,coincidence,Marriage,1.0,0.142857,0.220238,0.238095,2.053571,0.235119,10,hard_negative,0,3.319556,0.267431,0.66736
213,213,2630,बाकिरा,بی‌کاره,baːkiraː,bj‌kɒrh,bakira,Unemployed,0.625,0.333333,0.104167,0.118056,1.916667,0.104167,7,hard_negative,0,-0.11514,0.656177,0.798148
215,215,3379,रिश्वत,رشوت,riʃvət,rʃvt,bribe,Bribery,0.333333,0.333333,0.298611,0.333333,2.416667,0.333333,6,hard_negative,0,0.926623,0.686041,0.544634
241,241,174,आज़ादी,آزاده,aːzaːdiː,ɒzɒdh,freedom,Azadeh,0.75,0.2,0.1,0.108333,1.725,0.1,6,hard_negative,0,1.220047,0.458264,0.446374
278,278,2467,फ़ैसला,حاصله,fæːslaː,hɒslh,decision,The result,0.714286,0.4,0.104167,0.116667,2.05,0.104167,6,hard_negative,0,0.886158,0.297383,0.636984
282,282,5678,मौजूदा,موجود,mɔːd͡ʒuːdaː,mvd͡ʒvd,existing,Available,0.545455,0.5,0.267361,0.305556,4.041667,0.284722,6,synonym,0,2.950336,0.512315,0.531326


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
64,64,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237
80,80,834,बाक़ायदा,باقاعده,baːqaːjdaː,bɒɣɒʔdh,quite often,Regularly,0.8,0.142857,0.113095,0.119048,1.767857,0.113095,8,loan,1,-0.342959,0.408781,0.50733
88,88,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651
148,148,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052
194,194,82,आसान,آسان,aːsaːn,ɒsɒn,easy,Easy,0.666667,0.0,0.0625,0.0625,0.75,0.0625,4,loan,1,-1.954477,0.620488,0.595963
203,203,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639
256,256,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
358,358,34,अरमान,ارمان,armaːn,ɒrmɒn,desires,Goals,0.5,0.0,0.041667,0.041667,0.5,0.041667,5,loan,1,-1.328181,0.609449,0.619888
393,393,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013
394,394,371,चीख़,جیغ,t͡ʃiːx,d͡ʒjɣ,squeak,جیغ,0.833333,0.333333,0.069444,0.083333,1.125,0.069444,4,loan,1,-0.422133,0.450465,0.788355


Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,0.004958,0.032679,0.161249,1.131312,-0.153215,-0.86419,-1.608508,3.582837,0.669039,0.356977


f1-score :  0.7721691678035472
precision :  0.8062678062678063
recall :  0.7408376963350786
accuracy :  0.8990936555891239
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      1273
           1       0.81      0.74      0.77       382

    accuracy                           0.90      1655
   macro avg       0.87      0.84      0.85      1655
weighted avg       0.90      0.90      0.90      1655

[[1205   68]
 [  99  283]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
11,11,3767,brusque,brisque,brəsk,bʀisk,brusque,brisque,0.4,0.0,0.058333,0.075,0.625,0.058333,1,hard_negative,0,,0.761474,0.872441,0.838017
34,34,1854,hectare,instant,ˈhɛkˌtɑr,ɛ̃stɑ̃,hectare,instant,0.625,0.5,0.340278,0.388889,3.104167,0.385417,5,hard_negative,0,,4.17516,0.476067,0.536381
82,82,705,cryptic,cryptique,ˈkrɪptɪk,kʀiptik,cryptic,cryptique,0.5,0.0,0.035714,0.047619,0.339286,0.035714,3,synonym,0,,1.01671,0.868188,0.900735
111,111,447,privilege,privilège,ˈprɪvɪlɪʤ,pʀivilə̀ʒ,privilege,privilege,0.777778,0.125,0.15625,0.177083,1.328125,0.166667,1,synonym,0,,-0.177626,0.785339,0.925608
154,154,9967,chloroform,chlorophylle,ˈklɔrəˌfɔrm,ʃlɔʀɔfij,chloroform,chlorophylle,0.727273,0.222222,0.18287,0.212963,2.013889,0.199074,6,hard_negative,0,,4.23461,0.942351,0.814433
168,168,689,ambiguous,ambiguë,æmˈbɪgjuəs,ɑ̃biɡ̈,ambiguous,ambiguous,0.9,0.625,0.489583,0.541667,4.234375,0.539062,3,synonym,0,,-2.187952,0.875147,0.812622
228,228,327,barbet,barbette,barbet*,baʀbɛt,barbet,barbette,0.428571,0.0,0.034722,0.048611,0.354167,0.034722,2,hard_negative,0,,2.166789,0.769538,0.594019
248,248,105,aligoté,aligot,aligoté*é,aliɡo,aligoté,aligot,0.555556,0.4,0.300595,0.333333,2.553571,0.330357,1,hard_negative,0,,4.398977,0.786604,0.726988
253,253,419,pompous,pompeux,ˈpɑmpəs,pɔ̃pœ,pompous,pompous,0.714286,0.333333,0.347222,0.381944,2.916667,0.378472,2,synonym,0,,4.45172,0.852376,0.798138
277,277,825,recover,récupérer,rɪˈkəvər,ʀekypeʀəʀ,recover,retrieve,0.888889,0.222222,0.277778,0.324074,2.513889,0.300926,5,synonym,0,,2.14363,0.519501,0.715382


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
2,2,1984,galvanize,galvaniser,ˈgælvəˌnaɪz,ɡalvanizəʀ,galvanize,galvanize,0.818182,0.3,0.23125,0.254167,2.1625,0.25,2,loan,1,,-2.92541,0.822644,0.890509
12,12,1610,émeute,plural,éémeute*,plyʀal,riot,plural,1.0,0.666667,0.324405,0.369048,5.160714,0.339286,6,loan,1,,3.502692,0.281304,0.584873
15,15,3033,ponce,alphonse,ˈpɑnseɪ,alfɔ̃s,ponce,alphonse,1.0,0.666667,0.28125,0.319444,3.458333,0.298611,4,loan,1,,-10.249369,0.343229,0.756543
31,31,986,cinema,cinéma,ˈsɪnəmə,sinema,cinema,movie theater,0.571429,0.0,0.041667,0.041667,0.375,0.041667,1,loan,1,,-7.539831,0.86588,0.836437
38,38,1330,decalcomanie,décalcomanie,decalcomanie*,dekalkɔmani,decals,decal,0.384615,0.25,0.085069,0.09375,0.708333,0.09375,1,loan,1,,-3.734234,0.848105,0.900687
40,40,1041,coincident,coïncident,koʊˈɪnsədənt,koinsidɑ̃,coincident,coincide,0.583333,0.272727,0.276515,0.30303,2.25,0.301136,1,loan,1,,-6.739983,0.588436,0.705931
84,84,3558,telecommunication,télécommunication,ˌtɛləkəmˌjunɪˈkeɪʃən,telekɔmynikasjɔ̃,telecommunication,telecommunication,0.7,0.235294,0.155637,0.17402,1.669118,0.167892,2,loan,1,,-11.287354,0.831114,0.633883
103,103,2219,impersonal,impersonnel,ˌɪmˈpərsənəl,ɛ̃pɛʀsɔnəl,impersonal,impersonal,0.583333,0.1,0.122917,0.141667,1.1125,0.133333,2,loan,1,,-5.260839,0.939827,0.869831
156,156,1882,flageolet,flageolet,ˌflæʤəˈlɛt,flaʒəɔlɛ,flageolet,flageolet,0.5,0.375,0.210938,0.239583,2.71875,0.223958,0,loan,1,,-7.85252,1.0,1.0
201,201,2217,impermeability,imperméabilité,impermeability*,ɛ̃pɛʀmeabilite,impermeability,impermeability,0.4,0.071429,0.096726,0.110119,0.866071,0.104167,2,loan,1,,-14.115271,0.802988,0.477161


Evaluating on realdist splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-0.964314,0.068899,-0.085584,0.369617,-0.130293,-0.261355,-0.20481,4.930519,0.659521,0.229935



Evaluating on all langs
f1-score :  0.8891235480464624
precision :  0.9790697674418605
recall :  0.8143133462282398
accuracy :  0.8905109489051095
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       442
           1       0.98      0.81      0.89       517

    accuracy                           0.89       959
   macro avg       0.90      0.90      0.89       959
weighted avg       0.91      0.89      0.89       959

[[433   9]
 [ 96 421]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
106,106,3238,मौलवी,مخلوع,mɔːlviː,mxlvʔ,cleric,Makhloo,0.571429,0.4,0.125,0.141667,2.6,0.125,5,hard_negative,0,-2.220913,0.609177,0.713042,
220,220,4282,शैतान,شیطان,ʃæːtaːn,ʃjtɒn,Satan,Satan,0.571429,0.2,0.066667,0.075,1.075,0.066667,5,synonym,0,0.996812,0.568882,0.774358,
357,357,2537,बदनाम,بدنام,bədnaːm,bdnɒm,Infamous,Infamous,0.428571,0.166667,0.170139,0.1875,1.458333,0.1875,5,hard_negative,0,3.461268,0.412911,0.808907,
694,244,11069,valet-de-place,valet de chambre,valet-de-place*,valət də ʃɑ̃bʀ,valet-de-place,valet,0.666667,0.333333,0.144097,0.159722,1.625,0.151042,7,hard_negative,0,-2.079422,0.806185,0.724407,
727,277,560,cadre,cadis,ˈkædri,kadi,frame,cadiz,0.5,0.2,0.166667,0.2,1.45,0.2,2,hard_negative,0,1.179581,0.517538,0.470978,
778,328,292,baguette,banquette,ˌbæˈgɛt,bɑ̃kɛt,baguette,bench,0.571429,0.2,0.2,0.225,1.7,0.220833,2,hard_negative,0,-2.767203,0.576753,0.865124,
799,349,2538,plafond,plateau,plafond*,plato,ceiling,plateau,0.5,0.428571,0.285714,0.315476,2.392857,0.309524,4,hard_negative,0,-3.621152,0.634463,0.781309,
813,363,1381,en plein air,en plein,ɛn plin ɛr,ən plɛ̃,outside,and square,0.5,0.375,0.341146,0.385417,2.84375,0.385417,4,hard_negative,0,-2.798578,0.841374,0.860655,
937,487,10058,corsair,corsage,ˈkɔrsɛr,kɔʀsaʒ,corsair,corsage,0.571429,0.166667,0.076389,0.104167,1.0625,0.076389,2,hard_negative,0,3.4544,0.850421,0.820582,


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
11,11,82,आसान,آسان,aːsaːn,ɒsɒn,easy,Easy,0.666667,0.0,0.0625,0.0625,0.75,0.0625,4,loan,1,-1.954477,0.620488,0.595963,
34,34,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639,
70,70,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651,
94,94,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052,
144,144,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977,
171,171,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013,
192,192,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621,
216,216,980,मुक़र्रर,مقرر,muqrrər,mɣrr,Appointment,Prescribed,0.571429,0.428571,0.392857,0.446429,3.357143,0.446429,8,loan,1,-1.197678,0.374187,0.669189,
217,217,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758,
244,244,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885,


Evaluating on Hindi-Persian
f1-score :  0.9047619047619048
precision :  0.9743589743589743
recall :  0.8444444444444444
accuracy :  0.9466666666666667
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       315
           1       0.97      0.84      0.90       135

    accuracy                           0.95       450
   macro avg       0.96      0.92      0.93       450
weighted avg       0.95      0.95      0.95       450

[[312   3]
 [ 21 114]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
106,106,3238,मौलवी,مخلوع,mɔːlviː,mxlvʔ,cleric,Makhloo,0.571429,0.4,0.125,0.141667,2.6,0.125,5,hard_negative,0,-2.220913,0.609177,0.713042
220,220,4282,शैतान,شیطان,ʃæːtaːn,ʃjtɒn,Satan,Satan,0.571429,0.2,0.066667,0.075,1.075,0.066667,5,synonym,0,0.996812,0.568882,0.774358
357,357,2537,बदनाम,بدنام,bədnaːm,bdnɒm,Infamous,Infamous,0.428571,0.166667,0.170139,0.1875,1.458333,0.1875,5,hard_negative,0,3.461268,0.412911,0.808907


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
11,11,82,आसान,آسان,aːsaːn,ɒsɒn,easy,Easy,0.666667,0.0,0.0625,0.0625,0.75,0.0625,4,loan,1,-1.954477,0.620488,0.595963
34,34,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639
70,70,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651
94,94,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052
144,144,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977
171,171,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013
192,192,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621
216,216,980,मुक़र्रर,مقرر,muqrrər,mɣrr,Appointment,Prescribed,0.571429,0.428571,0.392857,0.446429,3.357143,0.446429,8,loan,1,-1.197678,0.374187,0.669189
217,217,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
244,244,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885


Evaluating on English-French
f1-score :  0.883453237410072
precision :  0.9808306709265175
recall :  0.8036649214659686
accuracy :  0.8408644400785854
              precision    recall  f1-score   support

           0       0.62      0.95      0.75       127
           1       0.98      0.80      0.88       382

    accuracy                           0.84       509
   macro avg       0.80      0.88      0.82       509
weighted avg       0.89      0.84      0.85       509

[[121   6]
 [ 75 307]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
244,244,11069,valet-de-place,valet de chambre,valet-de-place*,valət də ʃɑ̃bʀ,valet-de-place,valet,0.666667,0.333333,0.144097,0.159722,1.625,0.151042,7,hard_negative,0,,-2.079422,0.806185,0.724407
277,277,560,cadre,cadis,ˈkædri,kadi,frame,cadiz,0.5,0.2,0.166667,0.2,1.45,0.2,2,hard_negative,0,,1.179581,0.517538,0.470978
328,328,292,baguette,banquette,ˌbæˈgɛt,bɑ̃kɛt,baguette,bench,0.571429,0.2,0.2,0.225,1.7,0.220833,2,hard_negative,0,,-2.767203,0.576753,0.865124
349,349,2538,plafond,plateau,plafond*,plato,ceiling,plateau,0.5,0.428571,0.285714,0.315476,2.392857,0.309524,4,hard_negative,0,,-3.621152,0.634463,0.781309
363,363,1381,en plein air,en plein,ɛn plin ɛr,ən plɛ̃,outside,and square,0.5,0.375,0.341146,0.385417,2.84375,0.385417,4,hard_negative,0,,-2.798578,0.841374,0.860655
487,487,10058,corsair,corsage,ˈkɔrsɛr,kɔʀsaʒ,corsair,corsage,0.571429,0.166667,0.076389,0.104167,1.0625,0.076389,2,hard_negative,0,,3.4544,0.850421,0.820582


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
3,3,16,abaissé,abaissé,abaissé*é,abɛse,lowered,lowered,0.666667,0.333333,0.348958,0.385417,2.8125,0.385417,0,loan,1,,-4.694026,1.0,1.0
5,5,2314,kilogram,kilogramme,ˈkɪləˌgræm,kilɔɡʀɑm,kilogram,kilogram,0.7,0.125,0.151042,0.177083,1.328125,0.164062,2,loan,1,,-4.398973,0.922724,0.739337
10,10,885,chapé,échapper,chapé*é,eʃapəʀ,escaped,escape,0.714286,0.666667,0.111111,0.131944,2.770833,0.111111,4,loan,1,,-12.475492,0.28053,0.724244
17,17,1760,état major,état,éétat* ˈmeɪʤər,eta,Staff,state,0.857143,0.625,0.622917,0.7,5.075,0.7,6,loan,1,,-5.155156,0.582395,0.92188
22,22,917,chaussé,chaussé,chaussé*é,ʃose,shod,shod,0.888889,0.666667,0.473958,0.526042,4.015625,0.523438,0,loan,1,,-3.274237,1.0,1.0
29,29,140,ambassadry,embassadrie,ambassadry*,ɑ̃basadʀi,embassy,embassadrie,0.545455,0.2,0.216667,0.245833,1.8625,0.235417,3,loan,1,,-6.937426,0.517142,0.79236
37,37,3154,quintilliard,quintilliard,quintilliard*,kɛ̃tijjaʀ,quintilliard,quintilliard,0.769231,0.333333,0.357639,0.399306,3.260417,0.387153,0,loan,1,,-10.514448,1.0,1.0
39,39,3145,quatre quart,quatre-quarts,quatre* kwɔrt,katʀə-kaʀ,four quarters,four-quarters,0.769231,0.272727,0.30303,0.344697,2.590909,0.329545,2,loan,1,,-7.247688,0.775167,0.537955
41,41,2035,geste antagoniste,geste antagoniste,geste* antagoniste*,ʒɛstə ɑ̃taɡɔnist,antagonistic gesture,antagonistic gesture,0.526316,0.285714,0.206845,0.229167,2.25,0.209821,0,loan,1,,-11.636612,1.0,1.0
49,49,3581,thermometer,thermomètre,θərˈmɑmətər,tɛʀmɔmə̀tʀ,thermometer,thermometer,0.727273,0.1,0.141667,0.175,1.425,0.158333,3,loan,1,,-3.57576,0.902252,0.790529


Training and evaluating on Hindi-Persian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-1.205211,-0.195307,0.330821,-0.105014,-0.559217,0.004228,0.736844,6.750008,0.193646,0.029961


f1-score :  0.9266409266409267
precision :  0.967741935483871
recall :  0.8888888888888888
accuracy :  0.9577777777777777
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       315
           1       0.97      0.89      0.93       135

    accuracy                           0.96       450
   macro avg       0.96      0.94      0.95       450
weighted avg       0.96      0.96      0.96       450

[[311   4]
 [ 15 120]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
106,106,3238,मौलवी,مخلوع,mɔːlviː,mxlvʔ,cleric,Makhloo,0.571429,0.4,0.125,0.141667,2.6,0.125,5,hard_negative,0,-2.220913,0.609177,0.713042
220,220,4282,शैतान,شیطان,ʃæːtaːn,ʃjtɒn,Satan,Satan,0.571429,0.2,0.066667,0.075,1.075,0.066667,5,synonym,0,0.996812,0.568882,0.774358
357,357,2537,बदनाम,بدنام,bədnaːm,bdnɒm,Infamous,Infamous,0.428571,0.166667,0.170139,0.1875,1.458333,0.1875,5,hard_negative,0,3.461268,0.412911,0.808907
385,385,266,इंतक़ाल,امتثال,intqaːl,ɒmtsɒl,timeout,compliance,0.714286,0.333333,0.097222,0.104167,1.208333,0.097222,7,hard_negative,0,-1.75611,0.334941,0.623539


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
34,34,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639
70,70,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651
94,94,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052
144,144,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977
171,171,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013
192,192,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621
217,217,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
244,244,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885
273,273,910,मद्रसा,مَدْرَسَة,mədrəsaː,mædْræsæة,Madrasa,School,0.555556,0.0,0.041667,0.041667,0.428571,0.041667,9,loan,1,-10.278879,0.535047,0.643058
341,341,824,बलूचिस्तान,بلوچستان,bəluːt͡ʃistaːn,blvt͡ʃstɒn,Balochistan,Balochistan,0.428571,0.3,0.227083,0.254167,2.45,0.247917,10,loan,1,-7.575051,0.443984,0.625031


Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-0.504236,0.370791,-0.517518,1.274296,-0.440389,-0.531624,-1.040692,3.43975,0.56339,0.254884


f1-score :  0.8939828080229226
precision :  0.9873417721518988
recall :  0.8167539267015707
accuracy :  0.8546168958742633
              precision    recall  f1-score   support

           0       0.64      0.97      0.77       127
           1       0.99      0.82      0.89       382

    accuracy                           0.85       509
   macro avg       0.81      0.89      0.83       509
weighted avg       0.90      0.85      0.86       509

[[123   4]
 [ 70 312]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
277,277,560,cadre,cadis,ˈkædri,kadi,frame,cadiz,0.5,0.2,0.166667,0.2,1.45,0.2,2,hard_negative,0,,1.179581,0.517538,0.470978
328,328,292,baguette,banquette,ˌbæˈgɛt,bɑ̃kɛt,baguette,bench,0.571429,0.2,0.2,0.225,1.7,0.220833,2,hard_negative,0,,-2.767203,0.576753,0.865124
363,363,1381,en plein air,en plein,ɛn plin ɛr,ən plɛ̃,outside,and square,0.5,0.375,0.341146,0.385417,2.84375,0.385417,4,hard_negative,0,,-2.798578,0.841374,0.860655
487,487,10058,corsair,corsage,ˈkɔrsɛr,kɔʀsaʒ,corsair,corsage,0.571429,0.166667,0.076389,0.104167,1.0625,0.076389,2,hard_negative,0,,3.4544,0.850421,0.820582


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
5,5,2314,kilogram,kilogramme,ˈkɪləˌgræm,kilɔɡʀɑm,kilogram,kilogram,0.7,0.125,0.151042,0.177083,1.328125,0.164062,2,loan,1,,-4.398973,0.922724,0.739337
10,10,885,chapé,échapper,chapé*é,eʃapəʀ,escaped,escape,0.714286,0.666667,0.111111,0.131944,2.770833,0.111111,4,loan,1,,-12.475492,0.28053,0.724244
17,17,1760,état major,état,éétat* ˈmeɪʤər,eta,Staff,state,0.857143,0.625,0.622917,0.7,5.075,0.7,6,loan,1,,-5.155156,0.582395,0.92188
29,29,140,ambassadry,embassadrie,ambassadry*,ɑ̃basadʀi,embassy,embassadrie,0.545455,0.2,0.216667,0.245833,1.8625,0.235417,3,loan,1,,-6.937426,0.517142,0.79236
37,37,3154,quintilliard,quintilliard,quintilliard*,kɛ̃tijjaʀ,quintilliard,quintilliard,0.769231,0.333333,0.357639,0.399306,3.260417,0.387153,0,loan,1,,-10.514448,1.0,1.0
39,39,3145,quatre quart,quatre-quarts,quatre* kwɔrt,katʀə-kaʀ,four quarters,four-quarters,0.769231,0.272727,0.30303,0.344697,2.590909,0.329545,2,loan,1,,-7.247688,0.775167,0.537955
41,41,2035,geste antagoniste,geste antagoniste,geste* antagoniste*,ʒɛstə ɑ̃taɡɔnist,antagonistic gesture,antagonistic gesture,0.526316,0.285714,0.206845,0.229167,2.25,0.209821,0,loan,1,,-11.636612,1.0,1.0
49,49,3581,thermometer,thermomètre,θərˈmɑmətər,tɛʀmɔmə̀tʀ,thermometer,thermometer,0.727273,0.1,0.141667,0.175,1.425,0.158333,3,loan,1,,-3.57576,0.902252,0.790529
69,69,1296,cyme,cime,cyme*,sim,cyme,tops,0.8,0.5,0.291667,0.322917,2.71875,0.317708,1,loan,1,,-3.919738,0.660974,0.82649
75,75,103,aioli,aïoli,aioli*,aiɔli,aioli,aïoli,0.333333,0.0,0.008333,0.008333,0.05,0.008333,1,loan,1,,-7.425697,0.549787,0.869716


Evaluating on balanced splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-1.116035,0.249119,-0.355424,0.64595,-0.266503,-0.318055,-0.121683,4.310355,0.558623,0.244093



Evaluating on all langs
f1-score :  0.911550468262227
precision :  0.9864864864864865
recall :  0.8471953578336557
accuracy :  0.8797736916548797
              precision    recall  f1-score   support

           0       0.70      0.97      0.81       190
           1       0.99      0.85      0.91       517

    accuracy                           0.88       707
   macro avg       0.84      0.91      0.86       707
weighted avg       0.91      0.88      0.88       707

[[184   6]
 [ 79 438]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
124,124,4034,सुक़रात,ثروت,suqraːt,srvt,Socrates,ثروت,0.571429,0.5,0.350694,0.395833,3.75,0.385417,7,hard_negative,0,-2.138875,0.462361,0.707515,
212,212,1493,तंग,ترگ,təŋɡə,trɡ,Narrow,ترگ,0.6,0.6,0.4125,0.475,3.625,0.454167,3,hard_negative,0,-2.665791,0.646124,0.810342,
365,94,7958,hymen,carbone,ˈhaɪmən,kaʀbɔn,hymen,carbone,0.714286,0.5,0.086806,0.097222,2.104167,0.086806,6,hard_negative,0,-0.837086,0.320901,0.623296,
595,324,1299,dupion,dauphine,dupion*,dofin,dupion,dauphine,0.571429,0.166667,0.173611,0.194444,1.5625,0.190972,4,hard_negative,0,-2.970491,0.616064,0.732397,
635,364,5570,pertinent,pertinence,ˈpərtɪnɪnt,pɛʀtinɑ̃s,pertinent,relevance,0.7,0.111111,0.159722,0.180556,1.486111,0.168981,2,hard_negative,0,-3.08316,0.716512,0.880625,
640,369,1473,espalier,espalière,ɛˈspæljər,ɛspaljə̀ʀ,espalier,espalier,0.444444,0.0,0.020833,0.03125,0.234375,0.020833,2,hard_negative,0,-0.153409,0.844079,0.692612,


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,Unnamed: 0.1.1
3,3,688,निगाह,نگه,niɡaːɦ,ŋh,gaze,Hold,1.0,0.6,0.566667,0.625,4.825,0.625,5,loan,1,-3.592351,0.604057,0.479274,
24,24,1194,संगतराश,سنگتراش,səŋɡtəraːʃ,sŋtrɒʃ,freebies,سنگتراش,0.5,0.333333,0.3125,0.347222,2.583333,0.347222,7,loan,1,-4.075851,0.296396,0.78492,
44,44,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621,
82,82,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977,
99,99,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758,
102,102,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885,
111,111,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237,
135,135,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013,
148,148,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639,
164,164,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651,


Evaluating on Hindi-Persian
f1-score :  0.9254901960784313
precision :  0.9833333333333333
recall :  0.8740740740740741
accuracy :  0.9298892988929889
              precision    recall  f1-score   support

           0       0.89      0.99      0.93       136
           1       0.98      0.87      0.93       135

    accuracy                           0.93       271
   macro avg       0.94      0.93      0.93       271
weighted avg       0.94      0.93      0.93       271

[[134   2]
 [ 17 118]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
124,124,4034,सुक़रात,ثروت,suqraːt,srvt,Socrates,ثروت,0.571429,0.5,0.350694,0.395833,3.75,0.385417,7,hard_negative,0,-2.138875,0.462361,0.707515
212,212,1493,तंग,ترگ,təŋɡə,trɡ,Narrow,ترگ,0.6,0.6,0.4125,0.475,3.625,0.454167,3,hard_negative,0,-2.665791,0.646124,0.810342


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
3,3,688,निगाह,نگه,niɡaːɦ,ŋh,gaze,Hold,1.0,0.6,0.566667,0.625,4.825,0.625,5,loan,1,-3.592351,0.604057,0.479274
24,24,1194,संगतराश,سنگتراش,səŋɡtəraːʃ,sŋtrɒʃ,freebies,سنگتراش,0.5,0.333333,0.3125,0.347222,2.583333,0.347222,7,loan,1,-4.075851,0.296396,0.78492
44,44,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621
82,82,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977
99,99,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
102,102,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885
111,111,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237
135,135,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013
148,148,1125,वाक़ई,واقعی,vaːqəi,vɒɣʔj,really,Real,0.833333,0.4,0.116667,0.133333,2.0,0.116667,5,loan,1,-0.559847,0.387109,0.69639
164,164,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651


Evaluating on English-French
f1-score :  0.9065155807365438
precision :  0.9876543209876543
recall :  0.837696335078534
accuracy :  0.8486238532110092
              precision    recall  f1-score   support

           0       0.45      0.93      0.60        54
           1       0.99      0.84      0.91       382

    accuracy                           0.85       436
   macro avg       0.72      0.88      0.75       436
weighted avg       0.92      0.85      0.87       436

[[ 50   4]
 [ 62 320]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
94,94,7958,hymen,carbone,ˈhaɪmən,kaʀbɔn,hymen,carbone,0.714286,0.5,0.086806,0.097222,2.104167,0.086806,6,hard_negative,0,,-0.837086,0.320901,0.623296
324,324,1299,dupion,dauphine,dupion*,dofin,dupion,dauphine,0.571429,0.166667,0.173611,0.194444,1.5625,0.190972,4,hard_negative,0,,-2.970491,0.616064,0.732397
364,364,5570,pertinent,pertinence,ˈpərtɪnɪnt,pɛʀtinɑ̃s,pertinent,relevance,0.7,0.111111,0.159722,0.180556,1.486111,0.168981,2,hard_negative,0,,-3.08316,0.716512,0.880625
369,369,1473,espalier,espalière,ɛˈspæljər,ɛspaljə̀ʀ,espalier,espalier,0.444444,0.0,0.020833,0.03125,0.234375,0.020833,2,hard_negative,0,,-0.153409,0.844079,0.692612


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
3,3,2217,impermeability,imperméabilité,impermeability*,ɛ̃pɛʀmeabilite,impermeability,impermeability,0.4,0.071429,0.096726,0.110119,0.866071,0.104167,2,loan,1,,-14.115271,0.802988,0.477161
9,9,986,cinema,cinéma,ˈsɪnəmə,sinema,cinema,movie theater,0.571429,0.0,0.041667,0.041667,0.375,0.041667,1,loan,1,,-7.539831,0.86588,0.836437
13,13,1429,detergent,détergent,dɪˈtərʤənt,detɛʀʒɑ̃,detergent,detergent,0.8,0.375,0.221354,0.260417,3.078125,0.236979,1,loan,1,,-0.962433,0.569331,0.520957
15,15,1296,cyme,cime,cyme*,sim,cyme,tops,0.8,0.5,0.291667,0.322917,2.71875,0.317708,1,loan,1,,-3.919738,0.660974,0.82649
19,19,46,accolé,not comparable,accolé*é,nɔt kɔ̃paʀabl,attached,not comparable,1.0,0.727273,0.399621,0.439394,3.590909,0.435606,11,loan,1,,-4.059823,0.466399,0.495811
21,21,2680,musique concrète,musique concrète,musique* concrète*,myzikə kɔ̃kʀə̀t,concrete music,concrete music,0.833333,0.428571,0.222222,0.247222,1.875,0.241667,0,loan,1,,-9.241386,1.0,1.0
22,22,1609,emetic,émétique,ɪˈmɛtɪk,emetik,emetic,emetic,0.571429,0.0,0.027778,0.027778,0.208333,0.027778,5,loan,1,,-3.677054,0.523215,0.745417
24,24,1360,degenerescence,dégénérescence,degenerescence*,deʒeneʀəsɑ̃s,degeneration,degeneration,0.6,0.384615,0.259615,0.291667,2.769231,0.274038,3,loan,1,,-7.066214,0.740021,0.785161
25,25,3145,quatre quart,quatre-quarts,quatre* kwɔrt,katʀə-kaʀ,four quarters,four-quarters,0.769231,0.272727,0.30303,0.344697,2.590909,0.329545,2,loan,1,,-7.247688,0.775167,0.537955
37,37,3457,somnambulist,somnambuliste,somnambulist*,sɔnɑ̃bylist,somnambulist,somnambulist,0.461538,0.166667,0.164931,0.184028,1.375,0.182292,1,loan,1,,-7.880839,0.957998,0.694461


Training and evaluating on Hindi-Persian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-1.366577,-0.328989,0.187031,0.031336,-0.522714,0.088124,0.492637,5.956102,0.139581,0.195086


f1-score :  0.9425287356321839
precision :  0.9761904761904762
recall :  0.9111111111111111
accuracy :  0.9446494464944649
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       136
           1       0.98      0.91      0.94       135

    accuracy                           0.94       271
   macro avg       0.95      0.94      0.94       271
weighted avg       0.95      0.94      0.94       271

[[133   3]
 [ 12 123]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
71,71,3322,रमज़ान,حمدان,rəmzaːn,hmdɒn,Ramadan,Hamdan,0.714286,0.166667,0.201389,0.236111,2.020833,0.229167,6,hard_negative,0,-2.678188,0.593762,0.705876
124,124,4034,सुक़रात,ثروت,suqraːt,srvt,Socrates,ثروت,0.571429,0.5,0.350694,0.395833,3.75,0.385417,7,hard_negative,0,-2.138875,0.462361,0.707515
212,212,1493,तंग,ترگ,təŋɡə,trɡ,Narrow,ترگ,0.6,0.6,0.4125,0.475,3.625,0.454167,3,hard_negative,0,-2.665791,0.646124,0.810342


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim
3,3,688,निगाह,نگه,niɡaːɦ,ŋh,gaze,Hold,1.0,0.6,0.566667,0.625,4.825,0.625,5,loan,1,-3.592351,0.604057,0.479274
44,44,1083,रेशमी,ابریشمی,reʃmiː,ɒbrjʃmj,silky,Silk,0.714286,0.571429,0.291667,0.333333,2.892857,0.321429,7,loan,1,-12.706986,0.500842,0.648621
82,82,1152,शहज़ादा,شاهزاده,ʃəɦzaːdaː,ʃɒhzɒdh,prince,Prince,0.666667,0.142857,0.077381,0.083333,1.232143,0.077381,7,loan,1,-8.861265,0.444924,0.738977
99,99,6,अंदेशा,اندیشه,ndeʃaː,ɒndjʃh,suspect,Thought,0.666667,0.5,0.211806,0.243056,2.625,0.229167,6,loan,1,-3.063483,0.391255,0.710758
102,102,39,अलादीन,علاءالدین,alaːdiːn,ʔlɒʔɒldjn,aladdin,Alaeddin,0.777778,0.444444,0.356481,0.398148,3.388889,0.388889,9,loan,1,-14.802485,0.527945,0.658885
111,111,79,आशिक़ी,عاشقی,aːʃiqiː,ʔɒʃɣj,Aashiqui,being in love,0.857143,0.6,0.25,0.291667,4.125,0.25,6,loan,1,-1.864414,0.469438,0.733237
135,135,787,फ़ौरन,فوراً,fɔːrən,fvrɒً,immediately,immediately,0.666667,0.4,0.279167,0.308333,3.4,0.295833,5,loan,1,-2.609516,0.465668,0.608013
164,164,392,जलालाबाद,جلال‌آباد,d͡ʒəlaːlaːbaːd,d͡ʒlɒl‌ɒbɒd,Jalalabad,Jalalabad,0.5,0.111111,0.141204,0.152778,1.305556,0.152778,9,loan,1,-7.722263,0.589794,0.724651
194,194,966,मिरजई,مرزائی,mird͡ʒəi,mrzɒjʔj,mirzai,Mirzai,0.75,0.714286,0.270833,0.333333,3.946429,0.285714,6,loan,1,-6.764426,0.668898,0.697918
202,202,729,पाजामा,پايجامه / پاجامه,paːd͡ʒaːmaː,pɒjd͡ʒɒmh / pɒd͡ʒɒmh,pajamas,Pajamas,0.8,0.538462,0.512821,0.567308,4.25,0.567308,16,loan,1,-20.556976,0.564887,0.668052


Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim
0,-0.674053,0.560808,-0.707315,1.442646,-0.60889,-0.565429,-0.837315,3.293219,0.50363,0.244202


f1-score :  0.9119318181818181
precision :  0.9968944099378882
recall :  0.8403141361256544
accuracy :  0.8577981651376146
              precision    recall  f1-score   support

           0       0.46      0.98      0.63        54
           1       1.00      0.84      0.91       382

    accuracy                           0.86       436
   macro avg       0.73      0.91      0.77       436
weighted avg       0.93      0.86      0.88       436

[[ 53   1]
 [ 61 321]]



Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
369,369,1473,espalier,espalière,ɛˈspæljər,ɛspaljə̀ʀ,espalier,espalier,0.444444,0.0,0.020833,0.03125,0.234375,0.020833,2,hard_negative,0,,-0.153409,0.844079,0.692612


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1,DNN_logits,MBERT_cos_sim,XLM_cos_sim
3,3,2217,impermeability,imperméabilité,impermeability*,ɛ̃pɛʀmeabilite,impermeability,impermeability,0.4,0.071429,0.096726,0.110119,0.866071,0.104167,2,loan,1,,-14.115271,0.802988,0.477161
9,9,986,cinema,cinéma,ˈsɪnəmə,sinema,cinema,movie theater,0.571429,0.0,0.041667,0.041667,0.375,0.041667,1,loan,1,,-7.539831,0.86588,0.836437
15,15,1296,cyme,cime,cyme*,sim,cyme,tops,0.8,0.5,0.291667,0.322917,2.71875,0.317708,1,loan,1,,-3.919738,0.660974,0.82649
19,19,46,accolé,not comparable,accolé*é,nɔt kɔ̃paʀabl,attached,not comparable,1.0,0.727273,0.399621,0.439394,3.590909,0.435606,11,loan,1,,-4.059823,0.466399,0.495811
21,21,2680,musique concrète,musique concrète,musique* concrète*,myzikə kɔ̃kʀə̀t,concrete music,concrete music,0.833333,0.428571,0.222222,0.247222,1.875,0.241667,0,loan,1,,-9.241386,1.0,1.0
22,22,1609,emetic,émétique,ɪˈmɛtɪk,emetik,emetic,emetic,0.571429,0.0,0.027778,0.027778,0.208333,0.027778,5,loan,1,,-3.677054,0.523215,0.745417
24,24,1360,degenerescence,dégénérescence,degenerescence*,deʒeneʀəsɑ̃s,degeneration,degeneration,0.6,0.384615,0.259615,0.291667,2.769231,0.274038,3,loan,1,,-7.066214,0.740021,0.785161
25,25,3145,quatre quart,quatre-quarts,quatre* kwɔrt,katʀə-kaʀ,four quarters,four-quarters,0.769231,0.272727,0.30303,0.344697,2.590909,0.329545,2,loan,1,,-7.247688,0.775167,0.537955
27,27,2219,impersonal,impersonnel,ˌɪmˈpərsənəl,ɛ̃pɛʀsɔnəl,impersonal,impersonal,0.583333,0.1,0.122917,0.141667,1.1125,0.133333,2,loan,1,,-5.260839,0.939827,0.869831
37,37,3457,somnambulist,somnambuliste,somnambulist*,sɔnɑ̃bylist,somnambulist,somnambulist,0.461538,0.166667,0.164931,0.184028,1.375,0.182292,1,loan,1,,-7.880839,0.957998,0.694461


# Neural Network classifier

## Definition

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1),
            
        )

    def forward(self, x):
        logits_new = self.linear_relu_stack(x)
        logits  = logits_new
        
        return torch.sigmoid(logits), logits_new
    
    def fit(self, X_train, Y_train, X_val, Y_val, criterion, optimizer, n_epochs=5000):
        train_losses = []
        val_losses = []
        train_accur = []
        val_accur = []

        for epoch in range(n_epochs):
            y_pred, logits = self(X_train.float())

            train_loss = criterion(y_pred, Y_train.float())

            if epoch % (n_epochs // 50) == 0:
                train_acc,_ = self.calculate_accuracy(Y_train, y_pred)

                y_val_pred = self(X_val.float())[0]

                val_loss = criterion(y_val_pred, Y_val.float())

                val_acc, total_corr = self.calculate_accuracy(Y_val, y_val_pred)

                print(f'''epoch {epoch}
                    Train set - loss: {self.round_tensor(train_loss)}, accuracy: {self.round_tensor(train_acc)} 
                    Val set - loss: {self.round_tensor(val_loss)}, accuracy: {self.round_tensor(val_acc)}''')
                
                train_losses.append(train_loss.detach().cpu().numpy())
                val_losses.append(val_loss.detach().cpu().numpy())

                val_accur.append(val_acc.detach().cpu().numpy())
                train_accur.append(train_acc.detach().cpu().numpy())

            optimizer.zero_grad()

            train_loss.backward()

            optimizer.step()
            
        return train_losses,val_losses,train_accur,val_accur
    
    def calculate_accuracy(self, y_true, y_pred):
        predicted = y_pred.ge(.5) 
        return ((y_true == predicted).sum().float() / len(y_true), (y_true == predicted).sum())
    
    def round_tensor(self, t, decimal_places=3):
        return round(t.item(), decimal_places)
    
    def plot_losses(self, train_losses, val_losses, train_accur, val_accur):
        epochs = range(1, len(train_accur) + 1)

        plt.plot(epochs, train_accur, 'bo', label='Training acc')
        plt.plot(epochs, val_accur, 'b', label='Vaidation acc')
        plt.title('Training and validation accuracy')
        plt.legend()

        plt.figure()

        plt.plot(epochs, train_losses, 'bo', label='Training loss')
        plt.plot(epochs, val_losses, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()

        plt.show()

In [None]:
def nn_evaluate(full_train_set, full_test_set, pairs_list):    
    train_set = pd.concat([full_train_set[pair] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds
    
    torch.manual_seed(7)
    random.seed(7)
    np.random.seed(7)
    
    model = NeuralNetwork(x_train.shape[1]).to(device)
    print(model)
    
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.00001)
    #optimizer = optim.SGD(model.parameters(),lr=0.00001, momentum=0.0,  weight_decay=0.0, nesterov=False)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

    x_train = torch.tensor(x_train).to(device)
    x_val = torch.tensor(x_val).to(device)
    y_train = torch.tensor(y_train).reshape(-1,1).to(device)
    y_val = torch.tensor(y_val).reshape(-1,1).to(device)
        
    print("Training on all langs\n")
    train_losses, val_losses, train_accur, val_accur = \
        model.fit(x_train, y_train, x_val, y_val, criterion, optimizer, n_epochs=5000)
    model.plot_losses(train_losses,val_losses,train_accur,val_accur)
    
    model.eval()
    
    x_test = (x_test - x_means)/x_stds
    x_test = torch.tensor(x_test).to(device)

    with torch.no_grad():
        y_pred = model(torch.tensor(x_test).float())[0] > .5
        y_pred = y_pred.detach().cpu().numpy()

        print("f1-score : ", f1_score(y_test, y_pred))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred)) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.reset_index(drop=True).iloc[fp,:].style.set_caption('False positives'))
        display(test_set.reset_index(drop=True).iloc[fn,:].style.set_caption('False negatives'))
        #display(test_set.iloc[fn,:].style.set_caption('False negatives'))
        
        for pair in pairs_list:
            print(f'Evaluating on {pair}')

            test_set = full_test_set[pair]
            x_test = test_set[features].values
            y_test = test_set[labels].values.ravel()

            x_test = (x_test - x_means)/x_stds
            x_test = torch.tensor(x_test).to(device)
            y_pred = model(torch.tensor(x_test).float())[0] > .5
            y_pred = y_pred.detach().cpu().numpy()
            
 
            print("f1-score : ", f1_score(y_test, y_pred ))
            print("precision : ",precision_score(y_test, y_pred))
            print("recall : ",recall_score(y_test, y_pred )) 
            print("accuracy : ",accuracy_score(y_test, y_pred))
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            print()
            
            # display false positives and false negatives
            unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
            tp = np.array(np.where(unq == 3)).tolist()[0]
            fp = np.array(np.where(unq == 1)).tolist()[0]
            tn = np.array(np.where(unq == 0)).tolist()[0]
            fn = np.array(np.where(unq == 2)).tolist()[0]

            display(test_set.iloc[fp,:].style.set_caption('False positives'))
            display(test_set.iloc[fn,:].style.set_caption('False negatives'))
            
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        model = NeuralNetwork(x_train.shape[1]).to(device)
    
        criterion = nn.BCELoss().to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.00001)

        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

        x_train = torch.tensor(x_train).to(device)
        x_val = torch.tensor(x_val).to(device)
        y_train = torch.tensor(y_train).reshape(-1,1).to(device)
        y_val = torch.tensor(y_val).reshape(-1,1).to(device)

        train_losses, val_losses, train_accur, val_accur = \
            model.fit(x_train, y_train, x_val, y_val, criterion, optimizer, n_epochs=5000)
        model.plot_losses(train_losses,val_losses,train_accur,val_accur)

        model.eval()

        x_test = (x_test - x_means)/x_stds
        x_test = torch.tensor(x_test).to(device)
        
        with torch.no_grad():
            y_pred = model(torch.tensor(x_test).float())[0] > .5
            y_pred = y_pred.detach().cpu().numpy()

            print("f1-score : ", f1_score(y_test, y_pred ))
            print("precision : ",precision_score(y_test, y_pred))
            print("recall : ",recall_score(y_test, y_pred )) 
            print("accuracy : ",accuracy_score(y_test, y_pred))
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            print()
            
            # display false positives and false negatives
            unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
            tp = np.array(np.where(unq == 3)).tolist()[0]
            fp = np.array(np.where(unq == 1)).tolist()[0]
            tn = np.array(np.where(unq == 0)).tolist()[0]
            fn = np.array(np.where(unq == 2)).tolist()[0]

            display(test_set.iloc[fp,:].style.set_caption('False positives'))
            display(test_set.iloc[fn,:].style.set_caption('False negatives'))

In [None]:
print("Evaluating on alldata splits\n")
nn_evaluate(train_alldata, test_alldata, pairs)

print("Evaluating on realdist splits\n")
nn_evaluate(train_realdist, test_realdist, pairs)

print("Evaluating on balanced splits\n")
nn_evaluate(train_balanced, test_balanced, pairs)

# SVM Classifier

In [None]:
def svm_evaluate(full_train_set, full_test_set, pairs_list):    
    train_set = pd.concat([full_train_set[pair] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds

    print("Training on all langs\n")
     
    SVM = SVC(kernel='linear')
    SVM.fit(x_train, y_train)

    # display regressor weights
    df = pd.DataFrame(SVM.coef_, columns=features).style.set_caption('Weights')
    display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = SVM.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    display(test_set.reset_index(drop=True).iloc[fp,:].style.set_caption('False positives'))
    display(test_set.reset_index(drop=True).iloc[fn,:].style.set_caption('False negatives'))
    
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = SVM.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))
        
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        SVM = SVC(kernel='linear')
        SVM.fit(x_train, y_train)
        
        # display regressor weights
        df = pd.DataFrame(SVM.coef_, columns=features).style.set_caption('Weights')
        display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = SVM.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))

In [None]:
print("Evaluating on alldata splits\n")
svm_evaluate(train_alldata, test_alldata, pairs)

print("Evaluating on realdist splits\n")
svm_evaluate(train_realdist, test_realdist, pairs)

print("Evaluating on balanced splits\n")
svm_evaluate(train_balanced, test_balanced, pairs)

# Random Forest Classifier 

In [None]:
def rf_evaluate(full_train_set, full_test_set, pairs_list):    
    train_set = pd.concat([full_train_set[pair] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds

    print("Training on all langs\n")
     
    RF = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
    RF.fit(x_train, y_train)
#     num_leafs = [1, 5, 10, 20, 50, 100]
#     parameters3 = [{'n_estimators' : range(100,800,20),
#              'max_depth': range(1,20,5),
#              'min_samples_leaf':num_leafs
#              }]
    
#     gs3 = GridSearchCV(estimator=RF,
#                   param_grid=parameters3,
#                   cv = 10,
#                   n_jobs = -1)

#     gs3.fit(x_train, y_train)
    
    # display regressor weights
    #df = pd.DataFrame(RF.best_params_, columns=features).style.set_caption('Weights')
    #display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = RF.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    display(test_set.reset_index(drop=True).iloc[fp,:].style.set_caption('False positives'))
    display(test_set.reset_index(drop=True).iloc[fn,:].style.set_caption('False negatives'))
    
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = RF.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))
        
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        RF = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
        RF.fit(x_train, y_train)
#         num_leafs = [1, 5, 10, 20, 50, 100]
#         parameters3 = [{'n_estimators' : range(100,800,20),
#              'max_depth': range(1,20,5),
#              'min_samples_leaf':num_leafs
#              }]
    
#         gs3 = GridSearchCV(estimator=RF,
#                   param_grid=parameters3,
#                   cv = 10,
#                   n_jobs = -1)

#         gs3.fit(x_train, y_train)
        
        # display regressor weights
        #df = pd.DataFrame(RF.best_params_, columns=features).style.set_caption('Weights')
        #display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = RF.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.iloc[fp,:].style.set_caption('False positives'))
        display(test_set.iloc[fn,:].style.set_caption('False negatives'))

In [None]:
print("Evaluating on alldata splits\n")
rf_evaluate(train_alldata, test_alldata, pairs)

print("Evaluating on realdist splits\n")
rf_evaluate(train_realdist, test_realdist, pairs)

print("Evaluating on balanced splits\n")
rf_evaluate(train_balanced, test_balanced, pairs)