Loan word classification experiments

GPU compatible

In [1]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd 
import numpy as np 
import os
import io
import requests
import csv
import json
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import pprint
import matplotlib.pyplot as plt
import random

In [2]:
import torch
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# device = 'cpu'
# print(device)

Using cuda device


In [19]:
pairs = None

with open('../language-pairs.json', 'r') as f:
    pairs = json.loads(f.read())
    print(pairs)
    
train_alldata = {}
test_alldata = {}

train_realdist = {}
test_realdist = {}

train_balanced = {}
test_balanced = {}

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L2 = pairs[pair]['source']['name']

    # load datasets
    prefix = f'../Datasets/production_train_test/{L1}-{L2}'

    train_alldata[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')
    
    
features = ['Fast Levenshtein Distance Div Maxlen',
            'Dolgo Prime Distance Div Maxlen',
            'Feature Edit Distance Div Maxlen',
            'Hamming Feature Distance Div Maxlen',
            'Weighted Feature Distance Div Maxlen',
            'Partial Hamming Feature Distance Div Maxlen',
            'plain Levenshtein',
            'DNN_logits',
            'MBERT_cos_sim',
            'XLM_cos_sim'
           ]


# features = ['Fast Levenshtein Distance Div Maxlen',
#             'Dolgo Prime Distance Div Maxlen',
#             'Feature Edit Distance Div Maxlen',
#             'Hamming Feature Distance Div Maxlen',
#             'Weighted Feature Distance Div Maxlen',
#             'Partial Hamming Feature Distance Div Maxlen',
#             'plain Levenshtein',
             
#            ]




# features = [ 
#             'plain Levenshtein',
#             'DNN_logits',
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]


# features = [ 
#             'plain Levenshtein',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]

# features = [ 
            
#                 'DNN_logits',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]
# features = [ 
#             'Partial Hamming Feature Distance Div Maxlen',
#             'plain Levenshtein',
#                 'DNN_logits',
             
#             'MBERT_cos_sim',
#             'XLM_cos_sim'
#            ]


labels = ['label_bin']

{'Hindi-Persian': {'target': {'name': 'Hindi', 'epi': 'hin-Deva', 'gtrans': 'hi', 'unicode': 'Devanagari'}, 'source': {'name': 'Persian', 'epi': 'fas-Arab', 'gtrans': 'fa', 'unicode': 'Arabic'}, 'wiki': 'https://en.m.wiktionary.org/wiki/Category:Hindi_terms_borrowed_from_Persian', 'loan-frac': '.3'}, 'English-French': {'target': {'name': 'English', 'epi': 'eng-Latn', 'gtrans': 'en', 'unicode': 'Latin'}, 'source': {'name': 'French', 'epi': 'fra-Latn', 'gtrans': 'fr', 'unicode': 'Latin'}, 'wiki': 'https://en.m.wiktionary.org/wiki/Category:English_terms_borrowed_from_French', 'loan-frac': '.3'}, 'Finnish-Swedish': {'target': {'name': 'Finnish', 'epi': 'fin-Latn', 'gtrans': 'fi', 'unicode': 'Latin'}, 'source': {'name': 'Swedish', 'epi': 'swe-Latn', 'gtrans': 'sv', 'unicode': 'Latin'}, 'wiki': 'https://en.m.wiktionary.org/wiki/Category:Finnish_terms_borrowed_from_Swedish', 'loan-frac': '.1'}, 'Kazakh-Russian': {'target': {'name': 'Kazakh', 'epi': 'kaz-Cyrl', 'gtrans': 'kk', 'unicode': 'Cyri

In [20]:
unicode_list = ["Latin", "Greek", "Cyrillic", "Arabic", "Devanagari", "Bengali", "Gurmukhi", "Tamil", "Telugu",\
                "Malayalam", "Myanmar", "Chinese"]

unicode_list = np.array(unicode_list)
if unicode_list.ndim == 1:
    unicode_list = unicode_list.reshape((-1, 1)) 
unicode_map = (unicode_list == np.unique(unicode_list)).astype(int).tolist()
unicode_map = dict(zip(unicode_list.reshape(-1,).tolist(), unicode_map))

In [21]:
for pair in pairs:
    # all data
    loan_unicode_onehots = pd.DataFrame(train_alldata[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(train_alldata[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    train_alldata[pair] = pd.concat([train_alldata[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    
    loan_unicode_onehots = pd.DataFrame(test_alldata[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_alldata[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_alldata[pair] = pd.concat([test_alldata[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    
    # real dist
    loan_unicode_onehots = pd.DataFrame(train_realdist[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(train_realdist[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    train_realdist[pair] = pd.concat([train_realdist[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
                
    loan_unicode_onehots = pd.DataFrame(test_realdist[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_realdist[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_realdist[pair] = pd.concat([test_realdist[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    
    # balanced
    loan_unicode_onehots = pd.DataFrame(train_balanced[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(train_balanced[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    train_balanced[pair] = pd.concat([train_balanced[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)

    loan_unicode_onehots = pd.DataFrame(test_balanced[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_balanced[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_balanced[pair] = pd.concat([test_balanced[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
        
features.extend([f'loan_unicode_{l[0]}' for l in unicode_list])
features.extend([f'original_unicode_{l[0]}' for l in unicode_list])

# Logistic Regression classifier

In [22]:
def classification_report_csv(report, language_pair, all_lang=False, train_evaluate_pair =True):
    #if language_pair ==True:
        
        #language_pair = pair

    report_data = []
    lines = report.split('\n')
    #print("lines",len(lines))
    #print(lines)
    for i, line in enumerate(lines[2:-3]):

        if i ==0 or i==1:
            #print("i", i)


           #print("actual lines",line)
            row = {}
            #row_data = line.split('      ')
            row_data = line.split(' ') 
            row_data = list(filter(None, row_data))
            #print(row_data)
            #print("rowdata",row_data[4] )
        row['all_language_evaluate'] = all_lang
        row['train_evaluate_pair'] = train_evaluate_pair
        row['language_pair'] = language_pair
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])

        if i==3:
            #print("actual lines", line)
            #print("i", i)
            row_data = line.split(' ') 
            row_data = list(filter(None, row_data))
            row['acc'] = float(row_data[1])

        report_data.append(row)
    print("final report", report_data[0:2])
    dataframe = pd.DataFrame.from_dict(report_data[0:2])
    return dataframe
    #dataframe.to_csv((f'{pair}-class_report.csv'), index=False)
    #class_report.to_csv((f'{pair}-class_report.csv'), index=False)

 

In [8]:
def lr_evaluate(full_train_set, full_test_set, pairs_list):
    all_result = []
    all_lang_fp_list= []
    all_lang_fn_list=[]
    evaluate_lang_fp_list = []
    evaluate_lang_fn_list = []
    train_evaluate_lang_fp_list = []
    train_evaluate_lang_fn_list = []
    weights_list = []
    
    pair ='All languages' #pair None for all languages evaluation
    
    
    
    
    display_fields = ['loan_word',
              'original_word',
              'loan_word_epitran',
              'original_word_epitran',
              'loan_english',
              'original_english'] + features
        
    train_set = pd.concat([full_train_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    x_stds[x_stds == 0] = 1
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds
    
    print("Training on all langs\n")
    LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500).fit(x_train, y_train)

    # display regressor weights
    df = pd.DataFrame(LR.coef_, columns=features).style.set_caption('Weights')
    df_weights = pd.DataFrame(LR.coef_, columns=features) 
     
    df_weights.insert(0, 'language_pair', pair)
    
    weights_list.append(df_weights)
     
    display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = LR.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, output_dict=True))
    #class_report = classification_report(y_test, y_pred)
    #a = classification_report_csv(class_report,pair , all_lang=True)
    print(confusion_matrix(y_test, y_pred))
    class_report = classification_report(y_test, y_pred)
    a = classification_report_csv(class_report,pair,all_lang=True, train_evaluate_pair =False)
    all_result.append(a)
    
    #all_result.append(a)
    #class_report = classification_report(y_test, y_pred, output_dict=True)
     
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
    #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
    all_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
    all_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
    all_lang_fp.insert(0, 'language_pair', pair)
    all_lang_fn.insert(0, 'language_pair', pair)
    all_lang_fp_list.append(all_lang_fp)
    all_lang_fn_list.append(all_lang_fn)
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = LR.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False, train_evaluate_pair =False)
        all_result.append(a)
        
        #class_report = pd.DataFrame(class_report).transpose()
        #class_report.to_csv('lr_alllang_report.csv', index=False)
        
        print(confusion_matrix(y_test, y_pred))
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        evaluate_lang_fp.insert(0, 'language_pair', pair)
        evaluate_lang_fn.insert(0, 'language_pair', pair)
        evaluate_lang_fp_list.append(evaluate_lang_fp)
        evaluate_lang_fn_list.append(evaluate_lang_fn)
        
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair][display_fields + ["label"] + labels]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        x_stds[x_stds == 0] = 1
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500).fit(x_train, y_train)
        
        # display regressor weights
        df = pd.DataFrame(LR.coef_, columns=features).style.set_caption('Weights')
        df_weights = pd.DataFrame(LR.coef_, columns=features)
        df_weights.insert(0, 'language_pair', pair)
        weights_list.append(df_weights)
        display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = LR.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        #print(classification_report(y_test, y_pred))
        print(classification_report(y_test, y_pred, output_dict=True))
        #class_report = classification_report(y_test, y_pred, output_dict=True)
        #class_report = pd.DataFrame(class_report).transpose()
        #class_report.to_csv((f'{pair}-class_report.csv'), index=False)
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False,train_evaluate_pair =True)
        all_result.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        train_evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        train_evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        train_evaluate_lang_fp.insert(0, 'language_pair', pair)
        train_evaluate_lang_fn.insert(0, 'language_pair', pair)
        train_evaluate_lang_fp_list.append(train_evaluate_lang_fp)
        train_evaluate_lang_fn_list.append(train_evaluate_lang_fn)
    final_result = pd.concat(all_result, ignore_index=True)
    all_lang_fp_list = pd.concat(all_lang_fp_list, ignore_index=True)
    all_lang_fn_list = pd.concat(all_lang_fn_list, ignore_index=True)
    evaluate_lang_fp_list = pd.concat(evaluate_lang_fp_list, ignore_index=True)
    evaluate_lang_fn_list = pd.concat(evaluate_lang_fn_list, ignore_index=True)
    train_evaluate_lang_fp_list = pd.concat(train_evaluate_lang_fp_list, ignore_index=True)
    train_evaluate_lang_fn_list = pd.concat(train_evaluate_lang_fn_list, ignore_index=True)
    weights = pd.concat(weights_list, ignore_index=True)
    
    return final_result, all_lang_fp_list, all_lang_fn_list, evaluate_lang_fp_list, evaluate_lang_fn_list, train_evaluate_lang_fp_list, train_evaluate_lang_fn_list, weights

# Saving final results for alldata

In [None]:
print("Evaluating and Saving final results as csv files on alldata splits\n")
output_alldata,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = lr_evaluate(train_alldata, test_alldata, pairs)
#prefix = f'../Final_results/all_data/LR_results'
prefix = f'../Final_results/all_data/features_wo_dnnlogits_cosims/LR_results'

if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_alldata.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')
weights.to_csv(f'{prefix}/LR_weights.csv')

# print("Evaluating on realdist splits\n")
# lr_evaluate(train_realdist, test_realdist, pairs)

# print("Evaluating on balanced splits\n")
# lr_evaluate(train_balanced, test_balanced, pairs)

# Saving final results for realdist

In [None]:
print("Evaluating and Saving final results as csv files on realdist splits\n")
output_realdist,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = lr_evaluate(train_realdist, test_realdist, pairs)
#prefix = f'../Final_results/real_dist/LR_results'
prefix = f'../Final_results/real_dist/features_wo_dnnlogits_cosims/LR_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_realdist.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')
weights.to_csv(f'{prefix}/LR_weights.csv')

# Saving final results for balanced

In [None]:
print("Evaluating and Saving final results as csv files on balanced splits\n")
output_balanced,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = lr_evaluate(train_balanced, test_balanced, pairs)
#prefix = f'../Final_results/balanced/LR_results'
prefix = f'../Final_results/balanced/features_wo_dnnlogits_cosims/LR_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_balanced.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')
weights.to_csv(f'{prefix}/LR_weights.csv')

# Neural Network classifier

## Definition

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1),
            
        )

    def forward(self, x):
        logits_new = self.linear_relu_stack(x)
        logits  = logits_new
        
        return torch.sigmoid(logits), logits_new
    
    def fit(self, X_train, Y_train, X_val, Y_val, criterion, optimizer, n_epochs=5000):
        train_losses = []
        val_losses = []
        train_accur = []
        val_accur = []

        for epoch in range(n_epochs):
            y_pred, logits = self(X_train.float())

            train_loss = criterion(y_pred, Y_train.float())

            if epoch % (n_epochs // 50) == 0:
                train_acc,_ = self.calculate_accuracy(Y_train, y_pred)

                y_val_pred = self(X_val.float())[0]

                val_loss = criterion(y_val_pred, Y_val.float())

                val_acc, total_corr = self.calculate_accuracy(Y_val, y_val_pred)

                print(f'''epoch {epoch}
                    Train set - loss: {self.round_tensor(train_loss)}, accuracy: {self.round_tensor(train_acc)} 
                    Val set - loss: {self.round_tensor(val_loss)}, accuracy: {self.round_tensor(val_acc)}''')
                
                train_losses.append(train_loss.detach().cpu().numpy())
                val_losses.append(val_loss.detach().cpu().numpy())

                val_accur.append(val_acc.detach().cpu().numpy())
                train_accur.append(train_acc.detach().cpu().numpy())

            optimizer.zero_grad()

            train_loss.backward()

            optimizer.step()
            
        return train_losses,val_losses,train_accur,val_accur
    
    def calculate_accuracy(self, y_true, y_pred):
        predicted = y_pred.ge(.5) 
        return ((y_true == predicted).sum().float() / len(y_true), (y_true == predicted).sum())
    
    def round_tensor(self, t, decimal_places=3):
        return round(t.item(), decimal_places)
    
    def plot_losses(self, train_losses, val_losses, train_accur, val_accur):
        epochs = range(1, len(train_accur) + 1)

        plt.plot(epochs, train_accur, 'bo', label='Training acc')
        plt.plot(epochs, val_accur, 'b', label='Vaidation acc')
        plt.title('Training and validation accuracy')
        plt.legend()

        plt.figure()

        plt.plot(epochs, train_losses, 'bo', label='Training loss')
        plt.plot(epochs, val_losses, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()

        plt.show()

In [None]:
def nn_evaluate(full_train_set, full_test_set, pairs_list):   
    all_result_nn = []
    all_result = []
    all_lang_fp_list= []
    all_lang_fn_list=[]
    evaluate_lang_fp_list = []
    evaluate_lang_fn_list = []
    train_evaluate_lang_fp_list = []
    train_evaluate_lang_fn_list = []
    
    pair =None #pair None for all languages evaluation
    display_fields = ['loan_word',
              'original_word',
              'loan_word_epitran',
              'original_word_epitran',
              'loan_english',
              'original_english'] + features
        
    train_set = pd.concat([full_train_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    x_stds[x_stds == 0] = 1
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds
    
    torch.manual_seed(7)
    random.seed(7)
    np.random.seed(7)
    
    model = NeuralNetwork(x_train.shape[1]).to(device)
    print(model)
    
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.00001)
    #optimizer = optim.SGD(model.parameters(),lr=0.00001, momentum=0.0,  weight_decay=0.0, nesterov=False)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

    x_train = torch.tensor(x_train).to(device)
    x_val = torch.tensor(x_val).to(device)
    y_train = torch.tensor(y_train).reshape(-1,1).to(device)
    y_val = torch.tensor(y_val).reshape(-1,1).to(device)
        
    print("Training on all langs\n")
    train_losses, val_losses, train_accur, val_accur = \
        model.fit(x_train, y_train, x_val, y_val, criterion, optimizer, n_epochs=5000)
    model.plot_losses(train_losses,val_losses,train_accur,val_accur)
    
    model.eval()
    
    x_test = (x_test - x_means)/x_stds
    x_test = torch.tensor(x_test).to(device)

    with torch.no_grad():
        y_pred = model(torch.tensor(x_test).float())[0] > .5
        y_pred = y_pred.detach().cpu().numpy()

        print("f1-score : ", f1_score(y_test, y_pred))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred)) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=True, train_evaluate_pair =False)
        all_result_nn.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        all_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        all_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        
        all_lang_fp.insert(0, 'language_pair', pair)
        all_lang_fn.insert(0, 'language_pair', pair)
        all_lang_fp_list.append(all_lang_fp)
        all_lang_fn_list.append(all_lang_fn)
        for pair in pairs_list:
            print(f'Evaluating on {pair}')

            test_set = full_test_set[pair][display_fields + ["label"] + labels]
            x_test = test_set[features].values
            y_test = test_set[labels].values.ravel()

            x_test = (x_test - x_means)/x_stds
            x_test = torch.tensor(x_test).to(device)
            y_pred = model(torch.tensor(x_test).float())[0] > .5
            y_pred = y_pred.detach().cpu().numpy()
             
            print("f1-score : ", f1_score(y_test, y_pred ))
            print("precision : ",precision_score(y_test, y_pred))
            print("recall : ",recall_score(y_test, y_pred )) 
            print("accuracy : ",accuracy_score(y_test, y_pred))
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            class_report = classification_report(y_test, y_pred)
            a = classification_report_csv(class_report,pair,all_lang=False, train_evaluate_pair =False)
            all_result_nn.append(a)
            print()
            
            # display false positives and false negatives
            unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
            tp = np.array(np.where(unq == 3)).tolist()[0]
            fp = np.array(np.where(unq == 1)).tolist()[0]
            tn = np.array(np.where(unq == 0)).tolist()[0]
            fn = np.array(np.where(unq == 2)).tolist()[0]

            #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
            #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
            evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
            evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
            evaluate_lang_fp.insert(0, 'language_pair', pair)
            evaluate_lang_fn.insert(0, 'language_pair', pair)
            evaluate_lang_fp_list.append(evaluate_lang_fp)
            evaluate_lang_fn_list.append(evaluate_lang_fn)
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair][display_fields + ["label"] + labels]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        x_stds[x_stds == 0] = 1
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        # standardize input features
        x_train = (x_train - x_means)/x_stds
        
        model = NeuralNetwork(x_train.shape[1]).to(device)
    
        criterion = nn.BCELoss().to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.00001)

        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

        x_train = torch.tensor(x_train).to(device)
        x_val = torch.tensor(x_val).to(device)
        y_train = torch.tensor(y_train).reshape(-1,1).to(device)
        y_val = torch.tensor(y_val).reshape(-1,1).to(device)

        train_losses, val_losses, train_accur, val_accur = \
            model.fit(x_train, y_train, x_val, y_val, criterion, optimizer, n_epochs=5000)
        model.plot_losses(train_losses,val_losses,train_accur,val_accur)

        model.eval()

        x_test = (x_test - x_means)/x_stds
        x_test = torch.tensor(x_test).to(device)
        
        with torch.no_grad():
            y_pred = model(torch.tensor(x_test).float())[0] > .5
            y_pred = y_pred.detach().cpu().numpy()

            print("f1-score : ", f1_score(y_test, y_pred ))
            print("precision : ",precision_score(y_test, y_pred))
            print("recall : ",recall_score(y_test, y_pred )) 
            print("accuracy : ",accuracy_score(y_test, y_pred))
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            class_report = classification_report(y_test, y_pred)
            a = classification_report_csv(class_report,pair,all_lang=False,train_evaluate_pair =True)
            all_result_nn.append(a)
            print()
            
            # display false positives and false negatives
            unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
            tp = np.array(np.where(unq == 3)).tolist()[0]
            fp = np.array(np.where(unq == 1)).tolist()[0]
            tn = np.array(np.where(unq == 0)).tolist()[0]
            fn = np.array(np.where(unq == 2)).tolist()[0]

            #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
            #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
            train_evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
            train_evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
            train_evaluate_lang_fp.insert(0, 'language_pair', pair)
            train_evaluate_lang_fn.insert(0, 'language_pair', pair)
            train_evaluate_lang_fp_list.append(train_evaluate_lang_fp)
            train_evaluate_lang_fn_list.append(train_evaluate_lang_fn)
            
    final_result = pd.concat(all_result_nn, ignore_index=True)
    all_lang_fp_list = pd.concat(all_lang_fp_list, ignore_index=True)
    all_lang_fn_list = pd.concat(all_lang_fn_list, ignore_index=True)
    evaluate_lang_fp_list = pd.concat(evaluate_lang_fp_list, ignore_index=True)
    evaluate_lang_fn_list = pd.concat(evaluate_lang_fn_list, ignore_index=True)
    train_evaluate_lang_fp_list = pd.concat(train_evaluate_lang_fp_list, ignore_index=True)
    train_evaluate_lang_fn_list = pd.concat(train_evaluate_lang_fn_list, ignore_index=True)
    
    return final_result, all_lang_fp_list, all_lang_fn_list, evaluate_lang_fp_list, evaluate_lang_fn_list, train_evaluate_lang_fp_list, train_evaluate_lang_fn_list

# NN alldata

In [None]:
print("Evaluating on alldata splits\n")
#nn_evaluate(train_alldata, test_alldata, pairs)


print("Evaluating and Saving final results as csv files on alldata splits\n")
output_alldata,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = nn_evaluate(train_alldata, test_alldata, pairs)
#prefix = f'../Final_results/all_data/NN_results'

prefix = f'../Final_results/all_data/features_wo_dnnlogits_cosims/NN_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_alldata.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')

# print("Evaluating on realdist splits\n")
# nn_evaluate(train_realdist, test_realdist, pairs)

# print("Evaluating on balanced splits\n")
# nn_evaluate(train_balanced, test_balanced, pairs)

# NN real dist

In [None]:
print("Evaluating on realdist splits\n")
 
print("Evaluating and Saving final results as csv files on realdist splits\n")
output_realdist,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = nn_evaluate(train_realdist, test_realdist, pairs)
#prefix = f'../Final_results/real_dist/NN_results'
prefix = f'../Final_results/real_dist/features_wo_dnnlogits_cosims/NN_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_realdist.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')

# NN balanced

In [None]:
print("Evaluating on balanced splits\n")
 
print("Evaluating and Saving final results as csv files on balanced splits\n")
output_balanced,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = nn_evaluate(train_balanced, test_balanced, pairs)
#prefix = f'../Final_results/balanced/NN_results'
prefix = f'../Final_results/balanced/features_wo_dnnlogits_cosims/NN_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_balanced.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')

# final results Pruned list

In [None]:
train_realdist_pruned = {}

# remove any pairs in alldata testing set from realdist training set
for lang in train_realdist:
    train_realdist_pruned[lang] = train_realdist[lang][~train_realdist[lang].set_index(['loan_word','original_word']).index.\
                          isin(test_alldata[lang].set_index(['loan_word','original_word']).index)]

print("Evaluating on realdist (train) and alldata (test)\n")
output_pruned,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = nn_evaluate(train_realdist_pruned, test_alldata, pairs)
#prefix = f'../Final_results/pruned_nn'

prefix = f'../Final_results/pruned_nn/features_wo_dnnlogits_cosims/NN_results'

if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_pruned.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')





In [None]:
def nn_evaluate_holdout(full_train_set, full_holdout_test_set, holdout_pairs_list):  
    all_result_nn_holdout = []
    hold_out_fp=[]
    hold_out_fn =[]
    pair =None #pair None for all languages evaluation
    display_fields = ['loan_word',
              'original_word',
              'loan_word_epitran',
              'original_word_epitran',
              'loan_english',
              'original_english'] + features
        
    train_set = pd.concat([full_train_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    x_stds[x_stds == 0] = 1
    y_train = train_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds
    
    torch.manual_seed(7)
    random.seed(7)
    np.random.seed(7)
    
    model = NeuralNetwork(x_train.shape[1]).to(device)
    print(model)
    
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.00001)
    #optimizer = optim.SGD(model.parameters(),lr=0.00001, momentum=0.0,  weight_decay=0.0, nesterov=False)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

    x_train = torch.tensor(x_train).to(device)
    x_val = torch.tensor(x_val).to(device)
    y_train = torch.tensor(y_train).reshape(-1,1).to(device)
    y_val = torch.tensor(y_val).reshape(-1,1).to(device)
        
    print("Training on all langs\n")
    train_losses, val_losses, train_accur, val_accur = \
        model.fit(x_train, y_train, x_val, y_val, criterion, optimizer, n_epochs=5000)
    model.plot_losses(train_losses,val_losses,train_accur,val_accur)
    
    model.eval()

    with torch.no_grad():
        for pair in holdout_pairs_list:
            print(f'Evaluating on {pair}')

            test_set = full_holdout_test_set[pair][display_fields + ["label"] + labels]
            x_test = test_set[features].values
            y_test = test_set[labels].values.ravel()

            x_test = (x_test - x_means)/x_stds
            x_test = torch.tensor(x_test).to(device)
            y_pred = model(torch.tensor(x_test).float())[0] > .5
            y_pred = y_pred.detach().cpu().numpy()
             
            print("f1-score : ", f1_score(y_test, y_pred ))
            print("precision : ",precision_score(y_test, y_pred))
            print("recall : ",recall_score(y_test, y_pred )) 
            print("accuracy : ",accuracy_score(y_test, y_pred))
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            class_report = classification_report(y_test, y_pred)
            a = classification_report_csv(class_report,pair,all_lang=False, train_evaluate_pair =False)
            all_result_nn_holdout.append(a)
            print()
            
            # display false positives and false negatives
            unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
            tp = np.array(np.where(unq == 3)).tolist()[0]
            fp = np.array(np.where(unq == 1)).tolist()[0]
            tn = np.array(np.where(unq == 0)).tolist()[0]
            fn = np.array(np.where(unq == 2)).tolist()[0]

            display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
            display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
            holdout_evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
            holdout_evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
            #holdout_evaluate_lang_fp['language_pair'] = pair
            #holdout_evaluate_lang_fn['language_pair'] = pair
            holdout_evaluate_lang_fp.insert(0, 'language_pair', pair)
            holdout_evaluate_lang_fn.insert(0, 'language_pair', pair)
            hold_out_fp.append(holdout_evaluate_lang_fp)
            hold_out_fn.append(holdout_evaluate_lang_fn)
    holdout_evaluate_lang_fp = pd.concat(hold_out_fp, ignore_index=True) 
    holdout_evaluate_lang_fn = pd.concat(hold_out_fn, ignore_index=True) 
    final_result = pd.concat(all_result_nn_holdout, ignore_index=True)
    return final_result, holdout_evaluate_lang_fp,holdout_evaluate_lang_fn
            

In [None]:
with open('../language-pairs-holdout.json', 'r') as f:
    pairs_holdout = json.loads(f.read())
    print(pairs_holdout)
    
test_holdout_alldata = {}

test_holdout_realdist = {}

test_holdout_balanced = {}

for pair in pairs_holdout:
    print(pair)
    L1 = pairs_holdout[pair]['target']['name']
    L2 = pairs_holdout[pair]['source']['name']

    # load datasets
    prefix = f'../Datasets/production_train_test/{L1}-{L2}'

    test_holdout_alldata[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    test_holdout_realdist[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    test_holdout_balanced[f'{L1}-{L2}'] = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')
    
for pair in pairs_holdout:
    # all data
    loan_unicode_onehots = pd.DataFrame(test_holdout_alldata[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_holdout_alldata[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_holdout_alldata[pair] = pd.concat([test_holdout_alldata[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    
    # real dist            
    loan_unicode_onehots = pd.DataFrame(test_holdout_realdist[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_holdout_realdist[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_holdout_realdist[pair] = pd.concat([test_holdout_realdist[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    
    # balanced
    loan_unicode_onehots = pd.DataFrame(test_holdout_balanced[pair]['loan_unicode'].map(unicode_map).tolist(), \
                     columns = [f'loan_unicode_{l[0]}' for l in unicode_list])
    orig_unicode_onehots = pd.DataFrame(test_holdout_balanced[pair]['original_unicode'].map(unicode_map).tolist(), \
                     columns = [f'original_unicode_{l[0]}' for l in unicode_list])
    test_holdout_balanced[pair] = pd.concat([test_holdout_balanced[pair], loan_unicode_onehots, orig_unicode_onehots], axis=1)
    


In [None]:
print("Evaluating on alldata splits\n")
output_holdout, holdout_evaluate_lang_fp,holdout_evaluate_lang_fn = nn_evaluate_holdout(train_alldata, test_holdout_alldata, pairs_holdout)

#prefix = f'../Final_results/holdout/all_data'
prefix = f'../Final_results/holdout/all_data/features_wo_dnnlogits_cosims'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_holdout.to_csv(f'{prefix}/holdout_final_results.csv')
holdout_evaluate_lang_fp.to_csv(f'{prefix}/holdout_falsepos.csv')
holdout_evaluate_lang_fn.to_csv(f'{prefix}/holdout_falseneg.csv')
 
# print("Evaluating on realdist splits\n")
# nn_evaluate_holdout(train_realdist, test_holdout_realdist, pairs_holdout)

# print("Evaluating on balanced splits\n")
# nn_evaluate_holdout(train_balanced, test_holdout_balanced, pairs_holdout)

In [None]:
print("Evaluating on real dist splits\n")
output_holdout, holdout_evaluate_lang_fp,holdout_evaluate_lang_fn = nn_evaluate_holdout(train_realdist, test_holdout_realdist, pairs_holdout)

#prefix = f'../Final_results/holdout/real_dist'
prefix = f'../Final_results/holdout/real_dist/features_wo_dnnlogits_cosims'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_holdout.to_csv(f'{prefix}/holdout_final_results.csv')
holdout_evaluate_lang_fp.to_csv(f'{prefix}/holdout_falsepos.csv')
holdout_evaluate_lang_fn.to_csv(f'{prefix}/holdout_falseneg.csv')

In [None]:
print("Evaluating on balanced splits\n")
output_holdout, holdout_evaluate_lang_fp,holdout_evaluate_lang_fn = nn_evaluate_holdout(train_balanced, test_holdout_balanced, pairs_holdout)

#prefix = f'../Final_results/holdout/balanced'
prefix = f'../Final_results/holdout/balanced/features_wo_dnnlogits_cosims'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_holdout.to_csv(f'{prefix}/holdout_final_results.csv')
holdout_evaluate_lang_fp.to_csv(f'{prefix}/holdout_falsepos.csv')
holdout_evaluate_lang_fn.to_csv(f'{prefix}/holdout_falseneg.csv')

# SVM Classifier

In [23]:
def svm_evaluate(full_train_set, full_test_set, pairs_list):
    pair ='All_languages'
    
    all_result = []
    all_lang_fp_list= []
    all_lang_fn_list=[]
    evaluate_lang_fp_list = []
    evaluate_lang_fn_list = []
    train_evaluate_lang_fp_list = []
    train_evaluate_lang_fn_list = []
    weights_list = []
    display_fields = ['loan_word',
              'original_word',
              'loan_word_epitran',
              'original_word_epitran',
              'loan_english',
              'original_english'] + features
        
    train_set = pd.concat([full_train_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    x_stds[x_stds == 0] = 1
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds

    print("Training on all langs\n")
     
    SVM = SVC(kernel='linear')
    SVM.fit(x_train, y_train)

    # display regressor weights
    df = pd.DataFrame(SVM.coef_, columns=features).style.set_caption('Weights')
    df_weights = pd.DataFrame(SVM.coef_, columns=features) 
     
    df_weights.insert(0, 'language_pair', pair)
    
    weights_list.append(df_weights)
    display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = SVM.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    class_report = classification_report(y_test, y_pred)
    a = classification_report_csv(class_report,pair,all_lang=True, train_evaluate_pair =False)
    all_result.append(a)
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
    #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
    all_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
    all_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
    
    all_lang_fp.insert(0, 'language_pair', pair)
    all_lang_fn.insert(0, 'language_pair', pair)
    all_lang_fp_list.append(all_lang_fp)
    all_lang_fn_list.append(all_lang_fn)
    
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = SVM.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False, train_evaluate_pair =False)
        all_result.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]] .style.set_caption('False negatives'))
        evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        
        evaluate_lang_fp.insert(0, 'language_pair', pair)
        evaluate_lang_fn.insert(0, 'language_pair', pair)
        evaluate_lang_fp_list.append(evaluate_lang_fp)
        evaluate_lang_fn_list.append(evaluate_lang_fn)
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair][display_fields + ["label"] + labels]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        x_stds[x_stds == 0] = 1
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        SVM = SVC(kernel='linear')
        SVM.fit(x_train, y_train)
        
        # display regressor weights
        df = pd.DataFrame(SVM.coef_, columns=features).style.set_caption('Weights')
        df_weights = pd.DataFrame(SVM.coef_, columns=features) 
     
        df_weights.insert(0, 'language_pair', pair)
    
        weights_list.append(df_weights)
        display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = SVM.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False,train_evaluate_pair =True)
        all_result.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        #display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        #display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        train_evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        train_evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        
        train_evaluate_lang_fp.insert(0, 'language_pair', pair)
        train_evaluate_lang_fn.insert(0, 'language_pair', pair)
        train_evaluate_lang_fp_list.append(train_evaluate_lang_fp)
        train_evaluate_lang_fn_list.append(train_evaluate_lang_fn)
    
    final_result = pd.concat(all_result, ignore_index=True)
    all_lang_fp_list = pd.concat(all_lang_fp_list, ignore_index=True)
    all_lang_fn_list = pd.concat(all_lang_fn_list, ignore_index=True)
    evaluate_lang_fp_list = pd.concat(evaluate_lang_fp_list, ignore_index=True)
    evaluate_lang_fn_list = pd.concat(evaluate_lang_fn_list, ignore_index=True)
    train_evaluate_lang_fp_list = pd.concat(train_evaluate_lang_fp_list, ignore_index=True)
    train_evaluate_lang_fn_list = pd.concat(train_evaluate_lang_fn_list, ignore_index=True)
    weights = pd.concat(weights_list, ignore_index=True)
    
    return final_result, all_lang_fp_list, all_lang_fn_list, evaluate_lang_fp_list, evaluate_lang_fn_list, train_evaluate_lang_fp_list, train_evaluate_lang_fn_list, weights
        
        

In [25]:
print("Evaluating on alldata splits\n")
final_result,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = svm_evaluate(train_alldata, test_alldata, pairs)

prefix = f'../Final_results/all_data/SVM_results'
#prefix = f'../Final_results/all_data/features_wo_dnnlogits_cosims/SVM_results'


if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
final_result.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')   
weights.to_csv(f'{prefix}/SVM_weights.csv')
# print("Evaluating on realdist splits\n")
# svm_evaluate(train_realdist, test_realdist, pairs)

# print("Evaluating on balanced splits\n")
# svm_evaluate(train_balanced, test_balanced, pairs)

Evaluating on alldata splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.110236,-0.089952,1.016421,0.697109,-0.184853,-1.368514,-0.547781,3.509347,0.09045,0.199903,0.0,0.0,0.18891,-0.057056,-0.080115,0.0,0.0,-0.012989,0.0,0.0,0.0,0.0,0.16296,0.0,0.0,-0.057056,0.0,0.0,0.0,-0.09557,0.0,0.0,0.0,0.0



Evaluating on all langs
f1-score :  0.7466965285554311
precision :  0.8018278018278018
recall :  0.6986588432523051
accuracy :  0.9425509219281759
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     17301
           1       0.80      0.70      0.75      2386

    accuracy                           0.94     19687
   macro avg       0.88      0.84      0.86     19687
weighted avg       0.94      0.94      0.94     19687

[[16889   412]
 [  719  1667]]
final report [{'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '0', 'precision': 0.96, 'recall': 0.98, 'f1_score': 0.97, 'support': 17301.0}, {'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '1', 'precision': 0.8, 'recall': 0.7, 'f1_score': 0.75, 'support': 2386.0, 'acc': 0.94}]

Evaluating on Hindi-Persian
f1-score :  0.7256637168141592
precision :  0.803921568627451
recall :  0

f1-score :  0.6900269541778977
precision :  0.7710843373493976
recall :  0.624390243902439
accuracy :  0.9313432835820895
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1470
           1       0.77      0.62      0.69       205

    accuracy                           0.93      1675
   macro avg       0.86      0.80      0.83      1675
weighted avg       0.93      0.93      0.93      1675

[[1432   38]
 [  77  128]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Polish-French', 'class': '0', 'precision': 0.95, 'recall': 0.97, 'f1_score': 0.96, 'support': 1470.0}, {'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Polish-French', 'class': '1', 'precision': 0.77, 'recall': 0.62, 'f1_score': 0.69, 'support': 205.0, 'acc': 0.93}]

Evaluating on Indonesian-Dutch
f1-score :  0.8301886792452831
precision :  0.8333333333333334
recall :  0.8270676691729323
accura

Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.40921,-0.060178,-0.234523,0.149537,-0.146449,-0.071039,0.363607,4.249317,-0.001123,0.02215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7327586206896551
precision :  0.7870370370370371
recall :  0.6854838709677419
accuracy :  0.9496344435418359
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1107
           1       0.79      0.69      0.73       124

    accuracy                           0.95      1231
   macro avg       0.88      0.83      0.85      1231
weighted avg       0.95      0.95      0.95      1231

[[1084   23]
 [  39   85]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '0', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.97, 'support': 1107.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '1', 'precision': 0.79, 'recall': 0.69, 'f1_score': 0.73, 'support': 124.0, 'acc': 0.95}]

Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,0.045965,-0.016677,0.398586,1.086153,-0.180177,-1.078298,-1.18118,2.55405,0.220915,0.499634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8841893252769387
precision :  0.9032921810699589
recall :  0.8658777120315582
accuracy :  0.9663447468539654
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2910
           1       0.90      0.87      0.88       507

    accuracy                           0.97      3417
   macro avg       0.94      0.92      0.93      3417
weighted avg       0.97      0.97      0.97      3417

[[2863   47]
 [  68  439]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '0', 'precision': 0.98, 'recall': 0.98, 'f1_score': 0.98, 'support': 2910.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '1', 'precision': 0.9, 'recall': 0.87, 'f1_score': 0.88, 'support': 507.0, 'acc': 0.97}]

Training and evaluating on Finnish-Swedish


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,0.085518,-0.068862,0.020771,1.279636,-0.042749,-1.065472,-0.725579,3.58041,0.06478,0.082881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.644859813084112
precision :  0.7666666666666667
recall :  0.5564516129032258
accuracy :  0.9402045633359559
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1147
           1       0.77      0.56      0.64       124

    accuracy                           0.94      1271
   macro avg       0.86      0.77      0.81      1271
weighted avg       0.94      0.94      0.94      1271

[[1126   21]
 [  55   69]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '0', 'precision': 0.95, 'recall': 0.98, 'f1_score': 0.97, 'support': 1147.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '1', 'precision': 0.77, 'recall': 0.56, 'f1_score': 0.64, 'support': 124.0, 'acc': 0.94}]

Training and evaluating on Kazakh-Russian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.098707,0.727799,0.239185,-0.35763,-0.851599,0.539664,-1.895327,4.686711,-0.067451,0.113032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8690807799442897
precision :  0.8715083798882681
recall :  0.8666666666666667
accuracy :  0.969281045751634
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1350
           1       0.87      0.87      0.87       180

    accuracy                           0.97      1530
   macro avg       0.93      0.92      0.93      1530
weighted avg       0.97      0.97      0.97      1530

[[1327   23]
 [  24  156]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '0', 'precision': 0.98, 'recall': 0.98, 'f1_score': 0.98, 'support': 1350.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '1', 'precision': 0.87, 'recall': 0.87, 'f1_score': 0.87, 'support': 180.0, 'acc': 0.97}]

Training and evaluating on Azerbaijani-Arabic


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.073263,-0.689606,-0.308118,-0.103612,0.090102,0.574886,1.020989,4.17669,-0.292705,-0.040673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.6919431279620853
precision :  0.73
recall :  0.6576576576576577
accuracy :  0.9455611390284757
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1083
           1       0.73      0.66      0.69       111

    accuracy                           0.95      1194
   macro avg       0.85      0.82      0.83      1194
weighted avg       0.94      0.95      0.94      1194

[[1056   27]
 [  38   73]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '0', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.97, 'support': 1083.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '1', 'precision': 0.73, 'recall': 0.66, 'f1_score': 0.69, 'support': 111.0, 'acc': 0.95}]

Training and evaluating on Chinese-English


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.045018,-0.104104,-0.561003,0.403828,0.002453,0.228546,-0.019203,1.959674,0.035292,0.068714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.5316455696202531
precision :  0.6666666666666666
recall :  0.4421052631578947
accuracy :  0.9187705817782656
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       816
           1       0.67      0.44      0.53        95

    accuracy                           0.92       911
   macro avg       0.80      0.71      0.74       911
weighted avg       0.91      0.92      0.91       911

[[795  21]
 [ 53  42]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '0', 'precision': 0.94, 'recall': 0.97, 'f1_score': 0.96, 'support': 816.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '1', 'precision': 0.67, 'recall': 0.44, 'f1_score': 0.53, 'support': 95.0, 'acc': 0.92}]

Training and evaluating on German-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.080879,-0.063524,-0.271311,1.16668,-0.15185,-0.504299,-1.093886,2.573201,0.081722,0.05779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7717842323651452
precision :  0.8857142857142857
recall :  0.6838235294117647
accuracy :  0.9588014981273408
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1199
           1       0.89      0.68      0.77       136

    accuracy                           0.96      1335
   macro avg       0.93      0.84      0.87      1335
weighted avg       0.96      0.96      0.96      1335

[[1187   12]
 [  43   93]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '0', 'precision': 0.97, 'recall': 0.99, 'f1_score': 0.98, 'support': 1199.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '1', 'precision': 0.89, 'recall': 0.68, 'f1_score': 0.77, 'support': 136.0, 'acc': 0.96}]

Training and evaluating on English-German


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,0.114629,-0.225492,0.546524,0.16062,0.110168,-0.214888,-1.651542,2.587664,0.266025,0.278059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8389380530973451
precision :  0.8745387453874539
recall :  0.8061224489795918
accuracy :  0.9587675577707295
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1913
           1       0.87      0.81      0.84       294

    accuracy                           0.96      2207
   macro avg       0.92      0.89      0.91      2207
weighted avg       0.96      0.96      0.96      2207

[[1879   34]
 [  57  237]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '0', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.98, 'support': 1913.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '1', 'precision': 0.87, 'recall': 0.81, 'f1_score': 0.84, 'support': 294.0, 'acc': 0.96}]

Training and evaluating on Romanian-Hungarian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.15638,-0.247806,0.256275,-0.078598,0.176745,-0.014283,-0.216274,3.442421,0.172574,0.161937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.6039215686274509
precision :  0.7
recall :  0.5310344827586206
accuracy :  0.9251851851851852
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1205
           1       0.70      0.53      0.60       145

    accuracy                           0.93      1350
   macro avg       0.82      0.75      0.78      1350
weighted avg       0.92      0.93      0.92      1350

[[1172   33]
 [  68   77]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '0', 'precision': 0.95, 'recall': 0.97, 'f1_score': 0.96, 'support': 1205.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '1', 'precision': 0.7, 'recall': 0.53, 'f1_score': 0.6, 'support': 145.0, 'acc': 0.93}]

Training and evaluating on Polish-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.16735,-0.106319,0.604676,0.420079,-0.423343,-0.428969,-0.482172,3.047766,-0.004327,0.031726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7333333333333334
precision :  0.772972972972973
recall :  0.697560975609756
accuracy :  0.937910447761194
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1470
           1       0.77      0.70      0.73       205

    accuracy                           0.94      1675
   macro avg       0.87      0.83      0.85      1675
weighted avg       0.94      0.94      0.94      1675

[[1428   42]
 [  62  143]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '0', 'precision': 0.96, 'recall': 0.97, 'f1_score': 0.96, 'support': 1470.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '1', 'precision': 0.77, 'recall': 0.7, 'f1_score': 0.73, 'support': 205.0, 'acc': 0.94}]

Training and evaluating on Indonesian-Dutch


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.028903,0.156673,0.029122,0.698036,-0.703871,0.352172,-1.206297,4.17404,0.067455,0.218024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8352490421455939
precision :  0.8515625
recall :  0.8195488721804511
accuracy :  0.9547844374342797
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1636
           1       0.85      0.82      0.84       266

    accuracy                           0.95      1902
   macro avg       0.91      0.90      0.90      1902
weighted avg       0.95      0.95      0.95      1902

[[1598   38]
 [  48  218]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '0', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.97, 'support': 1636.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '1', 'precision': 0.85, 'recall': 0.82, 'f1_score': 0.84, 'support': 266.0, 'acc': 0.95}]

Training and evaluating on Romanian-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.129567,-0.1853,-0.360038,0.253768,0.116074,0.126768,-0.919823,2.848118,0.081097,0.031072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7584415584415584
precision :  0.7849462365591398
recall :  0.7336683417085427
accuracy :  0.9441105769230769
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1465
           1       0.78      0.73      0.76       199

    accuracy                           0.94      1664
   macro avg       0.87      0.85      0.86      1664
weighted avg       0.94      0.94      0.94      1664

[[1425   40]
 [  53  146]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '0', 'precision': 0.96, 'recall': 0.97, 'f1_score': 0.97, 'support': 1465.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '1', 'precision': 0.78, 'recall': 0.73, 'f1_score': 0.76, 'support': 199.0, 'acc': 0.94}]

Exists


In [26]:
print("Evaluating on realdist splits\n")
final_result,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = svm_evaluate(train_realdist, test_realdist, pairs)

prefix = f'../Final_results/real_dist/SVM_results'
#prefix = f'../Final_results/real_dist/features_wo_dnnlogits_cosims/SVM_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
final_result.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')   
weights.to_csv(f'{prefix}/SVM_weights.csv') 

Evaluating on realdist splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.207335,-0.14127,0.220731,1.28953,-0.26654,-1.095382,-0.777955,3.309823,0.008973,0.025218,0.0,0.0,0.161097,-0.087167,-0.0351,0.0,0.0,-0.014551,0.0,0.0,0.0,0.0,0.199193,0.0,0.0,-0.087167,0.0,0.0,0.0,-0.088366,0.0,0.0,0.0,0.0



Evaluating on all langs
f1-score :  0.8398366606170599
precision :  0.9154302670623146
recall :  0.7757753562447611
accuracy :  0.9558446431921946
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     13603
           1       0.92      0.78      0.84      2386

    accuracy                           0.96     15989
   macro avg       0.94      0.88      0.91     15989
weighted avg       0.95      0.96      0.95     15989

[[13432   171]
 [  535  1851]]
final report [{'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '0', 'precision': 0.96, 'recall': 0.99, 'f1_score': 0.97, 'support': 13603.0}, {'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '1', 'precision': 0.92, 'recall': 0.78, 'f1_score': 0.84, 'support': 2386.0, 'acc': 0.96}]

Evaluating on Hindi-Persian
f1-score :  0.8608695652173912
precision :  0.9339622641509434
recall :

f1-score :  0.8128342245989306
precision :  0.8994082840236687
recall :  0.7414634146341463
accuracy :  0.9658869395711501
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1847
           1       0.90      0.74      0.81       205

    accuracy                           0.97      2052
   macro avg       0.94      0.87      0.90      2052
weighted avg       0.96      0.97      0.96      2052

[[1830   17]
 [  53  152]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Polish-French', 'class': '0', 'precision': 0.97, 'recall': 0.99, 'f1_score': 0.98, 'support': 1847.0}, {'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Polish-French', 'class': '1', 'precision': 0.9, 'recall': 0.74, 'f1_score': 0.81, 'support': 205.0, 'acc': 0.97}]

Evaluating on Indonesian-Dutch
f1-score :  0.9227722772277228
precision :  0.9748953974895398
recall :  0.8759398496240601
accura

Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.623771,-0.040326,-0.158854,-0.411522,-0.187851,0.393302,0.312532,3.631307,0.099157,-0.060298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9113924050632912
precision :  0.9557522123893806
recall :  0.8709677419354839
accuracy :  0.9492753623188406
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       290
           1       0.96      0.87      0.91       124

    accuracy                           0.95       414
   macro avg       0.95      0.93      0.94       414
weighted avg       0.95      0.95      0.95       414

[[285   5]
 [ 16 108]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '0', 'precision': 0.95, 'recall': 0.98, 'f1_score': 0.96, 'support': 290.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '1', 'precision': 0.96, 'recall': 0.87, 'f1_score': 0.91, 'support': 124.0, 'acc': 0.95}]

Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.150458,-0.009536,-0.172125,1.95833,-0.384097,-1.238169,-1.545652,2.203011,-0.029768,0.038855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9341438703140832
precision :  0.9604166666666667
recall :  0.9092702169625246
accuracy :  0.9615384615384616
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1183
           1       0.96      0.91      0.93       507

    accuracy                           0.96      1690
   macro avg       0.96      0.95      0.95      1690
weighted avg       0.96      0.96      0.96      1690

[[1164   19]
 [  46  461]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '0', 'precision': 0.96, 'recall': 0.98, 'f1_score': 0.97, 'support': 1183.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '1', 'precision': 0.96, 'recall': 0.91, 'f1_score': 0.93, 'support': 507.0, 'acc': 0.96}]

Training and evaluating on Finnish-Swedish


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,0.053135,-0.11587,0.319818,1.104627,-0.071999,-1.192753,-0.910896,3.515169,-0.001407,-0.057454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.6698564593301435
precision :  0.8235294117647058
recall :  0.5645161290322581
accuracy :  0.9443996776792909
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1117
           1       0.82      0.56      0.67       124

    accuracy                           0.94      1241
   macro avg       0.89      0.78      0.82      1241
weighted avg       0.94      0.94      0.94      1241

[[1102   15]
 [  54   70]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '0', 'precision': 0.95, 'recall': 0.99, 'f1_score': 0.97, 'support': 1117.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '1', 'precision': 0.82, 'recall': 0.56, 'f1_score': 0.67, 'support': 124.0, 'acc': 0.94}]

Training and evaluating on Kazakh-Russian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.267962,0.686718,0.057056,-0.177856,-0.863139,0.520357,-1.633131,4.846153,-0.011962,0.03869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9126760563380283
precision :  0.9257142857142857
recall :  0.9
accuracy :  0.9828349944629015
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1626
           1       0.93      0.90      0.91       180

    accuracy                           0.98      1806
   macro avg       0.96      0.95      0.95      1806
weighted avg       0.98      0.98      0.98      1806

[[1613   13]
 [  18  162]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '0', 'precision': 0.99, 'recall': 0.99, 'f1_score': 0.99, 'support': 1626.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '1', 'precision': 0.93, 'recall': 0.9, 'f1_score': 0.91, 'support': 180.0, 'acc': 0.98}]

Training and evaluating on Azerbaijani-Arabic


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.2207,-0.593799,-0.156837,-0.05719,0.00527,0.333641,0.964983,4.240785,-0.100589,-0.037597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7788461538461539
precision :  0.8350515463917526
recall :  0.7297297297297297
accuracy :  0.9379217273954116
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       630
           1       0.84      0.73      0.78       111

    accuracy                           0.94       741
   macro avg       0.89      0.85      0.87       741
weighted avg       0.94      0.94      0.94       741

[[614  16]
 [ 30  81]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '0', 'precision': 0.95, 'recall': 0.97, 'f1_score': 0.96, 'support': 630.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '1', 'precision': 0.84, 'recall': 0.73, 'f1_score': 0.78, 'support': 111.0, 'acc': 0.94}]

Training and evaluating on Chinese-English


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.109104,-0.095586,-0.478267,0.3151,-0.105516,0.304533,-0.080057,2.00567,0.007342,-0.011526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.6025641025641025
precision :  0.7704918032786885
recall :  0.49473684210526314
accuracy :  0.9351464435146444
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       861
           1       0.77      0.49      0.60        95

    accuracy                           0.94       956
   macro avg       0.86      0.74      0.78       956
weighted avg       0.93      0.94      0.93       956

[[847  14]
 [ 48  47]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '0', 'precision': 0.95, 'recall': 0.98, 'f1_score': 0.96, 'support': 861.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '1', 'precision': 0.77, 'recall': 0.49, 'f1_score': 0.6, 'support': 95.0, 'acc': 0.94}]

Training and evaluating on German-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.126968,-0.264413,-0.288764,1.285243,-0.264738,-0.362185,-1.289564,2.462148,0.091376,-0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8467741935483872
precision :  0.9375
recall :  0.7720588235294118
accuracy :  0.9720998531571219
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1226
           1       0.94      0.77      0.85       136

    accuracy                           0.97      1362
   macro avg       0.96      0.88      0.92      1362
weighted avg       0.97      0.97      0.97      1362

[[1219    7]
 [  31  105]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '0', 'precision': 0.98, 'recall': 0.99, 'f1_score': 0.98, 'support': 1226.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '1', 'precision': 0.94, 'recall': 0.77, 'f1_score': 0.85, 'support': 136.0, 'acc': 0.97}]

Training and evaluating on English-German


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,0.053002,-0.3012,0.139028,0.54622,0.001744,0.090424,-2.408314,2.440491,0.035342,-0.039057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8872987477638641
precision :  0.9358490566037736
recall :  0.8435374149659864
accuracy :  0.9785787147228834
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2647
           1       0.94      0.84      0.89       294

    accuracy                           0.98      2941
   macro avg       0.96      0.92      0.94      2941
weighted avg       0.98      0.98      0.98      2941

[[2630   17]
 [  46  248]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '0', 'precision': 0.98, 'recall': 0.99, 'f1_score': 0.99, 'support': 2647.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '1', 'precision': 0.94, 'recall': 0.84, 'f1_score': 0.89, 'support': 294.0, 'acc': 0.98}]

Training and evaluating on Romanian-Hungarian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.248292,-0.357517,-0.093461,0.445546,0.326553,-0.206497,-0.421159,3.211525,0.055134,-0.03996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.7165354330708662
precision :  0.8348623853211009
recall :  0.6275862068965518
accuracy :  0.9505833905284832
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1312
           1       0.83      0.63      0.72       145

    accuracy                           0.95      1457
   macro avg       0.90      0.81      0.84      1457
weighted avg       0.95      0.95      0.95      1457

[[1294   18]
 [  54   91]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '0', 'precision': 0.96, 'recall': 0.99, 'f1_score': 0.97, 'support': 1312.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '1', 'precision': 0.83, 'recall': 0.63, 'f1_score': 0.72, 'support': 145.0, 'acc': 0.95}]

Training and evaluating on Polish-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.315978,-0.170506,0.716979,0.157876,-0.316536,-0.267447,-0.54053,2.969186,0.005045,0.008595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.8201058201058201
precision :  0.8959537572254336
recall :  0.7560975609756098
accuracy :  0.9668615984405458
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1847
           1       0.90      0.76      0.82       205

    accuracy                           0.97      2052
   macro avg       0.93      0.87      0.90      2052
weighted avg       0.97      0.97      0.97      2052

[[1829   18]
 [  50  155]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '0', 'precision': 0.97, 'recall': 0.99, 'f1_score': 0.98, 'support': 1847.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '1', 'precision': 0.9, 'recall': 0.76, 'f1_score': 0.82, 'support': 205.0, 'acc': 0.97}]

Training and evaluating on Indonesian-Dutch


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.073134,0.042959,-0.057954,1.139004,-1.203439,0.483872,-1.67578,3.331362,0.112762,0.290611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.946360153256705
precision :  0.96484375
recall :  0.9285714285714286
accuracy :  0.9578947368421052
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       399
           1       0.96      0.93      0.95       266

    accuracy                           0.96       665
   macro avg       0.96      0.95      0.96       665
weighted avg       0.96      0.96      0.96       665

[[390   9]
 [ 19 247]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '0', 'precision': 0.95, 'recall': 0.98, 'f1_score': 0.97, 'support': 399.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '1', 'precision': 0.96, 'recall': 0.93, 'f1_score': 0.95, 'support': 266.0, 'acc': 0.96}]

Training and evaluating on Romanian-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.406783,-0.128574,0.072739,-0.109208,-0.385473,0.380011,-1.075477,2.636973,0.062783,0.023087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9028871391076114
precision :  0.945054945054945
recall :  0.864321608040201
accuracy :  0.9442771084337349
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       465
           1       0.95      0.86      0.90       199

    accuracy                           0.94       664
   macro avg       0.94      0.92      0.93       664
weighted avg       0.94      0.94      0.94       664

[[455  10]
 [ 27 172]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '0', 'precision': 0.94, 'recall': 0.98, 'f1_score': 0.96, 'support': 465.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '1', 'precision': 0.95, 'recall': 0.86, 'f1_score': 0.9, 'support': 199.0, 'acc': 0.94}]

Exists


In [27]:
print("Evaluating on balanced splits\n")
final_result,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn, weights = svm_evaluate(train_balanced, test_balanced, pairs)

prefix = f'../Final_results/balanced/SVM_results'
#prefix = f'../Final_results/balanced/features_wo_dnnlogits_cosims/SVM_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
final_result.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')   
weights.to_csv(f'{prefix}/SVM_weights.csv') 


Evaluating on balanced splits

Training on all langs



Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.557592,-0.396341,0.235268,0.657303,-0.109161,-0.487456,-0.799094,2.296604,-0.006159,0.023581,0.0,0.0,0.175009,-0.06795,-0.070042,0.0,0.0,-0.00217,0.0,0.0,0.0,0.0,0.203352,0.0,0.0,-0.06795,0.0,0.0,0.0,-0.112584,0.0,0.0,0.0,0.0



Evaluating on all langs
f1-score :  0.9750519750519752
precision :  0.9722222222222222
recall :  0.9778982485404504
accuracy :  0.9749321077919365
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2389
           1       0.97      0.98      0.98      2398

    accuracy                           0.97      4787
   macro avg       0.97      0.97      0.97      4787
weighted avg       0.97      0.97      0.97      4787

[[2322   67]
 [  53 2345]]
final report [{'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '0', 'precision': 0.98, 'recall': 0.97, 'f1_score': 0.97, 'support': 2389.0}, {'all_language_evaluate': True, 'train_evaluate_pair': False, 'language_pair': 'All_languages', 'class': '1', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.98, 'support': 2398.0, 'acc': 0.97}]

Evaluating on Hindi-Persian
f1-score :  0.9802371541501976
precision :  0.96875
recall :  0.992
accuracy

f1-score :  0.9754901960784313
precision :  0.9567307692307693
recall :  0.995
accuracy :  0.974937343358396
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       199
           1       0.96      0.99      0.98       200

    accuracy                           0.97       399
   macro avg       0.98      0.97      0.97       399
weighted avg       0.98      0.97      0.97       399

[[190   9]
 [  1 199]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Romanian-French', 'class': '0', 'precision': 0.99, 'recall': 0.95, 'f1_score': 0.97, 'support': 199.0}, {'all_language_evaluate': False, 'train_evaluate_pair': False, 'language_pair': 'Romanian-French', 'class': '1', 'precision': 0.96, 'recall': 0.99, 'f1_score': 0.98, 'support': 200.0, 'acc': 0.97}]

Training and evaluating on Hindi-Persian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-1.003404,-0.347152,0.243486,-0.746437,-0.004762,0.295978,0.423882,2.587965,-0.055848,0.112921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9763779527559056
precision :  0.9612403100775194
recall :  0.992
accuracy :  0.9759036144578314
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       124
           1       0.96      0.99      0.98       125

    accuracy                           0.98       249
   macro avg       0.98      0.98      0.98       249
weighted avg       0.98      0.98      0.98       249

[[119   5]
 [  1 124]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '0', 'precision': 0.99, 'recall': 0.96, 'f1_score': 0.98, 'support': 124.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Hindi-Persian', 'class': '1', 'precision': 0.96, 'recall': 0.99, 'f1_score': 0.98, 'support': 125.0, 'acc': 0.98}]

Training and evaluating on English-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.306275,-0.157521,0.060872,1.022176,-0.1637,-0.587921,-1.558047,1.730358,0.089668,-0.059447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9755142017629774
precision :  0.9707602339181286
recall :  0.9803149606299213
accuracy :  0.9753694581280788
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       507
           1       0.97      0.98      0.98       508

    accuracy                           0.98      1015
   macro avg       0.98      0.98      0.98      1015
weighted avg       0.98      0.98      0.98      1015

[[492  15]
 [ 10 498]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '0', 'precision': 0.98, 'recall': 0.97, 'f1_score': 0.98, 'support': 507.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-French', 'class': '1', 'precision': 0.97, 'recall': 0.98, 'f1_score': 0.98, 'support': 508.0, 'acc': 0.98}]

Training and evaluating on Finnish-Swedish


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.327182,-0.500669,0.214053,0.779048,0.243493,-0.611752,-1.266359,2.008298,0.040025,-0.004662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.96875
precision :  0.9465648854961832
recall :  0.992
accuracy :  0.968
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       125
           1       0.95      0.99      0.97       125

    accuracy                           0.97       250
   macro avg       0.97      0.97      0.97       250
weighted avg       0.97      0.97      0.97       250

[[118   7]
 [  1 124]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '0', 'precision': 0.99, 'recall': 0.94, 'f1_score': 0.97, 'support': 125.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Finnish-Swedish', 'class': '1', 'precision': 0.95, 'recall': 0.99, 'f1_score': 0.97, 'support': 125.0, 'acc': 0.97}]

Training and evaluating on Kazakh-Russian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.216299,0.343163,0.085442,-0.22164,-0.651512,0.267297,-1.397291,2.526615,-0.1066,0.038883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.989010989010989
precision :  0.9836065573770492
recall :  0.994475138121547
accuracy :  0.989010989010989
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       183
           1       0.98      0.99      0.99       181

    accuracy                           0.99       364
   macro avg       0.99      0.99      0.99       364
weighted avg       0.99      0.99      0.99       364

[[180   3]
 [  1 180]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '0', 'precision': 0.99, 'recall': 0.98, 'f1_score': 0.99, 'support': 183.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Kazakh-Russian', 'class': '1', 'precision': 0.98, 'recall': 0.99, 'f1_score': 0.99, 'support': 181.0, 'acc': 0.99}]

Training and evaluating on Azerbaijani-Arabic


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.696821,-1.38978,-0.239526,-0.14959,0.672506,0.335135,0.812044,3.584368,-0.076177,0.05619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9955555555555555
precision :  0.9911504424778761
recall :  1.0
accuracy :  0.9955357142857143
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       112
           1       0.99      1.00      1.00       112

    accuracy                           1.00       224
   macro avg       1.00      1.00      1.00       224
weighted avg       1.00      1.00      1.00       224

[[111   1]
 [  0 112]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '0', 'precision': 1.0, 'recall': 0.99, 'f1_score': 1.0, 'support': 112.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Azerbaijani-Arabic', 'class': '1', 'precision': 0.99, 'recall': 1.0, 'f1_score': 1.0, 'support': 112.0, 'acc': 1.0}]

Training and evaluating on Chinese-English


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.395275,-0.569707,-0.617362,0.937335,0.025136,-0.115999,0.084891,1.978838,-0.014431,0.061328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9468085106382979
precision :  0.967391304347826
recall :  0.9270833333333334
accuracy :  0.9473684210526315
              precision    recall  f1-score   support

           0       0.93      0.97      0.95        94
           1       0.97      0.93      0.95        96

    accuracy                           0.95       190
   macro avg       0.95      0.95      0.95       190
weighted avg       0.95      0.95      0.95       190

[[91  3]
 [ 7 89]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '0', 'precision': 0.93, 'recall': 0.97, 'f1_score': 0.95, 'support': 94.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Chinese-English', 'class': '1', 'precision': 0.97, 'recall': 0.93, 'f1_score': 0.95, 'support': 96.0, 'acc': 0.95}]

Training and evaluating on German-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.414821,-0.559979,-0.147051,0.994578,-0.066752,-0.249523,-1.673875,1.701198,-0.067155,0.061073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9818181818181817
precision :  0.9782608695652174
recall :  0.9854014598540146
accuracy :  0.9816176470588235
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       135
           1       0.98      0.99      0.98       137

    accuracy                           0.98       272
   macro avg       0.98      0.98      0.98       272
weighted avg       0.98      0.98      0.98       272

[[132   3]
 [  2 135]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '0', 'precision': 0.99, 'recall': 0.98, 'f1_score': 0.98, 'support': 135.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'German-French', 'class': '1', 'precision': 0.98, 'recall': 0.99, 'f1_score': 0.98, 'support': 137.0, 'acc': 0.98}]

Training and evaluating on English-German


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.17189,-0.122464,0.473483,0.454941,-0.338854,-0.14436,-2.198706,1.609478,-0.137456,0.14503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9830508474576272
precision :  0.9830508474576272
recall :  0.9830508474576272
accuracy :  0.9829931972789115
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       293
           1       0.98      0.98      0.98       295

    accuracy                           0.98       588
   macro avg       0.98      0.98      0.98       588
weighted avg       0.98      0.98      0.98       588

[[288   5]
 [  5 290]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '0', 'precision': 0.98, 'recall': 0.98, 'f1_score': 0.98, 'support': 293.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'English-German', 'class': '1', 'precision': 0.98, 'recall': 0.98, 'f1_score': 0.98, 'support': 295.0, 'acc': 0.98}]

Training and evaluating on Romanian-Hungarian


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.716195,-0.524604,0.468868,0.203201,0.002634,-0.18708,-0.269836,2.156916,0.095942,0.117277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9553264604810997
precision :  0.9586206896551724
recall :  0.952054794520548
accuracy :  0.9554794520547946
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       146
           1       0.96      0.95      0.96       146

    accuracy                           0.96       292
   macro avg       0.96      0.96      0.96       292
weighted avg       0.96      0.96      0.96       292

[[140   6]
 [  7 139]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '0', 'precision': 0.95, 'recall': 0.96, 'f1_score': 0.96, 'support': 146.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-Hungarian', 'class': '1', 'precision': 0.96, 'recall': 0.95, 'f1_score': 0.96, 'support': 146.0, 'acc': 0.96}]

Training and evaluating on Polish-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.574937,-0.40104,0.272436,0.50851,-0.397469,-0.14628,-0.473685,1.928066,0.057776,-0.0632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9711538461538461
precision :  0.9619047619047619
recall :  0.9805825242718447
accuracy :  0.9707317073170731
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       204
           1       0.96      0.98      0.97       206

    accuracy                           0.97       410
   macro avg       0.97      0.97      0.97       410
weighted avg       0.97      0.97      0.97       410

[[196   8]
 [  4 202]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '0', 'precision': 0.98, 'recall': 0.96, 'f1_score': 0.97, 'support': 204.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Polish-French', 'class': '1', 'precision': 0.96, 'recall': 0.98, 'f1_score': 0.97, 'support': 206.0, 'acc': 0.97}]

Training and evaluating on Indonesian-Dutch


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.037558,-0.097602,-0.070104,0.983875,-0.828235,0.332257,-1.568361,2.736221,0.07526,0.393812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.9868173258003766
precision :  0.9924242424242424
recall :  0.9812734082397003
accuracy :  0.9868913857677902
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       267
           1       0.99      0.98      0.99       267

    accuracy                           0.99       534
   macro avg       0.99      0.99      0.99       534
weighted avg       0.99      0.99      0.99       534

[[265   2]
 [  5 262]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '0', 'precision': 0.98, 'recall': 0.99, 'f1_score': 0.99, 'support': 267.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Indonesian-Dutch', 'class': '1', 'precision': 0.99, 'recall': 0.98, 'f1_score': 0.99, 'support': 267.0, 'acc': 0.99}]

Training and evaluating on Romanian-French


Unnamed: 0,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,DNN_logits,MBERT_cos_sim,XLM_cos_sim,loan_unicode_Latin,loan_unicode_Greek,loan_unicode_Cyrillic,loan_unicode_Arabic,loan_unicode_Devanagari,loan_unicode_Bengali,loan_unicode_Gurmukhi,loan_unicode_Tamil,loan_unicode_Telugu,loan_unicode_Malayalam,loan_unicode_Myanmar,loan_unicode_Chinese,original_unicode_Latin,original_unicode_Greek,original_unicode_Cyrillic,original_unicode_Arabic,original_unicode_Devanagari,original_unicode_Bengali,original_unicode_Gurmukhi,original_unicode_Tamil,original_unicode_Telugu,original_unicode_Malayalam,original_unicode_Myanmar,original_unicode_Chinese
0,-0.503193,-0.155482,-0.185494,0.212373,-0.333702,0.298957,-1.215673,1.694677,0.079763,-0.088232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


f1-score :  0.977886977886978
precision :  0.961352657004831
recall :  0.995
accuracy :  0.9774436090225563
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       199
           1       0.96      0.99      0.98       200

    accuracy                           0.98       399
   macro avg       0.98      0.98      0.98       399
weighted avg       0.98      0.98      0.98       399

[[191   8]
 [  1 199]]
final report [{'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '0', 'precision': 0.99, 'recall': 0.96, 'f1_score': 0.98, 'support': 199.0}, {'all_language_evaluate': False, 'train_evaluate_pair': True, 'language_pair': 'Romanian-French', 'class': '1', 'precision': 0.96, 'recall': 0.99, 'f1_score': 0.98, 'support': 200.0, 'acc': 0.98}]

Exists


# Random Forest Classifier 

In [None]:
def rf_evaluate(full_train_set, full_test_set, pairs_list): 
    all_result = []
    all_lang_fp_list= []
    all_lang_fn_list=[]
    evaluate_lang_fp_list = []
    evaluate_lang_fn_list = []
    train_evaluate_lang_fp_list = []
    train_evaluate_lang_fn_list = []
    pair =None #pair None for all languages evaluation
    display_fields = ['loan_word',
              'original_word',
              'loan_word_epitran',
              'original_word_epitran',
              'loan_english',
              'original_english'] + features
        
    train_set = pd.concat([full_train_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_train = train_set[features].values
    x_means = np.mean(x_train, axis=0)
    x_stds = np.std(x_train, axis=0)
    x_stds[x_stds == 0] = 1
    y_train = train_set[labels].values.ravel()
    
    test_set = pd.concat([full_test_set[pair][display_fields + ["label"] + labels] for pair in pairs])
    x_test = test_set[features].values
    y_test = test_set[labels].values.ravel()
    
    # standardize input features
    x_train = (x_train - x_means)/x_stds

    print("Training on all langs\n")
     
    RF = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
    RF.fit(x_train, y_train)
#     num_leafs = [1, 5, 10, 20, 50, 100]
#     parameters3 = [{'n_estimators' : range(100,800,20),
#              'max_depth': range(1,20,5),
#              'min_samples_leaf':num_leafs
#              }]
    
#     gs3 = GridSearchCV(estimator=RF,
#                   param_grid=parameters3,
#                   cv = 10,
#                   n_jobs = -1)

#     gs3.fit(x_train, y_train)
    
    # display regressor weights
    #df = pd.DataFrame(RF.best_params_, columns=features).style.set_caption('Weights')
    #display(df)
    print()
    
    print("Evaluating on all langs")
    x_test = (x_test - x_means)/x_stds
    y_pred = RF.predict(x_test)

    print("f1-score : ", f1_score(y_test, y_pred ))
    print("precision : ",precision_score(y_test, y_pred))
    print("recall : ",recall_score(y_test, y_pred )) 
    print("accuracy : ",accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    class_report = classification_report(y_test, y_pred)
    a = classification_report_csv(class_report,pair,all_lang=True, train_evaluate_pair =False)
    all_result.append(a)
    print()
    
    # display false positives and false negatives
    unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
    tp = np.array(np.where(unq == 3)).tolist()[0]
    fp = np.array(np.where(unq == 1)).tolist()[0]
    tn = np.array(np.where(unq == 0)).tolist()[0]
    fn = np.array(np.where(unq == 2)).tolist()[0]

    display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
    display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
    all_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
    all_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
    all_lang_fp.insert(0, 'language_pair', pair)
    all_lang_fn.insert(0, 'language_pair', pair)
    all_lang_fp_list.append(all_lang_fp)
    all_lang_fn_list.append(all_lang_fn)
    
    for pair in pairs_list:
        print(f'Evaluating on {pair}')
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_test = (x_test - x_means)/x_stds
        y_pred = RF.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False, train_evaluate_pair =False)
        all_result.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        
        evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        evaluate_lang_fp.insert(0, 'language_pair', pair)
        evaluate_lang_fn.insert(0, 'language_pair', pair)
        evaluate_lang_fp_list.append(evaluate_lang_fp)
        evaluate_lang_fn_list.append(evaluate_lang_fn)
        
    for pair in pairs_list:
        print(f'Training and evaluating on {pair}')
        
        train_set = full_train_set[pair][display_fields + ["label"] + labels]
        x_train = train_set[features].values
        x_means = np.mean(x_train, axis=0)
        x_stds = np.std(x_train, axis=0)
        x_stds[x_stds == 0] = 1
        y_train = train_set[labels].values.ravel()
        
        test_set = full_test_set[pair][display_fields + ["label"] + labels]
        x_test = test_set[features].values
        y_test = test_set[labels].values.ravel()
        
        x_train = (x_train - x_means)/x_stds
        
        RF = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
        RF.fit(x_train, y_train)
#         num_leafs = [1, 5, 10, 20, 50, 100]
#         parameters3 = [{'n_estimators' : range(100,800,20),
#              'max_depth': range(1,20,5),
#              'min_samples_leaf':num_leafs
#              }]
    
#         gs3 = GridSearchCV(estimator=RF,
#                   param_grid=parameters3,
#                   cv = 10,
#                   n_jobs = -1)

#         gs3.fit(x_train, y_train)
        
        # display regressor weights
        #df = pd.DataFrame(RF.best_params_, columns=features).style.set_caption('Weights')
        #display(df)

        x_test = (x_test - x_means)/x_stds
        y_pred = RF.predict(x_test)
        
        print("f1-score : ", f1_score(y_test, y_pred ))
        print("precision : ",precision_score(y_test, y_pred))
        print("recall : ",recall_score(y_test, y_pred )) 
        print("accuracy : ",accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        class_report = classification_report(y_test, y_pred)
        a = classification_report_csv(class_report,pair,all_lang=False,train_evaluate_pair =True)
        all_result.append(a)
        print()
        
        # display false positives and false negatives
        unq = np.array([x + 2*y for x, y in zip(y_pred, y_test)])
        tp = np.array(np.where(unq == 3)).tolist()[0]
        fp = np.array(np.where(unq == 1)).tolist()[0]
        tn = np.array(np.where(unq == 0)).tolist()[0]
        fn = np.array(np.where(unq == 2)).tolist()[0]

        display(test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]].style.set_caption('False positives'))
        display(test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]].style.set_caption('False negatives'))
        train_evaluate_lang_fp = test_set.reset_index(drop=True).iloc[fp,:][display_fields + ["label"]]
        train_evaluate_lang_fn = test_set.reset_index(drop=True).iloc[fn,:][display_fields + ["label"]]
        train_evaluate_lang_fp.insert(0, 'language_pair', pair)
        train_evaluate_lang_fn.insert(0, 'language_pair', pair)
        train_evaluate_lang_fp_list.append(train_evaluate_lang_fp)
        train_evaluate_lang_fn_list.append(train_evaluate_lang_fn)
    final_result = pd.concat(all_result, ignore_index=True)
    all_lang_fp_list = pd.concat(all_lang_fp_list, ignore_index=True)
    all_lang_fn_list = pd.concat(all_lang_fn_list, ignore_index=True)
    evaluate_lang_fp_list = pd.concat(evaluate_lang_fp_list, ignore_index=True)
    evaluate_lang_fn_list = pd.concat(evaluate_lang_fn_list, ignore_index=True)
    train_evaluate_lang_fp_list = pd.concat(train_evaluate_lang_fp_list, ignore_index=True)
    train_evaluate_lang_fn_list = pd.concat(train_evaluate_lang_fn_list, ignore_index=True)
        
    return final_result, all_lang_fp_list, all_lang_fn_list, evaluate_lang_fp_list, evaluate_lang_fn_list, train_evaluate_lang_fp_list, train_evaluate_lang_fn_list

In [None]:
print("Evaluating and saving csvs on alldata splits\n")

output_alldata,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = rf_evaluate(train_alldata, test_alldata, pairs)
prefix = f'../Final_results/all_data/RF_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_alldata.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')



# print("Evaluating on realdist splits\n")
# rf_evaluate(train_realdist, test_realdist, pairs)

# print("Evaluating on balanced splits\n")
# rf_evaluate(train_balanced, test_balanced, pairs)

In [None]:
print("Evaluating on realdist splits\n")
output_alldata,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = rf_evaluate(train_realdist, test_realdist, pairs)
prefix = f'../Final_results/real_dist/RF_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_alldata.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')

In [None]:
print("Evaluating on balanced splits\n")
output_alldata,all_lang_fp, all_lang_fn, evaluate_lang_fp, evaluate_lang_fn, train_evaluate_lang_fp, train_evaluate_lang_fn = rf_evaluate(train_balanced, test_balanced, pairs)
prefix = f'../Final_results/balanced/RF_results'
if os.path.isdir(prefix):
    print("Exists")
else:
    print("Doesn't exist")
    os.mkdir(prefix)
output_alldata.to_csv(f'{prefix}/final_results.csv')
all_lang_fp.to_csv(f'{prefix}/all_lang_falsepos.csv')
all_lang_fn.to_csv(f'{prefix}/all_lang_falseneg.csv')
evaluate_lang_fp.to_csv(f'{prefix}/all_lang_pair_eval_falsepos.csv')
evaluate_lang_fn.to_csv(f'{prefix}/all_lang_pair_eval_falseneg.csv')
train_evaluate_lang_fp.to_csv(f'{prefix}/indiv_pair_eval_falsepos.csv')
train_evaluate_lang_fn.to_csv(f'{prefix}/indiv_pair_eval_falseneg.csv')