Assumes you have run `Train_Testset.ipynb` first to make the `alldata`, `realdist`, and `balanced` train/test splits for the chosen language pair.

# Imports and setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import panphon
import panphon.distance
import editdistance # levenshtein
import epitran
import eng_to_ipa as eng
from epitran.backoff import Backoff
from googletrans import Translator
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
epitran.download.cedict()


In [2]:
import torch
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import os
import torch
from torch import nn
from torch import nn, optim

import torch.nn.functional as F

%matplotlib inline

In [3]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import numpy as np 
import io
import requests
import csv

In [4]:
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
#device = torch.device("cuda:0:3" if torch.cuda.is_available() else "cpu") ## specify the GPU id's, GPU id's start from 0.

Using device: cuda

NVIDIA GeForce RTX 3090
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())

NVIDIA GeForce RTX 3090


## Load datasets

In [None]:
train_alldata = pd.read_csv('../Datasets/train_final_production_alldata.csv')
test_alldata = pd.read_csv('../Datasets/test_final_production_alldata.csv')

In [None]:
train_realdist = pd.read_csv('../Datasets/train_final_production_realdist.csv')
test_realdist = pd.read_csv('../Datasets/test_final_production_realdist.csv')

In [None]:
train_balanced = pd.read_csv('../Datasets/train_final_production_balanced.csv')
test_balanced = pd.read_csv('../Datasets/test_final_production_balanced.csv')

## Get Panphon phonetic features

In [None]:
#get phonetic features using panPhon
ft = panphon.FeatureTable()

In [None]:
train_alldata['features_loan'] = train_alldata.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
train_alldata['features_orig'] = train_alldata.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)
test_alldata['features_loan'] = test_alldata.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
test_alldata['features_orig'] = test_alldata.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)

train_alldata['features_loan'] = train_alldata['features_loan'].apply(lambda x:sum(x, []))
train_alldata['features_orig'] = train_alldata['features_orig'].apply(lambda x:sum(x, []))
test_alldata['features_orig'] = test_alldata['features_orig'].apply(lambda x:sum(x, []))
test_alldata['features_loan'] = test_alldata['features_loan'].apply(lambda x:sum(x, []))


Pad the phonetic features of the loan word and original word out to the maxlen of the features appearing in the training set (format: `<loan><pad 0s><orig><pad 0s>`).

In [None]:
train_alldata_maxlen = (np.max(train_alldata['features_loan'].str.len()),\
                               np.max(train_alldata['features_orig'].str.len()))

train_alldata['features_loan'] = train_alldata['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
train_alldata['features_orig'] = train_alldata['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

train_alldata['features_loan'][np.random.randint(len(train_alldata['features_loan']))],\
train_alldata['features_orig'][np.random.randint(len(train_alldata['features_loan']))]

In [None]:
test_alldata['features_loan'] = test_alldata['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_alldata['features_orig'] = test_alldata['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

test_alldata['features_loan'][np.random.randint(len(test_alldata['features_loan']))],\
test_alldata['features_orig'][np.random.randint(len(test_alldata['features_orig']))]

## Add target labels and make train and test sets

In [None]:
Y_train = np.array([y for y in train_alldata['label_bin']])
Y_test = np.array([y for y in test_alldata['label_bin']])
Y_train.shape,Y_test.shape

In [None]:
X_train = np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                     np.array([x for x in train_alldata['features_orig']])])
X_test = np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                    np.array([x for x in test_alldata['features_orig']])])
X_train.shape,X_test.shape

Make a validation split for training the DNN model

In [None]:
#create train and validation splits keeping the composition of labels balanced between them using a random state '1 '

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape


In [None]:
X_train = torch.tensor(X_train).to(device)
Y_train = torch.tensor(Y_train).to(device).reshape((-1,1))

X_test = torch.tensor(X_test).to(device)
Y_test = torch.tensor(Y_test).to(device).reshape((-1,1))

X_val = torch.tensor(X_val).to(device)
Y_val = torch.tensor(Y_val).to(device).reshape((-1,1))


In [None]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape, X_val.shape, Y_val.shape
 

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

## DNN Definition

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_features):
        super(NeuralNetwork, self).__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
            
        )
        self.dropout = nn.Dropout(0.10) 

    def forward(self, x):
        #x = self.flatten(x)
        #logits = self.linear_relu_stack(x)
        logits_new = self.linear_relu_stack(x)
        logits  = self.dropout(logits_new)
        
        return torch.sigmoid(logits), logits_new
        #return logits

In [None]:
model = NeuralNetwork(X_train.shape[1]).to(device)
#model = NeuralNetwork(X_test.shape[1]).to(device)
print(model)

In [None]:
criterion = nn.BCELoss().to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [None]:
def calculate_accuracy(y_true, y_pred):
    predicted = y_pred.ge(.5) 
    return ((y_true == predicted).sum().float() / len(y_true), (y_true == predicted).sum())

In [None]:
def round_tensor(t, decimal_places=3):
    return round(t.item(), decimal_places)

In [None]:
CUDA_LAUNCH_BLOCKING=1

Train!

In [None]:
# train for 5000 epochs and get the logits 
val_losses = []
train_losses = []
val_accur = []
train_accur = []
logits = []
for epoch in range(5000):

    y_pred = model(X_train.float())[0]
    logits = model(X_train.float())[1]
    #getting logits for test set 
#     y_pred = model(X_test.float())[0]
#     logits = model(X_test.float())[1]
    #y_pred = model(X_train) 
    #print(y_pred)

    #y_pred = torch.squeeze(y_pred)
    train_loss = criterion(y_pred, Y_train.float())
    
    #test_loss = criterion(y_pred, Y_test.float())
    #train_loss = criterion(y_pred, Y_train)

    if epoch % 100 == 0:
        train_acc,_ = calculate_accuracy(Y_train, y_pred)

        y_val_pred = model(X_val.float())[0]
        #y_test_pred = torch.squeeze(y_test_pred)
         

        val_loss = criterion(y_val_pred, Y_val.float())

        val_acc, total_corr = calculate_accuracy(Y_val, y_val_pred)
        #print(total_corr)
        
        print(f'''epoch {epoch} Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)} Val set - loss: {round_tensor(val_loss)}, Val accuracy: {round_tensor(val_acc)}
''')
        #print(f'''epoch {epoch}Train set - loss: {round_tensor(train_loss)} ''')
        #print(f'''epoch {epoch}Test set - loss: {round_tensor(test_loss)} ''')
        train_losses.append(train_loss.detach().numpy())
        val_losses.append(val_loss.detach().numpy())
        
        val_accur.append(val_acc.detach().numpy())
        train_accur.append(train_acc.detach().numpy())

    optimizer.zero_grad()

    train_loss.backward()
    #test_loss.backward()

    optimizer.step()

Plot!

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_accur) + 1)

plt.plot(epochs, train_accur, 'bo', label='Training acc')
plt.plot(epochs, val_accur, 'b', label='vaidation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, train_losses, 'bo', label='Training loss')
plt.plot(epochs, val_losses, 'b', label='validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()


## Setup evaluation datasets

Get Panphon features and pad (use `train_alldata` as already defined above)

In [None]:
train_realdist['features_loan'] = train_realdist.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
train_realdist['features_orig'] = train_realdist.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)
test_realdist['features_loan'] = test_realdist.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
test_realdist['features_orig'] = test_realdist.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)

train_realdist['features_loan'] = train_realdist['features_loan'].apply(lambda x:sum(x, []))
train_realdist['features_orig'] = train_realdist['features_orig'].apply(lambda x:sum(x, []))
test_realdist['features_orig'] = test_realdist['features_orig'].apply(lambda x:sum(x, []))
test_realdist['features_loan'] = test_realdist['features_loan'].apply(lambda x:sum(x, []))

train_balanced['features_loan'] = train_balanced.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
train_balanced['features_orig'] = train_balanced.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)
test_balanced['features_loan'] = test_balanced.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
test_balanced['features_orig'] = test_balanced.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)

train_balanced['features_loan'] = train_balanced['features_loan'].apply(lambda x:sum(x, []))
train_balanced['features_orig'] = train_balanced['features_orig'].apply(lambda x:sum(x, []))
test_balanced['features_orig'] = test_balanced['features_orig'].apply(lambda x:sum(x, []))
test_balanced['features_loan'] = test_balanced['features_loan'].apply(lambda x:sum(x, []))


In [None]:
test_alldata['features_loan'] = test_alldata['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_alldata['features_orig'] = test_alldata['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_alldata = torch.tensor(np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                     np.array([x for x in train_alldata['features_orig']])])).to(device)
X_test_alldata = torch.tensor(np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                    np.array([x for x in test_alldata['features_orig']])])).to(device)

In [None]:
train_realdist['features_loan'] = train_realdist['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
train_realdist['features_orig'] = train_realdist['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

test_realdist['features_loan'] = test_realdist['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_realdist['features_orig'] = test_realdist['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_realdist = torch.tensor(np.hstack([np.array([x for x in train_realdist['features_loan']]),\
                     np.array([x for x in train_realdist['features_orig']])])).to(device)
X_test_realdist = torch.tensor(np.hstack([np.array([x for x in test_realdist['features_loan']]),\
                    np.array([x for x in test_realdist['features_orig']])])).to(device)

train_balanced['features_loan'] = train_balanced['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
train_balanced['features_orig'] = train_balanced['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

test_balanced['features_loan'] = test_balanced['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_balanced['features_orig'] = test_balanced['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_balanced = torch.tensor(np.hstack([np.array([x for x in train_balanced['features_loan']]),\
                     np.array([x for x in train_balanced['features_orig']])])).to(device)
X_test_balanced = torch.tensor(np.hstack([np.array([x for x in test_balanced['features_loan']]),\
                    np.array([x for x in test_balanced['features_orig']])])).to(device)

Get logits from DNN for all datasets/splits

In [None]:
model.eval()

with torch.no_grad():
    train_logits_dnn_alldata = model(X_train_alldata.float())[1].detach().cpu().numpy()
    test_logits_dnn_alldata = model(X_test_alldata.float())[1].detach().cpu().numpy()
    train_logits_dnn_realdist = model(X_train_realdist.float())[1].detach().cpu().numpy()
    test_logits_dnn_realdist = model(X_test_realdist.float())[1].detach().cpu().numpy()
    train_logits_dnn_balanced = model(X_train_balanced.float())[1].detach().cpu().numpy()
    test_logits_dnn_balanced = model(X_test_balanced.float())[1].detach().cpu().numpy()

In [None]:
train_logits_dnn_alldata, test_logits_dnn_alldata,\
train_logits_dnn_realdist, test_logits_dnn_realdist,\
train_logits_dnn_balanced, test_logits_dnn_balanced

Add DNN logit column to production datasets

In [None]:
train_alldata_dnnlogits = pd.read_csv('../Datasets/train_final_production_alldata.csv')
test_alldata_dnnlogits = pd.read_csv('../Datasets/test_final_production_alldata.csv')
train_realdist_dnnlogits = pd.read_csv('../Datasets/train_final_production_realdist.csv')
test_realdist_dnnlogits = pd.read_csv('../Datasets/test_final_production_realdist.csv')
train_balanced_dnnlogits = pd.read_csv('../Datasets/train_final_production_balanced.csv')
test_balanced_dnnlogits = pd.read_csv('../Datasets/test_final_production_balanced.csv')

In [None]:
train_alldata_dnnlogits['DNNlogits_modelpredicted'] = train_logits_dnn_alldata
test_alldata_dnnlogits['DNNlogits_modelpredicted'] = test_logits_dnn_alldata

train_realdist_dnnlogits['DNNlogits_modelpredicted'] = train_logits_dnn_realdist
test_realdist_dnnlogits['DNNlogits_modelpredicted'] = test_logits_dnn_realdist

train_balanced_dnnlogits['DNNlogits_modelpredicted'] = train_logits_dnn_balanced
test_balanced_dnnlogits['DNNlogits_modelpredicted'] = test_logits_dnn_balanced

train_alldata_dnnlogits.to_csv('../Datasets/modelpredictedlogits_trainDNN_alldata.csv')
test_alldata_dnnlogits.to_csv('../Datasets/modelpredictedlogits_testDNN_alldata.csv')

train_realdist_dnnlogits.to_csv('../Datasets/modelpredictedlogits_trainDNN_realdist.csv')
test_realdist_dnnlogits.to_csv('../Datasets/modelpredictedlogits_testDNN_realdist.csv')

train_balanced_dnnlogits.to_csv('../Datasets/modelpredictedlogits_trainDNN_balanced.csv')
test_balanced_dnnlogits.to_csv('../Datasets/modelpredictedlogits_testDNN_balanced.csv')


## Setup data for CNN training

Current CNN approach doesn't really work.  Because the inputs are PanPhon features with padding dependent on the maxlen of the input data, and then reshaped into a square to fit into a 2D CNN, the filter is not able to capture the necessary dependency: that is, the relationship between an L1 feature and an L2 feature at the same approximate position in their respective words (cf., Persian /ɣ/ usually becomes Hindi /q/).  If we reshape the data this way for the network, because the ordering of PanPhon features are effectively conventionlized into a fixed order, a /q/ in an (e.g.) Hindi word is not guaranteed to fall in the same window as the equivalent /ɣ/ in the (e.g.) Persian source word, thus losing the dependency.  Therefore the CNN usually falls into a local minimum of predicting everything to be a non-loan word.  Loss remains relatively compared to the DNN and accuracy plateaus at about 90%.

Use `train_alldata` again

In [None]:
X_train = np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                     np.array([x for x in train_alldata['features_orig']])])
Y_train = np.array([y for y in train_alldata['label_bin']])

In [None]:
# create train and validation splits for proper model training while keeping the composition of labels balanced between them using a random state '1 '

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

In [None]:
# convert them to torch tensors for padding
X_train = torch.tensor(X_train).to(device)
Y_train = torch.tensor(Y_train).to(device).reshape((-1,1))

X_test = torch.tensor(X_test).to(device)
Y_test = torch.tensor(Y_test).to(device).reshape((-1,1))

X_val = torch.tensor(X_val).to(device)
Y_val = torch.tensor(Y_val).to(device).reshape((-1,1))


In [None]:
closest_perfect_square = X_train.shape[1]
while (True):
    if np.sqrt(closest_perfect_square) - np.floor(np.sqrt(closest_perfect_square)) != 0:
        closest_perfect_square += 1
    else:
        break
view_shape = int(np.sqrt(closest_perfect_square))
closest_perfect_square,view_shape

In [None]:
X_train = F.pad(X_train, pad=(0, closest_perfect_square-X_train.shape[1]), value=0)
X_val = F.pad(X_val, pad=(0, closest_perfect_square-X_val.shape[1]), value=0)
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

## CNN Definition

In [None]:
class CCN_Net(nn.Module):
    def __init__(self):
        super().__init__() # just run the init of parent class (nn.Module)
        self.conv1 = nn.Conv2d(1, 128, 8) # input is 1 image, 32 output channels, 2X2 kernel / window
        self.conv2 = nn.Conv2d(128, 64, 2) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
        self.conv3 = nn.Conv2d(64, 32, 2)
        

        #x = torch.randn(23,23).view(-1,1,23,23)
        #x = torch.randn(33,33).view(-1,1,33,33) #33 because its the square root of 1089
        #x = torch.randn(30,30).view(-1,1,30,30) #30 because its the square root of 900 for real dist train set
        #x = torch.randn(29,29).view(-1,1,29,29) #29 because its the square root of 841 for balanced train set
        x = torch.randn(view_shape,view_shape).view(-1,1,view_shape,view_shape) # for trained model logit prediction, for all data, 1089 is sq of 33
        self._to_linear = None
        self.convs(x)

        self.fc1 = nn.Linear(self._to_linear, 512) #flattening.
        self.fc2 = nn.Linear(512, 1) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).
        self.dropout = nn.Dropout(0.1)

    def convs(self, x):
        # max pooling over 2x2
        x = F.max_pool2d(torch.tanh(self.conv1(x)), (2, 2))
        #x = F.max_pool2d(torch.tanh(self.conv2(x)), (1, 1))
        #x = F.max_pool2d(torch.tanh(self.conv3(x)), (1, 1))
#         x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
#         x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
#         x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
        

        if self._to_linear is None:
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)  # .view is reshape ... this flattens X before 
        x = self.dropout(x)
        x = torch.tanh(self.fc1(x))
        #x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x) # bc this is our output layer. No activation here.
        return F.sigmoid(x), x, #comment it out to get the logits in the return statement 
        #return x
                         


CNN_Net = CCN_Net() 
CNN_Net = nn.DataParallel(CNN_Net)
CNN_Net.to(device)
print(CNN_Net)

In [None]:
X_train = torch.tensor(X_train).view(-1,view_shape,view_shape).to(device)
X_val = torch.tensor(X_val).view(-1,view_shape,view_shape).to(device)
Y_train = torch.tensor(Y_train).to(device)
Y_val = torch.tensor(Y_val).to(device)
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape


In [None]:
#optimizer = optim.Adam(CNN_Net.parameters(), lr=0.01)
optimizer = optim.SGD(CNN_Net.parameters(),lr=0.001, momentum=0.0,  weight_decay=0.0, nesterov=False)
#optimizer = torch.optim.RMSprop(CNN_Net.parameters(), lr=0.00001, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
#loss_function = nn.MSELoss()
scheduler1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
loss_function = nn.BCELoss().to(device)

In [None]:
from tqdm import tqdm

In [None]:
torch.cuda.empty_cache()

In [None]:
X_val = X_val.unsqueeze(1) #just do it once 

In [None]:
# sets the seed manually to 42
torch.manual_seed(42)
# a = X_val_CNN[torch.randint(len(X_val_CNN), (120,))]  
# # b = Y_val_CNN[torch.randint(len(X_val_CNN), (120,))]  
# # a.shape
# b

# 

In [None]:
torch.initial_seed()

In [None]:
#train for 10000 epochs and get the logits 
val_losses = []
train_losses = []
val_accur = []
train_accur = []
train_losses_batch = []
logits = []
BATCH_SIZE = 512
n_epochs = 50
for epoch in range(n_epochs):
    for i in tqdm(range(0, len(X_train), BATCH_SIZE)):
        batch_X = X_train[i:i+BATCH_SIZE].view(-1, 1, view_shape,view_shape)  
        #batch_X = X_train_CNN.view(-1, 1, 29,29)  
        batch_y = Y_train[i:i+BATCH_SIZE]
        
        

        #X_train_CNN = X_train_CNN.view(-1, 1, 33,33) # for balanced train set
        CNN_Net.zero_grad()
        
        y_pred = CNN_Net(batch_X .float())[0]
        #print(y_pred)
        logits = CNN_Net(batch_X.float())[1]
        #getting logits for test set 
    #     y_pred = model(X_test.float())[0]
    #     logits = model(X_test.float())[1]
        #y_pred = model(X_train) 
        #print(y_pred)

        #y_pred = torch.squeeze(y_pred)
        train_loss = loss_function(y_pred, batch_y.float())

        #test_loss = criterion(y_pred, Y_test.float())
        #train_loss = criterion(y_pred, Y_train)
        train_losses.append(train_loss)
        if epoch % (n_epochs // 20) == 0:
            with torch.no_grad():
                CNN_Net.eval()

                val_batch_X = X_val[torch.randint(len(X_val), (BATCH_SIZE,))] 
                val_batch_Y = Y_val[torch.randint(len(X_val), (BATCH_SIZE,))] 

                train_acc,_ = calculate_accuracy(batch_y, y_pred)
                #X_val_CNN= X_val_CNN.unsqueeze(1) don't do it here, it will keep adding a channel dimension every time the for loop operates
                y_val_pred = CNN_Net(val_batch_X.float())[0]
                #print(y_val_pred)
                #y_test_pred = torch.squeeze(y_test_pred)


                val_loss = loss_function(y_val_pred, val_batch_Y.float())

                val_acc, total_corr = calculate_accuracy(val_batch_Y, y_val_pred)
                #print(total_corr)

                print(f'''epoch {epoch} Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)} Val  set - loss: {round_tensor(val_loss)}, Val accuracy: {round_tensor(val_acc)}
        ''')
                #print(f'''epoch {epoch}Train set - loss: {round_tensor(train_loss)} ''')
                #print(f'''epoch {epoch}Test set - loss: {round_tensor(test_loss)} ''')
                train_losses_batch.append(train_loss.detach().numpy())
                val_losses.append(val_loss.detach().numpy())

                train_accur.append(train_acc.detach().numpy())
                val_accur.append(val_acc.detach().numpy())

        optimizer.zero_grad()

        train_loss.backward()
        #test_loss.backward()

        optimizer.step()
    scheduler1.step()

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_accur) + 1)

plt.plot(epochs, train_accur, 'bo', label='Training acc')
plt.plot(epochs, val_accur, 'b', label='vaidation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, train_losses_batch, 'bo', label='Training loss')
plt.plot(epochs, val_losses, 'b', label='validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
test_alldata['features_loan'] = test_alldata['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_alldata['features_orig'] = test_alldata['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_alldata = torch.tensor(np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                     np.array([x for x in train_alldata['features_orig']])])).to(device)
X_test_alldata = torch.tensor(np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                    np.array([x for x in test_alldata['features_orig']])])).to(device)

train_realdist['features_loan'] = train_realdist['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
train_realdist['features_orig'] = train_realdist['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

test_realdist['features_loan'] = test_realdist['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_realdist['features_orig'] = test_realdist['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_realdist = torch.tensor(np.hstack([np.array([x for x in train_realdist['features_loan']]),\
                     np.array([x for x in train_realdist['features_orig']])])).to(device)
X_test_realdist = torch.tensor(np.hstack([np.array([x for x in test_realdist['features_loan']]),\
                    np.array([x for x in test_realdist['features_orig']])])).to(device)

train_balanced['features_loan'] = train_balanced['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
train_balanced['features_orig'] = train_balanced['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

test_balanced['features_loan'] = test_balanced['features_loan'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[0]-len(x)), 'constant'))
test_balanced['features_orig'] = test_balanced['features_orig'].apply(lambda x: \
                                np.pad(x,\
                                (0,train_alldata_maxlen[1]-len(x)), 'constant'))

X_train_balanced = torch.tensor(np.hstack([np.array([x for x in train_balanced['features_loan']]),\
                     np.array([x for x in train_balanced['features_orig']])])).to(device)
X_test_balanced = torch.tensor(np.hstack([np.array([x for x in test_balanced['features_loan']]),\
                    np.array([x for x in test_balanced['features_orig']])])).to(device)

In [None]:
X_train_alldata = F.pad(X_train_alldata, pad=(0, closest_perfect_square-X_train_alldata.shape[1]), value=0)
X_test_alldata = F.pad(X_test_alldata, pad=(0, closest_perfect_square-X_test_alldata.shape[1]), value=0)
X_train_realdist = F.pad(X_train_realdist, pad=(0, closest_perfect_square-X_train_realdist.shape[1]), value=0)
X_test_realdist = F.pad(X_test_realdist, pad=(0, closest_perfect_square-X_test_realdist.shape[1]), value=0)
X_train_balanced = F.pad(X_train_balanced, pad=(0, closest_perfect_square-X_train_balanced.shape[1]), value=0)
X_test_balanced = F.pad(X_test_balanced, pad=(0, closest_perfect_square-X_test_balanced.shape[1]), value=0)
X_train_alldata.shape,X_test_alldata.shape,X_train_realdist.shape,X_test_realdist.shape,X_train_balanced.shape,X_test_balanced.shape


In [None]:
X_train_alldata = torch.tensor(X_train_alldata).view(-1,1,view_shape,view_shape).to(device)
X_test_alldata = torch.tensor(X_test_alldata).view(-1,1,view_shape,view_shape).to(device)
X_train_realdist = torch.tensor(X_train_realdist).view(-1,1,view_shape,view_shape).to(device)
X_test_realdist = torch.tensor(X_test_realdist).view(-1,1,view_shape,view_shape).to(device)
X_train_balanced = torch.tensor(X_train_balanced).view(-1,1,view_shape,view_shape).to(device)
X_test_balanced = torch.tensor(X_test_balanced).view(-1,1,view_shape,view_shape).to(device)
X_train_alldata.shape,X_test_alldata.shape,X_train_realdist.shape,X_test_realdist.shape,X_train_balanced.shape,X_test_balanced.shape


In [None]:
# use the trained CNN model to get logits for the three different splits

CNN_Net.eval()

with torch.no_grad():
    train_logits_cnn_alldata = CNN_Net(X_train_alldata.float())[1]
    test_logits_cnn_alldata = CNN_Net(X_test_alldata.float())[1]
    
    train_logits_cnn_realdist = CNN_Net(X_train_realdist.float())[1]
    test_logits_cnn_realdist = CNN_Net(X_test_realdist.float())[1]
    
    train_logits_cnn_balanced = CNN_Net(X_train_balanced.float())[1]
    test_logits_cnn_balanced = CNN_Net(X_test_balanced.float())[1]

In [None]:
print(CNN_Net(X_train_alldata.float()))

In [None]:
train_logits_cnn_alldata,\
test_logits_cnn_alldata,\
train_logits_cnn_realdist,\
test_logits_cnn_realdist,\
train_logits_cnn_balanced,\
test_logits_cnn_balanced

In [6]:
# for Hindi-Persian

# train_alldata_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_alldata.csv')
 
# test_alldata_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_alldata.csv')
# train_realdist_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_realdist.csv')
# test_realdist_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_realdist.csv')
# train_balanced_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_balanced.csv')
# test_balanced_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_balanced.csv')


# for English-French 

train_alldata_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_alldata.csv')
 
test_alldata_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_alldata.csv')
train_realdist_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_realdist.csv')
test_realdist_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_realdist.csv')
train_balanced_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_balanced.csv')
test_balanced_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_balanced.csv')



In [7]:
train_alldata_dnnlogits.shape, test_alldata_dnnlogits.shape, train_realdist_dnnlogits.shape, test_realdist_dnnlogits.shape,train_balanced_dnnlogits.shape,test_balanced_dnnlogits.shape  

((14876, 19), (1655, 19), (6156, 19), (509, 19), (5502, 19), (436, 19))

# this part includes steps to get the cosine similarities from the two multi-lingual transformer model we are using : M-bert multilingual cased and XLM

In [8]:
#transformer specific imports 
import sys
import json
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification\
    , BertForPreTraining, AutoModel
from transformers import XLMTokenizer, XLMWithLMHeadModel


from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, mean_squared_error
import time

In [9]:
from transformers import XLMTokenizer, XLMWithLMHeadModel
xlm_tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
xlm_model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
#set the seeds for reproducibility even though we are not fine-tuning or training and the weights 
#for both these models are effectively frozen for our purpose 

torch.manual_seed(7)
random.seed(7)
np.random.seed(7)
# Setting PyTorch's required configuration variables for reproducibility.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = False
torch.use_deterministic_algorithms(False)

In [11]:
PRE_TRAINED_bert_MODEL = 'bert-base-multilingual-cased'
PRE_TRAINED_xlm_MODEL = 'xlm-mlm-100-1280'


In [12]:
MAXTOKENS = 5
NUM_EPOCHS = 2000  # default maximum number of epochs
BERT_EMB = 768  # set to either 768 or 1024 for BERT-Base and BERT-Large models respectively
BS = 8  # batch size
INITIAL_LR = 1e-5  # initial learning rate
save_epochs = [1, 2, 3, 4, 5, 6, 7]  # these are the epoch numbers (starting from 1) to test the model on the test set
# and save the model checkpoint.
EARLY_STOP_PATIENCE = 30  # If model does not improve for this number of epochs, training stops.

# Setting GPU cards to use for training the model. Make sure you read our paper to figure out if you have enough GPU
# memory. If not, you can change all of them to 'cpu' to use CPU instead of GPU. By the way, two 24 GB GPU cards are
# enough for current configuration, but in case of developing based on this you may need more (that's why there are
# three cards declared here)
# CUDA_0 = 'cuda:1'
# CUDA_1 = 'cuda:1'
# CUDA_2 = 'cuda:1'
args = sys.argv
epochs = NUM_EPOCHS

In [13]:
MAXTOKENS = 512
BERT_EMB = 768  # set to either 768 or 1024 for BERT-Base and BERT-Large models respectively
#CUDA_0 = 'cuda:1'
#CUDA_1 = 'cuda:1'
#CUDA_2 = 'cuda:1'
CUDA_0 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#CUDA_1 = 'cuda:0'
#CUDA_2 = 'cuda:0'

# The function for printing in both console and a given log file.
def myprint(mystr, logfile):
    print(mystr)
    print(mystr, file=logfile)


# The function for loading datasets from parallel tsv files and returning texts in lists.
def load_data(file_name):
    try:
        # f = open(file_name)
        f = pd.read_csv(file_name, sep='\t', names=['l1_text', 'l2_text'])#, 'extra'])
    except:
        print('my log: could not read file')
        exit()
    print("This many number of rows were removed from " + file_name.split("/")[-1] + " due to having missing values: ",
          f.shape[0] - f.dropna().shape[0])
    f.dropna(inplace=True)
    l1_texts = f['l1_text'].values.tolist()
    l2_texts = f['l2_text'].values.tolist()
    print(len(l1_texts), len(l2_texts))
    print(l1_texts[500])
    print("\n")
    print(l2_texts[500])
    return l1_texts, l2_texts


# Overriding the Dataset class required for the use of PyTorch's data loader classes.
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, l1_encodings, l2_encodings):
        self.l1_encodings = l1_encodings
        self.l2_encodings = l2_encodings

    def __getitem__(self, idx):
        item = {('l1_' + key): torch.tensor(val[idx]) for key, val in self.l1_encodings.items()}
        item2 = {('l2_' + key): torch.tensor(val[idx]) for key, val in self.l2_encodings.items()}
        item.update(item2)
        # item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.l1_encodings['attention_mask'])


class MyDataset1(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.l1_encodings['attention_mask'])


class MyModel(nn.Module):
    # Each component other than the Transformer, are in a sequential layer (it is not required obviously, but it is
    # possible to stack them with other layers if desired)
    def __init__(self, base_model, n_classes, dropout=0.05):
        super().__init__()
        # self.base_model = base_model.to(CUDA_0)
        self.transformation_learner = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(BERT_EMB, BERT_EMB),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(BERT_EMB, BERT_EMB),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(BERT_EMB, BERT_EMB),
            nn.LeakyReLU()
        ).to(CUDA_0)

    def forward(self, input, **kwargs):
        l1_pooler_output = input
        # l2 = input2
        # if 'l1_attention_mask' in kwargs:
        #     l1_attention_mask = kwargs['l1_attention_mask']
            # l2_attention_mask = kwargs['l2_attention_mask']
        # else:
        #     print("my err: attention mask is not set, error maybe")
        # here we use only the CLS token
        # l1_pooler_output = self.base_model(l1.to(CUDA_0), attention_mask=l1_attention_mask.to(CUDA_0)).pooler_output
        myoutput = self.transformation_learner(l1_pooler_output)
        return myoutput


# The function to compute and print the performance measure scores using sklearn implementations.
def evaluate_model(labels, predictions, titlestr, logfile):
    myprint(titlestr, logfile)
    conf_matrix = confusion_matrix(labels, predictions)
    myprint("Confusion matrix- \n" + str(conf_matrix), logfile)
    acc_score = accuracy_score(labels, predictions)
    myprint('  Accuracy Score: {0:.2f}'.format(acc_score), logfile)
    myprint('Report', logfile)
    cls_rep = classification_report(labels, predictions)
    myprint(cls_rep, logfile)
    return f1_score(labels, predictions)  # return f-1 for positive class (sarcasm) as the early stopping measure.

# get the cosine similarties for all three types of data from M-BERT

In [14]:
#list of loan word and original word pairs that we are feeding inside the transformer models with tokenizers 
#to get their vector embedding of the CLS or classification token and 
#then calculating their cosine similarities between those embedding pairs 

In [15]:
#list of loan-original words for train sets

l1_train_alldata = list(train_alldata_dnnlogits["loan_word"])
l2_train_alldata = list(train_alldata_dnnlogits["original_word"])

l1_train_realdist = list(train_realdist_dnnlogits["loan_word"])
l2_train_realdist = list(train_realdist_dnnlogits["original_word"])

l1_train_balanced = list(train_balanced_dnnlogits["loan_word"])
l2_train_balanced = list(train_balanced_dnnlogits["original_word"])

#list of loan-original words for test sets


l1_test_alldata = list(test_alldata_dnnlogits["loan_word"])
l2_test_alldata = list(test_alldata_dnnlogits["original_word"])

l1_test_realdist = list(test_realdist_dnnlogits["loan_word"])
l2_test_realdist = list(test_realdist_dnnlogits["original_word"])

l1_test_balanced = list(test_balanced_dnnlogits["loan_word"])
l2_test_balanced = list(test_balanced_dnnlogits["original_word"])




len(l1_train_alldata), len(l1_train_realdist), len(l1_train_balanced), len(l1_test_alldata), len(l1_test_realdist), len(l1_test_balanced)

(14876, 6156, 5502, 1655, 509, 436)

In [16]:
PRE_TRAINED_bert_MODEL

'bert-base-multilingual-cased'

# get cosine similarities for train set for all three data sets

In [None]:
with torch.no_grad():
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_bert_MODEL)
    tokenizer.model_max_length = MAXTOKENS
    l1_encodings_alldata = tokenizer(l1_train_alldata, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_alldata = tokenizer(l2_train_alldata, truncation=False, padding=True, max_length=MAXTOKENS)
    l1_encodings_realdist = tokenizer(l1_train_realdist, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_realdist = tokenizer(l2_train_realdist, truncation=False, padding=True, max_length=MAXTOKENS)
    l1_encodings_balanced = tokenizer(l1_train_balanced, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_balanced = tokenizer(l2_train_balanced, truncation=False, padding=True, max_length=MAXTOKENS)
    
    dataset_alldata = MyDataset(l1_encodings_alldata, l2_encodings_alldata)
    dataset_realdist = MyDataset(l1_encodings_realdist, l2_encodings_realdist)
    dataset_balanced  = MyDataset(l1_encodings_balanced, l2_encodings_balanced)
    
    
    data_loader_alldata = DataLoader(dataset_alldata, batch_size=BS, shuffle=False)  # shuffle False for reproducibility
    data_loader_realdist = DataLoader(dataset_realdist, batch_size=BS, shuffle=False)
    data_loader_balanced = DataLoader(dataset_balanced, batch_size=BS, shuffle=False)
    
    base_model = BertModel.from_pretrained(PRE_TRAINED_bert_MODEL).to(CUDA_0)
    base_model.eval()
    cos_s = torch.nn.CosineSimilarity()
    print("\n\n\n\n")
    sim_lst_alldata = []
    sim_lst_realdist = []
    sim_lst_balanced = []
     
    
    #loop for all data 
    for step, batch in enumerate(data_loader_alldata):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_alldata = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_alldata.extend(list(sims_alldata))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_alldata))
    
    #loop for real dist

    for step, batch in enumerate(data_loader_realdist):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_realdist = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_realdist.extend(list(sims_realdist))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_realdist))

# loop for balanced dataset
    for step, batch in enumerate(data_loader_balanced):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_balanced = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_balanced.extend(list(sims_balanced))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_balanced))







      # print("Similarities: ")
      # for i in range(len(sims)):
      #   print(l1[i], ' and ', l2[i], ' : ', sims[i])

In [None]:
train_alldata_dnnlogits['m-bert_cosim'] = sim_lst_alldata
train_realdist_dnnlogits['m-bert_cosim'] = sim_lst_realdist
train_balanced_dnnlogits['m-bert_cosim'] = sim_lst_balanced

# Get cosim for test set for three types 

In [None]:
with torch.no_grad():
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_bert_MODEL)
    tokenizer.model_max_length = MAXTOKENS
    l1_encodings_alldata = tokenizer(l1_test_alldata, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_alldata = tokenizer(l2_test_alldata, truncation=False, padding=True, max_length=MAXTOKENS)
    l1_encodings_realdist = tokenizer(l1_test_realdist, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_realdist = tokenizer(l2_test_realdist, truncation=False, padding=True, max_length=MAXTOKENS)
    l1_encodings_balanced = tokenizer(l1_test_balanced, truncation=False, padding=True, max_length=MAXTOKENS)
    l2_encodings_balanced = tokenizer(l2_test_balanced, truncation=False, padding=True, max_length=MAXTOKENS)
    
    dataset_alldata = MyDataset(l1_encodings_alldata, l2_encodings_alldata)
    dataset_realdist = MyDataset(l1_encodings_realdist, l2_encodings_realdist)
    dataset_balanced  = MyDataset(l1_encodings_balanced, l2_encodings_balanced)
    
    
    data_loader_alldata = DataLoader(dataset_alldata, batch_size=BS, shuffle=False)  # shuffle False for reproducibility
    data_loader_realdist = DataLoader(dataset_realdist, batch_size=BS, shuffle=False)
    data_loader_balanced = DataLoader(dataset_balanced, batch_size=BS, shuffle=False)
    
    base_model = BertModel.from_pretrained(PRE_TRAINED_bert_MODEL).to(CUDA_0)
    base_model.eval()
    cos_s = torch.nn.CosineSimilarity()
    print("\n\n\n\n")
    sim_lst_alldata = []
    sim_lst_realdist = []
    sim_lst_balanced = []
     
    
    #loop for all data 
    for step, batch in enumerate(data_loader_alldata):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_alldata = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_alldata.extend(list(sims_alldata))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_alldata))
    
    #loop for real dist

    for step, batch in enumerate(data_loader_realdist):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_realdist = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_realdist.extend(list(sims_realdist))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_realdist))

# loop for balanced dataset
    for step, batch in enumerate(data_loader_balanced):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l1_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0),
                                      attention_mask=batch['l2_attention_mask'].to(CUDA_0),
                                      return_dict=True).last_hidden_state[:, 1, :]
        sims_balanced = cos_s(l1_vector, l2_vector).data.cpu().numpy()
        sim_lst_balanced.extend(list(sims_balanced))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_balanced))


In [None]:
test_alldata_dnnlogits['m-bert_cosim'] = sim_lst_alldata
test_realdist_dnnlogits['m-bert_cosim'] = sim_lst_realdist
test_balanced_dnnlogits['m-bert_cosim'] = sim_lst_balanced

In [None]:
test_alldata_dnnlogits

# get cosine sims from the XLM-100 model now 

In [None]:
with torch.no_grad():
   # tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_xlm_MODEL)
    tokenizer = XLMTokenizer.from_pretrained(PRE_TRAINED_xlm_MODEL)
    tokenizer.model_max_length = MAXTOKENS
    
    l1_encodings_alldata = tokenizer(l1_train_alldata, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_alldata = tokenizer(l2_train_alldata, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    l1_encodings_realdist = tokenizer(l1_train_realdist, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_realdist = tokenizer(l2_train_realdist, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    l1_encodings_balanced = tokenizer(l1_train_balanced, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_balanced = tokenizer(l2_train_balanced, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    

    dataset_alldata = MyDataset(l1_encodings_alldata, l2_encodings_alldata)
    dataset_realdist = MyDataset(l1_encodings_realdist, l2_encodings_realdist)
    dataset_balanced  = MyDataset(l1_encodings_balanced, l2_encodings_balanced)
    
    
    data_loader_alldata = DataLoader(dataset_alldata, batch_size=BS, shuffle=False)  # shuffle False for reproducibility
    data_loader_realdist = DataLoader(dataset_realdist, batch_size=BS, shuffle=False)
    data_loader_balanced = DataLoader(dataset_balanced, batch_size=BS, shuffle=False)
    
    base_model = XLMWithLMHeadModel.from_pretrained(PRE_TRAINED_xlm_MODEL).to(CUDA_0)
    base_model.eval()
    cos_s = torch.nn.CosineSimilarity()
    print("\n\n\n\n")
    sim_lst_alldata = []
    sim_lst_realdist = []
    sim_lst_balanced = []
     
    
    #loop for all data 
    for step, batch in enumerate(data_loader_alldata):
        
        
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_alldata = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_alldata.extend(list(sims_alldata))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_alldata))
    
    #loop for real dist

    for step, batch in enumerate(data_loader_realdist):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_realdist = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_realdist.extend(list(sims_realdist))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_realdist))

# loop for balanced dataset
    for step, batch in enumerate(data_loader_balanced):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_balanced = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_balanced.extend(list(sims_balanced))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_balanced))



In [None]:
train_alldata_dnnlogits['xlm_cosim'] = sim_lst_alldata
train_realdist_dnnlogits['xlm_cosim'] = sim_lst_realdist
train_balanced_dnnlogits['xlm_cosim'] = sim_lst_balanced

In [None]:
train_alldata_dnnlogits.loc[train_alldata_dnnlogits['m-bert_cosim']==1]

In [None]:
#save all the new datafames as the notebook is running out of cuda memory, please proceed without running this cell if no 
#cuda memory issues 

train_alldata_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_alldata.csv')
train_realdist_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_realdist.csv')
train_balanced_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_balanced.csv')

test_alldata_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_alldata.csv')
test_realdist_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_realdist.csv')
test_balanced_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_balanced.csv')



In [None]:
train_alldata_dnnlogits

# get XLM cosine sims for test splits now 

In [17]:
with torch.no_grad():
   # tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_xlm_MODEL)
    tokenizer = XLMTokenizer.from_pretrained(PRE_TRAINED_xlm_MODEL)
    tokenizer.model_max_length = MAXTOKENS
    
    l1_encodings_alldata = tokenizer(l1_test_alldata, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_alldata = tokenizer(l2_test_alldata, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    l1_encodings_realdist = tokenizer(l1_test_realdist, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_realdist = tokenizer(l2_test_realdist, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    l1_encodings_balanced = tokenizer(l1_test_balanced, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    l2_encodings_balanced = tokenizer(l2_test_balanced, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask =True)
    
    

    dataset_alldata = MyDataset(l1_encodings_alldata, l2_encodings_alldata)
    dataset_realdist = MyDataset(l1_encodings_realdist, l2_encodings_realdist)
    dataset_balanced  = MyDataset(l1_encodings_balanced, l2_encodings_balanced)
    
    
    data_loader_alldata = DataLoader(dataset_alldata, batch_size=BS, shuffle=False)  # shuffle False for reproducibility
    data_loader_realdist = DataLoader(dataset_realdist, batch_size=BS, shuffle=False)
    data_loader_balanced = DataLoader(dataset_balanced, batch_size=BS, shuffle=False)
    
    base_model = XLMWithLMHeadModel.from_pretrained(PRE_TRAINED_xlm_MODEL).to(CUDA_0)
    base_model.eval()
    cos_s = torch.nn.CosineSimilarity()
    print("\n\n\n\n")
    sim_lst_alldata = []
    sim_lst_realdist = []
    sim_lst_balanced = []
     
    
    #loop for all data 
    for step, batch in enumerate(data_loader_alldata):
        
        
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_alldata = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_alldata.extend(list(sims_alldata))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_alldata))
    
    #loop for real dist

    for step, batch in enumerate(data_loader_realdist):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_realdist = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_realdist.extend(list(sims_realdist))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_realdist))

# loop for balanced dataset
    for step, batch in enumerate(data_loader_balanced):
        l1_vector = base_model(batch['l1_input_ids'].to(CUDA_0),output_hidden_states =True )[0] 
        l2_vector = base_model(batch['l2_input_ids'].to(CUDA_0), output_hidden_states =True) [0]
        sims_balanced = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
        sim_lst_balanced.extend(list(sims_balanced))
        #sim_lst_test.extend(list(sims))
    print(len(sim_lst_balanced))



  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.











1655
509
436


In [18]:
#train_alldata_dnnlogits = pd.read_csv('../TorchClassifier_Panphon_features/modelpredictedlogits_trainDNN_alldata.csv')
 
test_alldata_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_alldata.csv')
#train_realdist_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_realdist.csv')
test_realdist_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_realdist.csv')
#train_balanced_dnnlogits = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_balanced.csv')
test_balanced_dnnlogits = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_balanced.csv')


 

In [19]:
test_alldata_dnnlogits['xlm_cosim'] = sim_lst_alldata
test_realdist_dnnlogits['xlm_cosim'] = sim_lst_realdist
test_balanced_dnnlogits['xlm_cosim'] = sim_lst_balanced

In [20]:
test_alldata_dnnlogits

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,...,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1.1,DNNlogits_modelpredicted,m-bert_cosim,xlm_cosim
0,0,0,4202,cougar,cafard,ˈkugər,kafaʀ,cougar,cockroach,0.833333,...,0.316667,2.575000,0.300000,4,hard_negative,0,,-5.154968,0.554451,0.631995
1,1,1,829,residue,résidu,ˈrɛzəˌdu,ʀezidy,residue,residue,0.750000,...,0.076389,0.645833,0.062500,2,synonym,0,,2.845915,0.641440,0.420390
2,2,2,1984,galvanize,galvaniser,ˈgælvəˌnaɪz,ɡalvanizəʀ,galvanize,galvanize,0.818182,...,0.254167,2.162500,0.250000,2,loan,1,,-1.857455,0.822515,0.890437
3,3,3,9659,angélique,amunition,angélique*,amynisjɔ̃,angelic,ammunition,0.800000,...,0.125000,1.609375,0.114583,8,hard_negative,0,,-9.178938,0.456421,0.733871
4,4,4,1181,deportment,déporter,dəˈpɔrtmənt,depɔʀtəʀ,deportment,deport,0.545455,...,0.262500,2.050000,0.250000,4,hard_negative,0,,-3.505590,0.701201,0.707137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1650,1650,1650,10769,pendentive,chardon béni,pendentive*,ʃaʀdɔn beni,pendentive,blessed thistle,0.818182,...,0.120833,1.487500,0.108333,10,hard_negative,0,,-18.449093,0.400846,0.670738
1651,1651,1651,6492,baba au rhum,demi-relief,ˈbəbə oʊ rhum*,dəmi-ʀɛljəf,rum baba,demi-relief,0.928571,...,0.141667,2.137500,0.122917,12,hard_negative,0,,-8.230941,0.419612,0.219390
1652,1652,1652,7753,forcené,pasquinade,forcené*é,paskinad,mad,pasquinade,0.888889,...,0.114583,1.921875,0.093750,9,hard_negative,0,,-10.169317,0.597328,0.699115
1653,1653,1653,8429,parfait,bastille,ˌpɑrˈfeɪ,bastij,perfect,bastille,1.000000,...,0.125000,1.562500,0.093750,7,hard_negative,0,,-13.261622,0.403259,0.461173


In [21]:
# save the dataframes as csv files 

test_alldata_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_alldata.csv')
test_realdist_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_realdist.csv')
test_balanced_dnnlogits.to_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_balanced.csv')


# Hindi -Persian Evaluation of the phonetic, edit distance and logits as features for our classifiers for the three datasets 

In [23]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import svm
from sklearn.svm import SVC

## all data set

In [233]:
 #for Hindi Persian 
# test_alldata = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_cosims_alldata.csv', index_col=[0])
# train_alldata = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_cosims_alldata.csv', index_col=[0])
 
# test_realdist = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_cosims_realdist.csv', index_col=[0])
# train_realdist = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_cosims_realdist.csv', index_col=[0])

# train_balanced = pd.read_csv('../Datasets/modelpredictedlogits_trainDNN_cosims_balanced.csv', index_col=[0])
# test_balanced = pd.read_csv('../Datasets/modelpredictedlogits_testDNN_cosims_balanced.csv', index_col=[0])

# for English French 
test_alldata = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_alldata.csv', index_col=[0])
train_alldata = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_alldata.csv', index_col=[0])
 
test_realdist = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_realdist.csv', index_col=[0])
train_realdist = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_realdist.csv', index_col=[0])

train_balanced = pd.read_csv('../Datasets/English-French-modelpredictedlogits_trainDNN_cosims_balanced.csv', index_col=[0])
test_balanced = pd.read_csv('../Datasets/English-French-modelpredictedlogits_testDNN_cosims_balanced.csv', index_col=[0])




In [234]:
#the extra column in the shape is due to an extra column index, 
test_alldata.shape,train_alldata.shape, test_realdist.shape, train_realdist.shape, train_balanced.shape, test_balanced.shape

((1655, 22), (14876, 21), (509, 22), (6156, 21), (5502, 21), (436, 22))

In [40]:
test_alldata

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,...,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,DNNlogits_modelpredicted,m-bert_cosim,xlm_cosim
0,0,0,2180,नौकर,ناهار,nɔːkər,nɒhɒr,Servant,Lunch,0.666667,...,0.083333,0.083333,1.300000,0.083333,5,hard_negative,0,-6.632645,0.651157,0.698117
1,1,1,2185,प्रभाव,نتیجه,prəb̤aːv,ntjd͡ʒh,Effect,Result,1.000000,...,0.319444,0.368056,4.083333,0.336806,6,synonym,0,-12.517828,0.388871,0.722407
2,2,2,3050,माही,ماب,maːɦiː,mɒb,lover,map,0.833333,...,0.296875,0.322917,3.187500,0.322917,4,hard_negative,0,-4.893986,0.676401,0.534651
3,3,3,818,बरामद,رهائی,bəraːməd,rhɒjʔj,found,رهائی,0.875000,...,0.315476,0.369048,4.607143,0.339286,5,random,0,-4.171056,0.478386,0.650366
4,4,4,2571,बराबर,بارآور,bəraːbər,bɒrɒvr,Equal,Fertile,0.625000,...,0.172619,0.190476,1.589286,0.187500,6,hard_negative,0,-8.876501,0.472573,0.447921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,1315,1315,3170,वाहवाही,تشویق و تمجید,vaːɦvaːɦiː,tʃvjɣ v tmd͡ʒjd,Praise,Applause,0.866667,...,0.422348,0.465909,4.568182,0.452652,13,synonym,0,-16.205833,0.523541,0.581920
1316,1316,1316,4560,नुकसान पहुचने वाला,خطرناک,nuksaːnə pəɦut͡ʃne vaːlaː,xtrnɒk,Harmful,Dangerous,0.920000,...,0.636574,0.708333,5.333333,0.702546,18,synonym,0,-39.144264,0.398689,0.202958
1317,1317,1317,140,कबाब,کباب,kəbaːb,kbɒb,Kebab,Kebab,0.500000,...,0.204167,0.225000,1.750000,0.225000,4,loan,1,4.328829,0.677797,0.805250
1318,1318,1318,4493,अटल,یک دنده,aʈəl,jk dndh,Firm,stubborn,1.000000,...,0.413194,0.465278,4.583333,0.447917,7,synonym,0,-5.486457,0.437662,0.476721


In [41]:
#normalization of logits is optional, have to dig into this more. 

# train_alldata['DNNlogits_modelpredicted'] = (train_alldata['DNNlogits_modelpredicted']-train_alldata['DNNlogits_modelpredicted'].min()) / (train_alldata['DNN_logits'].max()-train_alldata['DNNlogits_modelpredicted'].min())
# test_alldata['DNNlogits_modelpredicted'] = (test_alldata['DNNlogits_modelpredicted']-test_alldata['DNNlogits_modelpredicted'].min()) / (test_alldata['DNNlogits_modelpredicted'].max()-test_alldata['DNNlogits_modelpredicted'].min())

In [324]:

features_all =  ['Fast Levenshtein Distance Div Maxlen',
       'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen',
       'Hamming Feature Distance Div Maxlen',
       'Weighted Feature Distance Div Maxlen',
       'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein' ,'DNNlogits_modelpredicted','m-bert_cosim', 'xlm_cosim',
           
        ]

features_nologits =  ['Fast Levenshtein Distance Div Maxlen',
       'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen',
       'Hamming Feature Distance Div Maxlen',
       'Weighted Feature Distance Div Maxlen',
       'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein' ,'m-bert_cosim', 'xlm_cosim',
           
        ]

features_logits_nocosine =['Fast Levenshtein Distance Div Maxlen',
       'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen',
       'Hamming Feature Distance Div Maxlen',
       'Weighted Feature Distance Div Maxlen',
       'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein' ,'DNNlogits_modelpredicted', 
           
        ]

edit_features = ['Fast Levenshtein Distance Div Maxlen',
       'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen',
       'Hamming Feature Distance Div Maxlen',
       'Weighted Feature Distance Div Maxlen',
       'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein' , 
           
        ]

features_plainLV_cosim = [  'plain Levenshtein' ,'m-bert_cosim', 'xlm_cosim', 
           
        ]

features_logits_plainLV_cosim = [  'plain Levenshtein' ,'m-bert_cosim', 'xlm_cosim', 'DNNlogits_modelpredicted'
           
        ]


features_cosim_logits = ['m-bert_cosim', 'xlm_cosim', 'DNNlogits_modelpredicted']

## all features includes cosim and logits for all three data splits

In [44]:
labels = ['label_bin']

## alldata evaluation 

In [45]:
x_train = train_alldata[features_all].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_all].values
y_test = test_alldata[labels].values.ravel()

In [46]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11857, 10), (11857,), (1320, 10), (1320,))

In [47]:
LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [48]:
y_pred = LR.predict(x_test)

In [49]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.7547169811320754
precision :  0.7692307692307693
recall :  0.7407407407407407
accuracy :  0.9507575757575758
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1185
           1       0.77      0.74      0.75       135

    accuracy                           0.95      1320
   macro avg       0.87      0.86      0.86      1320
weighted avg       0.95      0.95      0.95      1320

[[1155   30]
 [  35  100]]


In [50]:
precision, recall, fscore, support = score(y_pred, y_test, pos_label=1)

In [51]:
print('fscore: {}'.format(fscore))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print("accuracy : ",accuracy_score(y_test, y_pred))

print('support: {}'.format(support))

fscore: [0.97263158 0.75471698]
precision: [0.97468354 0.74074074]
recall: [0.97058824 0.76923077]
accuracy :  0.9507575757575758
support: [1190  130]


## real distribution , all features

In [52]:
x_train = train_realdist[features_all].values 


y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[features_all].values
y_test = test_realdist[labels].values.ravel()

In [53]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4030, 10), (4030,), (450, 10), (450,))

In [54]:
LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [55]:
y_pred = LR.predict(x_test)

In [56]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9307692307692307
precision :  0.968
recall :  0.8962962962962963
accuracy :  0.96
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       315
           1       0.97      0.90      0.93       135

    accuracy                           0.96       450
   macro avg       0.96      0.94      0.95       450
weighted avg       0.96      0.96      0.96       450

[[311   4]
 [ 14 121]]


## balanced , all features

In [202]:
x_train = train_balanced[features_all].values 


y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[features_all].values
y_test = test_balanced[labels].values.ravel()

In [203]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((2417, 10), (2417,), (271, 10), (271,))

In [204]:
LR = LogisticRegression(random_state=1, solver='lbfgs', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [205]:
y_pred = LR.predict(x_test)

In [206]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9461538461538462
precision :  0.984
recall :  0.9111111111111111
accuracy :  0.948339483394834
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       136
           1       0.98      0.91      0.95       135

    accuracy                           0.95       271
   macro avg       0.95      0.95      0.95       271
weighted avg       0.95      0.95      0.95       271

[[134   2]
 [ 12 123]]


## try an SVM classifier for all features

In [223]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(x_train, y_train)

SVC(kernel='linear')

In [224]:
y_pred = svclassifier.predict(x_test)

In [225]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9498069498069498
precision :  0.9919354838709677
recall :  0.9111111111111111
accuracy :  0.9520295202952029
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       136
           1       0.99      0.91      0.95       135

    accuracy                           0.95       271
   macro avg       0.96      0.95      0.95       271
weighted avg       0.96      0.95      0.95       271

[[135   1]
 [ 12 123]]


## features_logits_nocosine

In [62]:
x_train = train_alldata[features_logits_nocosine].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_logits_nocosine].values
y_test = test_alldata[labels].values.ravel()

In [63]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11857, 8), (11857,), (1320, 8), (1320,))

In [91]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [92]:
y_pred = LR.predict(x_test)

In [93]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.7547169811320754
precision :  0.7692307692307693
recall :  0.7407407407407407
accuracy :  0.9507575757575758
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1185
           1       0.77      0.74      0.75       135

    accuracy                           0.95      1320
   macro avg       0.87      0.86      0.86      1320
weighted avg       0.95      0.95      0.95      1320

[[1155   30]
 [  35  100]]


## real dist

In [95]:
x_train = train_realdist[features_logits_nocosine].values 


y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[features_logits_nocosine].values
y_test = test_realdist[labels].values.ravel()

In [96]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [97]:
y_pred = LR.predict(x_test)

In [98]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9302325581395349
precision :  0.975609756097561
recall :  0.8888888888888888
accuracy :  0.96
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       315
           1       0.98      0.89      0.93       135

    accuracy                           0.96       450
   macro avg       0.96      0.94      0.95       450
weighted avg       0.96      0.96      0.96       450

[[312   3]
 [ 15 120]]


## balanced split

In [122]:
x_train = train_balanced[features_logits_nocosine].values 


y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[features_logits_nocosine].values
y_test = test_balanced[labels].values.ravel()

In [123]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((2417, 8), (2417,), (271, 8), (271,))

In [124]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [125]:
y_pred = LR.predict(x_test)

In [126]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9461538461538462
precision :  0.984
recall :  0.9111111111111111
accuracy :  0.948339483394834
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       136
           1       0.98      0.91      0.95       135

    accuracy                           0.95       271
   macro avg       0.95      0.95      0.95       271
weighted avg       0.95      0.95      0.95       271

[[134   2]
 [ 12 123]]


## Features with no logits but with cos sims, all data

In [159]:
x_train = train_alldata[features_nologits].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_nologits].values
y_test = test_alldata[labels].values.ravel()

In [160]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11857, 9), (11857,), (1320, 9), (1320,))

In [161]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [162]:
y_pred = LR.predict(x_test)

In [163]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.6
precision :  0.6857142857142857
recall :  0.5333333333333333
accuracy :  0.9272727272727272
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1185
           1       0.69      0.53      0.60       135

    accuracy                           0.93      1320
   macro avg       0.82      0.75      0.78      1320
weighted avg       0.92      0.93      0.92      1320

[[1152   33]
 [  63   72]]


## real dist

In [175]:
x_train = train_realdist[features_nologits].values 
y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[features_nologits].values
y_test = test_realdist[labels].values.ravel()

In [176]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4030, 9), (4030,), (450, 9), (450,))

In [177]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [178]:
LR2 = LogisticRegression(random_state=1).fit(x_train, y_train)

In [179]:
y_pred = LR.predict(x_test)

In [180]:
y_pred2 = LR2.predict(x_test)

In [181]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8498168498168497
precision :  0.8405797101449275
recall :  0.8592592592592593
accuracy :  0.9088888888888889
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       315
           1       0.84      0.86      0.85       135

    accuracy                           0.91       450
   macro avg       0.89      0.89      0.89       450
weighted avg       0.91      0.91      0.91       450

[[293  22]
 [ 19 116]]


In [182]:
print("f1-score : ", f1_score(y_test, y_pred2 ))
print("precision : ",precision_score(y_test, y_pred2))
print("recall : ",recall_score(y_test, y_pred2 )) 
print("accuracy : ",accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

f1-score :  0.8498168498168497
precision :  0.8405797101449275
recall :  0.8592592592592593
accuracy :  0.9088888888888889
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       315
           1       0.84      0.86      0.85       135

    accuracy                           0.91       450
   macro avg       0.89      0.89      0.89       450
weighted avg       0.91      0.91      0.91       450

[[293  22]
 [ 19 116]]


## balanced

In [183]:
x_train = train_balanced[features_nologits].values 
y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[features_nologits].values
y_test = test_balanced[labels].values.ravel()

In [184]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((2417, 9), (2417,), (271, 9), (271,))

In [185]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [186]:
y_pred = LR.predict(x_test)

In [187]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9259259259259259
precision :  0.9259259259259259
recall :  0.9259259259259259
accuracy :  0.9261992619926199
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       136
           1       0.93      0.93      0.93       135

    accuracy                           0.93       271
   macro avg       0.93      0.93      0.93       271
weighted avg       0.93      0.93      0.93       271

[[126  10]
 [ 10 125]]


## only edit_features

In [188]:
## alldata

In [189]:
x_train = train_alldata[edit_features].values 
y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[edit_features].values
y_test = test_alldata[labels].values.ravel()

In [192]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11857, 7), (11857,), (1320, 7), (1320,))

In [194]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [195]:
y_pred = LR.predict(x_test)

In [196]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.5975103734439834
precision :  0.6792452830188679
recall :  0.5333333333333333
accuracy :  0.9265151515151515
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1185
           1       0.68      0.53      0.60       135

    accuracy                           0.93      1320
   macro avg       0.81      0.75      0.78      1320
weighted avg       0.92      0.93      0.92      1320

[[1151   34]
 [  63   72]]


In [197]:
# real dist

x_train = train_realdist[edit_features].values 
y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[edit_features].values
y_test = test_realdist[labels].values.ravel()

In [198]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4030, 7), (4030,), (450, 7), (450,))

In [199]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [200]:
y_pred = LR.predict(x_test)

In [201]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8581818181818183
precision :  0.8428571428571429
recall :  0.8740740740740741
accuracy :  0.9133333333333333
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       315
           1       0.84      0.87      0.86       135

    accuracy                           0.91       450
   macro avg       0.89      0.90      0.90       450
weighted avg       0.91      0.91      0.91       450

[[293  22]
 [ 17 118]]


In [226]:

x_train = train_balanced[edit_features].values 
y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[edit_features].values
y_test = test_balanced[labels].values.ravel()

In [227]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((2417, 7), (2417,), (271, 7), (271,))

In [230]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [231]:
y_pred = LR.predict(x_test)

In [232]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.929368029739777
precision :  0.9328358208955224
recall :  0.9259259259259259
accuracy :  0.9298892988929889
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       136
           1       0.93      0.93      0.93       135

    accuracy                           0.93       271
   macro avg       0.93      0.93      0.93       271
weighted avg       0.93      0.93      0.93       271

[[127   9]
 [ 10 125]]


# For English-French pairs 

In [235]:
train_alldata

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,...,Hamming Feature Distance Div Maxlen,Weighted Feature Distance Div Maxlen,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,label,label_bin,Unnamed: 0.1.1.1,DNNlogits_modelpredicted,m-bert_cosim,xlm_cosim
0,0,2485,maquis,maquis,maquis*,maki,maquis,maquis,0.571429,0.333333,...,0.340278,2.500000,0.340278,0,loan,1,,4.269493,1.000000,1.000000
1,1,7720,fin de siècle,dépréciation,fɪn də siècle*,depʀesjasjɔ̃,end of century,depreciation,0.928571,0.454545,...,0.159091,2.193182,0.142045,13,hard_negative,0,,-13.580923,0.365554,0.348418
2,2,3292,roman à thèse,roman à thèse,ˈroʊmən à* thèse*,ʀɔman a tə̀s,thesis novel,thesis novel,0.647059,0.400000,...,0.291667,2.156250,0.284722,0,loan,1,,4.820937,1.000000,0.999999
3,3,2151,hausse,hausse,hausse*,os,rise,rise,0.857143,0.666667,...,0.680556,5.000000,0.680556,0,loan,1,,4.353779,1.000000,1.000000
4,4,4628,enfile,aspirant,enfile*,aspiʀɑ̃,put on,aspirant,0.857143,0.166667,...,0.118056,1.583333,0.104167,7,hard_negative,0,,-5.874206,0.418582,0.607021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14871,14871,795,chichi,effigie,ˈʧiʧi,ɛfiʒi,father,effigy,0.600000,0.600000,...,0.600000,4.350000,0.600000,6,hard_negative,0,,-4.745490,0.487566,0.679311
14872,14872,9423,lorgnon,jornel,lorgnon*,ʒɔʀnəl,lorgnon,jornel,0.875000,0.333333,...,0.111111,1.354167,0.097222,4,hard_negative,0,,-4.068455,0.500577,0.711260
14873,14873,5524,pas de chat,pastillage,pɑz də ʧæt,pastijaʒ,no cat,pastillage,0.900000,0.125000,...,0.182292,1.468750,0.179688,8,hard_negative,0,,-4.408354,0.628722,0.582361
14874,14874,9510,publicist,conclaviste,ˈpəblɪsɪst,kɔ̃klavist,publicist,conclaved,0.700000,0.333333,...,0.092593,0.916667,0.087963,7,hard_negative,0,,-5.373606,0.392996,0.804365


## alldata English-French, all features :

In [242]:
x_train = train_alldata[features_all].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_all].values
y_test = test_alldata[labels].values.ravel()

In [243]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((14876, 10), (14876,), (1655, 10), (1655,))

In [244]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [245]:
y_pred = LR.predict(x_test)

In [246]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.7777777777777778
precision :  0.8061797752808989
recall :  0.7513089005235603
accuracy :  0.9009063444108761
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1273
           1       0.81      0.75      0.78       382

    accuracy                           0.90      1655
   macro avg       0.87      0.85      0.86      1655
weighted avg       0.90      0.90      0.90      1655

[[1204   69]
 [  95  287]]


In [282]:
x_train = train_realdist[features_all].values 


y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[features_all].values
y_test = test_realdist[labels].values.ravel()

In [283]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [284]:
y_pred = LR.predict(x_test)

In [285]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8863309352517985
precision :  0.9840255591054313
recall :  0.806282722513089
accuracy :  0.8447937131630648
              precision    recall  f1-score   support

           0       0.62      0.96      0.76       127
           1       0.98      0.81      0.89       382

    accuracy                           0.84       509
   macro avg       0.80      0.88      0.82       509
weighted avg       0.89      0.84      0.85       509

[[122   5]
 [ 74 308]]


In [286]:
x_train = train_balanced[features_all].values 


y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[features_all].values
y_test = test_balanced[labels].values.ravel()

In [287]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [288]:
y_pred = LR.predict(x_test)

In [289]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9028571428571429
precision :  0.9937106918238994
recall :  0.8272251308900523
accuracy :  0.8440366972477065
              precision    recall  f1-score   support

           0       0.44      0.96      0.60        54
           1       0.99      0.83      0.90       382

    accuracy                           0.84       436
   macro avg       0.72      0.90      0.75       436
weighted avg       0.93      0.84      0.87       436

[[ 52   2]
 [ 66 316]]


## cosims and plain lev as the only features, give high F scores which signifies the importance of plain lev and cosine sim for eng-french pairs

## all data 

In [268]:
x_train = train_alldata[features_plainLV_cosim].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_plainLV_cosim].values
y_test = test_alldata[labels].values.ravel()

In [269]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((14876, 3), (14876,), (1655, 3), (1655,))

In [270]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [271]:
y_pred = LR.predict(x_test)

In [272]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8659517426273459
precision :  0.8873626373626373
recall :  0.8455497382198953
accuracy :  0.9395770392749244
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1273
           1       0.89      0.85      0.87       382

    accuracy                           0.94      1655
   macro avg       0.92      0.91      0.91      1655
weighted avg       0.94      0.94      0.94      1655

[[1232   41]
 [  59  323]]


## real dist

In [273]:
## real dist
x_train = train_realdist[features_plainLV_cosim].values 


y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[features_plainLV_cosim].values
y_test = test_realdist[labels].values.ravel()

In [274]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [275]:
y_pred = LR.predict(x_test)

In [276]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9414965986394558
precision :  0.9801699716713881
recall :  0.9057591623036649
accuracy :  0.9155206286836935
              precision    recall  f1-score   support

           0       0.77      0.94      0.85       127
           1       0.98      0.91      0.94       382

    accuracy                           0.92       509
   macro avg       0.87      0.93      0.89       509
weighted avg       0.93      0.92      0.92       509

[[120   7]
 [ 36 346]]


## Balanced

In [277]:
x_train = train_balanced[features_plainLV_cosim].values 


y_train = train_balanced[labels].values.ravel()
x_test = test_balanced[features_plainLV_cosim].values
y_test = test_balanced[labels].values.ravel()

In [278]:
LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [279]:
y_pred = LR.predict(x_test)

In [280]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.9525101763907734
precision :  0.9887323943661972
recall :  0.918848167539267
accuracy :  0.9197247706422018
              precision    recall  f1-score   support

           0       0.62      0.93      0.74        54
           1       0.99      0.92      0.95       382

    accuracy                           0.92       436
   macro avg       0.80      0.92      0.85       436
weighted avg       0.94      0.92      0.93       436

[[ 50   4]
 [ 31 351]]


## features_nologits for english french 

In [310]:
x_train = train_alldata[features_nologits].values 


y_train = train_alldata[labels].values.ravel()
x_test = test_alldata[features_nologits].values
y_test = test_alldata[labels].values.ravel()

In [311]:
 LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [312]:
y_pred = LR.predict(x_test)

In [313]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8633288227334236
precision :  0.8935574229691877
recall :  0.8350785340314136
accuracy :  0.9389728096676737
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1273
           1       0.89      0.84      0.86       382

    accuracy                           0.94      1655
   macro avg       0.92      0.90      0.91      1655
weighted avg       0.94      0.94      0.94      1655

[[1235   38]
 [  63  319]]


In [325]:
x_train = train_realdist[
features_cosim_logits].values 


y_train = train_realdist[labels].values.ravel()
x_test = test_realdist[
features_cosim_logits].values
y_test = test_realdist[labels].values.ravel()

In [326]:
 LR = LogisticRegression(random_state=1, solver='lbfgs',penalty = 'l2', multi_class='ovr', max_iter=500 ).fit(x_train, y_train)

In [327]:
y_pred = LR.predict(x_test)

In [328]:
print("f1-score : ", f1_score(y_test, y_pred ))
print("precision : ",precision_score(y_test, y_pred))
print("recall : ",recall_score(y_test, y_pred )) 
print("accuracy : ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1-score :  0.8843930635838151
precision :  0.9870967741935484
recall :  0.8010471204188482
accuracy :  0.8428290766208252
              precision    recall  f1-score   support

           0       0.62      0.97      0.75       127
           1       0.99      0.80      0.88       382

    accuracy                           0.84       509
   macro avg       0.80      0.88      0.82       509
weighted avg       0.90      0.84      0.85       509

[[123   4]
 [ 76 306]]
