Assumes you have run `Train_Testset.ipynb` first to make the `alldata`, `realdist`, and `balanced` train/test splits for the chosen language pair.

# Imports and setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import panphon
import panphon.distance
import editdistance # levenshtein
import epitran
import eng_to_ipa as eng
from epitran.backoff import Backoff
from googletrans import Translator
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
epitran.download.cedict()

In [1]:
import torch
import os
import numpy as np

import argparse
from tqdm import tqdm
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import os
import torch
from torch import nn
from torch import nn, optim
import torch.nn.functional as F
# sys.argv = ['']
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
#from evaluations.eval import *
import os
import copy
import configparser
config_model = configparser.ConfigParser()
config_model.read('model_parameters.ini')
from collections import defaultdict
%matplotlib inline

In [2]:
import sklearn as sk
import sklearn.linear_model
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import numpy as np 
import io
import requests
import csv

In [3]:
# transformer specific imports, and cross encoders for our customized transformer models 
import sys
import json
import torch
from torch import nn
from transformers import *
from bert_stuff import *
from cognate_encoders.cross_encoder_assamese import FullCrossEncoder, FullCrossEncoderSingle, FullCrossEncoderSingle_muril

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,\
    BertForSequenceClassification, BertForPreTraining, AutoModel
from transformers import XLMTokenizer, XLMWithLMHeadModel, XLMModel

from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, mean_squared_error
import time
from transformers import XLMTokenizer, XLMWithLMHeadModel

In [4]:
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
#device = torch.device("cuda:0:3" if torch.cuda.is_available() else "cpu") ## specify the GPU id's, GPU id's start from 0.

Using device: cuda

NVIDIA GeForce RTX 3090
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())

NVIDIA GeForce RTX 3090


# DNN Definition

In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
            
        )
        self.dropout = nn.Dropout(0.1) 

    def forward(self, x):
        logits_new = self.linear_relu_stack(x)
        logits  = self.dropout(logits_new)
        
        return torch.sigmoid(logits), logits_new
    
    def fit(self, X_train, Y_train, X_val, Y_val, criterion, optimizer, n_epochs=5000):
        train_losses = []
        val_losses = []
        train_accur = []
        val_accur = []

        for epoch in range(n_epochs):
            y_pred, logits = self(X_train.float())

            train_loss = criterion(y_pred, Y_train.float())

            if epoch % (n_epochs // 50) == 0:
                train_acc,_ = self.calculate_accuracy(Y_train, y_pred)

                y_val_pred = self(X_val.float())[0]

                val_loss = criterion(y_val_pred, Y_val.float())

                val_acc, total_corr = self.calculate_accuracy(Y_val, y_val_pred)

                print(f'''epoch {epoch}
                    Train set - loss: {self.round_tensor(train_loss)}, accuracy: {self.round_tensor(train_acc)} 
                    Val set - loss: {self.round_tensor(val_loss)}, accuracy: {self.round_tensor(val_acc)}''')
                
                train_losses.append(train_loss.detach().cpu().numpy())
                val_losses.append(val_loss.detach().cpu().numpy())

                val_accur.append(val_acc.detach().cpu().numpy())
                train_accur.append(train_acc.detach().cpu().numpy())

            optimizer.zero_grad()

            train_loss.backward()

            optimizer.step()
            
        return train_losses,val_losses,train_accur,val_accur
    
    def calculate_accuracy(self, y_true, y_pred):
        predicted = y_pred.ge(.5) 
        return ((y_true == predicted).sum().float() / len(y_true), (y_true == predicted).sum())
    
    def round_tensor(self, t, decimal_places=3):
        return round(t.item(), decimal_places)
    
    def plot_losses(self, train_losses, val_losses, train_accur, val_accur):
        epochs = range(1, len(train_accur) + 1)

        plt.plot(epochs, train_accur, 'bo', label='Training acc')
        plt.plot(epochs, val_accur, 'b', label='Vaidation acc')
        plt.title('Training and validation accuracy')
        plt.legend()

        plt.figure()

        plt.plot(epochs, train_losses, 'bo', label='Training loss')
        plt.plot(epochs, val_losses, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()

        plt.show()

# MyDataset definition

In [7]:
# Overriding the Dataset class required for the use of PyTorch's data loader classes.
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, l1_encodings, l2_encodings):
        self.l1_encodings = l1_encodings
        self.l2_encodings = l2_encodings

    def __getitem__(self, idx):
        item = {('l1_' + key): torch.tensor(val[idx]) for key, val in self.l1_encodings.items()}
        item2 = {('l2_' + key): torch.tensor(val[idx]) for key, val in self.l2_encodings.items()}
        item.update(item2)
        # item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.l1_encodings['attention_mask'])

# Download LMs

In [8]:
# xlm_tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
# xlm_model = XLMModel.from_pretrained("xlm-mlm-100-1280")

In [9]:
# xlm_model

# Pipeline function definitions

## Get Panphon phonetic features

In [10]:
def get_panphon_features(train_set, test_set):
    #get phonetic features using PanPhon
    ft = panphon.FeatureTable()   
    
    train_set['features_loan'] = train_set.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
    train_set['features_orig'] = train_set.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)
    test_set['features_loan'] = test_set.apply(lambda x:ft.word_to_vector_list(x["loan_word_epitran"],numeric=True ), axis=1)
    test_set['features_orig'] = test_set.apply(lambda x:ft.word_to_vector_list(x["original_word_epitran"],numeric=True ), axis=1)

    train_set['features_loan'] = train_set['features_loan'].apply(lambda x:sum(x, []))
    train_set['features_orig'] = train_set['features_orig'].apply(lambda x:sum(x, []))
    test_set['features_orig'] = test_set['features_orig'].apply(lambda x:sum(x, []))
    test_set['features_loan'] = test_set['features_loan'].apply(lambda x:sum(x, []))

In [11]:
def pad_panphon_features(train_set, test_set, maxlen, verbose=False):
    # Pad the phonetic features of the loan word and original word out to the maxlen 
    # of the features appearing in the training set (format: `<loan><pad 0s><orig><pad 0s>`).
    train_set['features_loan'] = train_set['features_loan'].apply(lambda x: \
                                    np.pad(x,\
                                    (0,maxlen[0]-len(x)), 'constant'))
    train_set['features_orig'] = train_set['features_orig'].apply(lambda x: \
                                    np.pad(x,\
                                    (0,maxlen[1]-len(x)), 'constant'))
    test_set['features_loan'] = test_set['features_loan'].apply(lambda x: \
                                    np.pad(x,\
                                    (0,maxlen[0]-len(x)), 'constant'))
    test_set['features_orig'] = test_set['features_orig'].apply(lambda x: \
                                    np.pad(x,\
                                    (0,maxlen[1]-len(x)), 'constant'))
    
    if verbose:
        print("Sample train features:\n",\
                train_set['features_loan'][np.random.randint(len(train_set['features_loan']))],\
                train_set['features_orig'][np.random.randint(len(train_set['features_loan']))])

        print("Sample test features:\n",\
                test_set['features_loan'][np.random.randint(len(test_set['features_loan']))],\
                test_set['features_orig'][np.random.randint(len(test_set['features_orig']))])
    

## Add target labels and make train and validation sets

In [12]:
def add_target_labels(train_set, test_set):
    Y_train = np.array([y for y in train_set['label_bin']])
    Y_test = np.array([y for y in test_set['label_bin']])
    return Y_train, Y_test

Make a validation split for training the DNN model

In [13]:
def make_train_val_set(train_set, test_set, Y_train):
    X_train = np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                np.array([x for x in train_alldata['features_orig']])])
    X_test = np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                np.array([x for x in test_alldata['features_orig']])])
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2,\
                                                      random_state=1, stratify=Y_train)
    return X_train, X_val, X_test, Y_train, Y_val

Make tensors

In [14]:
def make_tensors(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    X_train = torch.tensor(X_train).to(device)
    Y_train = torch.tensor(Y_train).to(device).reshape((-1,1))

    X_val = torch.tensor(X_val).to(device)
    Y_val = torch.tensor(Y_val).to(device).reshape((-1,1))
    
    X_test = torch.tensor(X_test).to(device)
    Y_test = torch.tensor(Y_test).to(device).reshape((-1,1))
    
    return X_train, Y_train, X_val, Y_val, X_test, Y_test

## Get cosine similarities

MBERT

In [15]:
PRE_TRAINED_bert_MODEL = 'bert-base-multilingual-cased'
PRE_TRAINED_xlm_MODEL = 'xlm-mlm-100-1280'

MAXTOKENS = 5
BS = 8
all_vec= defaultdict(dict)  #stores the model embeddings  for each combination of models 
all_mapped = defaultdict(dict) 
cos_mapped = defaultdict(dict) 
# stores the linearly transformed model embeddings  for each combination of models 

In [16]:
def get_cosine_similarities(a,b):
    """
    Calculate the cosine similarity as row wise dot product of 2 arrays of vectors

    Parameters
    ----------
    
    a,b : Two arrays/tensors

    Returns
    -------
    list
    """
    def normed(a):
        return a/np.linalg.norm(a, axis=1).reshape((-1, 1))
     
    lhs = a
    rhs = b
 

    return np.sum(normed(lhs) * normed(rhs), axis=1) 

In [17]:
def get_mbert_cos_sims(l1_data,l2_data):
    l1 = []
    l2 = []
    with torch.no_grad():
        tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_bert_MODEL)
        tokenizer.model_max_length = MAXTOKENS
        l1_encodings = tokenizer(l1_data, truncation=False, padding=True, max_length=MAXTOKENS)
        l2_encodings = tokenizer(l2_data, truncation=False, padding=True, max_length=MAXTOKENS)
        
        dataset = MyDataset(l1_encodings, l2_encodings)
        
        data_loader = DataLoader(dataset, batch_size=BS, shuffle=False)  # shuffle False for reproducibility
        
        base_model = BertModel.from_pretrained(PRE_TRAINED_bert_MODEL).to(device)
        base_model.eval()
        cos_s = torch.nn.CosineSimilarity()
        
        sim_lst = []
#         l1 = base_model(input_ids = l1_encodings['input_ids'],
#                                         attention_mask = l1_encodings['attention_mask']).last_hidden_state[:, 0, :]
#         l2 = base_model(input_ids = l2_encodings['input_ids'],attention_mask = 
#                                           l2_encodings['attention_mask']
#                                         ).last_hidden_state[:, 0, :]
        
        #loop through dataset 
        for step, batch in enumerate(data_loader):
            l1_vector = base_model(batch['l1_input_ids'].to(device),
                                          attention_mask=batch['l1_attention_mask'].to(device),
                                          return_dict=True).last_hidden_state[:, 0, :]
            l2_vector = base_model(batch['l2_input_ids'].to(device),
                                          attention_mask=batch['l2_attention_mask'].to(device),
                                          return_dict=True).last_hidden_state[:, 0, :]
            
            l1.extend(l1_vector.data.cpu().numpy())
            l2.extend(l2_vector.data.cpu().numpy())
            
            
            
            
            sims = cos_s(l1_vector, l2_vector).data.cpu().numpy()
            sim_lst.extend(list(sims))
            if (step * BS) % 100 < BS:
                print("Got {}".format(len(sim_lst)))
        all_vec['mbert']['loan'] = np.array(l1)
        all_vec['mbert']['original'] = np.array(l2)
        print()
                
    return sim_lst, all_vec

XLM-100

In [18]:
def get_xlm_cos_sims(l1_data,l2_data):
    with torch.no_grad():
        tokenizer = XLMTokenizer.from_pretrained(PRE_TRAINED_xlm_MODEL)
        tokenizer.model_max_length = MAXTOKENS 
        l1_encodings = tokenizer(l1_data, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask=True)
        l2_encodings = tokenizer(l2_data, truncation=False, padding=True, max_length=MAXTOKENS, return_tensors="pt", return_special_tokens_mask=True)

        dataset = MyDataset(l1_encodings, l2_encodings)

        data_loader = DataLoader(dataset, batch_size=BS, shuffle=False)  # shuffle False for reproducibility

        
        base_model = XLMModel.from_pretrained(PRE_TRAINED_xlm_MODEL).to(device)
        
        base_model.eval()
        cos_s = torch.nn.CosineSimilarity()
        
        sim_lst = []
        l1 = []
        l2 = []

        #loop through dataset 
        for step, batch in enumerate(data_loader):
            
            l1_vector = base_model(batch['l1_input_ids'].to(device), output_hidden_states=False).last_hidden_state[:, 0, :]
            l2_vector = base_model(batch['l2_input_ids'].to(device), output_hidden_states=False).last_hidden_state[:, 0, :]
            l1.extend(l1_vector.data.cpu().numpy())
            l2.extend(l2_vector.data.cpu().numpy())
            sims = cos_s(l1_vector,l2_vector).data.cpu().numpy()
            sim_lst.extend(list(sims))
            if (step * BS) % 100 < BS:
                print("Got {}".format(len(sim_lst)))
            
        all_vec['xlm']['loan'] = np.array(l1)
        all_vec['xlm']['original'] = np.array(l2)
        
        print()
                
    return sim_lst , all_vec

In [20]:
def get_ass_albert_cos_sims(l1_data,l2_data, lin_transform =False):  #only assamese, monolingual, albert 
    
    with torch.no_grad():
        
        if lin_transform ==True:
                
            tokenizer_mbert = BertTokenizer.from_pretrained(PRE_TRAINED_bert_MODEL)
            tokenizer_assbert = AutoTokenizer.from_pretrained(config_model['model_ass_bert']['token_path'] )
            

            #tokenizer.model_max_length = MAXTOKENS 
            l1_encodings = tokenizer_assbert(l1_data,padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)
            l2_encodings = tokenizer_assbert(l2_data, padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)

            dataset = MyDataset(l1_encodings, l2_encodings)

            data_loader = DataLoader(dataset, batch_size=BS, shuffle=False)  # shuffle False for reproducibility


            source_model = AutoModel.from_pretrained(config_model['model_ass_bert']['model_name'] ).to(device)
            target_model = BertModel.from_pretrained(PRE_TRAINED_bert_MODEL).to(device) #do not use this when working with the transformation matrix
             
            target_model.eval()
            
            cos_s = torch.nn.CosineSimilarity()

            sim_lst = []
            l1_new = []
            l2_new = []

            #loop through dataset 
            for step, batch in enumerate(data_loader):

                l1_vector = source_model(batch['l1_input_ids'].to(device), output_hidden_states=True).last_hidden_state #albert has only 6 encoder layers
                l2_vector = source_model(batch['l2_input_ids'].to(device), output_hidden_states=True).last_hidden_state
                l1_new.extend(l1_vector[:, 0, :].data.cpu().numpy() )
                l2_new.extend(l2_vector[:, 0, :].data.cpu().numpy() )
                
                l2_vector = l2_vector.cpu()
                
                l1  = l1_vector.sum(1).cpu().numpy()
                l2  = l2_vector.sum(1).cpu().numpy()
                 
                #get the linear transform here 
                
                mapper_ = sklearn.linear_model.Ridge(fit_intercept=False).fit(l1,l2) 
                mapper_ = mapper_.coef_
                mapped_= l1 @ mapper_.transpose()
                
                

                #sims = cos_s(l1_vector[:,0,:],l2_vector[:,0,:]).data.cpu().numpy()
                sims = cos_s(torch.tensor(mapped_),l2_vector[:,0,:]).data.cpu().numpy()
                sim_lst.extend(list(sims))
                if (step * BS) % 100 < BS:
                    print("Got {}".format(len(sim_lst)))
                    
                    
            all_vec['ass_bert']['loan'] = np.array(l1_new)
            all_vec['ass_bert']['original'] = np.array(l2_new)
            print()
                
    return sim_lst, all_vec

In [21]:
def Indic_bert_cos_sims(l1_data,l2_data, lin_transform =False ):  #only assamese, monolingual 
    
    with torch.no_grad():
        
        tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
        base_model = AutoModel.from_pretrained('ai4bharat/indic-bert').to(device)
        
        #tokenizer.model_max_length = MAXTOKENS 
        l1_encodings = tokenizer(l1_data,padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)
        l2_encodings = tokenizer(l2_data, padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)

        dataset = MyDataset(l1_encodings, l2_encodings)

        data_loader = DataLoader(dataset, batch_size=BS, shuffle=False)  # shuffle False for reproducibility

        
          
        
        base_model.eval()
        cos_s = torch.nn.CosineSimilarity()
        
        sim_lst = []
        l1 = []
        l2 = []
        if lin_transform ==False:
            

            #loop through dataset 
            for step, batch in enumerate(data_loader):

                l1_vector = base_model(batch['l1_input_ids'].to(device), output_hidden_states=True).last_hidden_state[:, 0, :]
                l2_vector = base_model(batch['l2_input_ids'].to(device), output_hidden_states=True).last_hidden_state[:, 0, :]
                l1.extend(l1_vector.data.cpu().numpy())
                l2.extend(l2_vector.data.cpu().numpy())
                sims = cos_s(l1_vector,l2_vector).data.cpu().numpy()
                sim_lst.extend(list(sims))
                if (step * BS) % 100 < BS:
                    print("Got {}".format(len(sim_lst)))
            print()
            all_vec['indic_bert']['loan'] = np.array(l1)
            all_vec['indic_bert']['original'] = np.array(l2)
    
            
                
    return sim_lst, all_vec

In [23]:
def Muril_cos_sims(l1_data,l2_data):  #only assamese, monolingual 
    
    with torch.no_grad():
        
        tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

        base_model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased").to(device)
        
        #tokenizer.model_max_length = MAXTOKENS 
        l1_encodings = tokenizer(l1_data,padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)
        l2_encodings = tokenizer(l2_data, padding='longest', return_tensors="pt", return_special_tokens_mask=True, add_special_tokens=True, truncation =False)

        dataset = MyDataset(l1_encodings, l2_encodings)

        data_loader = DataLoader(dataset, batch_size=BS, shuffle=False)  # shuffle False for reproducibility

        base_model.eval()
        cos_s = torch.nn.CosineSimilarity()
        
        sim_lst = []
        l1 = []
        l2 = []

        #loop through dataset 
        for step, batch in enumerate(data_loader):
            
            l1_vector = base_model(batch['l1_input_ids'].to(device), output_hidden_states =True).hidden_states[12][:, 0, :]
            l2_vector = base_model(batch['l2_input_ids'].to(device), output_hidden_states =True).hidden_states[12][:, 0, :]
            l1.extend(l1_vector.data.cpu().numpy())
            l2.extend(l2_vector.data.cpu().numpy())
            sims = cos_s(l1_vector,l2_vector).data.cpu().numpy()
            sim_lst.extend(list(sims))
            if (step * BS) % 100 < BS:
                print("Got {}".format(len(sim_lst)))
        all_vec['muril']['loan'] = np.array(l1)
        all_vec['muril']['original'] = np.array(l2)
        print()
        
                
    return sim_lst, all_vec

In [24]:
%store -r beng_m_bertvec #loading the transformation matrix from the 339 sample equivalent sentences 
%store -r ass_bertvec

no stored variable or alias #loading
no stored variable or alias the
no stored variable or alias transformation
no stored variable or alias matrix
no stored variable or alias from
no stored variable or alias the
no stored variable or alias 339
no stored variable or alias sample
no stored variable or alias equivalent
no stored variable or alias sentences


In [28]:
beng_m_bertvec.shape, ass_bertvec.shape


((339, 768), (339, 768))

In [31]:

ben_ass_train = pd.read_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Bengali-Assamese-train_production_alldata.csv', index_col=False)
ben_ass_test = pd.read_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Bengali-Assamese-test_production_alldata.csv', index_col=False)

ass_ben_train = pd.read_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Assamese-Bengali-train_production_alldata.csv', index_col=False)
ass_ben_test = pd.read_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Assamese-Bengali-test_production_alldata.csv', index_col=False)

In [32]:
# get cosine sims for ben_ass_train


mbert_cos, all_vec = get_mbert_cos_sims(list(ben_ass_train['loan_word']),list(ben_ass_train['original_word']))
xlm_cos,all_vec = get_xlm_cos_sims(list(ben_ass_train['loan_word']),list(ben_ass_train['original_word']))
ass_bert_cos, all_vec = get_ass_albert_cos_sims(list(ben_ass_train['original_word']),list(ben_ass_train['loan_word']), lin_transform =True)
indic_bert_cos, all_vec = Indic_bert_cos_sims(list(ben_ass_train['loan_word']),list(ben_ass_train['original_word']))
muril_cos, all_vec = Muril_cos_sims(list(ben_ass_train['loan_word']),list(ben_ass_train['original_word']))
 





loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/vocab.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/fcfbd016a54060fb3e7290d39ac4fd04b0c7ca2c683e5fdd87928b0feaa2c367.bd20142f530c7b681cef79e2153e77f8d9e8c9fdb3f6db29f37298198166236d
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/merges.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/0693c49aba9ba31f442ba9a6c368bc400d2a81e5931d983d63d8648a043bf551.a1730275bc49c3d660f1d9bf50222d8cb849f3ae93e75e5865a3037650459b27
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/91dad0f1b901ee17a1ca86293a46619408e812441cfa380050cd03359564b95f.fabc8

loading configuration file https://huggingface.co/xlm-mlm-100-1280/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/db80884313abeddd782425fe61c0846a43c007b0f403d13181e368ad7dd52f62.394eed9fa7b8205161e0b47bf552ae933e7e29234009f217a03b87bfe054ed52
Model config XLMConfig {
  "accumulate_gradients": 4,
  "ae_steps": [],
  "amp": 2,
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "batch_size": 16,
  "beam_size": 1,
  "bos_index": 0,
  "bos_token_id": 0,
  "bptt": 256,
  "bt_src_langs": [],
  "bt_steps": [],
  "causal": false,
  "clip_grad_norm": 1.0,
  "clm_steps": [],
  "command": "python /private/home/aconneau/workdir/xlm_17_100_big.3/2019_08_10_19_23_42/train.py --n_heads 16 --bt_steps '' --max_vocab 200000 --word_mask_keep_rand '0.8,0.1,0.1' --use_lang_emb false --data_path '/private/home/aconneau/projects/XLM/data/wiki/100/175k' --save_periodic 0 --max_len 200 --bptt 256 --ae_steps '' --fp1

loading weights file https://huggingface.co/xlm-mlm-100-1280/resolve/main/pytorch_model.bin from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/5ba8c43e22aeb247eee7a4d27014eabe361cbfd2a9e00bd337946aaa0dece8ce.f11dc7ef5f30811d164556022e3174da0d6e8bb13676833dabdb1f50f61f39a5
Some weights of the model checkpoint at xlm-mlm-100-1280 were not used when initializing XLMModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of XLMModel were initialized from the model checkpoint at xlm-mlm-100-1280.
If your task 

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 712


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 808


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 912


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1008


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
Could not locate the tokenizer configuration file, will try to use the model config instead.





loading configuration file https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/2d290a1a22a5f80e173def8b2f31f12d68a957542e6769ab06bfc3de06bc49f4.06ba3893e888d6ff1388c45cdbee1fb785542ae22b70ff159f55da323230a159
Model config AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transfo

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading configuration file https://huggingface.co/google/muril-base-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/d8ca6ce642f067ecf3d1163f4d2903b471287613933f2857ca8307e500bc7645.aff1657f5771205f5a0c6cb4816f125ee5f2f2d62dbf27e6b9fee30b0ebbf0f5
Model config BertConfig {
  "_name_or_path": "google/muril-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

loading file https://huggingface.co/googl

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



In [33]:
for i, j in all_vec.items():
    print(i,j['loan'].shape, j['original'].shape, type(j['original']))
 

mbert (1648, 768) (1648, 768) <class 'numpy.ndarray'>
xlm (1648, 1280) (1648, 1280) <class 'numpy.ndarray'>
ass_bert (1648, 768) (1648, 768) <class 'numpy.ndarray'>
indic_bert (1648, 768) (1648, 768) <class 'numpy.ndarray'>
muril (1648, 768) (1648, 768) <class 'numpy.ndarray'>


In [34]:
all_vec['ass_bert']['original'].shape,all_vec['mbert']['loan'].shape


((1648, 768), (1648, 768))

In [35]:
# get the linear transformation from mbert space to ass_bert space using the precalculated transformation matrix i.e beng_m_bertvec, ass_bertvec

m = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec, ass_bertvec) 
m = m.coef_ 
print(m.shape) #this is the 768 by 768 transformation matrix between mbert and assbert 
mapped_= all_vec['mbert']['loan'] @ m.transpose()
beng_ass_cos_mapped = get_cosine_similarities(mapped_, all_vec['ass_bert']['original'] )


(768, 768)


In [40]:
#load the train production set and add the new cosim column
ben_ass_train = pd.read_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Bengali-Assamese-train_production_alldata.csv', index_col=False)

In [41]:
ben_ass_train['mbert_src_assbert_trg'] = beng_ass_cos_mapped

In [42]:
ben_ass_train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,...,ass_bertmuril,indic_bertmbert,indic_bertxlm,indic_bertass_bert,indic_bertmuril,murilmbert,murilxlm,murilass_bert,murilindic_bert,mbert_src_assbert_trg
0,0,0,0,0,180,180,পিন্ধা,আঠ,pin̪d̪ʱa,atʰ,...,0.999895,0.995502,0.987439,-0.036634,0.999794,0.998521,0.997388,-0.037025,0.999968,0.063041
1,1,1,1,1,615,615,পুষ্প,দোস্ত,puʃpɔ,dʊxtɔ,...,0.999765,0.995714,0.993861,-0.037131,0.999983,0.997182,0.998016,-0.036950,0.999891,-0.003760
2,2,2,2,2,336,336,জ্ঞান,শেঙুন,d͡ʑnan̪,xɛŋun,...,0.999856,0.996036,0.997111,-0.037456,0.999987,0.997383,0.997570,-0.036918,0.999913,0.129613
3,3,3,3,3,47,47,কালি,কালি,kali,kali,...,0.999915,0.997840,0.997473,-0.036673,0.999995,0.998325,0.998332,-0.037079,0.999947,0.032470
4,4,4,4,4,105,105,ছবি,ঠেং,t͡ɕʰɔbi,tʰɛŋ,...,0.999907,0.993710,0.992030,-0.037115,0.999976,0.998208,0.998024,-0.037072,0.999954,0.012763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1643,1643,1643,1643,1643,348,348,হার্ট,হিয়া,ɦarʈɔ,ɦija,...,0.999483,0.992731,0.987405,-0.037828,0.999939,0.997372,0.998031,-0.037065,0.999911,-0.060017
1644,1644,1644,1644,1644,136,136,পুষ্প,পাউণ্ড,puʃpɔ,paundɔ,...,0.999741,0.997582,0.998046,-0.036369,0.999993,0.998415,0.998395,-0.037102,0.999950,-0.003760
1645,1645,1645,1645,1645,37,37,কলা,ককা,kɔla,kɔka,...,0.999899,0.992143,0.981005,-0.037294,0.999865,0.997453,0.997853,-0.037073,0.999909,0.049187
1646,1646,1646,1646,1646,589,589,দিন,ডিম,d̪in̪,dim,...,0.999900,0.997719,0.994965,-0.037080,0.999971,0.998300,0.998274,-0.037104,0.999951,0.051215


In [43]:
def get_lt_maps(all_vec):
    #this function is for getting linear maps without using the transformation matrix from 339 sentences 
    
  
    
    for i, j in all_vec.items():
        #print(i,len(j['loan']) )


        for m,n in all_vec.items():
            if i ==m:
                continue


            print(i,m)
          

           #get linear maps for loans and originals separately    

            mapper_loan = sklearn.linear_model.Ridge(fit_intercept=False).fit(j['loan'],n['loan']) 
            #mapper_loan = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec,ass_bertvec) 
            mapper_loan = mapper_loan.coef_
            mapped_loan= j['loan'] @ mapper_loan.transpose()
            all_mapped[i+m]['loan']  = mapped_loan

            mapper_orig = sklearn.linear_model.Ridge(fit_intercept=False).fit(j['original'],n['original']) 
            #mapper_orig = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec,ass_bertvec) 
            mapper_orig = mapper_orig.coef_
            mapped_orig= j['original'] @ mapper_orig.transpose()
            all_mapped[i+m]['original']  = mapped_orig

            cos_mapped[i+m] = get_cosine_similarities(mapped_loan, mapped_orig ) #get the cosine sims for mapped embeddings
    return cos_mapped

 

    

In [35]:
cos_mapped = get_lt_maps(all_vec)
for i, j  in cos_mapped.items():
    print(i,j)
    
    ben_ass_train[i] = j

mbertxlm [0.6520988  0.88991714 0.71950054 ... 0.5739207  0.68108535 0.84755015]
mbertass_bert [-0.04072228 -0.03144859 -0.03459077 ... -0.03657508 -0.040287
 -0.03287696]
mbertindic_bert [0.9978991  0.99957275 0.9991308  ... 0.9979694  0.9989909  0.99966216]
mbertmuril [0.99839866 0.9986011  0.99890715 ... 0.9994731  0.999819   0.99949217]
xlmmbert [0.85940564 0.8999516  0.80412424 ... 0.90755445 0.8564594  0.9402543 ]
xlmass_bert [-0.04139591 -0.01836748 -0.03869876 ... -0.04617794 -0.04134841
 -0.03475533]
xlmindic_bert [0.99683017 0.99924695 0.9993081  ... 0.9966495  0.9988537  0.9997119 ]
xlmmuril [0.9977406  0.9966437  0.99824786 ... 0.99884045 0.99976546 0.9997694 ]
ass_bertmbert [0.9877634  0.9778743  0.9747802  ... 0.982127   0.98322934 0.9801762 ]
ass_bertxlm [0.95709693 0.95912945 0.95943815 ... 0.969867   0.9682539  0.96623474]
ass_bertindic_bert [0.99980676 0.9999196  0.99984217 ... 0.99959093 0.999913   0.9999382 ]
ass_bertmuril [0.9998949  0.9997655  0.9998558  ... 0.999

In [44]:
ben_ass_train.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'loan_word',
       'original_word', 'loan_word_epitran', 'original_word_epitran',
       'loan_english', 'original_english',
       'Fast Levenshtein Distance Div Maxlen',
       'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen',
       'Hamming Feature Distance Div Maxlen',
       'Weighted Feature Distance Div Maxlen',
       'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein',
       'loan_unicode', 'original_unicode', 'label', 'label_bin', 'DNN_logits',
       'mbert_cos', 'xlm_cos', 'ass_bert_mbert_cos', 'indic_bert_cos',
       'muril_cos', 'mbertxlm', 'mbertass_bert', 'mbertindic_bert',
       'mbertmuril', 'xlmmbert', 'xlmass_bert', 'xlmindic_bert', 'xlmmuril',
       'ass_bertmbert', 'ass_bertxlm', 'ass_bertindic_bert', 'ass_bertmuril',
       'indic_bertmbert', 'indic_bertxlm', 'indic_bertass_bert',
       'indic_b

In [78]:
ben_ass_train.loc[:,['mbertass_bert', 'xlmass_bert', 'ass_bertxlm', 'ass_bertmbert', 'ass_bertindic_bert','ass_bertmuril' ,'indic_bert_cos', 'ass_bert_mbert_cos','mbert_cos', 'xlm_cos', 'muril_cos', 'indic_bert_cos']]

Unnamed: 0,mbertass_bert,xlmass_bert,ass_bertxlm,ass_bertmbert,ass_bertindic_bert,ass_bertmuril,indic_bert_cos,ass_bert_mbert_cos,mbert_cos,xlm_cos,muril_cos,indic_bert_cos.1
0,-0.040722,-0.041396,0.957097,0.987763,0.999807,0.999895,0.996818,0.437369,0.859162,0.619778,0.997738,0.996818
1,-0.031449,-0.018367,0.959129,0.977874,0.999920,0.999766,0.999240,0.399135,0.899460,0.880066,0.996626,0.999240
2,-0.034591,-0.038699,0.959438,0.974780,0.999842,0.999856,0.999310,0.527592,0.803146,0.689375,0.998220,0.999310
3,-0.040505,-0.039202,0.976788,0.986098,0.999668,0.999915,0.999681,0.456332,1.000000,1.000000,1.000000,0.999681
4,-0.036145,-0.035271,0.962915,0.976585,0.999851,0.999907,0.999882,0.320201,0.787768,0.661757,0.999328,0.999882
...,...,...,...,...,...,...,...,...,...,...,...,...
1643,-0.041362,-0.040279,0.907283,0.969478,0.998929,0.999483,0.998596,0.636795,0.835814,0.682145,0.998156,0.998596
1644,-0.032763,-0.011251,0.961271,0.980109,0.999886,0.999741,0.999694,0.399142,0.945338,0.737297,0.999730,0.999694
1645,-0.036575,-0.046178,0.969867,0.982127,0.999591,0.999899,0.996649,0.494424,0.907485,0.485389,0.998837,0.996649
1646,-0.040287,-0.041348,0.968254,0.983229,0.999913,0.999900,0.998854,0.551089,0.856364,0.629492,0.999765,0.998854


In [45]:
len(mbert_cos ), len(xlm_cos), len(ass_bert_cos), len(indic_bert_cos), len(muril_cos)

(1648, 1648, 1648, 1648, 1648)

In [None]:
ben_ass_train['mbert_cos'] = mbert_cos
ben_ass_train['xlm_cos'] = xlm_cos
ben_ass_train['ass_bert_mbert_cos'] = ass_bert_cos
ben_ass_train['indic_bert_cos'] = indic_bert_cos
ben_ass_train['muril_cos'] = muril_cos

In [46]:
# now get the cosin sims and embeddings for ass_ben_train

mbert_cos, all_vec = get_mbert_cos_sims(list(ass_ben_train['loan_word']),list(ass_ben_train['original_word']))
xlm_cos, all_vec = get_xlm_cos_sims(list(ass_ben_train['loan_word']),list(ass_ben_train['original_word']))
ass_bert_cos, all_vec = get_ass_albert_cos_sims(list(ass_ben_train['loan_word']),list(ass_ben_train['original_word']), lin_transform =True)
indic_bert_cos, all_vec = Indic_bert_cos_sims(list(ass_ben_train['loan_word']),list(ass_ben_train['original_word']))
muril_cos, all_vec = Muril_cos_sims(list(ass_ben_train['loan_word']),list(ass_ben_train['original_word']))
 
# ass_ben_train['mbert_cos'] = mbert_cos
# ass_ben_train['xlm_cos'] = xlm_cos
# ass_ben_train['ass_bert_mbert_cos'] = ass_bert_cos
# ass_ben_train['indic_bert_cos'] = indic_bert_cos
# ass_ben_train['muril_cos'] = muril_cos   
    


loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608
Got 1712



loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/vocab.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/fcfbd016a54060fb3e7290d39ac4fd04b0c7ca2c683e5fdd87928b0feaa2c367.bd20142f530c7b681cef79e2153e77f8d9e8c9fdb3f6db29f37298198166236d
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/merges.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/0693c49aba9ba31f442ba9a6c368bc400d2a81e5931d983d63d8648a043bf551.a1730275bc49c3d660f1d9bf50222d8cb849f3ae93e75e5865a3037650459b27
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/91dad0f1b901ee17a1ca86293a46619408e812441cfa380050cd03359564b95f.fabc8

loading configuration file https://huggingface.co/xlm-mlm-100-1280/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/db80884313abeddd782425fe61c0846a43c007b0f403d13181e368ad7dd52f62.394eed9fa7b8205161e0b47bf552ae933e7e29234009f217a03b87bfe054ed52
Model config XLMConfig {
  "accumulate_gradients": 4,
  "ae_steps": [],
  "amp": 2,
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "batch_size": 16,
  "beam_size": 1,
  "bos_index": 0,
  "bos_token_id": 0,
  "bptt": 256,
  "bt_src_langs": [],
  "bt_steps": [],
  "causal": false,
  "clip_grad_norm": 1.0,
  "clm_steps": [],
  "command": "python /private/home/aconneau/workdir/xlm_17_100_big.3/2019_08_10_19_23_42/train.py --n_heads 16 --bt_steps '' --max_vocab 200000 --word_mask_keep_rand '0.8,0.1,0.1' --use_lang_emb false --data_path '/private/home/aconneau/projects/XLM/data/wiki/100/175k' --save_periodic 0 --max_len 200 --bptt 256 --ae_steps '' --fp1

loading weights file https://huggingface.co/xlm-mlm-100-1280/resolve/main/pytorch_model.bin from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/5ba8c43e22aeb247eee7a4d27014eabe361cbfd2a9e00bd337946aaa0dece8ce.f11dc7ef5f30811d164556022e3174da0d6e8bb13676833dabdb1f50f61f39a5
Some weights of the model checkpoint at xlm-mlm-100-1280 were not used when initializing XLMModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of XLMModel were initialized from the model checkpoint at xlm-mlm-100-1280.
If your task 

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608
Got 1712



loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 712


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 808


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 912


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1008


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1712



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/2d290a1a22a5f80e173def8b2f31f12d68a957542e6769ab06bfc3de06bc49f4.06ba3893e888d6ff1388c45cdbee1fb785542ae22b70ff159f55da323230a159
Model config AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "nu

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608
Got 1712



loading configuration file https://huggingface.co/google/muril-base-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/d8ca6ce642f067ecf3d1163f4d2903b471287613933f2857ca8307e500bc7645.aff1657f5771205f5a0c6cb4816f125ee5f2f2d62dbf27e6b9fee30b0ebbf0f5
Model config BertConfig {
  "_name_or_path": "google/muril-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

loading file https://huggingface.co/googl

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608
Got 1712



In [48]:
for i, j in all_vec.items():
    print(i,j['loan'].shape, j['original'].shape, type(j['original']))
# all_vec['mbert']['loan'].shape


mbert (1715, 768) (1715, 768) <class 'numpy.ndarray'>
xlm (1715, 1280) (1715, 1280) <class 'numpy.ndarray'>
ass_bert (1715, 768) (1715, 768) <class 'numpy.ndarray'>
indic_bert (1715, 768) (1715, 768) <class 'numpy.ndarray'>
muril (1715, 768) (1715, 768) <class 'numpy.ndarray'>


In [49]:
# get the linear transformation from mbert space to ass_bert space using the precalculated transformation matrix i.e beng_m_bertvec, ass_bertvec
#for ass_ben_train
m = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec, ass_bertvec) 
m = m.coef_ 
print(m.shape) #this is the 768 by 768 transformation matrix between mbert and assbert 
mapped_= all_vec['mbert']['loan'] @ m.transpose()
ass_beng_cos_mapped = get_cosine_similarities(mapped_, all_vec['ass_bert']['original'] )


(768, 768)


In [50]:
ass_ben_train['mbert_src_assbert_trg'] = ass_beng_cos_mapped
ass_ben_train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,...,ass_bertmuril,indic_bertmbert,indic_bertxlm,indic_bertass_bert,indic_bertmuril,murilmbert,murilxlm,murilass_bert,murilindic_bert,mbert_src_assbert_trg
0,0,0,0,568,568,ক্ৰিয়া,গাঁইয়া,kɹija,ɡãie̯a,Action,...,0.999289,0.997465,0.995359,-0.036194,0.999959,0.998514,0.997798,-0.037001,0.999945,-0.010115
1,1,1,1,119,119,চান্দ,চন্দ্র,sandɔ,t͡ɕɔn̪d̪rɔ,Chand,...,0.999310,0.998168,0.997567,-0.036599,0.999959,0.997993,0.997674,-0.037443,0.999867,0.080588
2,2,2,2,818,818,পখিলা,বকুল,pɔkʰila,bɔkulɔ,butterfly,...,0.999154,0.994700,0.990668,-0.037378,0.999927,0.998900,0.998522,-0.037572,0.999914,0.003350
3,3,3,3,277,277,ৰঙা,বাঘ,ɹɔŋa,bagʱɔ,red,...,0.999649,0.997704,0.995973,-0.037347,0.999954,0.998912,0.998547,-0.037650,0.999915,0.066199
4,4,4,4,665,665,বিলৈ,পালং,bilɔɪ,palɔŋ,to the b,...,0.999687,0.997192,0.992324,-0.037832,0.999953,0.998419,0.998129,-0.037200,0.999898,0.054813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1710,1710,1710,1710,269,269,অগ্নি,আনা,ɔgni,an̪a,Fire,...,0.999530,0.997056,0.995438,-0.037099,0.999946,0.998927,0.998485,-0.037632,0.999912,0.020783
1711,1711,1711,1711,153,153,ঠেং,পয়সা,tʰɛŋ,pe̯ɔʃa,The tail,...,0.998865,0.996444,0.995111,-0.037162,0.999935,0.998588,0.997881,-0.037122,0.999946,0.032935
1712,1712,1712,1712,48,48,কেন্দ্ৰ,আখন্দ,kɛndɹɔ,akʰɔn̪d̪ɔ,hub,...,0.999608,0.998104,0.995225,-0.036582,0.999951,0.998639,0.997997,-0.037136,0.999943,-0.064774
1713,1713,1713,1713,790,790,কাণ,খান,kan,kʰan̪,ear,...,0.999526,0.997004,0.992971,-0.037427,0.999954,0.998824,0.998260,-0.037465,0.999940,0.020544


In [52]:
cos_mapped = get_lt_maps(all_vec) # get the mapped embeddings and transformed cosine sims for ass_ben_train
for i, j  in cos_mapped.items():
    print(i,j)
    
    ass_ben_train[i] = j

mbert xlm
mbert ass_bert
mbert indic_bert
mbert muril
xlm mbert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


xlm ass_bert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


xlm indic_bert
xlm muril


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


ass_bert mbert
ass_bert xlm


  overwrite_a=True).T
  overwrite_a=True).T


ass_bert indic_bert
ass_bert muril


  overwrite_a=True).T


indic_bert mbert
indic_bert xlm
indic_bert ass_bert
indic_bert muril
muril mbert
muril xlm
muril ass_bert
muril indic_bert
mbertxlm [0.81422913 0.9049548  0.8283074  ... 0.8414111  0.72653395 0.8292563 ]
mbertass_bert [-0.02195851 -0.01808187 -0.04044119 ... -0.03736622 -0.06029329
 -0.04182959]
mbertindic_bert [0.999145   0.99959475 0.9984933  ... 0.9996923  0.9992353  0.9958416 ]
mbertmuril [0.99905944 0.9985286  0.99879175 ... 0.9997323  0.99941474 0.99884534]
xlmmbert [0.7086066  0.89907426 0.91591007 ... 0.87726027 0.84559566 0.8776003 ]
xlmass_bert [-0.0197074  -0.01707486 -0.0403541  ... -0.0462508  -0.05708092
 -0.03930154]
xlmindic_bert [0.9993415  0.99957055 0.998573   ... 0.9992578  0.99840796 0.9955033 ]
xlmmuril [0.998371   0.9983493  0.9987029  ... 0.9984844  0.99921745 0.99828595]
ass_bertmbert [0.7582203  0.92474973 0.92758924 ... 0.89519167 0.91026926 0.91854024]
ass_bertxlm [0.82852423 0.9215591  0.87545663 ... 0.84163153 0.865111   0.8806467 ]
ass_bertindic_bert [0.9

In [64]:
list(cos_mapped.keys())

['mbertxlm',
 'mbertass_bert',
 'mbertindic_bert',
 'mbertmuril',
 'xlmmbert',
 'xlmass_bert',
 'xlmindic_bert',
 'xlmmuril',
 'ass_bertmbert',
 'ass_bertxlm',
 'ass_bertindic_bert',
 'ass_bertmuril',
 'indic_bertmbert',
 'indic_bertxlm',
 'indic_bertass_bert',
 'indic_bertmuril',
 'murilmbert',
 'murilxlm',
 'murilass_bert',
 'murilindic_bert']

In [51]:
#get all the mapped embeddings and cosine sims for ben_ass_test
mbert_cos, all_vec = get_mbert_cos_sims(list(ben_ass_test['loan_word']),list(ben_ass_test['original_word']))
xlm_cos, all_vec = get_xlm_cos_sims(list(ben_ass_test['loan_word']),list(ben_ass_test['original_word']))
ass_bert_cos, all_vec = get_ass_albert_cos_sims(list(ben_ass_test['original_word']),list(ben_ass_test['loan_word']), lin_transform =True)
indic_bert_cos, all_vec = Indic_bert_cos_sims(list(ben_ass_test['loan_word']),list(ben_ass_test['original_word']))
muril_cos, all_vec = Muril_cos_sims(list(ben_ass_test['loan_word']),list(ben_ass_test['original_word']))

# ben_ass_test['mbert_cos'] = mbert_cos
# ben_ass_test['xlm_cos'] = xlm_cos
# ben_ass_test['ass_bert_mbert_cos'] = ass_bert_cos
# ben_ass_test['indic_bert_cos'] = indic_bert_cos
# ben_ass_test['muril_cos'] = muril_cos







loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/vocab.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/fcfbd016a54060fb3e7290d39ac4fd04b0c7ca2c683e5fdd87928b0feaa2c367.bd20142f530c7b681cef79e2153e77f8d9e8c9fdb3f6db29f37298198166236d
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/merges.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/0693c49aba9ba31f442ba9a6c368bc400d2a81e5931d983d63d8648a043bf551.a1730275bc49c3d660f1d9bf50222d8cb849f3ae93e75e5865a3037650459b27
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/91dad0f1b901ee17a1ca86293a46619408e812441cfa380050cd03359564b95f.fabc8

loading configuration file https://huggingface.co/xlm-mlm-100-1280/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/db80884313abeddd782425fe61c0846a43c007b0f403d13181e368ad7dd52f62.394eed9fa7b8205161e0b47bf552ae933e7e29234009f217a03b87bfe054ed52
Model config XLMConfig {
  "accumulate_gradients": 4,
  "ae_steps": [],
  "amp": 2,
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "batch_size": 16,
  "beam_size": 1,
  "bos_index": 0,
  "bos_token_id": 0,
  "bptt": 256,
  "bt_src_langs": [],
  "bt_steps": [],
  "causal": false,
  "clip_grad_norm": 1.0,
  "clm_steps": [],
  "command": "python /private/home/aconneau/workdir/xlm_17_100_big.3/2019_08_10_19_23_42/train.py --n_heads 16 --bt_steps '' --max_vocab 200000 --word_mask_keep_rand '0.8,0.1,0.1' --use_lang_emb false --data_path '/private/home/aconneau/projects/XLM/data/wiki/100/175k' --save_periodic 0 --max_len 200 --bptt 256 --ae_steps '' --fp1

loading weights file https://huggingface.co/xlm-mlm-100-1280/resolve/main/pytorch_model.bin from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/5ba8c43e22aeb247eee7a4d27014eabe361cbfd2a9e00bd337946aaa0dece8ce.f11dc7ef5f30811d164556022e3174da0d6e8bb13676833dabdb1f50f61f39a5
Some weights of the model checkpoint at xlm-mlm-100-1280 were not used when initializing XLMModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of XLMModel were initialized from the model checkpoint at xlm-mlm-100-1280.
If your task 

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 712


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 808


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 912


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1008


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1608



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/2d290a1a22a5f80e173def8b2f31f12d68a957542e6769ab06bfc3de06bc49f4.06ba3893e888d6ff1388c45cdbee1fb785542ae22b70ff159f55da323230a159
Model config AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "nu

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading configuration file https://huggingface.co/google/muril-base-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/d8ca6ce642f067ecf3d1163f4d2903b471287613933f2857ca8307e500bc7645.aff1657f5771205f5a0c6cb4816f125ee5f2f2d62dbf27e6b9fee30b0ebbf0f5
Model config BertConfig {
  "_name_or_path": "google/muril-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

loading file https://huggingface.co/googl

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



In [52]:
for i, j in all_vec.items():
    print(i,j['loan'].shape, j['original'].shape, type(j['original']))
# all_vec['mbert']['loan'].shape

mbert (1631, 768) (1631, 768) <class 'numpy.ndarray'>
xlm (1631, 1280) (1631, 1280) <class 'numpy.ndarray'>
ass_bert (1631, 768) (1631, 768) <class 'numpy.ndarray'>
indic_bert (1631, 768) (1631, 768) <class 'numpy.ndarray'>
muril (1631, 768) (1631, 768) <class 'numpy.ndarray'>


In [53]:
# get the linear transformation from mbert space to ass_bert space using the precalculated transformation matrix i.e beng_m_bertvec, ass_bertvec
#for ben_ass_test
m = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec, ass_bertvec) 
m = m.coef_ 
print(m.shape) #this is the 768 by 768 transformation matrix between mbert and assbert 
mapped_= all_vec['mbert']['loan'] @ m.transpose()
beng_ass_test_cos_mapped = get_cosine_similarities(mapped_, all_vec['ass_bert']['original'] )

(768, 768)


In [55]:
ben_ass_test['mbert_src_assbert_trg'] = beng_ass_test_cos_mapped 
ben_ass_test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,...,ass_bertmuril,indic_bertmbert,indic_bertxlm,indic_bertass_bert,indic_bertmuril,murilmbert,murilxlm,murilass_bert,murilindic_bert,mbert_src_assbert_trg
0,0,0,0,328,328,হাট,পুথি,ɦaʈɔ,putʰi,Hat,...,0.999896,0.995602,0.995207,-0.037043,0.999959,0.998250,0.998064,-0.037110,0.999959,0.053788
1,1,1,1,60,60,গলদেশ,গেলা,ɡɔlɔd̪eʃ,gɛla,Neck,...,0.999892,0.993731,0.990757,-0.036650,0.999945,0.997744,0.997690,-0.036971,0.999936,0.003743
2,2,2,2,15,15,আঁকা,আঁক,ãka,ãkɔ,Draw,...,0.999945,0.996707,0.996931,-0.036699,0.999989,0.997577,0.998135,-0.037100,0.999910,0.069326
3,3,3,3,255,255,মিডা,নদী,miɖa,nɔdi,Mida,...,0.999900,0.993703,0.982903,-0.037739,0.999721,0.998603,0.997843,-0.037069,0.999968,0.022493
4,4,4,4,9,9,অনুমতি,পানী,on̪umt̪i,pani,Permission,...,0.999640,0.998033,0.997661,-0.037113,0.999993,0.998184,0.998295,-0.037151,0.999949,0.066781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,1626,1626,1626,232,232,ভাজা,টাকা,bʱad͡ʑa,taka,Fried,...,0.999945,0.996144,0.994600,-0.037110,0.999983,0.998329,0.998383,-0.037136,0.999948,0.063769
1627,1627,1627,1627,84,84,ডাঁশ,আঁক,ɖãʃɔ,ãkɔ,Bite,...,0.999847,0.996940,0.987930,-0.036750,0.999786,0.998310,0.997333,-0.036979,0.999958,0.121835
1628,1628,1628,1628,311,311,মঙ্গল্য,শ্রী,mɔŋɡɔld͡zɔ,xɹi,Good luck,...,0.999517,0.995814,0.989882,-0.036404,0.999950,0.998445,0.997835,-0.037016,0.999963,0.003480
1629,1629,1629,1629,311,311,সন্ধ্যা,সন্ধিয়া,ʃɔn̪d̪ʱæ,xɔndʰija,Evening,...,0.999760,0.996680,0.993957,-0.037186,0.999987,0.998191,0.998367,-0.037138,0.999944,0.010274


In [56]:
cos_mapped = get_lt_maps(all_vec) # get the mapped embeddings and cosine sims for ass_ben_train
for i, j  in cos_mapped.items():
    print(i,j)
    
    ben_ass_test[i] = j

mbert xlm
mbert ass_bert
mbert indic_bert
mbert muril
xlm mbert
xlm ass_bert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


xlm indic_bert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


xlm muril
ass_bert mbert


  overwrite_a=True).T
  overwrite_a=True).T


ass_bert xlm
ass_bert indic_bert


  overwrite_a=True).T
  overwrite_a=True).T


ass_bert muril
indic_bert mbert
indic_bert xlm
indic_bert ass_bert
indic_bert muril
muril mbert
muril xlm
muril ass_bert
muril indic_bert
mbertxlm [0.68755674 0.6841567  0.8534988  ... 0.82382274 0.84723943 0.6884533 ]
mbertass_bert [-0.03226281 -0.03727898 -0.03546291 ... -0.02263128 -0.02630463
 -0.03139025]
mbertindic_bert [0.9993046  0.99761856 0.99946994 ... 0.9992995  0.9992915  0.99969566]
mbertmuril [0.99955285 0.99912673 0.9995197  ... 0.99956286 0.9997964  0.99970603]
xlmmbert [0.752661   0.896068   0.97391474 ... 0.8989773  0.9174396  0.83722556]
xlmass_bert [-0.03177009 -0.03839995 -0.03367117 ... -0.01886591 -0.02574864
 -0.03231563]
xlmindic_bert [0.9992496  0.9971322  0.99961483 ... 0.99843436 0.9987686  0.99955803]
xlmmuril [0.99931383 0.9988723  0.9991207  ... 0.9991207  0.9997629  0.9996088 ]
ass_bertmbert [0.97233784 0.96923876 0.98733175 ... 0.9767873  0.9857687  0.9736726 ]
ass_bertxlm [0.9623678  0.94629645 0.9579295  ... 0.9473304  0.95603776 0.93384033]
ass_bert

In [56]:
mbert_cos = get_mbert_cos_sims(list(ass_ben_test['loan_word']),list(ass_ben_test['original_word']))
xlm_cos = get_xlm_cos_sims(list(ass_ben_test['loan_word']),list(ass_ben_test['original_word']))
ass_bert_cos = get_ass_albert_cos_sims(list(ass_ben_test['loan_word']),list(ass_ben_test['original_word']), lin_transform =True)
indic_bert_cos = Indic_bert_cos_sims(list(ass_ben_test['loan_word']),list(ass_ben_test['original_word']))
muril_cos = Muril_cos_sims(list(ass_ben_test['loan_word']),list(ass_ben_test['original_word']))
 
# ass_ben_test['mbert_cos'] = mbert_cos
# ass_ben_test['xlm_cos'] = xlm_cos
# ass_ben_test['ass_bert_mbert_cos'] = ass_bert_cos
# ass_ben_test['indic_bert_cos'] = indic_bert_cos
# ass_ben_test['muril_cos'] = muril_cos   
    


loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/vocab.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/fcfbd016a54060fb3e7290d39ac4fd04b0c7ca2c683e5fdd87928b0feaa2c367.bd20142f530c7b681cef79e2153e77f8d9e8c9fdb3f6db29f37298198166236d
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/merges.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/0693c49aba9ba31f442ba9a6c368bc400d2a81e5931d983d63d8648a043bf551.a1730275bc49c3d660f1d9bf50222d8cb849f3ae93e75e5865a3037650459b27
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/xlm-mlm-100-1280/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/91dad0f1b901ee17a1ca86293a46619408e812441cfa380050cd03359564b95f.fabc8

loading configuration file https://huggingface.co/xlm-mlm-100-1280/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/db80884313abeddd782425fe61c0846a43c007b0f403d13181e368ad7dd52f62.394eed9fa7b8205161e0b47bf552ae933e7e29234009f217a03b87bfe054ed52
Model config XLMConfig {
  "accumulate_gradients": 4,
  "ae_steps": [],
  "amp": 2,
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "batch_size": 16,
  "beam_size": 1,
  "bos_index": 0,
  "bos_token_id": 0,
  "bptt": 256,
  "bt_src_langs": [],
  "bt_steps": [],
  "causal": false,
  "clip_grad_norm": 1.0,
  "clm_steps": [],
  "command": "python /private/home/aconneau/workdir/xlm_17_100_big.3/2019_08_10_19_23_42/train.py --n_heads 16 --bt_steps '' --max_vocab 200000 --word_mask_keep_rand '0.8,0.1,0.1' --use_lang_emb false --data_path '/private/home/aconneau/projects/XLM/data/wiki/100/175k' --save_periodic 0 --max_len 200 --bptt 256 --ae_steps '' --fp1

loading weights file https://huggingface.co/xlm-mlm-100-1280/resolve/main/pytorch_model.bin from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/5ba8c43e22aeb247eee7a4d27014eabe361cbfd2a9e00bd337946aaa0dece8ce.f11dc7ef5f30811d164556022e3174da0d6e8bb13676833dabdb1f50f61f39a5
Some weights of the model checkpoint at xlm-mlm-100-1280 were not used when initializing XLMModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of XLMModel were initialized from the model checkpoint at xlm-mlm-100-1280.
If your task 

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transform

Got 8


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 712


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 808


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 912


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1008


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1112


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1208


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1312


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1408


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1512


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':


Got 1608


  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':





Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/2d290a1a22a5f80e173def8b2f31f12d68a957542e6769ab06bfc3de06bc49f4.06ba3893e888d6ff1388c45cdbee1fb785542ae22b70ff159f55da323230a159
Model config AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "nu

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



loading configuration file https://huggingface.co/google/muril-base-cased/resolve/main/config.json from cache at /s/chopin/b/grad/abhijnan/.cache/huggingface/transformers/d8ca6ce642f067ecf3d1163f4d2903b471287613933f2857ca8307e500bc7645.aff1657f5771205f5a0c6cb4816f125ee5f2f2d62dbf27e6b9fee30b0ebbf0f5
Model config BertConfig {
  "_name_or_path": "google/muril-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

loading file https://huggingface.co/googl

Got 8
Got 112
Got 208
Got 312
Got 408
Got 512
Got 608
Got 712
Got 808
Got 912
Got 1008
Got 1112
Got 1208
Got 1312
Got 1408
Got 1512
Got 1608



In [57]:
for i, j in all_vec.items():
    print(i,j['loan'].shape, j['original'].shape, type(j['original']))
# all_vec['mbert']['loan'].shape

mbert (1700, 768) (1700, 768) <class 'numpy.ndarray'>
xlm (1700, 1280) (1700, 1280) <class 'numpy.ndarray'>
ass_bert (1700, 768) (1700, 768) <class 'numpy.ndarray'>
indic_bert (1700, 768) (1700, 768) <class 'numpy.ndarray'>
muril (1700, 768) (1700, 768) <class 'numpy.ndarray'>


In [58]:
# get the linear transformation from mbert space to ass_bert space using the precalculated transformation matrix i.e beng_m_bertvec, ass_bertvec
#for ass_ben_test
m = sklearn.linear_model.Ridge(fit_intercept=False).fit(beng_m_bertvec, ass_bertvec) 
m = m.coef_ 
print(m.shape) #this is the 768 by 768 transformation matrix between mbert and assbert 
mapped_= all_vec['mbert']['loan'] @ m.transpose()
ass_beng_test_cos_mapped = get_cosine_similarities(mapped_, all_vec['ass_bert']['original'] )

(768, 768)


In [59]:
ass_ben_test['mbert_src_assbert_trg'] = ass_beng_test_cos_mapped
ass_ben_test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,...,ass_bertmuril,indic_bertmbert,indic_bertxlm,indic_bertass_bert,indic_bertmuril,murilmbert,murilxlm,murilass_bert,murilindic_bert,mbert_src_assbert_trg
0,0,0,0,20,20,আজি,দাঁত,azi,d̪ãt̪ɔ,today,...,0.999816,0.995245,0.984364,-0.037255,0.999860,0.998842,0.998491,-0.037662,0.999918,0.003580
1,1,1,1,424,424,বস্তু,ভক্তি,bɔxtu,bʱɔkt̪i,Objects,...,0.999737,0.998458,0.995757,-0.037769,0.999962,0.998917,0.998437,-0.037490,0.999925,0.134114
2,2,2,2,83,83,গোসাঁই,কলা,gʊxãi,kɔla,Gosain,...,0.997764,0.995150,0.983945,-0.036948,0.999825,0.997333,0.996424,-0.037293,0.999829,0.081233
3,3,3,3,318,318,শ্রী,অশোত,xɹi,oʃot̪,Mr.,...,0.999268,0.997368,0.994264,-0.035945,0.999937,0.998593,0.997663,-0.036670,0.999940,0.063111
4,4,4,4,129,129,ডাঁহ,ডাঁশ,dãɦɔ,ɖãʃɔ,Duck,...,0.999721,0.995704,0.991231,-0.036069,0.999895,0.998209,0.997781,-0.036462,0.999917,0.116244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,1695,1695,1695,296,296,লাজ,লজ্জা,laz,lɔd͡ʑd͡ʑa,Shame,...,0.999359,0.997234,0.993459,-0.037235,0.999956,0.998167,0.997904,-0.037537,0.999868,0.099490
1696,1696,1696,1696,7,7,অক্ষ,অক্ষি,ɔkʰjɔ,okkʰi,Axis,...,0.999457,0.994719,0.988365,-0.036510,0.999899,0.998698,0.997802,-0.037324,0.999946,0.043864
1697,1697,1697,1697,586,586,চাঁদ,দাঁত,sãdɔ,d̪ãt̪ɔ,The moon,...,0.999868,0.997607,0.996476,-0.036991,0.999951,0.998912,0.998486,-0.037662,0.999921,0.170446
1698,1698,1698,1698,327,327,সঁচা,বাজ,xɔ̃sa,bad͡ʑ,true,...,0.998944,0.997836,0.996538,-0.037763,0.999941,0.998426,0.998303,-0.037575,0.999887,0.083471


In [61]:
cos_mapped = get_lt_maps(all_vec) # get the mapped embeddings and cosine sims for ass_ben_test
for i, j  in cos_mapped.items():
    print(i,j)
    
    ass_ben_test[i] = j

mbert xlm
mbert ass_bert
mbert indic_bert
mbert muril
xlm mbert


  overwrite_a=True).T
  overwrite_a=True).T


xlm ass_bert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


xlm indic_bert
xlm muril


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


ass_bert mbert
ass_bert xlm
ass_bert indic_bert


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


ass_bert muril
indic_bert mbert
indic_bert xlm


  overwrite_a=True).T


indic_bert ass_bert
indic_bert muril
muril mbert
muril xlm
muril ass_bert
muril indic_bert
mbertxlm [0.7835372  0.8324632  0.7524152  ... 0.83098465 0.8068333  0.8784781 ]
mbertass_bert [-0.02294929 -0.02864049 -0.04234096 ... -0.02772318 -0.035661
 -0.03385013]
mbertindic_bert [0.99575704 0.99916756 0.996084   ... 0.9997469  0.99915326 0.998631  ]
mbertmuril [0.99969953 0.99971086 0.997231   ... 0.9998252  0.9987678  0.9994286 ]
xlmmbert [0.8258198  0.8333431  0.76705796 ... 0.7853576  0.9124824  0.95633495]
xlmass_bert [-0.02109283 -0.02265036 -0.04356567 ... -0.02540999 -0.03646286
 -0.02866849]
xlmindic_bert [0.9954077  0.9984507  0.99488956 ... 0.99955475 0.9991399  0.99553984]
xlmmuril [0.9996149  0.9997564  0.99652153 ... 0.9997771  0.99840975 0.99922234]
ass_bertmbert [0.87931776 0.942175   0.82123417 ... 0.8644617  0.9282237  0.9772395 ]
ass_bertxlm [0.8653891  0.88182724 0.7863366  ... 0.8756689  0.8359395  0.96302044]
ass_bertindic_bert [0.998331   0.99960446 0.9974607  ... 

In [60]:



ben_ass_train.to_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Bengali-Assamese-train_production_alldata.csv')
ass_ben_train.to_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Assamese-Bengali-train_production_alldata.csv')
ben_ass_test.to_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Bengali-Assamese-test_production_alldata.csv')
ass_ben_test.to_csv('/s/chopin/d/proj/ramfis-aida/loan_exp_results/loan-word-detection/Datasets/Assamese_Bert_dataset/Assamese-Bengali-test_production_alldata.csv')

# Load `language-pairs.json` list and run pipeline for each

In [None]:
pairs = None

with open('../language-pairs.json', 'r') as f: # for getting logits from all languages
 
    
    pairs = json.loads(f.read())
for pair in pairs:
    print(pair)

In [None]:
pairs = None

with open('../language-pairs-holdout.json', 'r') as f: # for getting logits from all languages
 
    
    pairs = json.loads(f.read())
for pair in pairs:
    print(pair)

In [None]:
pairs = None

with open('../language-pairs.json', 'r') as f:
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L2 = pairs[pair]['source']['name']
    
    # load datasets
    prefix = f'../Datasets/production_train_test/{L1}-{L2}'
    
    train_alldata = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')

    # get and pad PanPhon features for alldata split
    get_panphon_features(train_alldata, test_alldata)
    alldata_maxlen = (max(np.max(train_alldata['features_loan'].str.len()),\
                          np.max(test_alldata['features_loan'].str.len())),\
                      max(np.max(train_alldata['features_orig'].str.len()),\
                          np.max(test_alldata['features_orig'].str.len())))
    pad_panphon_features(train_alldata, test_alldata, alldata_maxlen)

    # add target labels
    Y_train, Y_test = add_target_labels(train_alldata, test_alldata)

    # make train and val splits
    X_train, X_val, X_test, Y_train, Y_val = make_train_val_set(train_alldata, test_alldata, Y_train)
    X_train, Y_train, X_val, Y_val, X_test, Y_test = make_tensors(X_train, Y_train, X_val, Y_val, X_test, Y_test)

    # instantiate network
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\nUsing {device} device\n")
    
    # set random seeds for reproducibility
    np.random.seed(666)

    model = NeuralNetwork(X_train.shape[1]).to(device)
    print(model,"\n")

    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.00001)

    # train and plot losses, accuracy
    train_losses, val_losses, train_accur, val_accur = \
        model.fit(X_train, Y_train, X_val, Y_val, criterion, optimizer)
    model.plot_losses(train_losses,val_losses,train_accur,val_accur)

    # get and pad PanPhon features for realdist and balanced splits
    get_panphon_features(train_realdist,test_realdist)
    pad_panphon_features(train_realdist,test_realdist,alldata_maxlen)

    get_panphon_features(train_balanced,test_balanced)
    pad_panphon_features(train_balanced,test_balanced,alldata_maxlen)

    # create data to get logits for
    X_train_alldata = torch.tensor(np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                         np.array([x for x in train_alldata['features_orig']])])).to(device)
    X_test_alldata = torch.tensor(np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                        np.array([x for x in test_alldata['features_orig']])])).to(device)

    X_train_realdist = torch.tensor(np.hstack([np.array([x for x in train_realdist['features_loan']]),\
                         np.array([x for x in train_realdist['features_orig']])])).to(device)
    X_test_realdist = torch.tensor(np.hstack([np.array([x for x in test_realdist['features_loan']]),\
                        np.array([x for x in test_realdist['features_orig']])])).to(device)

    X_train_balanced = torch.tensor(np.hstack([np.array([x for x in train_balanced['features_loan']]),\
                         np.array([x for x in train_balanced['features_orig']])])).to(device)
    X_test_balanced = torch.tensor(np.hstack([np.array([x for x in test_balanced['features_loan']]),\
                        np.array([x for x in test_balanced['features_orig']])])).to(device)

    # place model in eval mode and get logits from DNN for all datasets/splits
    print("Getting logits from DNN")
    model.eval()

    with torch.no_grad():
        train_logits_dnn_alldata = model(X_train_alldata.float())[1].detach().cpu().numpy()
        test_logits_dnn_alldata = model(X_test_alldata.float())[1].detach().cpu().numpy()
        train_logits_dnn_realdist = model(X_train_realdist.float())[1].detach().cpu().numpy()
        test_logits_dnn_realdist = model(X_test_realdist.float())[1].detach().cpu().numpy()
        train_logits_dnn_balanced = model(X_train_balanced.float())[1].detach().cpu().numpy()
        test_logits_dnn_balanced = model(X_test_balanced.float())[1].detach().cpu().numpy()

    # remove PanPhon features from dataframe and add logits column
    train_alldata = train_alldata.drop(['features_loan','features_orig'], axis=1)
    train_alldata['DNN_logits'] = train_logits_dnn_alldata

    test_alldata = test_alldata.drop(['features_loan','features_orig'], axis=1)
    test_alldata['DNN_logits'] = test_logits_dnn_alldata

    train_realdist = train_realdist.drop(['features_loan','features_orig'], axis=1)
    train_realdist['DNN_logits'] = train_logits_dnn_realdist

    test_realdist = test_realdist.drop(['features_loan','features_orig'], axis=1)
    test_realdist['DNN_logits'] = test_logits_dnn_realdist

    train_balanced = train_balanced.drop(['features_loan','features_orig'], axis=1)
    train_balanced['DNN_logits'] = train_logits_dnn_balanced

    test_balanced = test_balanced.drop(['features_loan','features_orig'], axis=1)
    test_balanced['DNN_logits'] = test_logits_dnn_balanced

    #set the seeds for reproducibility even though we are not fine-tuning or training and the weights 
    #for both these models are effectively frozen for our purpose 
    torch.manual_seed(7)
    random.seed(7)
    np.random.seed(7)

    # Setting PyTorch's required configuration variables for reproducibility.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = False
    torch.use_deterministic_algorithms(False)

    PRE_TRAINED_bert_MODEL = 'bert-base-multilingual-cased'
    PRE_TRAINED_xlm_MODEL = 'xlm-mlm-100-1280'

    MAXTOKENS = 5
    BS = 8  # batch size

    #list of loan-original words for train sets
    l1_train_alldata = list(train_alldata["loan_word"])
    l2_train_alldata = list(train_alldata["original_word"])

    l1_train_realdist = list(train_realdist["loan_word"])
    l2_train_realdist = list(train_realdist["original_word"])

    l1_train_balanced = list(train_balanced["loan_word"])
    l2_train_balanced = list(train_balanced["original_word"])

    #list of loan-original words for test sets
    l1_test_alldata = list(test_alldata["loan_word"])
    l2_test_alldata = list(test_alldata["original_word"])

    l1_test_realdist = list(test_realdist["loan_word"])
    l2_test_realdist = list(test_realdist["original_word"])

    l1_test_balanced = list(test_balanced["loan_word"])
    l2_test_balanced = list(test_balanced["original_word"])

    print("Getting MBERT similarities")
    train_alldata['MBERT_cos_sim'] = get_mbert_cos_sims(l1_train_alldata,l2_train_alldata)
    test_alldata['MBERT_cos_sim'] = get_mbert_cos_sims(l1_test_alldata,l2_test_alldata)

    train_realdist['MBERT_cos_sim'] = train_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['MBERT_cos_sim']
    train_balanced['MBERT_cos_sim'] = train_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['MBERT_cos_sim']

    test_realdist['MBERT_cos_sim'] = test_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['MBERT_cos_sim']
    test_balanced['MBERT_cos_sim'] = test_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['MBERT_cos_sim']

    print()
    print("Getting XLM similarities")
    train_alldata['XLM_cos_sim'] = get_xlm_cos_sims(l1_train_alldata,l2_train_alldata)
    test_alldata['XLM_cos_sim'] = get_xlm_cos_sims(l1_test_alldata,l2_test_alldata)

    train_realdist['XLM_cos_sim'] = train_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['XLM_cos_sim']
    train_balanced['XLM_cos_sim'] = train_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['XLM_cos_sim']

    test_realdist['XLM_cos_sim'] = test_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['XLM_cos_sim']
    test_balanced['XLM_cos_sim'] = test_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['XLM_cos_sim']
        
    train_alldata.to_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata.to_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist.to_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist.to_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced.to_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced.to_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')

In [None]:
# test_alldata['MBERT_cos_sim'] = get_mbert_cos_sims(l1_test_alldata,l2_test_alldata)

In [None]:
train_balanced['MBERT_cos_sim'] = train_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['MBERT_cos_sim']

In [None]:
 
print(train_realdist.columns.tolist())

In [None]:
pairs = None

with open('../language-pairs.json', 'r') as f: # for getting logits from all languages
 
    
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L2 = pairs[pair]['source']['name']
    
    # load datasets
    prefix = f'../Datasets/production_train_test/{L1}-{L2}'
    
    train_alldata = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata = pd.read_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist = pd.read_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced = pd.read_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')

    # get and pad PanPhon features for alldata split
    get_panphon_features(train_alldata, test_alldata)
    alldata_maxlen = (max(np.max(train_alldata['features_loan'].str.len()),\
                          np.max(test_alldata['features_loan'].str.len())),\
                      max(np.max(train_alldata['features_orig'].str.len()),\
                          np.max(test_alldata['features_orig'].str.len())))
    pad_panphon_features(train_alldata, test_alldata, alldata_maxlen)

    # add target labels
    Y_train, Y_test = add_target_labels(train_alldata, test_alldata)

    # make train and val splits
    X_train, X_val, X_test, Y_train, Y_val = make_train_val_set(train_alldata, test_alldata, Y_train)
    X_train, Y_train, X_val, Y_val, X_test, Y_test = make_tensors(X_train, Y_train, X_val, Y_val, X_test, Y_test)

    # instantiate network
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\nUsing {device} device\n")
    
    # set random seeds for reproducibility
    np.random.seed(666)

    model = NeuralNetwork(X_train.shape[1]).to(device)
    print(model,"\n")

    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.00001)

    # train and plot losses, accuracy
    train_losses, val_losses, train_accur, val_accur = \
        model.fit(X_train, Y_train, X_val, Y_val, criterion, optimizer)
    model.plot_losses(train_losses,val_losses,train_accur,val_accur)

    # get and pad PanPhon features for realdist and balanced splits
    get_panphon_features(train_realdist,test_realdist)
    pad_panphon_features(train_realdist,test_realdist,alldata_maxlen)

    get_panphon_features(train_balanced,test_balanced)
    pad_panphon_features(train_balanced,test_balanced,alldata_maxlen)

    # create data to get logits for
    X_train_alldata = torch.tensor(np.hstack([np.array([x for x in train_alldata['features_loan']]),\
                         np.array([x for x in train_alldata['features_orig']])])).to(device)
    X_test_alldata = torch.tensor(np.hstack([np.array([x for x in test_alldata['features_loan']]),\
                        np.array([x for x in test_alldata['features_orig']])])).to(device)

    X_train_realdist = torch.tensor(np.hstack([np.array([x for x in train_realdist['features_loan']]),\
                         np.array([x for x in train_realdist['features_orig']])])).to(device)
    X_test_realdist = torch.tensor(np.hstack([np.array([x for x in test_realdist['features_loan']]),\
                        np.array([x for x in test_realdist['features_orig']])])).to(device)

    X_train_balanced = torch.tensor(np.hstack([np.array([x for x in train_balanced['features_loan']]),\
                         np.array([x for x in train_balanced['features_orig']])])).to(device)
    X_test_balanced = torch.tensor(np.hstack([np.array([x for x in test_balanced['features_loan']]),\
                        np.array([x for x in test_balanced['features_orig']])])).to(device)

    # place model in eval mode and get logits from DNN for all datasets/splits
    print("Getting logits from DNN")
    model.eval()

    with torch.no_grad():
        train_logits_dnn_alldata = model(X_train_alldata.float())[1].detach().cpu().numpy()
        test_logits_dnn_alldata = model(X_test_alldata.float())[1].detach().cpu().numpy()
        train_logits_dnn_realdist = model(X_train_realdist.float())[1].detach().cpu().numpy()
        test_logits_dnn_realdist = model(X_test_realdist.float())[1].detach().cpu().numpy()
        train_logits_dnn_balanced = model(X_train_balanced.float())[1].detach().cpu().numpy()
        test_logits_dnn_balanced = model(X_test_balanced.float())[1].detach().cpu().numpy()

    # remove PanPhon features from dataframe and add logits column
    train_alldata = train_alldata.drop(['features_loan','features_orig'], axis=1)
    train_alldata['DNN_logits'] = train_logits_dnn_alldata

    test_alldata = test_alldata.drop(['features_loan','features_orig'], axis=1)
    test_alldata['DNN_logits'] = test_logits_dnn_alldata

    train_realdist = train_realdist.drop(['features_loan','features_orig'], axis=1)
    train_realdist['DNN_logits'] = train_logits_dnn_realdist

    test_realdist = test_realdist.drop(['features_loan','features_orig'], axis=1)
    test_realdist['DNN_logits'] = test_logits_dnn_realdist

    train_balanced = train_balanced.drop(['features_loan','features_orig'], axis=1)
    train_balanced['DNN_logits'] = train_logits_dnn_balanced

    test_balanced = test_balanced.drop(['features_loan','features_orig'], axis=1)
    test_balanced['DNN_logits'] = test_logits_dnn_balanced

    #set the seeds for reproducibility even though we are not fine-tuning or training and the weights 
    #for both these models are effectively frozen for our purpose 
    torch.manual_seed(7)
    random.seed(7)
    np.random.seed(7)

    # Setting PyTorch's required configuration variables for reproducibility.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = False
    torch.use_deterministic_algorithms(False)

    PRE_TRAINED_bert_MODEL = 'bert-base-multilingual-cased'
    PRE_TRAINED_xlm_MODEL = 'xlm-mlm-100-1280'

    MAXTOKENS = 5
    MAXTOKENS_XLM = 9
    BS = 8  # batch size

    #list of loan-original words for train sets
    l1_train_alldata = list(train_alldata["loan_word"])
    l2_train_alldata = list(train_alldata["original_word"])

    l1_train_realdist = list(train_realdist["loan_word"])
    l2_train_realdist = list(train_realdist["original_word"])

    l1_train_balanced = list(train_balanced["loan_word"])
    l2_train_balanced = list(train_balanced["original_word"])

    #list of loan-original words for test sets
    l1_test_alldata = list(test_alldata["loan_word"])
    l2_test_alldata = list(test_alldata["original_word"])

    l1_test_realdist = list(test_realdist["loan_word"])
    l2_test_realdist = list(test_realdist["original_word"])

    l1_test_balanced = list(test_balanced["loan_word"])
    l2_test_balanced = list(test_balanced["original_word"])

    print("Getting MBERT similarities")
    train_alldata['MBERT_cos_sim'] = get_mbert_cos_sims(l1_train_alldata,l2_train_alldata)
    test_alldata['MBERT_cos_sim'] = get_mbert_cos_sims(l1_test_alldata,l2_test_alldata)

    train_realdist['MBERT_cos_sim'] = train_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['MBERT_cos_sim']
    train_balanced['MBERT_cos_sim'] = train_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['MBERT_cos_sim']

    test_realdist['MBERT_cos_sim'] = test_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['MBERT_cos_sim']
    test_balanced['MBERT_cos_sim'] = test_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['MBERT_cos_sim']

    print()
    print("Getting XLM similarities")
    train_alldata['XLM_cos_sim'] = get_xlm_cos_sims(l1_train_alldata,l2_train_alldata)
    test_alldata['XLM_cos_sim'] = get_xlm_cos_sims(l1_test_alldata,l2_test_alldata)

    train_realdist['XLM_cos_sim'] = train_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['XLM_cos_sim']
    train_balanced['XLM_cos_sim'] = train_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                           on=['loan_word','original_word'], how="left")['XLM_cos_sim']

    test_realdist['XLM_cos_sim'] = test_realdist.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['XLM_cos_sim']
    test_balanced['XLM_cos_sim'] = test_balanced.merge(pd.concat([train_alldata,test_alldata]),\
                                                         on=['loan_word','original_word'], how="left")['XLM_cos_sim']
        
    train_alldata.to_csv(f'{prefix}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_alldata.to_csv(f'{prefix}/alldata/{L1}-{L2}-test_production_alldata.csv')

    train_realdist.to_csv(f'{prefix}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_realdist.to_csv(f'{prefix}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_balanced.to_csv(f'{prefix}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_balanced.to_csv(f'{prefix}/balanced/{L1}-{L2}-test_production_balanced.csv')