In [10]:
import torch
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from tqdm import tqdm
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import os
import io
import requests
import csv
import torch
from torch import nn
from torch import nn, optim
import json

import torch.nn.functional as F

%matplotlib inline

In [15]:
pairs = None

with open('language-pairs-cognates.json', 'r') as f:
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L2 = pairs[pair]['source']['name']
    loan_frac = float(pairs[pair]['loan-frac'])
    
    path_loans = f"../{L1}-{L2}-Cognates.csv"
    path_hard = f"../{L1}-{L2}-Hard-Negatives.csv"
    path_equiv = f"../{L1}-{L2}-Synonyms.csv"
    path_rand = f"../{L1}-{L2}-Randoms.csv"
    
    loans = pd.read_csv(path_loans)
    hard_neg = pd.read_csv(path_hard)
    equiv = pd.read_csv(path_equiv)
    random = pd.read_csv(path_rand)
    
    print(f"{len(loans)} loans")
    print(f"{len(hard_neg)} hard negatives")
    print(f"{len(equiv)} synonyms")
    print(f"{len(random)} randoms\n")
    
    loans_realdist_frac = loan_frac
    hard_neg_nonloan_frac = .1
    equiv_nonloan_frac = (1-(loans_realdist_frac+hard_neg_nonloan_frac))*(2/3)
    random_nonloan_frac = equiv_nonloan_frac/2
    nonloan_frac = equiv_nonloan_frac+random_nonloan_frac+hard_neg_nonloan_frac
    
    print("Real distribution fractions")
    print(f"{loans_realdist_frac*100}% loans")
    print(f"{hard_neg_nonloan_frac*100}% hard negatives")
    print(f"{equiv_nonloan_frac*100}% synonyms")
    print(f"{random_nonloan_frac*100}% randoms")
    print(f"Total {nonloan_frac*100}% non-loans\n")
    
    # set random seed for reproducibility
    np.random.seed(420)
    
    #first shuffle all the different sets before creating the splits:
    loans = loans.iloc[np.random.permutation(len(loans))]
    equiv = equiv.iloc[np.random.permutation(len(equiv))]
    random = random.iloc[np.random.permutation(len(random))]
    hard_neg = hard_neg.iloc[np.random.permutation(len(hard_neg))]
    
    train_frac = .9
    
    # train sets for different versions of datasets

    # version one: realistic distribution according to computed percentages

    loans_test_realdist = loans.iloc[:int(loans.shape[0] * (1-train_frac)), :]
    loans_train_realdist = loans.iloc[int(loans.shape[0] * (1-train_frac)):, :]
    hard_neg_test_realdist = hard_neg.iloc[:int(loans.shape[0] * hard_neg_nonloan_frac/loans_realdist_frac * (1-train_frac)), :]
    hard_neg_train_realdist = hard_neg.iloc[int(loans.shape[0] * hard_neg_nonloan_frac/loans_realdist_frac * (1-train_frac)):hard_neg_test_realdist.shape[0]+int(loans.shape[0] * hard_neg_nonloan_frac/loans_realdist_frac), :]
    equiv_test_realdist = equiv.iloc[:int(loans.shape[0] * equiv_nonloan_frac/loans_realdist_frac * (1-train_frac)), :]
    equiv_train_realdist = equiv.iloc[int(loans.shape[0] * equiv_nonloan_frac/loans_realdist_frac * (1-train_frac)):equiv_test_realdist.shape[0]+int(loans.shape[0] * equiv_nonloan_frac/loans_realdist_frac), :]
    random_test_realdist = random.iloc[:int(loans.shape[0] * random_nonloan_frac/loans_realdist_frac * (1-train_frac)), :]
    random_train_realdist = random.iloc[int(loans.shape[0] * random_nonloan_frac/loans_realdist_frac * (1-train_frac)):random_test_realdist.shape[0]+int(loans.shape[0] * random_nonloan_frac/loans_realdist_frac), :]

    print("Real distribution")
    print("Loans")
    print(f"    {loans_train_realdist.shape[0]} train")
    print(f"    {loans_test_realdist.shape[0]} test")
    print("Hard negatives")
    print(f"    {hard_neg_train_realdist.shape[0]} train")
    print(f"    {hard_neg_test_realdist.shape[0]} test")
    print("Synonyms")
    print(f"    {equiv_train_realdist.shape[0]} train")
    print(f"    {equiv_test_realdist.shape[0]} test")
    print("Randoms")
    print(f"    {random_train_realdist.shape[0]} train")
    print(f"    {random_test_realdist.shape[0]} test")
    print("Total non-loans")
    print(f"    {hard_neg_train_realdist.shape[0]+equiv_train_realdist.shape[0]+random_train_realdist.shape[0]} train")
    print(f"    {hard_neg_test_realdist.shape[0]+equiv_test_realdist.shape[0]+random_test_realdist.shape[0]} test\n")
    
    #version two: balanced - 50 percent balanced loans and rest proportional to real distribution

    loans_train_balanced = loans.iloc[:int(loans.shape[0] * train_frac), :]
    loans_test_balanced = loans.iloc[int(loans.shape[0] * train_frac):, :]
    hard_neg_train_balanced = hard_neg.iloc[:int(loans.shape[0] * hard_neg_nonloan_frac/nonloan_frac * train_frac), :]
    hard_neg_test_balanced = hard_neg.iloc[int(loans.shape[0] * hard_neg_nonloan_frac/nonloan_frac * train_frac):int(loans.shape[0] * hard_neg_nonloan_frac/nonloan_frac), :]
    equiv_train_balanced = equiv.iloc[:int(loans.shape[0] * equiv_nonloan_frac/nonloan_frac * train_frac), :]
    equiv_test_balanced = equiv.iloc[int(loans.shape[0] * equiv_nonloan_frac/nonloan_frac * train_frac):int(loans.shape[0] * equiv_nonloan_frac/nonloan_frac), :]
    random_train_balanced = random.iloc[:int(loans.shape[0] * random_nonloan_frac/nonloan_frac * train_frac), :]
    random_test_balanced = random.iloc[int(loans.shape[0] * random_nonloan_frac/nonloan_frac * train_frac):int(loans.shape[0] * random_nonloan_frac/nonloan_frac), :]

    print("Balanced distribution")
    print("Loans")
    print(f"    {loans_train_balanced.shape[0]} train")
    print(f"    {loans_test_balanced.shape[0]} test")
    print("Hard negatives")
    print(f"    {hard_neg_train_balanced.shape[0]} train")
    print(f"    {hard_neg_test_balanced.shape[0]} test")
    print("Synonyms")
    print(f"    {equiv_train_balanced.shape[0]} train")
    print(f"    {equiv_test_balanced.shape[0]} test")
    print("Randoms")
    print(f"    {random_train_balanced.shape[0]} train")
    print(f"    {random_test_balanced.shape[0]} test")
    print("Total non-loans")
    print(f"    {hard_neg_train_balanced.shape[0]+equiv_train_balanced.shape[0]+random_train_balanced.shape[0]} train")
    print(f"    {hard_neg_test_balanced.shape[0]+equiv_test_balanced.shape[0]+random_test_balanced.shape[0]} test\n")
    
    #version three: using the entire dataset, overweighted against loan words

    loans_test_alldata = loans.iloc[:int(loans.shape[0] * (1-train_frac)), :]
    loans_train_alldata = loans.iloc[int(loans.shape[0] * (1-train_frac)):, :] 
    hard_neg_test_alldata = hard_neg.iloc[:int(hard_neg.shape[0] * (1-train_frac)), :]
    hard_neg_train_alldata = hard_neg.iloc[int(hard_neg.shape[0] * (1-train_frac)):, :]
    equiv_test_alldata = equiv.iloc[:int(equiv.shape[0] * (1-train_frac)), :]
    equiv_train_alldata = equiv.iloc[int(equiv.shape[0] * (1-train_frac)):, :]
    random_test_alldata = random.iloc[:int(random.shape[0] * (1-train_frac)), :]
    random_train_alldata = random.iloc[int(random.shape[0] * (1-train_frac)):, :]
                                          
    print("All data")
    print("Loans")
    print(f"    {loans_train_alldata.shape[0]} train")
    print(f"    {loans_test_alldata.shape[0]} test")
    print("Hard negatives")
    print(f"    {hard_neg_train_alldata.shape[0]} train")
    print(f"    {hard_neg_test_alldata.shape[0]} test")
    print("Synonyms")
    print(f"    {equiv_train_alldata.shape[0]} train")
    print(f"    {equiv_test_alldata.shape[0]} test")
    print("Randoms")
    print(f"    {random_train_alldata.shape[0]} train")
    print(f"    {random_test_alldata.shape[0]} test")
    print("Total non-loans")
    print(f"    {hard_neg_train_alldata.shape[0]+equiv_train_alldata.shape[0]+random_train_alldata.shape[0]} train")
    print(f"    {hard_neg_test_alldata.shape[0]+equiv_test_alldata.shape[0]+random_test_alldata.shape[0]} test\n")
    
    # add category bin labels
    # this is why the pandas warning is disabled above
    loans_train_realdist['label_bin'] = 1
    loans_test_realdist['label_bin'] = 1
    equiv_train_realdist['label_bin'] = 0
    equiv_test_realdist['label_bin'] = 0
    random_train_realdist['label_bin'] = 0
    random_test_realdist['label_bin'] = 0
    hard_neg_train_realdist['label_bin'] = 0
    hard_neg_test_realdist['label_bin'] = 0

    loans_train_balanced['label_bin'] = 1
    loans_test_balanced['label_bin'] = 1
    equiv_train_balanced['label_bin'] = 0
    equiv_test_balanced['label_bin'] = 0
    random_train_balanced['label_bin'] = 0
    random_test_balanced['label_bin'] = 0
    hard_neg_train_balanced['label_bin'] = 0
    hard_neg_test_balanced['label_bin'] = 0

    loans_train_alldata['label_bin'] = 1
    loans_test_alldata['label_bin'] = 1
    equiv_train_alldata['label_bin'] = 0
    equiv_test_alldata['label_bin'] = 0
    random_train_alldata['label_bin'] = 0
    random_test_alldata['label_bin'] = 0
    hard_neg_train_alldata['label_bin'] = 0
    hard_neg_test_alldata['label_bin'] = 0
    
    # make concatenated datasets
    train_final_realdist = pd.concat([loans_train_realdist, equiv_train_realdist, random_train_realdist, hard_neg_train_realdist])
    test_final_realdist = pd.concat([loans_test_realdist, equiv_test_realdist, random_test_realdist, hard_neg_test_realdist])
    train_final_realdist = train_final_realdist.dropna(axis=0)
    test_final_realdist = test_final_realdist.dropna(axis=0)

    train_final_balanced = pd.concat([loans_train_balanced, equiv_train_balanced, random_train_balanced, hard_neg_train_balanced])
    test_final_balanced = pd.concat([loans_test_balanced, equiv_test_balanced, random_test_balanced, hard_neg_test_balanced])
    train_final_balanced = train_final_balanced.dropna(axis=0)
    test_final_balanced = test_final_balanced.dropna(axis=0)
    
    train_final_alldata = pd.concat([loans_train_alldata, equiv_train_alldata, random_train_alldata, hard_neg_train_alldata])
    test_final_alldata = train_final_alldata.dropna(axis=0)
    test_final_alldata = test_final_alldata.dropna(axis=0)
    
    #shuffle them after final concatenation 
    train_final_realdist = train_final_realdist.iloc[np.random.permutation(len(train_final_realdist))]
    test_final_realdist = test_final_realdist.iloc[np.random.permutation(len(test_final_realdist))]

    train_final_balanced = train_final_balanced.iloc[np.random.permutation(len(train_final_balanced))]
    test_final_balanced = test_final_balanced.iloc[np.random.permutation(len(test_final_balanced))]

    train_final_alldata = train_final_alldata.iloc[np.random.permutation(len(train_final_alldata))]
    test_final_alldata = test_final_alldata.iloc[np.random.permutation(len(test_final_alldata))]
    
    output_dir = 'production_train_test/'
    
    if not os.path.exists(f'{output_dir}/{L1}-{L2}'):
        os.makedirs(f'{output_dir}/{L1}-{L2}')
        os.makedirs(f'{output_dir}/{L1}-{L2}/realdist')
        os.makedirs(f'{output_dir}/{L1}-{L2}/balanced')
        os.makedirs(f'{output_dir}/{L1}-{L2}/alldata')

    train_final_realdist.to_csv(f'{output_dir}/{L1}-{L2}/realdist/{L1}-{L2}-train_production_realdist.csv')
    test_final_realdist.to_csv(f'{output_dir}/{L1}-{L2}/realdist/{L1}-{L2}-test_production_realdist.csv')

    train_final_balanced.to_csv(f'{output_dir}/{L1}-{L2}/balanced/{L1}-{L2}-train_production_balanced.csv')
    test_final_balanced.to_csv(f'{output_dir}/{L1}-{L2}/balanced/{L1}-{L2}-test_production_balanced.csv')

    train_final_alldata.to_csv(f'{output_dir}/{L1}-{L2}/alldata/{L1}-{L2}-train_production_alldata.csv')
    test_final_alldata.to_csv(f'{output_dir}/{L1}-{L2}/alldata/{L1}-{L2}-test_production_alldata.csv')

Assamese-Bengali
339 loans
862 hard negatives
365 synonyms
337 randoms

Real distribution fractions
50.0% loans
10.0% hard negatives
26.666666666666668% synonyms
13.333333333333334% randoms
Total 50.0% non-loans

Real distribution
Loans
    306 train
    33 test
Hard negatives
    67 train
    6 test
Synonyms
    180 train
    18 test
Randoms
    90 train
    9 test
Total non-loans
    337 train
    33 test

Balanced distribution
Loans
    305 train
    34 test
Hard negatives
    61 train
    6 test
Synonyms
    162 train
    18 test
Randoms
    81 train
    9 test
Total non-loans
    304 train
    33 test

All data
Loans
    306 train
    33 test
Hard negatives
    776 train
    86 test
Synonyms
    329 train
    36 test
Randoms
    304 train
    33 test
Total non-loans
    1409 train
    155 test

Bengali-Assamese
339 loans
801 hard negatives
352 synonyms
337 randoms

Real distribution fractions
50.0% loans
10.0% hard negatives
26.666666666666668% synonyms
13.333333333333334% randoms