Run one of the next cells, depending on language pair

In [2]:
L1 = "Hindi"
L2 = "Persian"
L1_epi = "hin-Deva"
L2_epi = "fas-Arab"
L1_gtrans = "hi"
L2_gtrans = "fa"
L1_unicode = "Devanagari"
L2_unicode = "Arabic"
synonym_file_name = "English-Hindi-Syn-Ant.csv"
colab = False # Change to False if not using google-colab

In [2]:
L1 = "English"
L2 = "French"
L1_epi = "eng-Latn"
L2_epi = "fra-Latn"
L1_gtrans = "en"
L2_gtrans = "fr"
L1_unicode = "Latin"
L2_unicode = "Latin"
synonym_file_name = "English-Hindi-Syn-Ant.csv"
colab = False # Change to False if not using google-colab

# Installations

In [3]:
!pip install Epitran
!pip install eng_to_ipa
!pip install panphon
!pip install -U PyYAML
!pip install googletrans
!pip install googletrans==3.1.0a0
!pip install unicodeblock



# Imports

In [4]:
import pandas as pd
import numpy as np
import panphon
import panphon.distance
import editdistance # levenshtein
import epitran
import eng_to_ipa as eng
from googletrans import Translator
from googletrans import LANGUAGES
import itertools
import unicodeblock.blocks
# epitran.download.cedict() # required for Mandarin Chinese (could be replace by cedict_file='cedict.txt' as argument)
if colab:
     from google.colab import files

# Preparation

In [5]:
dst = panphon.distance.Distance()

In [6]:
edit_dists = [dst.fast_levenshtein_distance_div_maxlen, dst.dolgo_prime_distance_div_maxlen, \
              dst.feature_edit_distance_div_maxlen, dst.hamming_feature_edit_distance_div_maxlen, dst.weighted_feature_edit_distance_div_maxlen, \
              dst.partial_hamming_feature_edit_distance_div_maxlen]
edit_dists_names = ["Fast Levenshtein Distance Div Maxlen", "Dolgo Prime Distance Div Maxlen", \
                    "Feature Edit Distance Div Maxlen", "Hamming Feature Distance Div Maxlen", "Weighted Feature Distance Div Maxlen", \
                    "Partial Hamming Feature Distance Div Maxlen"]

In [7]:
l = ['loan_word', 'original_word', 'loan_word_epitran', 'original_word_epitran', 'loan_english', 'original_english', 'Fast Levenshtein Distance Div Maxlen', 'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen', 'Hamming Feature Distance Div Maxlen', 'Weighted Feature Distance Div Maxlen', 'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein', 'loan_unicode', 'original_unicode', 'label']

In [8]:
def filter(test_str, all_freq):
    for i in test_str:
        if i.lower() in all_freq:
            all_freq[i.lower()] += 1
        else:
            all_freq[i.lower()] = 1

In [9]:
L1_L2 = pd.read_csv(f"{L1}-{L2}.csv")
L1_L2['original_word'] = L1_L2['original_word'].astype(str)
L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)

In [10]:
s=set()
for i in range(L1_L2.shape[0]):
    for letter in L1_L2.original_word.iloc[i]:
        s.add(unicodeblock.blocks.of(letter))
s

{'ARABIC',
 'BASIC_LATIN',
 'BASIC_PUNCTUATION',
 'DEVANAGARI',
 'GENERAL_PUNCTUATION',
 'GURMUKHI',
 'SPACE'}

In [11]:
L1_uni_blocks = {'BASIC_LATIN', 'BASIC_PUNCTUATION', 'LATIN_EXTENDED_LETTER', 'SPACE'}
L2_uni_blocks = {'BASIC_LATIN', 'BASIC_PUNCTUATION', 'LATIN_EXTENDED_A', 'LATIN_EXTENDED_LETTER', 'SPACE'}

# Loans

In [12]:
def get_L1_L2(L1, L1_epi, L2, L2_epi):
    L1_L2 = pd.read_csv(f"{L1}-{L2}.csv")
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2 = L1_L2.dropna()
    freq_L1 = dict()
    freq_L2 = dict()
    L1_L2["loan_word"].apply(lambda x:filter(x, freq_L1))
    L1_L2["original_word"].apply(lambda x:filter(x, freq_L2))
    L1_L2["plain Levenshtein"] = L1_L2.apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    alpha_L1 = [key for key, value in freq_L1.items() if value > 10]
    alpha_L2 = [key for key, value in freq_L2.items() if value > 10]
    drop = []
    for row in range(L1_L2.shape[0]):
        if L1_L2.iloc[row]["loan_word"] == 'nan':
            drop.append(row)
        if L1_L2.iloc[row]["original_word"] == 'nan':
            drop.append(row)

        for letter in L1_L2.iloc[row]["loan_word"]:
            if letter not in alpha_L1 and unicodeblock.blocks.of(letter) not in L1_uni_blocks:
                drop.append(row)
                break
        for letter in L1_L2.iloc[row]["original_word"]:
            if letter not in alpha_L2 and unicodeblock.blocks.of(letter) not in L2_uni_blocks:
                drop.append(row)
        if L1_L2.iloc[row]["plain Levenshtein"] > 25:
                drop.append(row)
    L1_L2 = L1_L2.loc[~L1_L2.index.isin(drop)].reset_index(drop=True)
    
    if L1_epi == "eng-Latn":
        L1_L2["loan_word_epitran"] = L1_L2.apply(lambda x:eng.convert(x["loan_word"]), axis=1)
    else:
        epi = epitran.Epitran(L1_epi)
        L1_L2["loan_word_epitran"] = L1_L2.apply(lambda x:epi.transliterate(x["loan_word"]), axis=1)

    if L2_epi == "eng-Latn":
        L1_L2["original_word_epitran"] = L1_L2.apply(lambda x:eng.convert(x["original_word"]), axis=1)
    
    else:
        epi = epitran.Epitran(L2_epi)
        L1_L2["original_word_epitran"] = L1_L2.apply(lambda x:epi.transliterate(x["original_word"]), axis=1)

    
    translator = Translator()
    L1_L2["loan_english"] = L1_L2.apply(lambda x: translator.translate(x["loan_word"], dest="en").text, axis=1)
    L1_L2["original_english"] = L1_L2.apply(lambda x: translator.translate(x["original_word"], dest="en").text, axis=1)

    L1_L2["label"] = "loan"
    L1_L2['original_word_epitran'] = L1_L2['original_word_epitran'].apply(lambda x: x.replace("پ", "p").replace("ك", "g").replace("ي", "j").replace("گ", "g").replace("چ", "tʃ").replace("ژ", "ʒ").replace("ء", "ʔ"))
    L1_L2 = L1_L2.drop_duplicates(ignore_index=True)

    for dist, name  in zip(edit_dists, edit_dists_names):
        L1_L2[name] = L1_L2.apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2['loan_unicode'] = L1_unicode
    L1_L2['original_unicode'] = L2_unicode
    L1_L2 = L1_L2[l]
    return L1_L2

In [13]:
L1_L2 = get_L1_L2(L1, L1_epi, L2, L2_epi)

In [None]:
L1_L2.to_csv(f"{L1}-{L2}-Loans.csv", index=False)

In [None]:
if colab:
    files.download(f"{L1}-{L2}-Loans.csv")

# Synonyms

In [18]:
def get_L1_L2_Synonyms(L1_epi, L2_epi, L1_gtrans, L2_gtrans, file_name=synonym_file_name):
    
    df = pd.read_csv(file_name)
    translator = Translator()

    l1, l2 = [], []
    for j in range(len(df)):
        l = df.iloc[j]["Synonym"].split(', ')
        a = translator.translate(l, dest=L1_gtrans)
        b = translator.translate(l, dest=L2_gtrans)
        for i in range(len(a)):
            a[i] = a[i].text
            b[i] = b[i].text
        pair_l = list(itertools.product(a,b))
        l1 += [e[0] for e in pair_l]
        l2 += [e[1] for e in pair_l]
    L1_L2_Synonyms = pd.DataFrame(list(zip(l1, l2)), columns =['loan_word', 'original_word'])
    L1_L2 = pd.read_csv(f"{L1}-{L2}.csv")
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2_Synonyms = L1_L2_Synonyms.dropna()
    freq_L1 = dict()
    freq_L2 = dict()
    L1_L2["loan_word"].apply(lambda x:filter(x, freq_L1))
    L1_L2["original_word"].apply(lambda x:filter(x, freq_L2))
    L1_L2_Synonyms["plain Levenshtein"] = L1_L2_Synonyms.apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    alpha_L1 = [key for key, value in freq_L1.items() if value > 10]
    alpha_L2 = [key for key, value in freq_L2.items() if value > 10]
    drop = []
    for row in range(L1_L2.shape[0]):
        if L1_L2_Synonyms.iloc[row]["loan_word"] == 'nan':
            drop.append(row)
        if L1_L2_Synonyms.iloc[row]["original_word"] == 'nan':
            drop.append(row)

        for letter in L1_L2_Synonyms.iloc[row]["loan_word"]:
            if letter not in alpha_L1 and unicodeblock.blocks.of(letter) not in L1_uni_blocks:
                drop.append(row)
                break
        for letter in L1_L2_Synonyms.iloc[row]["original_word"]:
            if letter not in alpha_L2 and unicodeblock.blocks.of(letter) not in L2_uni_blocks:
                drop.append(row)
        if L1_L2_Synonyms.iloc[row]["plain Levenshtein"] > 25:
                drop.append(row)
    L1_L2_Synonyms = L1_L2_Synonyms.loc[~L1_L2_Synonyms.index.isin(drop)].reset_index(drop=True)

    L1_L2_Synonyms = L1_L2_Synonyms[(~L1_L2_Synonyms['loan_word'].isin(L1_L2['loan_word'])) | (~L1_L2_Synonyms['original_word'].isin(L1_L2['original_word']))]

    if L1_epi == "eng-Latn":
        L1_L2_Synonyms["loan_word_epitran"] = L1_L2_Synonyms.apply(lambda x:eng.convert(x["loan_word"]), axis=1)
    else:
        epi = epitran.Epitran(L1_epi)
        L1_L2_Synonyms["loan_word_epitran"] = L1_L2_Synonyms.apply(lambda x:epi.transliterate(x["loan_word"]), axis=1)

    if L2_epi == "eng-Latn":
        L1_L2_Synonyms["original_word_epitran"] = L1_L2_Synonyms.apply(lambda x:eng.convert(x["original_word"]), axis=1)

    else:
        epi = epitran.Epitran(L2_epi)
        L1_L2_Synonyms["original_word_epitran"] = L1_L2_Synonyms.apply(lambda x:epi.transliterate(x["original_word"]), axis=1)


    L1_L2_Synonyms["loan_english"] = L1_L2_Synonyms.apply(lambda x: translator.translate(x["loan_word"], dest="en").text, axis=1)
    L1_L2_Synonyms["original_english"] = L1_L2_Synonyms.apply(lambda x: translator.translate(x["original_word"], dest="en").text, axis=1)
    L1_L2_Synonyms= L1_L2_Synonyms.drop_duplicates(ignore_index=True)
    L1_L2_Synonyms['original_word_epitran'] = L1_L2_Synonyms['original_word_epitran'].apply(lambda x: x.replace("پ", "p").replace("ء", "ʔ").replace("ي", "j").replace("گ", "g").replace("ك", "g").replace("چ", "tʃ").replace("ژ", "ʒ"))
    for dist, name  in zip(edit_dists, edit_dists_names):
        L1_L2_Synonyms[name] = L1_L2_Synonyms.apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2_Synonyms["label"] = "synonym"
    l = ['loan_word', 'original_word', 'loan_word_epitran', 'original_word_epitran', 'loan_english', 'original_english', 'Fast Levenshtein Distance Div Maxlen', 'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen', 'Hamming Feature Distance Div Maxlen', 'Weighted Feature Distance Div Maxlen', 'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein', 'loan_unicode', 'original_unicode', 'label']    
    L1_L2_Synonyms['loan_unicode'] = L1_unicode
    L1_L2_Synonyms['original_unicode'] = L2_unicode
    L1_L2_Synonyms = L1_L2_Synonyms[l]

    return L1_L2_Synonyms

In [19]:
L1_L2_Synonyms = get_L1_L2_Synonyms(L1_epi, L2_epi, L1_gtrans, L2_gtrans, synonym_file_name)b

In [None]:
L1_L2_Synonyms.to_csv(f"{L1}-{L2}-Synonyms.csv", index=False)

In [None]:
if colab:
    files.download(f"{L1}-{L2}-Synonyms.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Hard Negatives

In [None]:
# vocab = pd.read_csv("dictionary.csv")["form"]
vocab = L1_L2.original_word.drop_duplicates().reset_index(drop=True)

In [None]:
vocab = vocab.astype(str)

In [None]:
vocab = vocab.apply(lambda x: x.strip())

In [None]:
if L2_epi == "eng-Latn":
    vocab_ipa = vocab.apply(lambda x: eng.convert(x))
    
else:
    epi = epitran.Epitran(L2_epi)
    vocab_ipa = vocab.apply(lambda x: epi.transliterate(x))
vocab_ipa.to_csv(f"{L2}-vocab-ipa.csv", index=False)

In [None]:
translator = Translator()
vocab_translated = vocab.apply(lambda x:translator.translate(x, dest="en").text)
vocab_translated.to_csv(f"{L2}-vocab-translated.csv", index=False)

In [None]:
# vocab = vocab.iloc[::20].reset_index(drop=True)
# vocab_ipa = vocab_ipa.iloc[::20].reset_index(drop=True)
# vocab_translated = vocab_translated.iloc[::20].reset_index(drop=True)

In [None]:
def closest_neighbour_epi(anch, distance, df, vocab):
    # returns the row id of the closest neighbor in terms of "distance"
    min_dist = 10000
    argmin = 0
    for i in range(len(vocab)):
        tmp_dist = distance(df.iloc[anch]["loan_word_epitran"], vocab_ipa[i])
        if (tmp_dist < min_dist) and (df.iloc[anch]["original_word"] != vocab[i]):
            argmin = i
            min_dist = tmp_dist
    return argmin, min_dist

In [None]:
def get_Hard_Negatives():
    L1_L2_Hard_Negatives = L1_L2.head(0).copy()
    for edit_dist, edit_dist_name in zip(edit_dists, edit_dists_names):
        for row in range(len(L1_L2)):
            if row%50 == 0 : 
                print(row)
            i, _ = closest_neighbour_epi(row, edit_dist, L1_L2, vocab)
            if L1_L2.iloc[row]["original_word"] != vocab[i]:            
                tmp_df = pd.DataFrame({'loan_word': L1_L2.iloc[row]["loan_word"],
                                    'original_word' : vocab[i],
                                    'loan_word_epitran' : L1_L2.iloc[row]["loan_word_epitran"],
                                    'original_word_epitran' : vocab_ipa[i],
                                    'loan_english' : L1_L2.iloc[row]["loan_english"],
                                    'original_english' : [vocab_translated[i]]})
                L1_L2_Hard_Negatives = L1_L2_Hard_Negatives.append(tmp_df, ignore_index=True)
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives[(~L1_L2_Hard_Negatives['loan_word'].isin(L1_L2_Synonyms['loan_word'])) | (~L1_L2_Hard_Negatives['original_word'].isin(L1_L2_Synonyms['original_word']))]
    L1_L2_Hard_Negatives["label"]="hard_negative"
    L1_L2_Hard_Negatives['loan_unicode'] = L1_unicode
    L1_L2_Hard_Negatives['original_unicode'] = L2_unicode
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives.drop_duplicates(ignore_index=True)
    L1_L2_Hard_Negatives['original_word_epitran'] = L1_L2_Hard_Negatives['original_word_epitran'].apply(lambda x: x.replace("پ", "p").replace("ء", "ʔ").replace("ي", "j").replace("ك", "g").replace("گ", "g").replace("چ", "tʃ").replace("ژ", "ʒ"))
    for dist, name  in zip(edit_dists, edit_dists_names):
        L1_L2_Hard_Negatives[name] = L1_L2_Hard_Negatives.apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2_Hard_Negatives["plain Levenshtein"] = L1_L2_Hard_Negatives.apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives[l]
    return L1_L2_Hard_Negatives

In [None]:
L1_L2_Hard_Negatives = get_Hard_Negatives()

In [None]:
L1_L2_Hard_Negatives.to_csv(f"{L1}-{L2}-Hard-Negatives.csv", index=False)

In [None]:
colab = True

In [None]:
if colab:
    files.download(f"{L1}-{L2}-Hard-Negatives.csv")

# Randoms

In [113]:
def get_L1_L2_Randoms(L1_L2):
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2_Randoms = L1_L2.copy()
    L1_L2_Randoms = L1_L2_Randoms.drop(edit_dists_names , axis=1)
    idx = np.random.permutation(L1_L2_Randoms.index)
    L1_L2_Randoms["original_word"] = L1_L2_Randoms["original_word"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms["original_english"] = L1_L2_Randoms["original_english"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms["original_word_epitran"] = L1_L2_Randoms["original_word_epitran"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms['original_word_epitran'] = L1_L2_Randoms['original_word_epitran'].apply(lambda x: x.replace("پ", "p").replace("ي", "j").replace("ك", "g").replace("گ", "g").replace("چ", "tʃ").replace("ژ", "ʒ").replace("ء", "ʔ"))
    L1_L2_Randoms = L1_L2_Randoms.iloc[(~L1_L2_Randoms[['loan_word', 'original_word']].isin(L1_L2[['loan_word', 'original_word']])).index]
    L1_L2_Randoms = L1_L2_Randoms.iloc[(~L1_L2_Randoms[['loan_word', 'original_word']].isin(L1_L2_Synonyms[['loan_word', 'original_word']])).index]
    L1_L2_Randoms = L1_L2_Randoms.iloc[(~L1_L2_Randoms[['loan_word', 'original_word']].isin(L1_L2_Hard_Negatives[['loan_word', 'original_word']])).index]
    L1_L2_Randoms = L1_L2_Randoms.drop_duplicates(ignore_index=True)
    for dist, name  in zip(edit_dists, edit_dists_names):
        L1_L2_Randoms[name] = L1_L2_Randoms.apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2_Randoms["plain Levenshtein"] = L1_L2_Randoms.apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    L1_L2_Randoms["label"] = "random"
    L1_L2_Randoms.reset_index(drop=True, inplace=True)
    L1_L2_Randoms = L1_L2_Randoms[l]
    return L1_L2_Randoms

In [114]:
L1_L2_Randoms = get_L1_L2_Randoms(L1_L2)

In [None]:
L1_L2_Randoms.to_csv(f"{L1}-{L2}-Randoms.csv", index=False)

In [None]:
if colab:
    files.download(f"{L1}-{L2}-Randoms.csv")

In [None]:
L1_L2 = pd.read_csv(f"{L1}-{L2}-Loans.csv")
L1_L2_Synonyms = pd.read_csv(f"{L1}-{L2}-Synonyms.csv")
L1_L2_Hard_Negatives = pd.read_csv(f"{L1}-{L2}-Hard-Negatives.csv")
L1_L2_Randoms = pd.read_csv(f"{L1}-{L2}-Randoms.csv")