Recommended to run on Colab due to instability in Google Translate API.

# Installations

In [None]:
!pip install Epitran
!pip install eng_to_ipa
!pip install panphon
!pip install -U PyYAML
!pip install googletrans
!pip install googletrans==3.1.0a0
!pip install unicodeblock
!pip install tqdm
!pip install ipywidgets

In [None]:
colab = False # Change to False if not using google-colab

In [None]:
import json
import pandas as pd
import numpy as np
import panphon
import panphon.distance
import epitran
import eng_to_ipa as eng
from googletrans import Translator
import itertools
import unicodeblock.blocks
import time
import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
# epitran.download.cedict() # required for Mandarin Chinese (could be replace by cedict_file='cedict.txt' as argument)
if colab:
     from google.colab import files

In [None]:
if colab:
    print("Please upload language-pairs.json")
    files.upload() # upload language-pairs.json
    print("Please upload English-Syn-Ant.csv")
    files.upload() # upload English-Syn-Ant.csv

In [None]:
synonym_file_name = 'English-Syn-Ant.csv'

language_pairs_file = '../language-pairs.json'

if colab:
    language_pairs_file = 'language-pairs.json'

pairs = None

with open(language_pairs_file, 'r') as f:
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L1_epi = pairs[pair]['target']['epi']
    L1_gtrans = pairs[pair]['target']['gtrans']
    L1_unicode = pairs[pair]['target']['unicode']
    L2 = pairs[pair]['source']['name']
    L2_epi = pairs[pair]['source']['epi']
    L2_gtrans = pairs[pair]['source']['gtrans']
    L2_unicode = pairs[pair]['source']['unicode']
    print(L1,L1_epi,L1_gtrans,L1_unicode)
    print(L2,L2_epi,L2_gtrans,L2_unicode)
    if colab:
        print(f"Please upload {L1}-{L2}.csv")
        files.upload()
        print(f"Please upload {L2}-AllLemmas.csv")
        files.upload()

# Preparation

In [None]:
dst = panphon.distance.Distance()

In [None]:
edit_dists = [dst.fast_levenshtein_distance_div_maxlen, dst.dolgo_prime_distance_div_maxlen, \
              dst.feature_edit_distance_div_maxlen, dst.hamming_feature_edit_distance_div_maxlen, dst.weighted_feature_edit_distance_div_maxlen, \
              dst.partial_hamming_feature_edit_distance_div_maxlen]
edit_dists_names = ["Fast Levenshtein Distance Div Maxlen", "Dolgo Prime Distance Div Maxlen", \
                    "Feature Edit Distance Div Maxlen", "Hamming Feature Distance Div Maxlen", "Weighted Feature Distance Div Maxlen", \
                    "Partial Hamming Feature Distance Div Maxlen"]

In [None]:
cols = ['loan_word', 'original_word', 'loan_word_epitran', 'original_word_epitran', 'loan_english', 'original_english', 'Fast Levenshtein Distance Div Maxlen', 'Dolgo Prime Distance Div Maxlen', 'Feature Edit Distance Div Maxlen', 'Hamming Feature Distance Div Maxlen', 'Weighted Feature Distance Div Maxlen', 'Partial Hamming Feature Distance Div Maxlen', 'plain Levenshtein', 'loan_unicode', 'original_unicode']

In [None]:
def filter(test_str, all_freq):
    for i in test_str:
        if i.lower() in all_freq:
            all_freq[i.lower()] += 1
        else:
            all_freq[i.lower()] = 1

In [None]:
if colab:
    filename = f"{L1}-{L2}.csv"
else:
    filename = f"../wiktionary-scraper-python/results/{L1}-{L2}.csv"
L1_L2 = pd.read_csv(filename)
L1_L2['original_word'] = L1_L2['original_word'].astype(str)
L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)

In [None]:
s=set()
for i in range(L1_L2.shape[0]):
    for letter in L1_L2.original_word.iloc[i]:
        s.add(unicodeblock.blocks.of(letter))
s

In [None]:
L1_uni_blocks = {'BASIC_LATIN',
                 'BASIC_PUNCTUATION',
                 'GENERAL_PUNCTUATION',
                 'LATIN_EXTENDED_A',
                 'LATIN_EXTENDED_LETTER',
                 'SPACE'}
L2_uni_blocks = {'BASIC_LATIN',
                 'BASIC_PUNCTUATION',
                 'GENERAL_PUNCTUATION',
                 'LATIN_EXTENDED_A',
                 'LATIN_EXTENDED_LETTER',
                 'SPACE'}

In [None]:
def try_translate(word, dest_lang, translator):
    translation = None
    try:
        translation = translator.translate(word, dest=dest_lang).text
    except Exception as e:
        print(word, e)
    return translation

In [None]:
def clean_up_epitran(transcription):
    '''Language specific errors in Epitran transcription '''
    epitran_cleaned = transcription
    epitran_cleaned = epitran_cleaned.apply(lambda x: x.replace("پ", "p").\
                                            replace("ك", "g").replace("ي", "j").\
                                            replace("گ", "g").replace("چ", "tʃ").\
                                            replace("ژ", "ʒ").replace("ء", "ʔ"))
    return epitran_cleaned

# Loans

In [None]:
def get_L1_L2(L1, L1_epi, L2, L2_epi):
    if colab:
        filename = f"{L1}-{L2}.csv"
    else:
        filename = f"../wiktionary-scraper-python/results/{L1}-{L2}.csv"

    L1_L2 = pd.read_csv(filename)
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2 = L1_L2.dropna()
    freq_L1 = dict()
    freq_L2 = dict()

    print("Filtering loan words")
    L1_L2["loan_word"].progress_apply(lambda x:filter(x, freq_L1))

    print("Filtering original words")
    L1_L2["original_word"].progress_apply(lambda x:filter(x, freq_L2))

    print("Calculating plain Levenshtein distance")
    L1_L2["plain Levenshtein"] = L1_L2.progress_apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    alpha_L1 = [key for key, value in freq_L1.items() if value > 10]
    alpha_L2 = [key for key, value in freq_L2.items() if value > 10]
    drop = []
    for row in range(L1_L2.shape[0]):
        if L1_L2.iloc[row]["loan_word"] == 'nan':
            drop.append(row)
        if L1_L2.iloc[row]["original_word"] == 'nan':
            drop.append(row)

        for letter in L1_L2.iloc[row]["loan_word"]:
            if letter not in alpha_L1 and unicodeblock.blocks.of(letter) not in L1_uni_blocks:
                drop.append(row)
                break
        for letter in L1_L2.iloc[row]["original_word"]:
            if letter not in alpha_L2 and unicodeblock.blocks.of(letter) not in L2_uni_blocks:
                drop.append(row)
                break
        if L1_L2.iloc[row]["plain Levenshtein"] > 25:
              drop.append(row)
    L1_L2 = L1_L2.loc[~L1_L2.index.isin(drop)].reset_index(drop=True)
  
    print("Transliterating loan words")
    if L1_epi == "eng-Latn":
        L1_L2["loan_word_epitran"] = L1_L2.progress_apply(lambda x:eng.convert(x["loan_word"]), axis=1)
    else:
        epi = epitran.Epitran(L1_epi)
        L1_L2["loan_word_epitran"] = L1_L2.progress_apply(lambda x:epi.transliterate(x["loan_word"]), axis=1)

    print("Transliterating loan words")
    if L2_epi == "eng-Latn":
        L1_L2["original_word_epitran"] = L1_L2.progress_apply(lambda x:eng.convert(x["original_word"]), axis=1)
    else:
        epi = epitran.Epitran(L2_epi)
        L1_L2["original_word_epitran"] = L1_L2.progress_apply(lambda x:epi.transliterate(x["original_word"]), axis=1)

    translator = Translator()

    print("Translating loan words")
    L1_L2["loan_english"] = L1_L2.progress_apply(lambda x: try_translate(x["loan_word"],"en",translator), axis=1)

    print("Translating original words")
    L1_L2["original_english"] = L1_L2.progress_apply(lambda x: try_translate(x["original_word"],"en",translator), axis=1)

    L1_L2['original_word_epitran'] = clean_up_epitran(L1_L2['original_word_epitran'])
    L1_L2 = L1_L2.drop_duplicates(ignore_index=True)

    for dist, name  in zip(edit_dists, edit_dists_names):
        print(f"Calculating {name}")
        L1_L2[name] = L1_L2.progress_apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2['loan_unicode'] = L1_unicode
    L1_L2['original_unicode'] = L2_unicode
    L1_L2 = L1_L2[cols]
    return L1_L2

# Synonyms

In [None]:
def get_L1_L2_Synonyms(L1, L1_epi, L1_gtrans, L2, L2_epi, L2_gtrans, synonym_file_name=synonym_file_name):
    if colab:
        filename = f"{L1}-{L2}.csv"
    else:
        filename = f"../wiktionary-scraper-python/results/{L1}-{L2}.csv"

    df = pd.read_csv(synonym_file_name)
    translator = Translator()

    l1, l2 = [], []
    print("Translating synonyms")
    for j in tqdm(range(len(df))):
        l = df.iloc[j]["Synonym"].split(', ')
        a = [try_translate(x, L1_gtrans, translator) for x in l]
        b = [try_translate(x, L2_gtrans, translator) for x in l]
        pair_l = list(itertools.product(a,b))
        l1 += [e[0] for e in pair_l]
        l2 += [e[1] for e in pair_l]
    L1_L2_Synonyms = pd.DataFrame(list(zip(l1, l2)), columns =['loan_word', 'original_word'])
    L1_L2 = pd.read_csv(filename)
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2_Synonyms = L1_L2_Synonyms.dropna()
    freq_L1 = dict()
    freq_L2 = dict()

    print("Filtering loan words")
    L1_L2["loan_word"].progress_apply(lambda x:filter(x, freq_L1))

    print("Filtering original words")
    L1_L2["original_word"].progress_apply(lambda x:filter(x, freq_L2))

    L1_L2_Synonyms["plain Levenshtein"] = L1_L2_Synonyms.apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    alpha_L1 = [key for key, value in freq_L1.items() if value > 10]
    alpha_L2 = [key for key, value in freq_L2.items() if value > 10]
    drop = []
    for row in range(L1_L2.shape[0]):
        if L1_L2_Synonyms.iloc[row]["loan_word"] == 'nan':
            drop.append(row)
        if L1_L2_Synonyms.iloc[row]["original_word"] == 'nan':
            drop.append(row)

        for letter in L1_L2_Synonyms.iloc[row]["loan_word"]:
            if letter not in alpha_L1 and unicodeblock.blocks.of(letter) not in L1_uni_blocks:
                drop.append(row)
                break
        for letter in L1_L2_Synonyms.iloc[row]["original_word"]:
            if letter not in alpha_L2 and unicodeblock.blocks.of(letter) not in L2_uni_blocks:
                drop.append(row)
        if L1_L2_Synonyms.iloc[row]["plain Levenshtein"] > 25:
                drop.append(row)
    L1_L2_Synonyms = L1_L2_Synonyms.loc[~L1_L2_Synonyms.index.isin(drop)].reset_index(drop=True)


    print("Transliterating loan words")
    if L1_epi == "eng-Latn":
        L1_L2_Synonyms["loan_word_epitran"] = L1_L2_Synonyms.progress_apply(lambda x:eng.convert(x["loan_word"]), axis=1)
    else:
        epi = epitran.Epitran(L1_epi)
        L1_L2_Synonyms["loan_word_epitran"] = L1_L2_Synonyms.progress_apply(lambda x:epi.transliterate(x["loan_word"]), axis=1)

    print("Transliterating original words")
    if L2_epi == "eng-Latn":
        L1_L2_Synonyms["original_word_epitran"] = L1_L2_Synonyms.progress_apply(lambda x:eng.convert(x["original_word"]), axis=1)
    else:
        epi = epitran.Epitran(L2_epi)
        L1_L2_Synonyms["original_word_epitran"] = L1_L2_Synonyms.progress_apply(lambda x:epi.transliterate(x["original_word"]), axis=1)

    print("Translating loan words")
    L1_L2_Synonyms["loan_english"] = L1_L2_Synonyms.progress_apply(lambda x: try_translate(x["loan_word"], "en", translator), axis=1)
    
    print("Translating original words")
    L1_L2_Synonyms["original_english"] = L1_L2_Synonyms.progress_apply(lambda x: try_translate(x["original_word"], "en", translator), axis=1)
    
    L1_L2_Synonyms = L1_L2_Synonyms.drop_duplicates(ignore_index=True)
    L1_L2_Synonyms['original_word_epitran'] = clean_up_epitran(L1_L2_Synonyms['original_word_epitran'])
    
    for dist, name  in zip(edit_dists, edit_dists_names):
        print(f"Calculating {name}")
        L1_L2_Synonyms[name] = L1_L2_Synonyms.progress_apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    L1_L2_Synonyms['loan_unicode'] = L1_unicode
    L1_L2_Synonyms['original_unicode'] = L2_unicode
    L1_L2_Synonyms = L1_L2_Synonyms[cols]

    return L1_L2_Synonyms

# Hard Negatives

In [None]:
def closest_neighbour_epi(anch, distance, df, vocab, vocab_ipa):
    # returns the row id of the closest neighbor in terms of "distance"
    min_dist = 10000
    argmin = 0
    for i in range(len(vocab)):
        tmp_dist = distance(df.iloc[anch]["loan_word_epitran"], vocab_ipa[i])
        if (tmp_dist < min_dist) and (df.iloc[anch]["original_word"] != vocab.iloc[i]["word"]):
            argmin = i
            min_dist = tmp_dist
    return argmin, min_dist

In [None]:
def get_L1_L2_Hard_Negatives(L1, L2):
    if colab:
        filename = f"{L2}-AllLemmas.csv"
    else:
        filename = f"AllLemmas/{L2}-AllLemmas.csv"
    vocab = pd.read_csv(filename)
    vocab["word"] = vocab.apply(lambda x: x["word"].strip(), axis=1)

    print("Transliterating hard negative candidates")
    if L2_epi == "eng-Latn":
        vocab_ipa = vocab.progress_apply(lambda x: eng.convert(x["word"]))
    else:
        epi = epitran.Epitran(L2_epi)
        vocab_ipa = vocab.progress_apply(lambda x: epi.transliterate(x["word"]), axis=1)
    vocab_ipa.to_csv(f"{L2}-vocab-ipa.csv", index=False)

    translator = Translator()

    print("Translating hard negative candidates")
    vocab_translated = vocab.progress_apply(lambda x: try_translate(x["word"], "en", translator), axis=1)
    vocab_translated.to_csv(f"{L2}-vocab-translated.csv", index=False)

    L1_L2_Hard_Negatives = L1_L2.head(0).copy()
    for edit_dist, edit_dist_name in zip(edit_dists, edit_dists_names):
        print(f"Getting closest neighbors with {edit_dist_name}")
        for row in tqdm(range(len(L1_L2))):
            i, _ = closest_neighbour_epi(row, edit_dist, L1_L2, vocab, vocab_ipa)
            if L1_L2.iloc[row]["original_word"] != vocab.iloc[i]["word"]:            
                tmp_df = pd.DataFrame({'loan_word': L1_L2.iloc[row]["loan_word"],
                                    'original_word' : vocab.iloc[i]["word"],
                                    'loan_word_epitran' : L1_L2.iloc[row]["loan_word_epitran"],
                                    'original_word_epitran' : vocab_ipa[i],
                                    'loan_english' : L1_L2.iloc[row]["loan_english"],
                                    'original_english' : [vocab_translated[i]]})
                L1_L2_Hard_Negatives = L1_L2_Hard_Negatives.append(tmp_df, ignore_index=True)
    L1_L2_Hard_Negatives['loan_unicode'] = L1_unicode
    L1_L2_Hard_Negatives['original_unicode'] = L2_unicode
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives.drop_duplicates(ignore_index=True)
    L1_L2_Hard_Negatives['original_word_epitran'] = clean_up_epitran(L1_L2_Hard_Negatives['original_word_epitran'])
    
    for dist, name  in zip(edit_dists, edit_dists_names):
        print(f"Calculating {name}")
        L1_L2_Hard_Negatives[name] = L1_L2_Hard_Negatives.progress_apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    
    print("Calculating plain Levenshtein distance")
    L1_L2_Hard_Negatives["plain Levenshtein"] = L1_L2_Hard_Negatives.progress_apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives[cols]
    return L1_L2_Hard_Negatives

# Randoms

In [None]:
def get_L1_L2_Randoms(L1_L2):
    L1_L2['loan_word'] = L1_L2['loan_word'].astype(str)
    L1_L2['original_word'] = L1_L2['original_word'].astype(str)
    L1_L2_Randoms = L1_L2.copy()
    L1_L2_Randoms = L1_L2_Randoms.drop(edit_dists_names , axis=1)
    idx = np.random.permutation(L1_L2_Randoms.index)
    L1_L2_Randoms["original_word"] = L1_L2_Randoms["original_word"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms["original_english"] = L1_L2_Randoms["original_english"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms["original_word_epitran"] = L1_L2_Randoms["original_word_epitran"].reindex(idx).reset_index(drop=True)
    L1_L2_Randoms['original_word_epitran'] = clean_up_epitran(L1_L2_Randoms['original_word_epitran'])
    L1_L2_Randoms = L1_L2_Randoms.drop_duplicates(ignore_index=True)
    
    for dist, name in zip(edit_dists, edit_dists_names):
        print(f"Calculating {name}")
        L1_L2_Randoms[name] = L1_L2_Randoms.progress_apply(lambda x:dist(x["loan_word_epitran"], x["original_word_epitran"]), axis=1)
    
    print("Calculating plain Levenshtein distance")
    L1_L2_Randoms["plain Levenshtein"] = L1_L2_Randoms.progress_apply(lambda x:dst.fast_levenshtein_distance(x["loan_word"], x["original_word"]), axis=1)
    L1_L2_Randoms.reset_index(drop=True, inplace=True)
    L1_L2_Randoms = L1_L2_Randoms[cols]
    return L1_L2_Randoms

In [None]:
for pair in pairs:
    print(pair)
    L1 = pairs[pair]['target']['name']
    L1_epi = pairs[pair]['target']['epi']
    L1_gtrans = pairs[pair]['target']['gtrans']
    L2 = pairs[pair]['source']['name']
    L2_gtrans = pairs[pair]['source']['gtrans']

    print(f"Making {L1}-{L2} loans split")
    start_time = time.time()
    L1_L2 = get_L1_L2(L1, L1_epi, L2, L2_epi)
    print(f"Finished in {str(datetime.timedelta(seconds=time.time()-start_time))}")

    print(f"Making {L1}-{L2} synonyms split")
    start_time = time.time()
    L1_L2_Synonyms = get_L1_L2_Synonyms(L1, L1_epi, L1_gtrans, L2, L2_epi, L2_gtrans, synonym_file_name)
    print(f"Finished in {str(datetime.timedelta(seconds=time.time()-start_time))}")

    print(f"Making {L1}-{L2} hard negatives split")
    start_time = time.time()
    L1_L2_Hard_Negatives = get_L1_L2_Hard_Negatives(L1, L2)
    print(f"Finished in {str(datetime.timedelta(seconds=time.time()-start_time))}")

    print(f"Making {L1}-{L2} randoms split")
    start_time = time.time()
    L1_L2_Randoms = get_L1_L2_Randoms(L1_L2)
    print(f"Finished in {str(datetime.timedelta(seconds=time.time()-start_time))}")

    print("Removing intersections")

    L1_L2_Synonyms = L1_L2_Synonyms[~L1_L2_Synonyms.isin(L1_L2)].dropna()

    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives[~L1_L2_Hard_Negatives.isin(L1_L2)].dropna()
    L1_L2_Hard_Negatives = L1_L2_Hard_Negatives[~L1_L2_Hard_Negatives.isin(L1_L2_Synonyms)].dropna()

    L1_L2_Randoms = L1_L2_Randoms[~L1_L2_Randoms.isin(L1_L2)].dropna()
    L1_L2_Randoms = L1_L2_Randoms[~L1_L2_Randoms.isin(L1_L2_Synonyms)].dropna()
    L1_L2_Randoms = L1_L2_Randoms[~L1_L2_Randoms.isin(L1_L2_Hard_Negatives)].dropna()

    L1_L2["label"] = "loan"
    L1_L2_Synonyms["label"] = "synonym"
    L1_L2_Hard_Negatives["label"] = "hard_negative"
    L1_L2_Randoms["label"] = "random"

    if colab:
        print(f"Creating {L1}-{L2}-Loans.csv")
        L1_L2.to_csv(f"{L1}-{L2}-Loans.csv", index=False)
        print(f"Downloading {L1}-{L2}-Loans.csv")
        files.download(f"{L1}-{L2}-Loans.csv")

        print(f"Creating {L1}-{L2}-Synonyms.csv")
        L1_L2_Synonyms.to_csv(f"{L1}-{L2}-Synonyms.csv", index=False)
        print(f"Downloading {L1}-{L2}-Synonyms.csv")
        files.download(f"{L1}-{L2}-Synonyms.csv")

        print(f"Creating {L1}-{L2}-Hard-Negatives.csv")
        L1_L2_Hard_Negatives.to_csv(f"{L1}-{L2}-Hard-Negatives.csv", index=False)
        print(f"Downloading {L1}-{L2}-Hard-Negatives.csv")
        files.download(f"{L1}-{L2}-Hard-Negatives.csv")

        print(f"Creating {L1}-{L2}-Randoms.csv")
        L1_L2_Randoms.to_csv(f"{L1}-{L2}-Randoms.csv", index=False)
        print(f"Downloading {L1}-{L2}-Randoms.csv")
        files.download(f"{L1}-{L2}-Randoms.csv")
    else:
        print(f"Creating {L1}-{L2}-Loans.csv")
        L1_L2.to_csv(f"Loans/{L1}-{L2}-Loans.csv", index=False)

        print(f"Creating {L1}-{L2}-Synonyms.csv")
        L1_L2_Synonyms.to_csv(f"Synonyms/{L1}-{L2}-Synonyms.csv", index=False)

        print(f"Creating {L1}-{L2}-Hard-Negatives.csv")
        L1_L2_Hard_Negatives.to_csv(f"Hard-Negatives/{L1}-{L2}-Hard-Negatives.csv", index=False)

        print(f"Creating {L1}-{L2}-Randoms.csv")
        L1_L2_Randoms.to_csv(f"Randoms/{L1}-{L2}-Randoms.csv", index=False)