In [1]:
import pickle
import os
from scipy import spatial
import glob
import pandas as pd
import ipdb
import numpy as np
import itertools
import math

In [45]:
embeddings =  'fasttext' #'mbert'
mbert_emb_type = 'last4layers' #('cls_representation', 'pooler_output','avg_token_emb', 'last4layers', 'cls_4hidden_layer')

if embeddings == 'fasttext':
    datapath = 'fasttext_embedding'
    list_files = glob.glob(f"{datapath}/*.pkl")
    langs = [name.split('_embs_dict.pkl')[0].split('/')[-1] for name in list_files]
else:
    datapath = 'mbert_embedding'
    list_files = glob.glob(f"{datapath}/*.pickle")
    langs = [name.split('.pickle')[0].split('/')[-1] for name in list_files]

In [40]:
cognates = pd.read_csv("../../cognates_and_borrowings/cognates_large_uniq_cog_manual.csv", sep="\t", 
                       names=["src word", "src lang", "src meaning",
                              "tgt word", "tgt lang", "tgt meaning"], keep_default_na=False) #
#print(cognates.head())

In [46]:
print("Loading embedding matrices...")
embs_dict = {}
for lang, f in zip(langs, list_files):
    emb_dict = pickle.load(open(f, 'rb'))
    if embeddings == "mbert":
        if lang !='meaning_eng':
            emb_dict = emb_dict[0]
        else:
            new_dict = {}
            for w in emb_dict.keys():
                new_dict[w] = emb_dict[w][mbert_emb_type]
            emb_dict = new_dict
    embs_dict[lang] = emb_dict
    # Check that everything is ok
    words = list(emb_dict.keys())
    emb_dim = len(emb_dict[words[0]]) #300 or 768
    print(lang, emb_dim)

Loading embedding matrices...
it 300
meaning_eng 300
es 300
la 300
pt 300
ro 300
osp 300
fr 300
en 300
frm 300


In [48]:
def check_cognates_deep(row):
    # for some words, we couldn't extract their embedding
    if row['src word'] in embs_dict[row['src lang']] and row['tgt word'] in embs_dict[row['tgt lang']]:
        for src_meaning in row['src meaning'].split(";"):
            for tgt_meaning in row['tgt meaning'].split(";"):
                if src_meaning in embs_dict['en'] and tgt_meaning in embs_dict['en']:
                    return True
    return False

def check_cognates(row):
    if row['src word'] in embs_dict[row['src lang']] and row['tgt word'] in embs_dict[row['tgt lang']]:
        if row['src meaning'] in embs_dict['en'] and row['tgt meaning'] in embs_dict['en']:
            return True
    return False


In [56]:
def get_meanings_deep(row):
    # for some words, we couldn't extract their embedding
    src, tgt = [], []
    for src_meaning in row['src meaning'].split(";"):
        if src_meaning in embs_dict['meaning_eng']:
            src.append(src_meaning)
    for tgt_meaning in row['tgt meaning'].split(";"):
         if tgt_meaning in embs_dict['meaning_eng']:
            tgt.append(tgt_meaning)
    row['src meaning'] = ";".join(src)
    row['tgt meaning'] = ";".join(tgt)
    return row

def get_meanings(row):
    # for some words, we couldn't extract their embedding
    if row['src meaning'] in embs_dict['meaning_eng'] and row['tgt meaning'] in embs_dict['meaning_eng']:
        return row

In [63]:
metrics = ["cognate_similarities", "meaning_similarities", "cog_meaning_similarities", 
           "cog_meaning_similarities_long", "cog_meaning_similarities_short"] 

def create_metrics_dict(metrics):
    metrics_dict = {}
    for shift in ['shift','no-shift']:
        metrics_dict[shift] = {}
        for metric in metrics:
            metrics_dict[shift][metric] = []
    return metrics_dict

In [64]:
def compute_distance(langs, print_words = False, langs_diff=False):
    metrics_dict = create_metrics_dict(metrics)
    local_cognates = cognates.loc[cognates['src lang'].isin(langs) & cognates['tgt lang'].isin(langs)]
    for ix, row in local_cognates.iterrows():
        if check_cognates(row):
            row = get_meanings(row)
            shift = 'no-shift' if row['src meaning'] == row['tgt meaning'] else 'shift'
            src_lang, tgt_lang = row['src lang'], row['tgt lang']
            if langs_diff and src_lang == tgt_lang:
                continue
            for metric, lang1, lang2, word1, word2 in [
                ("cognate_similarities", src_lang, tgt_lang, "src word", "tgt word"),
                ("meaning_similarities", "meaning_eng", "meaning_eng", "src meaning", "tgt meaning"),
                ("cog_meaning_similarities", src_lang, "meaning_eng", "src word", "src meaning"),
                ("cog_meaning_similarities", tgt_lang, "meaning_eng", "tgt word", "tgt meaning"),
            ]:
                val = 1 - spatial.distance.cosine(embs_dict[lang1][row[word1]], 
                                                  embs_dict[lang2][row[word2]])
                metrics_dict[shift][metric].append(val)
                if metric == "cog_meaning_similarities":
                    if len(row[word2].replace(";", " ").split(" ")) > 1:
                        metrics_dict[shift]["cog_meaning_similarities_long"].append(val)
                    else:
                        metrics_dict[shift]["cog_meaning_similarities_short"].append(val)
                if print_words: print(metric, "{:2.2%}".format(val), 
                                      f"{lang1}: {row[word1]} | {lang2}: {row[word2]}")
    return metrics_dict


In [65]:
compute_distance(["es", "es"], langs_diff=False, print_words = False)

{'shift': {'cognate_similarities': [1, 0.3009178936481476],
  'meaning_similarities': [1, 0.20179061591625214],
  'cog_meaning_similarities': [0.5953587889671326,
   0.5953587889671326,
   0.5399176478385925,
   0.574857771396637],
  'cog_meaning_similarities_long': [],
  'cog_meaning_similarities_short': [0.5953587889671326,
   0.5953587889671326,
   0.5399176478385925,
   0.574857771396637]},
 'no-shift': {'cognate_similarities': [],
  'meaning_similarities': [],
  'cog_meaning_similarities': [],
  'cog_meaning_similarities_long': [],
  'cog_meaning_similarities_short': []}}

In [66]:
def compute_distance_intra_pair(langs):
    meaning_similarities = []
    local_cognates = cognates.loc[cognates['src lang'].isin(langs) & cognates['tgt lang'].isin(langs)]
    for ix, row in local_cognates.iterrows():
        if check_cognates(row):
            if row['tgt word'] in embs_dict[row['tgt lang']]:
                val = 1 - spatial.distance.cosine(embs_dict[row['tgt lang']][row['tgt word']], 
                                                  embs_dict['en'][row['tgt meaning']])
                meaning_similarities.append(val)
            if row['src word'] in embs_dict[row['src lang']]:
                val = 1 - spatial.distance.cosine(embs_dict[row['src lang']][row['src word']], 
                                                  embs_dict['en'][row['src meaning']])
                meaning_similarities.append(val)
    return np.mean(meaning_similarities)



In [67]:
# only langs with already trained embeddings (knowing that latin probably has a poor alignment with the rest)
print("Global")
langs_aligned = ['fr','es','ro','it','la', 'osp', 'frm']
metrics_dict = compute_distance(langs_aligned)

for shift in ['shift','no-shift']:
    print(shift, len(metrics_dict[shift]['cog_meaning_similarities']))
    for metric in metrics:
        print(metric, np.mean(metrics_dict[shift][metric]))
    print("#####")

Global
shift 78
cognate_similarities 0.22391585499430314
meaning_similarities 0.26608850061893463
cog_meaning_similarities 0.28206217164794606
cog_meaning_similarities_long nan
cog_meaning_similarities_short 0.28206217164794606
#####
no-shift 604
cognate_similarities 0.33623583423087855
meaning_similarities 1.0
cog_meaning_similarities 0.3095176515217097
cog_meaning_similarities_long nan
cog_meaning_similarities_short 0.3095176515217097
#####


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [68]:
compute_distance_intra_pair(all_langs)

0.5719614584866375

In [69]:
metric2name_long = {"cognate_similarities": "cognate (a) $\leftrightarrow$ cognate (b)",
              "meaning_similarities": "meaning (a) $\leftrightarrow$ meaning (b)",
              "src_meaning_similarities": "cognate (a) $\leftrightarrow$ meaning (a)",
              "tgt_meaning_similarities": "cognate (b) $\leftrightarrow$ meaning (b)",
              "cog_meaning_similarities": "cognate $\leftrightarrow$ meaning",
              "cog_meaning_similarities_long": "more than 1 meaning",
              "cog_meaning_similarities_short": "one meaning"
              }

metric2name = {"cognate_similarities": "c(a) $\leftrightarrow$ c(b)",
              "meaning_similarities": "m(a) $\leftrightarrow$ m(b)",
              "src_meaning_similarities": "c(a) $\leftrightarrow$ m(a)",
              "tgt_meaning_similarities": "c(b) $\leftrightarrow$ m(b)",
              }

shift2name_long = {"shift": "Cognates shifted in meaning",
             "no-shift": "Cognates similar in meanings"}
shift2name = {"shift": "Shift",
             "no-shift": "No shift"}

lang_combinations = [ x for x in sorted(list(itertools.combinations_with_replacement(langs_aligned, 2)))
                     if x not in [("es", "frm"), ("frm", "frm"), ("osp", "frm"), 
                                  ("osp", "osp"), ("ro", "frm"), ("ro", "osp")]]

In [70]:
len(lang_combinations)

22

In [71]:
for ix1, ix2 in [(0, len(lang_combinations)//3), (len(lang_combinations)//3, 2*len(lang_combinations)//3), 
                 (2*len(lang_combinations)//3, len(lang_combinations))]:
    cur_lang_combinations = lang_combinations[ix1:ix2]

    for shift in ['shift','no-shift']:
        print(shift2name_long[shift] + "&" + " & ".join([f"{l1}-{l2}".upper() for l1, l2 in cur_lang_combinations]), "\\\\ \\midrule")
        item_line = "\#items & "
        for ix, metric in enumerate(metrics):
            cur_line = metric2name_long[metric] + " & "
            for lang1, lang2 in cur_lang_combinations:
                metrics_dict = compute_distance([lang1, lang2], langs_diff=(lang1 != lang2))
                if len(metrics_dict[shift]['cog_meaning_similarities']) > 0:
                    cur_line += "{:2.2%}".format(np.mean(metrics_dict[shift][metric])) + " & "
                    if ix == 0: item_line += str(len(metrics_dict[shift]['cog_meaning_similarities'])) + " & "
                else:
                    cur_line += " & "
                    if ix == 0: item_line += "0 & "
            print(cur_line[:-2].replace("%", "\%") + "\\\\")
        print(item_line[:-2] + "\\\\ \\midrule")

Cognates shifted in meaning&ES-ES & ES-IT & ES-LA & ES-OSP & ES-RO & FR-ES & FR-FR \\ \midrule
cognate (a) $\leftrightarrow$ cognate (b) & 65.05\% & 30.76\% &  &  & 18.62\% & 37.93\% & 18.20\% \\
meaning (a) $\leftrightarrow$ meaning (b) & 60.09\% & 25.10\% &  &  & 27.72\% & 34.61\% & 12.91\% \\
cognate $\leftrightarrow$ meaning & 57.64\% & 40.32\% &  &  & 29.27\% & 48.58\% & 28.53\% \\
more than 1 meaning & nan\% & nan\% &  &  & nan\% & nan\% & nan\% \\
one meaning & 57.64\% & 40.32\% &  &  & 29.27\% & 48.58\% & 28.53\% \\
\#items & 4 & 10 & 0 & 0 & 4 & 4 & 8 \\ \midrule
Cognates similar in meanings&ES-ES & ES-IT & ES-LA & ES-OSP & ES-RO & FR-ES & FR-FR \\ \midrule
cognate (a) $\leftrightarrow$ cognate (b) &  & 65.68\% & 37.23\% & 44.96\% & 45.08\% & 55.87\% & 56.92\% \\
meaning (a) $\leftrightarrow$ meaning (b) &  & 100.00\% & 100.00\% & 100.00\% & 100.00\% & 100.00\% & 100.00\% \\
cognate $\leftrightarrow$ meaning &  & 45.62\% & 27.33\% & 37.42\% & 36.51\% & 38.65\% & 31.95\% \\
mor

In [72]:
print("Local")
for lang1, lang2 in lang_combinations:
    print(f"-----{lang1}-{lang2}")
    metrics_dict = compute_distance([lang1, lang2])

    for shift in ['shift','no-shift']:
        print(f"Number of {shift} items: {len(metrics_dict[shift]['cog_meaning_similarities'])}")
        if len(metrics_dict[shift]['cog_meaning_similarities']) > 0:
            for metric in metrics:
                print("-", metric, "{:2.2%}".format(np.mean(metrics_dict[shift][metric])))

Local
-----es-es
Number of shift items: 4
- cognate_similarities 65.05%
- meaning_similarities 60.09%
- cog_meaning_similarities 57.64%
- cog_meaning_similarities_long nan%
- cog_meaning_similarities_short 57.64%
Number of no-shift items: 0
-----es-it
Number of shift items: 18
- cognate_similarities 38.99%
- meaning_similarities 32.63%
- cog_meaning_similarities 44.77%
- cog_meaning_similarities_long nan%
- cog_meaning_similarities_short 44.77%
Number of no-shift items: 56
- cognate_similarities 64.77%
- meaning_similarities 100.00%
- cog_meaning_similarities 44.89%
- cog_meaning_similarities_long nan%
- cog_meaning_similarities_short 44.89%
-----es-la
Number of shift items: 4
- cognate_similarities 65.05%
- meaning_similarities 60.09%
- cog_meaning_similarities 57.64%
- cog_meaning_similarities_long nan%
- cog_meaning_similarities_short 57.64%
Number of no-shift items: 14
- cognate_similarities 37.23%
- meaning_similarities 100.00%
- cog_meaning_similarities 27.33%
- cog_meaning_simil

In [73]:
lang1, lang2 = "es", "frm"
metrics_dict = compute_distance([lang1, lang2], True)

for shift in ['shift','no-shift']:
    print(f"Number of {shift} items: {len(metrics_dict[shift]['cog_meaning_similarities'])}")
    if len(metrics_dict[shift]['cog_meaning_similarities']) > 0:
        for metric in metrics:
            print("-", metric, "{:2.2%}".format(np.mean(metrics_dict[shift][metric])))

cognate_similarities 0.43% frm: comté | es: condado
meaning_similarities 100.00% meaning_eng: county | meaning_eng: county
cog_meaning_similarities 0.25% frm: comté | meaning_eng: county
cog_meaning_similarities 52.52% es: condado | meaning_eng: county
cognate_similarities 44.41% frm: conté | frm: comté
meaning_similarities 100.00% meaning_eng: county | meaning_eng: county
cog_meaning_similarities -4.44% frm: conté | meaning_eng: county
cog_meaning_similarities 0.25% frm: comté | meaning_eng: county
cognate_similarities -2.12% frm: conté | es: condado
meaning_similarities 100.00% meaning_eng: county | meaning_eng: county
cog_meaning_similarities -4.44% frm: conté | meaning_eng: county
cog_meaning_similarities 52.52% es: condado | meaning_eng: county
cognate_similarities -1.78% es: dios | frm: dieu
meaning_similarities 100.00% meaning_eng: god | meaning_eng: god
cog_meaning_similarities 59.54% es: dios | meaning_eng: god
cog_meaning_similarities -3.90% frm: dieu | meaning_eng: g