In [1]:
import os 
import sys
import pickle

import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors

import tqdm

In [2]:
def find_closest_vector(v, M, n = 1):
    indices = np.flip(np.argsort(np.dot(M, v)))[:n]
    return indices[0] if n == 1 else indices

def print_examples(starting_language, target_language, word): 
    idx = find_closest_vector(starting_language[word], target_language.vectors)
    result = target_language.index_to_key[idx]
    #print(starting_language.has_index_for(word), result, idx)
    return result

# make a function that taken a word, it generates all the words that are the same but with every letter of the alphabet added at the beginning, then in the middle, then at the end 

def generate_words(word):
    words = []
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = c + word
        words.append(new_word)

    mid = len(word) // 2
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word[:mid] + c + word[mid:]
        words.append(new_word)

    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word + c
        words.append(new_word)
    return words


# make a function that given a list of words, return for each word the closest word in the target language 

def find_closest_words(starting_language, target_language, words):
    results = []
    for word in words:
        results.append(print_examples(starting_language, target_language, word))
    return results

def split_vector(v, n):
    return np.array_split(v, n)

# split the vector of results in three subvectors of the same length 
def noise_experiment(starting_language, target_language, word):
    res = find_closest_words(starting_language, target_language, generate_words(word))
    temp = split_vector(res, 3)

    print(len(temp[0]), len(temp[1]), len(temp[2]))
    # return the number of each word for the subvectors ordered by their frequency

    results = []
    for i in range(len(temp)):
        values, counts = np.unique(temp[i], return_counts=True)
        ordered_indexes = np.argsort(-counts)
        results.append((values[ordered_indexes], counts[ordered_indexes]))
    return results

In [None]:
heron_path = "/data1/malto/csavelli/aligned_subwords_fasttext/aligned/"
print("loading italian vectors")
ita_aligned = KeyedVectors.load_word2vec_format(f"wiki.it.align.vec")
print("loading english vectors")
eng_aligned = KeyedVectors.load_word2vec_format(f"wiki.en.align.vec")
#ita_wiki = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.it.bin") 
#eng_wiki = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.en.bin")

Look if the normal words are aligned with the same words of fastText. 

In [3]:
lang = "it"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading italian vectors")
with open(f"wiki.{lang}.pkl", "rb") as f:
    ita_new = pickle.load(f)

lang = "en"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading english vectors")
with open(f"wiki.{lang}.pkl", "rb") as f:
    eng_new = pickle.load(f)

lang = "pt"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading portuguese vectors")
with open(f"wiki.{lang}.pkl", "rb") as f:
    pt_new = pickle.load(f)

loading italian vectors
loading english vectors
loading portuguese vectors


same language

In [5]:
idx = find_closest_vector(ita_new["casa"], ita_aligned.vectors)

ita_aligned.index_to_key[idx], idx

('casa', 228)

multi language

In [None]:
# given a vector, find in a matrix the closest vector to it
# VALUTA DI USARE get_vector() per ottenere il vettore di una parola

idx = find_closest_vector(ita_new["ciao"], eng_new.vectors)
idx2 = find_closest_vector(ita_new["ciao"], eng_aligned.vectors)
eng_new.index_to_key[idx], idx, eng_aligned.index_to_key[idx2], idx2

In [None]:
idx = find_closest_vector(ita_new["casa"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_new["gatto"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_new["papero"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_aligned["ciao"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx)

idx = find_closest_vector(ita_aligned["casa"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx) 

idx = find_closest_vector(ita_aligned["gatto"], eng_aligned.vectors)   
print(eng_aligned.index_to_key[idx], idx)

idx = find_closest_vector(ita_aligned["papero"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx)

## Words with typos

In [49]:
word = "pomodoriniq"

print_examples(ita_new, eng_new, word)

False tomato 16956


In [32]:
word = "alberelo"

idx = find_closest_vector(ita_new[word], ita_new.vectors)
key = ita_new.index_to_key[idx]
print(key, idx)
idx = find_closest_vector(ita_new[key], eng_new.vectors)
print(ita_new.has_index_for(word), eng_new.index_to_key[idx], idx)

alberello 79967
False tree 1664


In [17]:
word = "albero"

idx = find_closest_vector(ita_new[word], ita_new.vectors)
key = ita_new.index_to_key[idx]
idx = find_closest_vector(ita_new[word], eng_new.vectors, 10)
print(ita_new.has_index_for(word))
print()
for i in idx: 
    print(eng_new.index_to_key[i])

False

soulamea
swae
kekenboschia
amboro
noega
caribbeana
albanyana
waina
paniola
loranga


### Noise Experiments

Elephant -> Elefante -> Elefant

In [55]:
eng_ita = noise_experiment(eng_new, ita_new, "elephant")
eng_ita

26 26 26


[(array(['elefante', 'elefanten'], dtype='<U10'),
  array([25,  1], dtype=int64)),
 (array(['akun', 'xbj', 'wami', 'varu', 'umari', 'tkv', 'tawe', 'sohatu',
         'slokar', 'sku', 'shaji', 'ranvir', 'pagal', 'oot', 'malla',
         'khri', 'fôn', 'elefante', 'dhar', 'bhu', 'aset', 'arpaso',
         'aphur', 'all»', 'карл', '고개'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['elefante', 'elefantide', 'elefanten', 'elefanti', 'elephanta'],
        dtype='<U10'),
  array([14,  6,  2,  2,  2], dtype=int64))]

In [57]:
eng_pt = noise_experiment(eng_new, pt_new, "elephant")
eng_pt

26 26 26


[(array(['elefant', 'elefante'], dtype='<U10'), array([22,  4], dtype=int64)),
 (array(['kaula', 'kir', ',ai', 'vavó', 'supah', 'suf', 'sika',
         'quincannon', 'mukia', 'melaka', 'magech', 'kosti', 'khak', 'ka',
         'hihi', 'gady', 'bhandari', 'akshak', 'ajaka', 'aei', '/la', '/en',
         'μj', '안녕'], dtype='<U10'),
  array([2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['elefant', 'elefantes', 'elefanta', 'elefante', 'elephas'],
        dtype='<U10'),
  array([17,  4,  3,  1,  1], dtype=int64))]

In [54]:
ita_eng = noise_experiment(ita_new, eng_new, "elefante")
ita_eng

26 26 26


[(array(['elephant'], dtype='<U10'), array([26], dtype=int64)),
 (array(['>la', '见', '小丑', '—mar', 'όπλα', 'γίγας', 'αντ', 'ɔŋ', 'ìl',
         'yeia', 'toŋ', 'terion', 'synes', 'oneia', 'oino', 'nsdnld', 'ndo',
         'myrion', 'molou', 'l̩', 'kaloula', 'ergenia', 'ekdikitho',
         'athinoula', '銘', '\uf06e'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['elephant', 'elephanten'], dtype='<U10'),
  array([25,  1], dtype=int64))]

In [56]:
ita_pt = noise_experiment(ita_new, pt_new, "elefante")
ita_pt

26 26 26


[(array(['elefante', 'elefantinho', 'elefanta'], dtype='<U11'),
  array([18,  7,  1], dtype=int64)),
 (array(['alaôr', 'κέρκυρα', 'ελευθερία', 'zótico', 'svaðilfari', 'sirene',
         'ratatoskr', 'pírrica', 'posível', 'paymaster', 'particpante',
         'ortígia', 'ne~e', 'mícala', 'molóssia', 'imitador', 'haliaeetus',
         'fsfla', 'enéade', 'egíalo', 'díon', 'cacófato', 'batlha',
         'babaca', 'طرابلس', '金色のガッシュベル'], dtype='<U11'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['elefant', 'elefante', 'elefanten', 'elefanta', 'elefantes'],
        dtype='<U11'),
  array([12, 10,  2,  1,  1], dtype=int64))]

In [23]:
pt_eng = noise_experiment(pt_new, eng_new, "elefante")
pt_eng

26 26 26


[(array(['elephant'], dtype='<U10'), array([26], dtype=int64)),
 (array(['#og', '٤', 'πf', 'ζφη', '}♥', 'tüt', 'tyua', 'tieion', 'rate}}',
         'poper', 'ogn', 'lioni', 'lefta', 'kyllene', 'ihnc', 'heracle',
         'gadara', 'ezzie', 'avator', 'amatue', 'aegp', '>ho', '+#',
         '%omitted%', 'যুব', '—met'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['elephant', 'elephants', 'elephanten'], dtype='<U10'),
  array([23,  2,  1], dtype=int64))]

In [59]:
pt_ita = noise_experiment(pt_new, ita_new, "elefante")
pt_ita

26 26 26


[(array(['elefante'], dtype='<U29'), array([26], dtype=int64)),
 (array(['bènna', 'pumminale', '#bbbbbe', 'λύρα', 'δημήτηρ',
         'videogioco/film/libro/fumetto', 'trampy', 'rotolante',
         'rivoltoso', 'puliero', 'nausimedonte', 'megasound', 'matunata',
         'm/v', 'j/p', 'hafenkante', 'guerriero/bolero', 'fedala',
         'controassicurazione', 'ciclopentolato', 'būlāq', 'afflante',
         'ἀνδρός', '会話'], dtype='<U29'),
  array([2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['elefanti', 'elefante', 'elefant', 'elefanten', 'elefantide',
         'elefantina'], dtype='<U29'),
  array([11, 10,  2,  1,  1,  1], dtype=int64))]

Apple -> Mela -> Maçã

In [60]:
eng_ita = noise_experiment(eng_new, ita_new, "apple")
eng_ita

26 26 26


[(array(['babbler', '/apple', 'tix', 'ssop', 'pymble', 'puledro', 'pix',
         'pimple', 'orv', 'omputing', 'nutt', 'mouseup', 'marmell',
         'mapple', 'madin', 'lappo', 'hopple', 'gobble', 'flitt',
         'fingerboard', 'erby', 'broccolo', 'weedy', 'zapple'], dtype='<U11'),
  array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['/doc', 'xpression', 'xmi', 'wiki@home', 'vob', 'ugc', 'tkv',
         'servator', 'pout', 'pns', 'nēnē', 'nswp', 'nee', 'mālā', 'markan',
         'lgu', 'iind', 'iat', 'ddj', 'daal', 'chapero', 'bucy', 'ammalato',
         'agbu', 'zvezde', 'σε'], dtype='<U11'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['apple', 'hackberry', 'acorus', 'mele', 'marshmallow', 'kempston',
         'ider', 'frish', 'egeskov', 'cydia', 'bracker', 'aros', 'applet',
         'apperson', 'amarok', 'orx', 'vinx'], dtype='<U11'),
  array([

In [62]:
eng_pt = noise_experiment(eng_new, pt_new, "apple")
eng_pt

26 26 26


[(array(['snapple', 'apple', 'dapple', 'treach', 'squeezebox', 'scrapple',
         'rappaport', 'pownall', 'pegan', 'odp', 'noblett', 'murcho',
         'mtabletools', 'lapp', 'fulvo', 'cocoa', 'caswell', 'caskey',
         'beanshell', 'vinn', 'wintry'], dtype='<U11'),
  array([4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        dtype=int64)),
 (array(['vob', 'thoi', '#nome', 'z/', 'xpert', 'wikiwix', 'vlis', 'vierde',
         'purinsesu', 'planeten', 'nle', 'navigateurs', 'morreu', 'mje',
         'labview', 'jcl', 'hif', 'grc', 'dde', 'cre', 'ceru', 'alguidá',
         'zongo', 'ôm'], dtype='<U11'),
  array([2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['apple', 'dapple', 'applet', 'appletree', 'beans', 'cerejas',
         'citrix', 'colca', 'colvillea', 'gingerbread', 'hedegaard',
         'hillingdon', 'kappler', 'tix', 'twix', 'whoami'], dtype='<U11'),
  array([9, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [61]:
ita_eng = noise_experiment(ita_new, eng_new, "mela")
ita_eng

26 26 26


[(array(['#ela', 'vihta', 'taveiro', 'stachyris', 'phina', 'nsaliwa',
         'mfutila', 'lög', 'kudina', 'kgph', 'kajona', 'ɔl', 'jämejala',
         'essä', 'emela', 'ejal', 'dssu', 'aqvr', 'agrilina', 'add_text',
         'abiana', '#vela', '#mint', 'gerka', '黃霑'], dtype='<U9'),
  array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1], dtype=int64)),
 (array(['adda', 'unilazer', 'ukhuwah', 'ratnu', 'plačem', 'pirlita',
         'pečat', 'paalma', 'mmma', 'meşe', 'meyla', 'meola', 'menla',
         'meala', 'karadjova', 'ipalapa', 'femra', 'efla', 'dhela', 'dehmi',
         'cuyana', 'châtain', 'brayeux', 'alila', 'uzet', 'zsye'],
        dtype='<U9'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['#ash', 'sauvelade', 'qedar', 'omato', 'naame', 'melax', 'melas',
         'melar', 'melan', 'medun', 'malat', 'konae', 'kelil', 'kahma',
         'ibil', 'ghah', 'elav', 'elau', 

In [20]:
ita_pt = noise_experiment(ita_new, pt_new, "mela")
ita_pt

26 26 26


[(array(['/xp', 'tjängvide', 'skärholmen', 'semora', 'sambabaca', 'ravula',
         'quiéreme', 'pt&lr', 'proenca', 'nmea', 'muniadona', 'matauatu',
         'käina', 'keulla', 'kapenga', 'iõla', 'ipala', 'heikkila',
         'haaparanta', 'główna', 'chalybea', 'celeneh', 'botox', 'bitupitá',
         'ödeshög', 'šimeček'], dtype='<U18'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['arzú', 'woevre', 'vocalviolão', 'tišina', 'tiye', 'slitaz',
         'sikandra', 'saraghina', 'sabika', 'moapa', 'marijana', 'laperche',
         'kirui', 'kayzer', 'järna', 'ihgal', 'hawza', 'harina', 'goès',
         'chilapa', 'casalinhos', 'berghaus', 'başar', 'baraom', 'wróbel',
         'şıkıdım'], dtype='<U18'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['tsafrir', 'adhamiyah', 'tanios', 'studiocanalworking', 'roumois',
         'repolhos', 'naama

In [22]:
pt_eng = noise_experiment(pt_new, eng_new, "maçã")
pt_eng

26 26 26


[(array(['apple', 'oidb', 'raspberry', 'actvity', 'νερά', 'äpple', 'znb',
         'zfq', 'water_', 'sweeta', 'svbv', 'pyt', 'mmbl', 'infa', 'glob',
         'eg£', 'daec', 'aquafish', 'allusia', 'ගුණ', '哈尼'], dtype='<U11'),
  array([4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        dtype=int64)),
 (array(['#mir', 'uneh', 'tular', 'tammara', 'stripe', 'sozopetra', 'rinho',
         'ndimbati', 'meŋeu', 'lmap', 'lamarckian', 'khreyn', 'kalakat',
         'kadozeik', 'hulah', 'flava', 'fabó', 'dɛ', 'cierno', 'cichla',
         'ceiça', 'beecheyi', 'avı', 'agall', 'å³', '卒業'], dtype='<U11'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['#bread', '것만', '粟', '伯樂', 'また', '‹y›', '—cut', 'ಶೇ',
         'vermelhinho', 'souse', 'sieiro', 'savóia', 'rootin', 'prunes',
         'prugne', 'pot,', 'köla', 'hawberries', 'halvas', 'cocore',
         'christkind', 'barme', 'apples', '#chick', '박물관', '패밀리']

In [21]:
pt_ita = noise_experiment(pt_new, ita_new, "maçã")
pt_ita

26 26 26


[(array(['melagrana', 'supernazione', 'sunchild', 'starquake', 'soaco',
         'raspberry', 'pear', 'obdotta', 'moonchild', 'mielata', 'metafora',
         'trädet', 'airflow', 'imbalsamazione', 'icedtea', 'gelatina',
         'g+c', 'fragoletta', 'firnificazione', 'cornflake',
         'autocompilazione', 'animazione', 'anguria', 'lattica', 'vbnc'],
        dtype='<U16'),
  array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1], dtype=int64)),
 (array(['alamgiri', 'superdurezza', 'slöngvir', 'shabranigdu',
         'seccoborella', 'pemmican', 'patè', 'parahybana', 'misà', 'minù',
         'mappatura', 'makrana', 'lamarckiani', 'kukufeldia', 'jayasekera',
         'hurriti', 'horridula', 'goytisolo', 'ciadra', 'canditatura',
         'calzettone', 'blockführer', 'baḥr', 'baço', 'yanfolila', '壞女孩'],
        dtype='<U16'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['#é', 'τ

Butterfly -> Farfalla -> Borboleta

In [8]:
eng_ita = noise_experiment(eng_new, ita_new, "butterfly")
eng_ita

26 26 26


[(array(['farfalla', 'butterfly', 'glaucopsyche', 'ildiscobolo', '$this',
         '/cosa', 'ndingi'], dtype='<U12'),
  array([17,  2,  2,  2,  1,  1,  1], dtype=int64)),
 (array(['blabber', 'tanase', 'synephebi', 'stalo', 'scha', 'pelz', 'ooi',
         'newstead', 'klei', 'kimo', 'joger', 'hillz', 'heeeeere',
         'gymnobucco', 'grilla', 'gambus', 'fr/de', 'fotter', 'flammer',
         'figona', 'farfallone', 'farfalla', 'corner', 'cherina',
         'wildgravio', 'дб'], dtype='<U12'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['farfalla', 'glaucopsyche', 'farfalline', 'kipod'], dtype='<U12'),
  array([19,  5,  1,  1], dtype=int64))]

In [19]:
eng_pt = noise_experiment(eng_new, pt_new, "butterfly")
eng_pt

26 26 26


[(array(['borboleta', 'stutterfly', 'butterfly', 'knode'], dtype='<U15'),
  array([17,  6,  2,  1], dtype=int64)),
 (array(['#emma', 'stunna', 'stiletto', 'scylax', 'schickele', 'scarus',
         'rácz', 'rufoniger', 'rhomb', 'omele', 'mörch', 'knochen',
         'hysterica', 'honeyeater', 'holler', 'helvola', 'ferax', 'cocco',
         'cerkno', 'buzau', 'buchholzi', 'botamo', 'bardach', 'agathocles',
         'stutterfly', 'zella'], dtype='<U15'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['borboleta', 'gempylus', 'borboletinha', 'butterflycorner',
         'cupid', 'kaempferia', 'melax', 'whax'], dtype='<U15'),
  array([18,  2,  1,  1,  1,  1,  1,  1], dtype=int64))]

In [7]:
ita_eng = noise_experiment(ita_new, eng_new, "farfalla")
ita_eng

26 26 26


[(array(['butterfly', 'gutterfly', 'photoperiod'], dtype='<U11'),
  array([22,  3,  1], dtype=int64)),
 (array(['tegi', 'abbod', '愛才', '呸', '؋', 'örfi', 'zacca', 'widd', 'waati',
         'viminia', 'timbur', '陰入', 'plectro', 'ndua', 'mullina', 'maysa',
         'lugna', 'llamp', 'gullfoss', 'grami', 'cymm', 'cethar', 'bedd',
         'neesa', '麻里子'], dtype='<U11'),
  array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1], dtype=int64)),
 (array(['butterfly'], dtype='<U11'), array([26], dtype=int64))]

In [9]:
ita_pt = noise_experiment(ita_new, pt_new, "farfalla")
ita_pt

26 26 26


[(array(['mariposa/borboleta', 'borboleta'], dtype='<U22'),
  array([22,  4], dtype=int64)),
 (array(['abençoasse', 'streoneshalh', 'sjöjungfrun', 'pai…', 'orthia',
         'ondimba', 'maíllo', 'járnsaxa', 'go/under',
         'femininopb/andebolpe', 'faraona', 'evanora', 'elayirampannai',
         'crispa', 'citreola', 'cellador', 'carruthersi', 'brigði',
         'bolimbolacho', 'bipinnula', 'bernau_breitscheidstr_', 'batlha',
         'allbäck', 'aleivosa', 'taghrout', 'شيخ'], dtype='<U22'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['borboleta', 'astyoche', 'foecundatrix', 'mariposa/borboleta'],
        dtype='<U22'),
  array([23,  1,  1,  1], dtype=int64))]

In [10]:
pt_ita = noise_experiment(pt_new, ita_new, "borboleta")
pt_ita

26 26 26


[(array(['farfalla', 'farfallina'], dtype='<U13'),
  array([25,  1], dtype=int64)),
 (array(['dafrosa', 'spante', 'siapiccia', 'semiflava', 'seccoborella',
         'santafusca', 'pozarica', 'polarografica', 'pir²', 'pioggerellina',
         'miñarro', 'marirosa', 'luccketta', 'lanzahíta', 'l`unica',
         'guttula', 'granatino', 'granatina', 'galetta', 'fialetta',
         'faísca', 'doletta', 'diglio', 'deflatore', 'versicolora',
         'šalina'], dtype='<U13'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['farfalla', 'farfalle', 'chrysomelinae'], dtype='<U13'),
  array([23,  2,  1], dtype=int64))]

In [12]:
pt_eng = noise_experiment(pt_new, eng_new, "borboleta")
pt_eng

26 26 26


[(array(['butterfly'], dtype='<U11'), array([26], dtype=int64)),
 (array(['#cska', '#cada', 'sisterna', 'se平', 'pudorella', 'prilla',
         'plectrura', 'pistoli', 'pepta', 'montela', 'kunka', 'inkcap',
         'dusta', 'cremastra', 'chepinoga', 'butterfly', 'asoleni',
         'agarita', '$–', '#puna', '#eta', '#crazy', 'velena', '청'],
        dtype='<U11'),
  array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['butterfly', 'butterflies'], dtype='<U11'),
  array([25,  1], dtype=int64))]

Cat -> Gatto -> Gato 

In [24]:
eng_ita = noise_experiment(eng_new, ita_new, "cat")
eng_ita

26 26 26


[(array(['ccat', 'acat', 'tsf', 'tribunal', 'tix', 'sottocategoria', 'scat',
         'pkg', 'oms', 'nsm', 'monobox', 'lmn', 'lcat', 'kcat', 'jct',
         'iiop', 'icat', 'fcat', 'esecutività', 'ecat', 'cgg', 'analingus',
         'vnx', 'zcat'], dtype='<U14'),
  array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['/dev/hd', 'sysop', 'stemma', 'shvat', 'sgp', 'rosc', 'nter',
         'nib', 'miniacea', 'marocchina', 'isme', 'ial', 'ftu', 'dehm',
         'czy', 'cuera', 'csat', 'cisac', 'chat', 'ceat', 'ccat', 'cbv',
         'br/index', 'an/', 'tvl', 'ᛋ'], dtype='<U14'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['atf', 'scu', 'probyn', 'ppy', 'piperonil', 'phbh', 'nex', 'myd',
         'macc', 'lnk', 'gatti', 'ftu', 'fttx', 'espera', 'coatl', 'cmp',
         'cim', 'cesvi', 'caty', 'catu', 'cato', 'cath', 'cate', 'camelli',
         'telem', 'xp

In [25]:
eng_pt = noise_experiment(eng_new, pt_new, "cat")
eng_pt

26 26 26


[(array(['dcat', 'kcat', 'fcat', 'capes', 'uw', 'scat', 'redirec', 'lcat',
         'ipfix', 'ipf', 'icg', 'icat', 'icap', 'icann', 'fdh', 'fct',
         'fap', 'escolaridade', 'ciat', 'ccs', 'categorize', 'xac', 'xsl'],
        dtype='<U12'),
  array([2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1], dtype=int64)),
 (array(['aaj', 'tnef', 'sysop', 'rbe', 'qaq', 'pold', 'peziza', 'inp',
         'iet', 'idc', 'huapango', 'hekat', 'hayy', 'gtm', 'fpe', 'efqm',
         'csat', 'chat', 'cepae', 'ccs', 'ccpg', 'cbat', 'brasão',
         'argelino', 'télé', 'wijs'], dtype='<U12'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['botella', 'uidoc', 'tezcatl', 'telemedia', 'tany', 'sewell',
         'playtone', 'mungo', 'lactiflora', 'keira', 'jtf', 'gpio', 'gatos',
         'dcn', 'cpe', 'cooc', 'cbp', 'catu', 'cato', 'cati', 'cath', 'cat',
         'cabr', 'brr', 'unfpa', 'xqt'], dtype='<

In [26]:
ita_eng = noise_experiment(ita_new, eng_new, "gatto")
ita_eng

26 26 26


[(array(['pottore', 'dlisti', '#mouse', 'tootle', 'sulmoni', 'stimate',
         'puleng', 'puffi', 'peggit', 'nekomaru', 'guľa', 'gryll', 'gippo',
         'fabin', 'dictis', 'checchio', 'carichini', 'buffoni',
         'beaglebone', 'avarolli', ',½', '#ugo', '啓', '陰入'], dtype='<U10'),
  array([2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['#tot', '本の泉社', 'すみれ', 'ʟ', 'ǭ', 'ƅ', 'vignate', 'sqay', 'smalli',
         'sinjoro', 'serd', 'rotner', 'opčine', 'muntanyola', 'miolo',
         'jantti', 'idey', 'hyoh', 'glci', 'eggther', 'dag', 'comerás',
         'beltri', 'bejgli', '永曆', '粟'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['cat', '#chick', '}ş', 'rosov', 'pig', 'nyan', 'mory', 'lungleg',
         'klængur', 'kitten', 'grumpig', 'flewell', 'ducku', 'doglion',
         'crabbit', 'chimpy', 'chimplee', 'chimpa', 'buford', 'bootin',


In [27]:
ita_pt = noise_experiment(ita_new, pt_new, "gatto")
ita_pt

26 26 26


[(array(['luciaccoelho', 'animago', 'vanzi', 'vanhoye', 'ursolino', 'urso',
         'tribble', 'stozice', 'saci', 'pessagno', 'oldroyd', 'merlino',
         'loquasto', 'frsa', 'frassilongo', 'folivora', 'fantascienza',
         'edilcuoghi', 'dalcio', 'bicci', 'beeblebrox', 'wallago', 'wolo'],
        dtype='<U19'),
  array([4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1], dtype=int64)),
 (array(['autografo', 'vikedal', 'uffugo', 'taladro', 'seninho', 'sallo',
         'saliento', 'sabàto', 'reizinho', 'raiano', 'nesma', 'maíllo',
         'mataskelekele', 'lusó/ôfona', 'i´ve', 'izvoarele', 'fiwc',
         'fedelho', 'etxebarria', 'diferença_uct_verão', 'calógero',
         'bokuto', 'bigdelete', 'bibino', 'íole', '罠'], dtype='<U19'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['rato/mouse', 'cachorro', 'babbo', 'sturmeck', 'spunky',
         'serelepe', 'sabichão', 'piriquito'

In [28]:
pt_eng = noise_experiment(pt_new, eng_new, "gato")
pt_eng

26 26 26


[(array(['&pb', 'ǐ', '~to', 'unisono', 'tuzlu', 'tokunai', 'tioro',
         'spingere', 'shound', 'oxl', 'opdivo', 'ofono', 'mmvr', 'miseno',
         'liru', 'hpè', 'halcyone', 'froso', 'feletto', 'daioni', 'cs−',
         'bl&srcid', 'axyi', 'ancoro', 'ḯ', '진실'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['bāru', '株', '弘仁', '}ş', 'vando', 'ular', 'uccio', 'toak', 'talán',
         'shouto', 'shound', 'sento', 'seiho', 'piț', 'mąka', 'moonland',
         'hunk', 'hróðulfr', 'heno', 'galton', 'gaito', 'expended',
         'energo', 'eext', '項', '욱'], dtype='<U10'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['bagcal', 'ㄑ', 'রাত', 'āhole', 'yoep', 'woolie', 'vykhlop',
         'toyak', 'toper', 'tbpa', 'stinko', 'rancer', 'pochenko', 'hueu',
         'honeybeast', 'haskey', 'gator', 'filmport', 'emon', 'draculon',
    

In [29]:
pt_ita = noise_experiment(pt_new, ita_new, "gato")
pt_ita

26 26 26


[(array(['&sig', 'κλειώ', 'vospro', 'trìnita', 'transcranico', 'tambò',
         'skeletron', 'serina/treonina', 'scodinzolante', 'rovarè',
         'resa_vino', 'pucajirca', 'nicostene', 'neroccio',
         'motocompressore', 'menuetto', 'kawasumi', 'idrossichinolina',
         'gemini_', 'flio', 'faraualla', 'dolone', 'cantabile', 'bruscello',
         'نهر', '清水'], dtype='<U16'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['arturiano', 'мιѕтєя', 'tentmaker', 'spesa', 'rospodotto',
         'raposo', 'preyas', 'potà', 'ozrock', 'otumbo', 'okaasan',
         'nome_leftbar', 'mužiki', 'miñarro', 'micidial', 'korpikoski',
         'iante', 'guardsman', 'gnifone', 'galton', 'gaito', 'bernabucci',
         'bailato', 'azmorigan', '山', '組合'], dtype='<U16'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['boomer', 'tigger', 'taʻu', 'takiki',

test

In [30]:
eng_ita = noise_experiment(eng_new, ita_new, "photography")
eng_ita

26 26 26


[(array(['fotografia_', 'fotogr', 'fotografias'], dtype='<U13'),
  array([16,  6,  4], dtype=int64)),
 (array(['fotografia_', 'fotografias', 'fotoit', 'fototecnica', 'photo/',
         'spettroscopia', 'agd', 'fluorescenza', 'fotografien',
         'nomografia', 'ografia', 'photoblog', 'photodo', 'photographing',
         'photorec', 'raman'], dtype='<U13'),
  array([6, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['fotogr', 'fotografia_'], dtype='<U13'),
  array([25,  1], dtype=int64))]

In [32]:
eng_ita = noise_experiment(eng_new, ita_new, "photo")
eng_ita

26 26 26


[(array(['mb_photo', 'photo/', 'fotografia_', '//image', '/foto',
         '/images/foto', 'cronofotografia', 'img_id', 'img_media_type',
         'iphoto', 'photo', 'photobucket'], dtype='<U15'),
  array([10,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int64)),
 (array(['/cor', 'yabo', 'vakataka', 'upas', 'totok', 'tewa', 'statistiku',
         'se/', 'pnu', 'nikaea', 'nampho', 'mgn', 'luong', 'lambang',
         'kuru', 'kou', 'inao', 'ilayang', 'haita', 'gyala', 'fights',
         'dafundo', 'baringo', 'bangt', 'yango', 'ⁿ'], dtype='<U15'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['fotogr', 'fotoit', 'fotos', 'photodo', 'foto', 'fotochimico',
         'fotofex', 'fotoni', 'fotopoulos', 'fotosub', 'fotothek', 'photo/',
         'photographing', 'photon'], dtype='<U15'),
  array([4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))]

In [31]:
eng_pt = noise_experiment(eng_new, pt_new, "photography")
eng_pt

26 26 26


[(array(['fotografia\xa0', 'fotografia', 'fotogravura', 'photographe',
         'photographo'], dtype='<U14'),
  array([21,  2,  1,  1,  1], dtype=int64)),
 (array(['fotometria', 'fotolitografia', 'colorimetria', 'encoding', 'fot',
         'fotog', 'fotografia', 'fotoquímica', 'photobleaching',
         'photografia'], dtype='<U14'),
  array([15,  3,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int64)),
 (array(['fotograf', 'fotografia\xa0', 'fotografie', 'fotográfia',
         'photographe', 'photos'], dtype='<U14'),
  array([11, 10,  2,  1,  1,  1], dtype=int64))]

In [33]:
eng_pt = noise_experiment(eng_new, pt_new, "photo")
eng_pt

26 26 26


[(array(['photos', 'imagem_paisagem', 'iphoto', 'imagemap', 'imagej',
         'wbmp', '/foto', 'image_caption', 'imagem_', '/timeline/', '/deka',
         'mn/', 'movieweb', 'photographer', 'photographo', 'photoimpact',
         'imagecount', '//images'], dtype='<U17'),
  array([3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['diuca', 'shimao', 'saona', 'ricla', 'pride', 'pri', 'pikul',
         'ongkiko', 'niumi', 'ngadi', 'nashik', 'nagatoki', 'moo',
         'mokambo', 'manalo', 'komai', 'kanmon', 'ippai', 'ikom', 'goroka',
         'gho', 'gebrinio', 'ganale', 'fcr', 'tsuchiura', 'xanax'],
        dtype='<U17'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['foton', 'fot', 'fotog', 'fotu', 'fotograf', 'fotografie',
         'fotográfa', 'fotoisomerização', 'fotoluminescência', 'fotos',
         'fóton'], dtype='<U17'),
  array([8, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1], dtype=int64))]

In [35]:
ita_eng = noise_experiment(ita_new, eng_new, "fotografia")
ita_eng

26 26 26


[(array(['orthophotography', 'fotography', 'photography', 'photograp',
         'photograpgh', 'photography,', 'photograpy', '‘photograph'],
        dtype='<U16'),
  array([11,  6,  4,  1,  1,  1,  1,  1], dtype=int64)),
 (array(['photogram', 'photog', 'photography', 'photography,',
         '//photography', 'fotofo', 'fotou', 'kaera', 'nudi',
         'photogallery', 'photogaphy', 'photogr', 'photograp',
         'photographics', 'photogs', '‘photo'], dtype='<U16'),
  array([6, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['photograp', 'photograpgh', 'photograph,', 'photogram',
         'photograpy', '‘photograph', 'fotography', 'photographics'],
        dtype='<U16'),
  array([7, 7, 4, 2, 2, 2, 1, 1], dtype=int64))]

In [36]:
ita_eng = noise_experiment(ita_new, eng_new, "foto")
ita_eng

26 26 26


[(array(['#β', '明美', '巷', '\u3000photo', 'ᵉ', 'vphoto', 'ufotable', 'treu',
         'tjz', 'smartshot', 'ojf', 'oinu', 'nycfoto', 'nsmap', 'newave',
         'liabe', 'jxz', 'imge', 'imazamox', 'il_', 'exz', 'eurofoto',
         'airmypc', '+no', '粟', '衝動'], dtype='<U13'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['#screen', 'ucum', 'tanca', 'tambelan', 'strečen', 'stergo',
         'soundgrid', 'semore', 'quinio', 'qto', 'pced', 'liti', 'lapcevic',
         'koncu', 'johto', 'fucky', 'fonto', 'fomu', 'erê', 'crysos',
         'cr&fc', 'ceļa', 'bristled', 'boric', 'ukcat', '孟海公'], dtype='<U13'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['photok', 'photo,', 'photolab', 'foton', 'fotou', 'phot', 'photo#',
         'photodex', 'photoed', 'photoelectric', 'photoes', 'photoetch',
         'photogram', 'photographe', 'photogs', 'photov

In [34]:
ita_pt = noise_experiment(ita_new, pt_new, "fotografia")
ita_pt

26 26 26


[(array(['fotografia\xa0', 'fotografia', 'microfotografia',
         'aerofotografia', 'fotogravura', 'macrofotografia',
         'fotogrametria'], dtype='<U16'),
  array([8, 4, 4, 3, 3, 3, 1], dtype=int64)),
 (array(['fotografia', 'fotogravura', 'fotografia\xa0', 'fotogrametria',
         'fotojornalismo', 'aerofotografia', 'cronofotografia',
         'fotografei', 'fotografem', 'fotográfia', 'kirliangrafia',
         'maghroumeh', 'microfotografia', 'videorreportagem'], dtype='<U16'),
  array([8, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['fotografia\xa0', 'fotogravura', 'fotografei', 'fotografem'],
        dtype='<U16'),
  array([21,  3,  1,  1], dtype=int64))]

In [37]:
ita_pt = noise_experiment(ita_new, pt_new, "foto")
ita_pt

26 26 26


[(array(['#icasa', 'xpdf', 'varredura', 'ušće', 'usúario', 'spiess',
         'photofolia', 'netmanage', 'lmbassman', 'imagej', 'hydrolase',
         'hotol', 'geocities', 'futeboldegoyaz', 'fotosite', 'externo',
         'en&nrm', 'com/photos/landshells_freshwater_gastropods/',
         'brincadeirinha', 'azulita', 'asa/', 'animepro', '_albirex',
         '/print', 'ずっと', 'タチコマ'], dtype='<U44'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['abülfaz', 'valmaior', 'setuid', 'retruca', 'pitio', 'pignerol',
         'pendjari', 'nescessário', 'melingo', 'maskwak', 'kã', 'katú',
         'indologia', 'imprença', 'gudfred', 'fonto', 'fjärde',
         'empoleirado', 'edi', 'dosma', 'dennison', 'cecotto', 'benzoato',
         'asbs/suzano', 'vetro', 'xdk'], dtype='<U44'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['foton', 'fotográfo', 'fot

In [40]:
pt_eng = noise_experiment(pt_new, eng_new, "fotografia")
pt_eng

26 26 26


[(array(['photography', 'orthophotography', 'geophotography'], dtype='<U16'),
  array([18,  6,  2], dtype=int64)),
 (array(['photography', 'photog', 'photogr', 'photograp', 'photography,',
         'photography}}', 'goniodes', 'hamri', 'motogp', 'olaba',
         'photofit', 'photogen', 'photograper', 'photographer,', 'photomap',
         '藤崎'], dtype='<U16'),
  array([6, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['photograp', 'photography', 'photogram', 'photographics',
         'photograph,', 'photography,', 'photograpgh', 'photographs'],
        dtype='<U16'),
  array([8, 5, 4, 3, 2, 2, 1, 1], dtype=int64))]

In [38]:
pt_eng = noise_experiment(pt_new, eng_new, "foto")
pt_eng

26 26 26


[(array(['#β', 'cm,', '十手', 'ⱦ', '‘photo', 'ගුණ', 'αtc', 'zbg', 'yarara',
         'xyg', 'svgimage', 'sc,', 'ppix', 'photograph', 'photobank', 'pgv',
         'mwha', 'lmap', 'kilometr', 'ixy', 'dlsv', 'closeup', '翁', '）』'],
        dtype='<U13'),
  array([2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=int64)),
 (array(['>ho', '星星', 'ฝ', '·na', 'verranno', 'tiuj', 'smarte', 'qyzyl',
         'plodn', 'nlib', 'nemah', 'meali', 'lykos', 'lagou', 'hornish',
         'honer', 'gieri', 'ganet', 'fonto', 'fluga', 'citran', 'caval',
         'astronom', '`n`', '真知子', '사이'], dtype='<U13'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['photok', 'photoed', 'photoink', 'photo,', 'photos', 'photolab',
         'photojournal', 'photograp', 'photogram', 'photoglo', 'photofile',
         'photof', 'photoetch', 'photoelectric', 'photocd', 'photoby',
         'photostat', 'phototube'], 

In [41]:
pt_ita = noise_experiment(pt_new, ita_new, "fotografia")
pt_ita

26 26 26


[(array(['astrofotografia', 'fotografica', 'macrofotografia'], dtype='<U15'),
  array([20,  5,  1], dtype=int64)),
 (array(['fotografia', 'fotogiglio', 'astrofotografia', 'fotografica',
         'photobeamer', 'eremomela', 'fotografandolo', 'fotografia»',
         'guanahani', 'jamize', 'lucorano', 'sportscar'], dtype='<U15'),
  array([7, 4, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1], dtype=int64)),
 (array(['fotografic', 'fotografica', 'fotogr', 'fotografe', 'fotografia',
         'fotografie'], dtype='<U15'),
  array([20,  2,  1,  1,  1,  1], dtype=int64))]

In [39]:
pt_ita = noise_experiment(pt_new, ita_new, "foto")
pt_ita

26 26 26


[(array(['atral', 'εr', 'µv/°c', 'us_open_', 'spjót', 'rimappatura',
         'riflettografia', 'retinale', 'pmvc', 'plb', 'photodraw', 'pgv',
         'o/rame', 'minisito', 'microindentazione', 'mhf', 'mangabeira',
         'jq', 'immagine', 'fototubo', 'fotorifrattivo', 'foto_veicolo',
         'foto', 'diametro_alla_base', '✠', 'キューティーハニー'], dtype='<U18'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['acmsetup', 'tigerbeat', 'rustom', 'probotector', 'kakkab',
         'gläubet', 'gerö', 'galvanoplastica', 'furst', 'foyt', 'foot/',
         'fonto', 'diffcile', 'córso', 'càndito', 'cunfida', 'caùto',
         'bulèta', 'bravissima', 'bombolone', 'beruatto', 'astronomico',
         'akuaduulza', 'adzope', 'wyton', 'ʿforte'], dtype='<U18'),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1], dtype=int64)),
 (array(['fotoit', 'fotograf', 'photobeamer', 'fotoluminescen

### 5: take random words from ita_new and watch the closest in eng_new and see if the pattern is the same with fastText

In [62]:
# generate 2000 random indexes between 0 and ita_new.index_to_key length 

np.random.seed(42)

idx = np.random.randint(0, 4000, 2000)

for i in idx:
    ita_word = ita_new.index_to_key[i]
    eng_word_ew = eng_new.index_to_key[find_closest_vector(ita_new[ita_word], eng_new.vectors)]
    eng_word_aw = eng_aligned.index_to_key[find_closest_vector(ita_aligned[ita_word], eng_aligned.vectors)]

    if ita_new.index_to_key[i] != ita_aligned.index_to_key[i]:
        print("PROBLEM: ", i)
    
    if eng_word_ew != eng_word_aw:
        print(ita_word, eng_word_ew, eng_word_aw)

NameError: name 'eng_aligned' is not defined

In [50]:
def generate_words(word):
    words = []
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = c + word
        words.append(new_word)

    mid = len(word) // 2
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word[:mid] + c + word[mid:]
        words.append(new_word)

    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word + c
        words.append(new_word)
    return words

len(generate_words("casa")) // 3

26

In [48]:
l = "abcdefghijklmnopqrstuvwxyz"
len(l)

26

# not working

In [None]:
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors
import pickle

def find_matrix(lang, step=1000):
    
    print("LANG", lang)
    dict = {}

    # not aligned
    try:
        src = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.{lang}.bin") 
        print("Loaded fastText vectors")
    except:
        print("Going to 'vec'")
        src = KeyedVectors.load_word2vec_format(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.{lang}.vec")
    dst = KeyedVectors.load_word2vec_format(f"/data1/malto/csavelli/aligned_subwords_fasttext/aligned/wiki.{lang}.align.vec") # aligned
    
    if src.index_to_key != dst.index_to_key:
        print("src and dst vocabularies differ. ")
        print("src", len(src))
        print("dst", len(dst))
        print("in src, not in dst", set(src.index_to_key) - set(dst.index_to_key))
        print("in dst, not in src", set(dst.index_to_key) - set(src.index_to_key))
    
    dict["missing_elements"] = [set(src.index_to_key) - set(dst.index_to_key), set(dst.index_to_key) - set(src.index_to_key)] # missing words in common vocabulary

    vocab = sorted(list(set(src.index_to_key) & set(dst.index_to_key)))
        
    Y = dst[vocab]
    X = src[vocab]

    W_ = np.linalg.pinv(X) @ Y

    prod = (X @ W_)
    prod = prod / np.linalg.norm(prod, axis=1).reshape(-1,1)

    dict["MSE"] = np.square(np.subtract(prod, Y)).sum(axis=1).mean() # mean squared error

    error_couples = []
    right_values = np.array([])

    for i in range(0, len(prod), step):
            M = (prod[i:i+step] @ Y.T)
            v = M.argmax(axis=1)

            # sum of the diagonal
            right_values = np.concatenate((right_values, np.diagonal(M[:,v])))
            for j in range(len(v)):
                if v[j] != i+j: # check that the most vector is the word itself
                    print("words do not match", i+j, v[j], M[j,v[j]])
                    print("instead the right word should be ", M[j,j])
                    error_couples.append((i+j, v[j], M[j,v[j]], M[j,j]))

            print(i, "/", len(prod), "done") if i % 50_000 == 0 else None
    
    dict["accuracy"] = (right_values.mean(), right_values.std()) # values of the diagonal of the matrix
    dict["n_errors"] = len(error_couples) # number of errors

    src.vectors = src.vectors @ W_
    src.vectors_ngrams = src.vectors_ngrams @ W_
    src.vectors = src.vectors / np.linalg.norm(src.vectors, axis=1).reshape(-1,1)
    src.vectors_ngrams = src.vectors_ngrams / np.linalg.norm(src.vectors_ngrams, axis=1).reshape(-1,1)

    return src, X, Y, W_, right_values, error_couples, dict

            
lang = "af"         
src, X, Y, W_, right_values, error_couples, dict = find_matrix(lang)

In [None]:
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors
import pickle

import os 

def create_bin(lang):
    
    print("LANG", lang)

    # not aligned
    try:
        src = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.{lang}.bin") 
        print("Loaded fastText vectors")
    except:
        print("Going to 'vec'")
        src = KeyedVectors.load_word2vec_format(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.{lang}.vec")
    
    path = "W/"
    file = f"{lang}.pkl"
    with open(os.path.join(path, file), "rb") as f:
        W_ = pickle.load(f)

    src.vectors = src.vectors @ W_
    src.vectors_ngrams = src.vectors_ngrams @ W_
    src.vectors = src.vectors / np.linalg.norm(src.vectors, axis=1).reshape(-1,1)
    src.vectors_ngrams = src.vectors_ngrams / np.linalg.norm(src.vectors_ngrams, axis=1).reshape(-1,1)

    return W_, src 

W1, src1 = create_bin("af")

In [None]:
FastText

In [None]:
src2 = KeyedVectors.load(f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/wiki.{lang}.bin")
vectors_ngrams = np.load(f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/wiki.{lang}.bin.vectors_ngrams.npy", allow_pickle=True)
vectors_vocab = np.load(f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/wiki.{lang}.bin.vectors_vocab.npy", allow_pickle=True)

In [None]:
(src.vectors == src1.vectors).all(), (src.vectors_ngrams == vectors_ngrams).all()

In [None]:
src.save("test")

In [None]:
bonk = KeyedVectors.load("test")
bonk.vectors == src.vectors, 

In [None]:
plain.vectors = src.vectors
plain.vectors_ngrams = src.vectors_ngrams

In [None]:
src, X, Y, W_, right_values, error_couples, dict = find_matrix(lang)

In [None]:
# save src with pickle 

import pickle

path = "W/"
file = f"{lang}.pkl"

with open(f"test2", "wb") as f:
                pickle.dump((src), f)

In [None]:
# load src with pickle
with open(f"test2", "rb") as f:
    bonk = pickle.load(f)

(bonk.vectors == src.vectors).all(), (bonk.vectors_ngrams == src.vectors_ngrams).all()  

In [None]:
lang = "af"
with open(f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/wiki.{lang}.pkl", "rb") as f:
    bonk2 = pickle.load(f)

(bonk2.vectors == src.vectors).all(), (bonk2.vectors_ngrams == src.vectors_ngrams).all()  

In [None]:
src.save("test")

In [None]:
test_model = KeyedVectors.load("test")
vectors_ngrams = np.load("test.vectors_ngrams.npy", allow_pickle=True)
vectors_vocab = np.load(f"test.vectors_vocab.npy", allow_pickle=True)

In [None]:
vectors_ngrams.shape, vectors_vocab.shape, src.vectors_ngrams.shape, src.vectors.shape

In [None]:
v = vectors_vocab[0]

for row in src.vectors:
    if (row == v).all():
        print("found")
        break

In [None]:
src.vectors

In [None]:
vectors_vocab

In [None]:
src.vectors_ngrams == vectors_ngrams, src.vectors == vectors_vocab

In [None]:
test_model.vectors_ngrams == vectors_ngrams, test_model.vectors == src.vectors