In [1]:
import os 
import sys
import pickle

import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors

import tqdm

In [2]:
def find_closest_vector(v, M, n = 1):
    indices = np.flip(np.argsort(np.dot(M, v)))[:n]
    return indices[0] if n == 1 else indices

def print_examples(starting_language, target_language, word): 
    idx = find_closest_vector(starting_language[word], target_language.vectors)
    result = target_language.index_to_key[idx]
    #print(starting_language.has_index_for(word), result, idx)
    return result

# make a function that taken a word, it generates all the words that are the same but with every letter of the alphabet added at the beginning, then in the middle, then at the end 

def generate_words(word):
    words = []
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = c + word
        words.append(new_word)

    mid = len(word) // 2
    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word[:mid] + c + word[mid:]
        words.append(new_word)

    for c in "abcdefghijklmnopqrstuvwxyz":
        new_word = word + c
        words.append(new_word)
    return words


# make a function that given a list of words, return for each word the closest word in the target language 

def find_closest_words(starting_language, target_language, words):
    results = []
    for word in words:
        results.append(print_examples(starting_language, target_language, word))
    return results

def split_vector(v, n):
    return np.array_split(v, n)

# split the vector of results in three subvectors of the same length 
def noise_experiment(starting_language, target_language, word):
    res = find_closest_words(starting_language, target_language, generate_words(word))
    temp = split_vector(res, 3)

    # return the number of each word for the subvectors ordered by their frequency

    results = []
    for i in range(len(temp)):
        values, counts = np.unique(temp[i], return_counts=True)
        ordered_indexes = np.argsort(-counts)
        results.append((values[ordered_indexes], counts[ordered_indexes]))
    return results

In [None]:
# heron_path = "/data1/malto/csavelli/aligned_subwords_fasttext/aligned/"
# print("loading italian vectors")
# ita_aligned = KeyedVectors.load_word2vec_format(f"wiki.it.align.vec")
# print("loading english vectors")
# eng_aligned = KeyedVectors.load_word2vec_format(f"wiki.en.align.vec")
#ita_wiki = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.it.bin") 
#eng_wiki = load_facebook_vectors(f"/data1/malto/csavelli/aligned_subwords_fasttext/wiki/wiki.en.bin")

Look if the normal words are aligned with the same words of fastText. 

In [3]:
lang = "it"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading italian vectors")
with open(f"{heron_path}wiki.{lang}.pkl", "rb") as f:
    ita_new = pickle.load(f)

lang = "en"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading english vectors")
with open(f"{heron_path}wiki.{lang}.pkl", "rb") as f:
    eng_new = pickle.load(f)

lang = "pt"
heron_path = f"/data1/malto/csavelli/aligned_subwords_fasttext/res/{lang}/"

print("loading portuguese vectors")
with open(f"{heron_path}wiki.{lang}.pkl", "rb") as f:
    pt_new = pickle.load(f)

loading italian vectors
loading english vectors
loading portuguese vectors


In [6]:
# make a 2x2 random matrix 

a = np.random.rand(2, 2)
b = np.random.rand(3, 2)

b@a

array([[0.32592416, 0.3984447 ],
       [0.84792559, 1.24908104],
       [0.54792281, 0.6696265 ]])

same language

In [None]:
idx = find_closest_vector(ita_new["casa"], ita_aligned.vectors)

ita_aligned.index_to_key[idx], idx

multi language

In [None]:
# given a vector, find in a matrix the closest vector to it
# VALUTA DI USARE get_vector() per ottenere il vettore di una parola

idx = find_closest_vector(ita_new["ciao"], eng_new.vectors)
idx2 = find_closest_vector(ita_new["ciao"], eng_aligned.vectors)
eng_new.index_to_key[idx], idx, eng_aligned.index_to_key[idx2], idx2

In [None]:
idx = find_closest_vector(ita_new["casa"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_new["gatto"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_new["papero"], eng_new.vectors)
eng_new.index_to_key[idx], idx

In [None]:
idx = find_closest_vector(ita_aligned["ciao"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx)

idx = find_closest_vector(ita_aligned["casa"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx) 

idx = find_closest_vector(ita_aligned["gatto"], eng_aligned.vectors)   
print(eng_aligned.index_to_key[idx], idx)

idx = find_closest_vector(ita_aligned["papero"], eng_aligned.vectors)
print(eng_aligned.index_to_key[idx], idx)

## Words with typos

In [None]:
word = "pomodoriniq"

print_examples(ita_new, eng_new, word)

In [None]:
word = "alberelo"

idx = find_closest_vector(ita_new[word], ita_new.vectors)
key = ita_new.index_to_key[idx]
print(key, idx)
idx = find_closest_vector(ita_new[key], eng_new.vectors)
print(ita_new.has_index_for(word), eng_new.index_to_key[idx], idx)

In [None]:
word = "albero"

idx = find_closest_vector(ita_new[word], ita_new.vectors)
key = ita_new.index_to_key[idx]
idx = find_closest_vector(ita_new[word], eng_new.vectors, 10)
print(ita_new.has_index_for(word))
print()
for i in idx: 
    print(eng_new.index_to_key[i])

### Noise Experiments

Elephant -> Elefante -> Elefant

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "elephant")
eng_ita

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "elephant")
eng_pt

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "elefante")
ita_eng

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "elefante")
ita_pt

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "elefante")
pt_eng

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "elefante")
pt_ita

Apple -> Mela -> Maçã

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "apple")
eng_ita

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "apple")
eng_pt

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "mela")
ita_eng

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "mela")
ita_pt

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "maçã")
pt_eng

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "maçã")
pt_ita

Butterfly -> Farfalla -> Borboleta

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "butterfly")
eng_ita

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "butterfly")
eng_pt

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "farfalla")
ita_eng

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "farfalla")
ita_pt

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "borboleta")
pt_ita

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "borboleta")
pt_eng

Cat -> Gatto -> Gato 

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "cat")
eng_ita

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "cat")
eng_pt

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "gatto")
ita_eng

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "gatto")
ita_pt

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "gato")
pt_eng

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "gato")
pt_ita

test

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "photography")
eng_ita

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "photo")
eng_ita

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "photography")
eng_pt

In [None]:
eng_pt = noise_experiment(eng_new, pt_new, "photo")
eng_pt

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "fotografia")
ita_eng

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "foto")
ita_eng

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "fotografia")
ita_pt

In [None]:
ita_pt = noise_experiment(ita_new, pt_new, "foto")
ita_pt

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "fotografia")
pt_eng

In [None]:
pt_eng = noise_experiment(pt_new, eng_new, "foto")
pt_eng

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "fotografia")
pt_ita

In [None]:
pt_ita = noise_experiment(pt_new, ita_new, "foto")
pt_ita

In [None]:
ita_eng = noise_experiment(ita_new, eng_new, "pecora")
ita_eng, print_examples(ita_new, eng_new, "pecora")

In [None]:
eng_ita = noise_experiment(eng_new, ita_new, "sheep")
eng_ita, print_examples(eng_new, ita_new, "sheep")

## Noise test

In [None]:
def evaluate_language(model1, model2, word, fraction = True):
    results = []
    closest_word = print_examples(model1, model2, word)
    res = noise_experiment(model1, model2, word)
    found = [0, 0, 0]
    for i in range(3):
        if closest_word in res[i][0]:
            found[i] = res[i][1][np.where(res[i][0] == closest_word)[0][0]]
            if fraction:
                found[i] /= sum(res[i][1])
    results.append((word, closest_word, found))
    return results

In [None]:
evaluate_language(eng_new, ita_new, "elephant", False)

In [None]:
evaluate_language(ita_new, eng_new, "elefante", False)

In [None]:
words_per_lang = []

english_words = [
    'elephant', 'apple', 'cat', 'butterfly', 'strawberry', 'university', 'magnificent', 
    'adventure', 'imagination', 'celebration', 'compassionate', 'extraordinary', 'friendship',
    'relationship', 'understanding', 'delicious', 'adventure', 'enthusiasm',
    'photography', 'restaurant', 'important', 'television', 'dictionary',
    'hospitality', 'independent', 'government', 'scientific', 'architecture',
    'responsibility', 'improvement', 'communication', 'opportunity', 'transportation',
    'environment', 'motivation', 'conversation', 'performance', 'appreciation',
    'cooperation', 'knowledge', 'adventure', 'sophisticated', 'imagination',
    'composition', 'presentation', 'international', 'determination', 'intelligence',
    'philosophy', 'psychology', 'unforgettable', 'recommendation', 'collaboration',
    'contribution', 'productivity', 'concentration', 'development', 'achievement',
    'fundamental', 'achievement', 'satisfaction', 'appreciation', 'celebration',
    'conversation', 'dedication', 'determination', 'excellence', 'fascination',
    'gratitude', 'hospitality', 'improvement', 'independence'
]

print (english_words)

for word in english_words:
    words_per_lang.append({
        'english': word,
        'italian': print_examples(eng_new, ita_new, word),
        'portuguese': print_examples(eng_new, pt_new, word)
    })

In [None]:
results = {}

eng_new.language = "english"
ita_new.language = "italian"
pt_new.language = "portuguese"

models = [eng_new, ita_new, pt_new]

for word_per_lang in words_per_lang:
    print(word_per_lang)
    for model1 in models:
        for model2 in models:
            if model1 != model2:
                print(f"evaluating {model1.language} to {model2.language}")
                word = word_per_lang[model1.language]
                result = evaluate_language(model1, model2, word)
                print(result)
                # Use a tuple of languages and word as the key
                results[(model1.language, model2.language, word)] = result