# Align vector spaces to English (Multilingual word vectors)!

In [1]:
import numpy as np
from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [4]:
class Downloder():
    def download_manager(self, url, destination='', try_number="10", time_out="60"):
        #threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()
        if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:
            return True
        else:
            return False


    def _wget_dl(self,url, destination, try_number, time_out):
        import subprocess
        command=["wget", "-c", "-P", destination, "-t", try_number, "-T", time_out , url]
        try:
            download_state=subprocess.call(command)
        except Exception as e:
            print(e)
        #if download_state==0 => successfull download
        return download_state

In [6]:
languages=["ar",
"da",
"de",
"el",
"en",
"es",
"fa",
"hu",
"fr",
"it",
"ja",
"lv",
"pt",
"ru",
"tr",
"uk"]
print(len(languages))

languages_dictionary={}

path_prefix = "/mnt/hdd_disk/dan/datasets/embeddings_fastText/"
for language in languages:
    folder_path = path_prefix + language + "/"
    file_path = path_prefix + language + "/wiki."+language+".vec"
    import os  
    if not os.path.isfile(file_path) :
            if os.path.isdir(folder_path) :
                os.mkdir(folder_path)
            downloader = Downloder()
            downloader._wget_dl("https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki."+language+".vec", folder_path)
    languages_dictionary[language]=FastVector(vector_file=file_path)

print(len(languages_dictionary))

16
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ar/wiki.ar.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/da/wiki.da.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/de/wiki.de.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/el/wiki.el.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/en/wiki.en.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/es/wiki.es.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fa/wiki.fa.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/hu/wiki.hu.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fr/wiki.fr.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/it/wiki.it.vec
reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ja/wiki.ja.vec
reading word vectors from /mn

In [7]:
fa_vector = languages_dictionary["fa"]["گربه"]
uk_vector = languages_dictionary["uk"]["кіт"]
print(FastVector.cosine_similarity(fa_vector, uk_vector))

-0.014731081984914952


"گربه" and "кіт" both mean "cat", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, ...

In [8]:
for language, language_dictionary in languages_dictionary.items():
    language_dictionary.apply_transform('alignment_matrices/'+language+'.txt')

In [9]:
fa_vector = languages_dictionary["fa"]["گربه"]
uk_vector = languages_dictionary["uk"]["кіт"]

print(FastVector.cosine_similarity(fa_vector, uk_vector))

0.500099345287656


In [10]:
for language, language_dictionary in languages_dictionary.items():
    target_path = path_prefix + language + '/multiling_'+language+'.vec')
    import os  
    if not os.path.isfile(target_path) :
        language_dictionary.export(target_path)