# So, show me how to align two vector spaces for myself!

No problem. We're going to run through the example given in the README again, and show you how to learn your own transformation to align the French vector space to the Russian vector space.

First, let's define a few simple functions...

In [1]:
import numpy as np

from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        try:
            source = source.lower().split()
            sourceVector = np.zeros(300) + sum([source_dictionary[word] for word in source  if word in source_dictionary])/len(source)
            target = target.lower().split()
            targetVector = np.zeros(300) + sum([target_dictionary[word] for word in target  if word in target_dictionary])/len(target)
            if (sourceVector.all() !=0) and (targetVector.all() != 0):
                    source_matrix.append(sourceVector)
                    target_matrix.append(targetVector)
        except:
            pass
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [2]:
import pandas as pd
# get wikidata data
df = pd.read_csv('../../gap/wikidataSixLang.csv.gz',sep='\t',index_col=0,header=None).rename(columns={0:'q',1:'wiki',2:'page'})


In [4]:
import glob,os
vectors = sorted(glob.glob('vectors/wiki.*.vec'), key=os.path.getsize) #sorted by size to load the largest files just once
vectors.remove('vectors/wiki.fa.vec') #not considering farsi for this experiment
lang2 = ''
while vectors:
    lang1 = vectors.pop()
    lang1_code = lang1.split('.')[1]
    print(lang1_code)
    if lang1 == lang2:
        lang1_dictionary = lang2_dictionary
    else:
        lang1_dictionary = FastVector(vector_file=lang1)
    for lang2 in vectors:
        lang2_dictionary = FastVector(vector_file=lang2)
        lang2_code = lang2.split('.')[1]
        print('==',lang2_code)
        pairs = df[df.wiki == lang1_code].join(df[df.wiki == lang2_code],rsuffix='_lang2',how='inner')
        bilingual_dictionary = list(zip(pairs['page'],pairs['page_lang2']))
        #common words
        lang1_words = set(lang1_dictionary.word2id.keys())
        lang2_words = set(lang2_dictionary.word2id.keys())
        overlap = list(lang1_words & lang2_words)
        bilingual_dictionary.extend([(entry, entry) for entry in overlap])
        # form the training matrices
        source_matrix, target_matrix = make_training_matrices(lang1_dictionary, lang2_dictionary, bilingual_dictionary)
        # learn and apply the transformation
        transform = learn_transformation(source_matrix, target_matrix)
        with open('my_alingments/apply_in_%s_to_%s.txt' % (lang1_code,lang2_code),'w') as f:
            np.savetxt(f, transform)
        bilingual_dictionary = [(y,x) for x,y in bilingual_dictionary] #reverse pairs
        # form the training matrices
        source_matrix, target_matrix = make_training_matrices(lang2_dictionary, lang1_dictionary, bilingual_dictionary)
        # learn and apply the transformation
        transform = learn_transformation(source_matrix, target_matrix)
        with open('my_alingments/apply_in_%s_to_%s.txt' % (lang2_code,lang1_code),'w') as f:
            np.savetxt(f, transform)
        

en
reading word vectors from vectors/wiki.en.vec
reading word vectors from vectors/wiki.ja.vec
== ja
reading word vectors from vectors/wiki.ar.vec
== ar
reading word vectors from vectors/wiki.es.vec
== es
reading word vectors from vectors/wiki.fr.vec
== fr
reading word vectors from vectors/wiki.ru.vec
== ru
ru
reading word vectors from vectors/wiki.ja.vec
== ja
reading word vectors from vectors/wiki.ar.vec
== ar
reading word vectors from vectors/wiki.es.vec
== es
reading word vectors from vectors/wiki.fr.vec
== fr
fr
reading word vectors from vectors/wiki.ja.vec
== ja
reading word vectors from vectors/wiki.ar.vec
== ar
reading word vectors from vectors/wiki.es.vec
== es
es
reading word vectors from vectors/wiki.ja.vec
== ja
reading word vectors from vectors/wiki.ar.vec
== ar
ar
reading word vectors from vectors/wiki.ja.vec
== ja
ja


# Test

In [16]:
lang1_dictionary = FastVector(vector_file='vectors/wiki.ru.vec')

reading word vectors from vectors/wiki.ru.vec


In [17]:
lang2_dictionary = FastVector(vector_file='vectors/wiki.ar.vec')

reading word vectors from vectors/wiki.ar.vec


In [19]:
FastVector.cosine_similarity(lang1_dictionary['История'.lower()],lang2_dictionary['التاريخ'.lower()])

0.006437103624660425

In [20]:
## test the preinstall aligments
lang1_dictionary.apply_transform('alignment_matrices/ru.txt')
lang2_dictionary.apply_transform('alignment_matrices/ar.txt')

In [22]:
FastVector.cosine_similarity(lang1_dictionary['История'.lower()],lang2_dictionary['التاريخ'.lower()])

0.4295707893064699

In [23]:
lang1_dictionary = FastVector(vector_file='vectors/wiki.ru.vec')
lang2_myaligment = FastVector(vector_file='vectors/wiki.ar.vec')
lang2_myaligment.apply_transform('my_alingments/apply_in_ar_to_ru.txt')

reading word vectors from vectors/wiki.ru.vec
reading word vectors from vectors/wiki.ar.vec


In [24]:
FastVector.cosine_similarity(lang1_dictionary['История'.lower()],lang2_myaligment['التاريخ'.lower()])

0.5110379249206718

In [45]:
print(FastVector.cosine_similarity(lang1_dictionary['toponomy'],lang2_dictionary['toponimia']))
print(FastVector.cosine_similarity(lang1_dictionary['toponomy'],lang2_myaligment['toponimia']))


0.5952403995146678
0.5887240865621274


In [46]:
print(FastVector.cosine_similarity(lang1_dictionary['history'],lang2_dictionary['historia']))
print(FastVector.cosine_similarity(lang1_dictionary['history'],lang2_myaligment['historia']))

0.7600301173896179
0.7209900810626596


In [14]:
!hadoop fs -put my_alingments


Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
