# Compute alignments using Wikidata

This notebook create alignment matrices to be applied for align two different word embeddings.
The matrices are created using [this approach](https://github.com/Babylonpartners/fastText_multilingual/blob/master/align_your_own.ipynb), but instead of using bilingual dictionaries we use the Wikidata labels.
Here we use a parquet dump clo

In [None]:
import numpy as np
import pandas as pd
import os 
from pyspark.sql.functions import regexp_replace


from fastText_multilingual.fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        try:
            source = source.lower().split()
            sourceVector = np.zeros(300) + sum([source_dictionary[word] for word in source  if word in source_dictionary])/len(source)
            target = target.lower().split()
            targetVector = np.zeros(300) + sum([target_dictionary[word] for word in target  if word in target_dictionary])/len(target)
            if (sourceVector.all() !=0) and (targetVector.all() != 0):
                    source_matrix.append(sourceVector)
                    target_matrix.append(targetVector)
        except:
            pass
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


## Prepare folder
outputFolder = 'my_alingments'
if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)


In [1]:


##### replaced by the code below
# get wikidata data


#df = spark.read.parquet('/user/joal/wmf/data/wmf/wikidata/item_page_link/20190204')
#df = df[df['page_namespace'] == 0]
#df = df.withColumn('page', regexp_replace('page_title', '_', ' '))
#df = df.select('wiki_db','item_id','page')

import wmfdata
spark = wmfdata.spark.get_session(type='yarn-regular')

You are using wmfdata v1.2, but v1.3 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [4]:
#Check the last partition of table wmf.wikidata_item_page_link
spark.sql('SHOW PARTITIONS wmf.wikidata_item_page_link').show()

+-------------------+
|          partition|
+-------------------+
|snapshot=2021-11-15|
|snapshot=2021-11-22|
|snapshot=2021-11-29|
|snapshot=2021-12-06|
|snapshot=2021-12-13|
|snapshot=2021-12-20|
+-------------------+



In [7]:
from pyspark.sql.functions import regexp_replace

last_partition = '2021-12-20'
df = spark.sql('''SELECT wiki_db,page_title,item_id 
                FROM wmf.wikidata_item_page_link
                WHERE snapshot = "{0}"
'''.format(last_partition))
df = df.withColumn('page', regexp_replace('page_title', '_', ' '))
df = df.select('wiki_db','item_id','page')


In [3]:
import glob,os
vectors = sorted(glob.glob('vectors/wiki.*.vec'), key=os.path.getsize) #sorted by size to load the largest files just once
lang2 = ''
while vectors:   
    lang1 = vectors.pop()
    lang1_code = lang1.split('.')[1]
    print(lang1_code)
    if lang1 == lang2:
        lang1_dictionary = lang2_dictionary
    else:
        lang1_dictionary = FastVector(vector_file=lang1)
    for lang2 in vectors:
        lang2_dictionary = FastVector(vector_file=lang2)
        lang2_code = lang2.split('.')[1]
        print('==',lang2_code)
        df2 = df[df.wiki_db == '%swiki' % lang1_code].join(df[df.wiki_db == '%swiki' % lang2_code].withColumnRenamed("page", "page2").withColumnRenamed('wiki_db','wiki_db2'),on='item_id')
        pairs = df2.toPandas()       
        bilingual_dictionary = list(zip(pairs['page'],pairs['page2']))
        ##common words
        #lang1_words = set(lang1_dictionary.word2id.keys()lang1_dictionary.word2id.keys())
        #lang2_words = set(lang2_dictionary.word2id.keys())
        #overlap = list(lang1_words & lang2_words)
        #bilingual_dictionary.extend([(entry, entry) for entry in overlap])
        # form the training matrices
        source_matrix, target_matrix = make_training_matrices(lang1_dictionary, lang2_dictionary, bilingual_dictionary)
        # learn and apply the transformation
        transform = learn_transformation(source_matrix, target_matrix)
        with open('%s/apply_in_%s_to_%s.txt' % (outputFolder,lang1_code,lang2_code),'w') as f:
            np.savetxt(f, transform)
        bilingual_dictionary = [(y,x) for x,y in bilingual_dictionary] #reverse pairs
        # form the training matrices
        source_matrix, target_matrix = make_training_matrices(lang2_dictionary, lang1_dictionary, bilingual_dictionary)
        # learn and apply the transformation
        transform = learn_transformation(source_matrix, target_matrix)
        with open('%s/apply_in_%s_to_%s.txt' % (outputFolder,lang2_code,lang1_code),'w') as f:
            np.savetxt(f, transform)
        

en
reading word vectors from vectors/wiki.en.vec
reading word vectors from vectors/wiki.ta.vec
== ta
reading word vectors from vectors/wiki.vi.vec
== vi
reading word vectors from vectors/wiki.id.vec
== id
reading word vectors from vectors/wiki.zh.vec
== zh
reading word vectors from vectors/wiki.fa.vec
== fa
reading word vectors from vectors/wiki.he.vec
== he
reading word vectors from vectors/wiki.ca.vec
== ca
reading word vectors from vectors/wiki.pt.vec
== pt
reading word vectors from vectors/wiki.ar.vec
== ar
reading word vectors from vectors/wiki.it.vec
== it
reading word vectors from vectors/wiki.uk.vec
== uk
reading word vectors from vectors/wiki.es.vec
== es
reading word vectors from vectors/wiki.fr.vec
== fr
reading word vectors from vectors/wiki.ru.vec
== ru
ru
reading word vectors from vectors/wiki.ta.vec
== ta
reading word vectors from vectors/wiki.vi.vec
== vi
reading word vectors from vectors/wiki.id.vec
== id
reading word vectors from vectors/wiki.zh.vec
== zh
reading word