In [None]:
python unsupervised.py --src_lang en --tgt_lang fr --src_emb ../wiki.en.vec --tgt_emb ../wiki.fr.vec --n_refinement 5 --epoch_size 100000 --max_vocab 30000 --dis_most_frequent 30000 --dico_eval ../data/en-fr.txt --batch_size 32 --dico_method nn --dico_max_size 1000

In [None]:
python GAN.py --src_emb wiki.en.vec --tgt_emb wiki.fr.vec --batch_size 32 --epoch_size 50000 --n_epochs 20 --dico_eval data/en-fr.txt

In [1]:
import io
import numpy as np
from numba import jit,cuda

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [2]:
src_path = 'wiki.en.vec'
tgt_path = 'wiki.fr.vec'

nmax = 100000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)






In [3]:
def get_nn_with_emb(word_emb ,tgt_emb, tgt_id2word, K=5):

    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    #for i, idx in enumerate(k_best):
    #    print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))
    return scores[k_best[0]],tgt_id2word[k_best[0]],scores,k_best  #Return le mot le plus proche de l'embedding

In [4]:
f = open("data/en-fr.txt", "r",encoding="utf-8")
fr_list=[]
eng_list=[]
eng_fr_dict={}
fr_eng_dict={}
for line in f.readlines():

    eng_word=line.split()[0]
    eng_list.append(eng_word)

    fr_word=line.split()[1]
    fr_list.append(fr_word)

    if eng_word not in eng_fr_dict.keys():
        eng_fr_dict[eng_word]=[fr_word]
    else:
        eng_fr_dict[eng_word].append(fr_word)

    if fr_word not in fr_eng_dict.keys():
        fr_eng_dict[fr_word]=[eng_word]
    else:
        fr_eng_dict[fr_word].append(eng_word)

In [5]:
def get_train_test(eng_fr_dict,fr_eng_dict,src_word2id,tgt_word2id):
    
    
    for word in list(eng_fr_dict.keys()):
        if word not in src_word2id.keys():
            del eng_fr_dict[word]
        else:
            if eng_fr_dict[word][0] not in tgt_word2id.keys():
                del eng_fr_dict[word]
    

    for word in list(fr_eng_dict.keys()):
        if word not in tgt_word2id.keys():
            del fr_eng_dict[word]
        else:
            if fr_eng_dict[word][0] not in src_word2id.keys():
                del fr_eng_dict[word]

    #Separate train/test


    return eng_fr_dict,fr_eng_dict

eng_fr_dict,fr_eng_dict=get_train_test(eng_fr_dict,fr_eng_dict,src_word2id,tgt_word2id)

In [6]:
src_embeddings_normalized=src_embeddings.copy()
tgt_embeddings_normalized=tgt_embeddings.copy()


for i in range(len(src_embeddings_normalized)):
    src_embeddings_normalized[i]=src_embeddings_normalized[i]/np.linalg.norm(src_embeddings_normalized[i])
    tgt_embeddings_normalized[i]=tgt_embeddings_normalized[i]/np.linalg.norm(tgt_embeddings_normalized[i])
    
    

In [7]:
import torch


W=torch.load("dumps/en_fr_mapping.pth")
print(type(W))

<class 'numpy.ndarray'>


In [8]:
print(src_id2word[2570])

get_nn_with_emb(W@src_embeddings_normalized[2570],tgt_emb=tgt_embeddings_normalized,tgt_id2word=tgt_id2word)


cat


(0.5565839694724418,
 'chat',
 array([0.08456691, 0.26493092, 0.11533649, ..., 0.22230542, 0.14808035,
        0.15922943]),
 array([ 3359,  6351, 26555,  2624,  8470], dtype=int64))

In [15]:
test=list(eng_fr_dict.keys())[:1000]

In [10]:
#@jit(target_backend="cuda")
def test_results(w,dict,src_embeddings,src_word2id,tgt_embeddings,tgt_id2word,test_list):

    cpt=0
    cpt_5=0
    score_sum=0
    for word in test_list:
        score,trad,scores,k_best=get_nn_with_emb(w@(src_embeddings[src_word2id[word]]),tgt_embeddings,tgt_id2word,K=5)
        #cosine sim @1
        score_word=0
        #precision @1
        
        if trad in dict[word]:
            score_word+=score
            cpt+=1
        
        #precision@5
        for _,idx in enumerate(k_best):
            if tgt_id2word[idx] in dict[word]:
                cpt_5+=1
                if score_word==0:
                    score_word+=scores[idx]
                break
        score_sum+=score_word
    return score_sum/len(test_list),cpt/len(test_list),cpt_5/len(test_list)

cos_sim,p1,p5=test_results(W,eng_fr_dict,src_embeddings,src_word2id,tgt_embeddings,tgt_id2word,test)

In [11]:
cos_sim,p1,p5

(0.5040366440999938, 0.659, 0.805)

In [28]:
#benchmark
cos_sim,p1,p5

(0.49888482082646074, 0.668, 0.813)

In [18]:

W_fr=torch.load("dumps/fr_en_mapping.pth")
print(type(W_fr))

<class 'numpy.ndarray'>


In [19]:
test_fr=list(fr_eng_dict.keys())[:1000]

In [None]:
cos_sim_,p1_,p5_=test_results(W_fr,fr_eng_dict,tgt_embeddings,tgt_word2id,src_embeddings,src_id2word,test_fr)
cos_sim_,p1_,p5_