In [1]:
from decoder_head.core import *
from fastai2.text.all import *
import re

In [2]:
class PermuteEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, padding_idx):
        super().__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        
        # to conform to nn.Embedding api
        self.max_norm=None
        self.norm_type=2.0
        self.scale_grad_by_freq=False
        self.sparse = False

        self.weight = nn.Parameter( torch.Tensor(num_embeddings, embedding_dim) )
        nn.init.kaiming_uniform_(self.weight)
        self.p = nn.Parameter( torch.eye(self.num_embeddings) )
        self.p.requires_grad = False
        
        self.reset_parameters()
        
    def forward(self, words):
        return F.embedding(words, self.p @ self.normalized_weight())
    
    def normalized_weight(self):
        w1 = self.weight / self.weight.norm(dim=1).unsqueeze(1)
        w2 = w1 - w1.mean(0)
        w3 = w2 / w2.norm(dim=1).unsqueeze(1)
        return w3
        
    def reset_parameters(self): pass

import decoder_head.core
decoder_head.core.PermuteEmbedding = PermuteEmbedding

In [3]:
vocab_en= make_vocab(pd.read_pickle('data/en-100_tok/counter.pkl'), max_vocab=4000)
vocab_es= make_vocab(pd.read_pickle('data/es-100_tok/counter.pkl'), max_vocab=4000)

In [4]:
path = 'data/en-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_en, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [5]:
learn = language_model_learner(
    dbunch_lm,
    pAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

In [6]:
learn.load('emb_norm_rows_columns') # en LM

<fastai2.text.learner.LMLearner at 0x7f69535f6fd0>

In [7]:
len(vocab_en)

4008

In [8]:
def embs_to_txt(vocab, embeddings, fname):
    '''writes embeddings to txt file in word2vec format'''
    lines = []
    lines.append(f'{len(vocab)} {embeddings.shape[1]}\n')
    for word, t in zip(vocab, embeddings):
        word = re.subn('\n', '', word)[0]
        lines.append(f"{word} {' '.join([str(datum.item()) for datum in t])}\n")
    with open(fname, 'w') as f:
        f.writelines(lines)
#     return lines

In [9]:
embs_to_txt(vocab_en, learn.model[0].encoder.normalized_weight(), 'data/en_norm_embs.txt')

In [10]:
path = 'data/es-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_es, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [11]:
learn = language_model_learner(
    dbunch_lm,
    pAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

In [12]:
learn.load('pLSTM_es')

<fastai2.text.learner.LMLearner at 0x7f6952844490>

In [13]:
embs_to_txt(vocab_en, learn.model[0].encoder.normalized_weight(), 'data/es_norm_embs.txt')

In [14]:
# python map_embeddings.py --cuda --unsupervised ~/workspace/decoder_head/data/en_norm_embs.txt ~/workspace/decoder_head/data/es_norm_embs.txt ~/workspace/decoder_head/data/en_src.txt ~/workspace/decoder_head/data/es_trg.txt

In [15]:
with open('data/en-es.txt') as f:
    en_es = f.readlines()
    
en_es = [l.strip() for l in en_es]

en_es_dict = defaultdict(list)

for l in en_es:
    source, target = l.split()
    en_es_dict[source].append(target)

In [16]:
len(en_es_dict)

93084

In [17]:
vocab_en_set = set(vocab_en)
en_es_dict = {k: v for k, v in en_es_dict.items() if k in vocab_en_set}

In [18]:
len(en_es_dict)

3475

In [19]:
vocab_es_set = set(vocab_es)
en_es_dict = {k: vocab_es_set.intersection(set(v)) for k, v in en_es_dict.items() if vocab_es_set.intersection(set(v))}

In [20]:
len(en_es_dict)

2451

In [22]:
import annoy

In [23]:
with open('data/en_src.txt') as f:
    en_src = f.readlines()
    
with open('data/es_trg.txt') as f:
    es_trg = f.readlines()

In [32]:
en_embs = []
for l in en_src[1:]:
    en_embs.append([float(s) for s in l.split()[1:]])

en_embs = np.array(en_embs)

es_embs = []
for l in es_trg[1:]:
    es_embs.append([float(s) for s in l.split()[1:]])

es_embs = np.array(es_embs)

In [47]:
en_embs_norm = en_embs / np.linalg.norm(en_embs, axis=1)[:, None]
es_embs_norm = es_embs / np.linalg.norm(es_embs, axis=1)[:, None]

In [48]:
from annoy import AnnoyIndex

In [51]:
t = AnnoyIndex(100, 'euclidean')

for i in range(len(en_embs_norm)):
    t.add_item(i, en_embs_norm[i])
    
t.build(10)

True

[5, 67, 771, 54, 191, 3578, 511, 76, 1833, 2105]

In [72]:
def top_n_translation_acc(top_n=1):
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            nns = t.get_nns_by_vector(es_embs_norm[idx_es], top_n)
            if idx_en in nns:
                hits += 1
#                 print(k, vv)
                break
    return hits/len(en_es_dict)

In [67]:
top_n_translation_acc(1)

0.3908608731130151

In [68]:
top_n_translation_acc(3)

0.5091799265605875

In [69]:
top_n_translation_acc(5)

0.551203590371277

In [64]:
top_n_translation_acc(10)

0.6597307221542228

In [70]:
def top_n_translation_acc(top_n=1):
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            nns = t.get_nns_by_vector(es_embs_norm[idx_es], top_n)
            if idx_en in nns:
                hits += 1
                print(k, vv)
                break
    return hits/len(en_es_dict)