In [1]:
from decoder_head.core import *
from fastai2.text.all import *

In [2]:
class PermuteEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, padding_idx):
        super().__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        
        # to conform to nn.Embedding api
        self.max_norm=None
        self.norm_type=2.0
        self.scale_grad_by_freq=False
        self.sparse = False

        self.weight = nn.Parameter( torch.Tensor(num_embeddings, embedding_dim) )
        nn.init.kaiming_uniform_(self.weight)
        self.p = nn.Parameter( torch.eye(self.num_embeddings) )
        self.p.requires_grad = False
        
        self.reset_parameters()
        
    def forward(self, words):
        return F.embedding(words, self.p @ self.normalized_weight())
    
    def normalized_weight(self):
        w1 = self.weight / self.weight.norm(dim=1).unsqueeze(1)
        w2 = w1 - w1.mean(0)
        w3 = w2 / w2.norm(dim=1).unsqueeze(1)
        return w3
        
    def reset_parameters(self): pass

import decoder_head.core
decoder_head.core.PermuteEmbedding = PermuteEmbedding

In [3]:
vocab_en= make_vocab(pd.read_pickle('data/en-100_tok/counter.pkl'), max_vocab=4000)
vocab_es= make_vocab(pd.read_pickle('data/es-100_tok/counter.pkl'), max_vocab=4000)

In [4]:
path = 'data/en-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_en, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [5]:
learn = language_model_learner(
    dbunch_lm,
    pAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

In [6]:
learn.load('normalized_mapped_en') # en LM

<fastai2.text.learner.LMLearner at 0x7fc44d5fa2d0>

In [7]:
with open('data/en-es.txt') as f:
    en_es = f.readlines()
    
en_es = [l.strip() for l in en_es]

en_es_dict = defaultdict(list)

for l in en_es:
    source, target = l.split()
    en_es_dict[source].append(target)

In [8]:
len(en_es_dict)

93084

Check that we have the source word in the model trained on English

In [9]:
vocab_en_set = set(vocab_en)
en_es_dict = {k: v for k, v in en_es_dict.items() if k in vocab_en_set}

In [10]:
len(en_es_dict)

3475

Make sure we have the target word in Spanish

In [11]:
vocab_es_set = set(vocab_es)
en_es_dict = {k: vocab_es_set.intersection(set(v)) for k, v in en_es_dict.items() if vocab_es_set.intersection(set(v))}

In [12]:
len(en_es_dict)

2451

Attemp translation from English to Spanish

In [13]:
learn.model[0].encoder.weight.requires_grad

True

In [14]:
learn.optimize_permutation()

In [15]:
learn.model[0].encoder.weight.requires_grad

False

In [16]:
nn.init.kaiming_normal_(learn.model[0].encoder.p);

In [18]:
def txt_to_embs(fname):
    with open(fname, 'r') as f:
        lines = f.readlines()
    vocab = []
    embs = []
    for line in lines[1:]:
        l = line.split()
        vocab.append(l[0])
        embs.append(np.array([float(s) for s in l[1:]]))
    return vocab, np.stack(embs)

vocab_embs, embs = txt_to_embs('data/es_norm_embs.txt')

In [19]:
learn.model[0].encoder.weight = nn.Parameter(tensor(embs))

In [20]:
learn.model[0].encoder.p.requires_grad

True

In [22]:
learn.model[0].encoder.weight.requires_grad = False

In [23]:
learn.model[0].encoder.weight.requires_grad

False

We now have an English LM with Spanish embeddings loaded. Let's see how we do on translation.

One thing worth keeping in mind are the missing English words from the Spanish vocabulary. This is an interesting situation and makes the task even harder.

In [24]:
learn.loss_func = lambda preds, targs: aza_loss(learn, preds, targs)

In [25]:
learn.fit_one_cycle(1, 1e-3,  moms=(0.8, 0.7, 0.8), wd=1e-7)

epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
0         3.684969    3.639772    0.344198  0.591861        40:21     


In [26]:
learn.save('pLSTM_emb_norm_en-es_hinted')

In [27]:
learn.load('pLSTM_emb_norm_en-es_hinted')

<fastai2.text.learner.LMLearner at 0x7fc44d5fa2d0>

In [28]:
learn.validate()

█

(#3) [3.639772415161133,0.3441983759403229,0.5918610692024231]

In [29]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
#                 print(k, vv)
                break
    return hits/len(en_es_dict)

In [30]:
top_n_translation_acc(100)

0.041207670338637294

In [38]:
top_n_translation_acc(100)

0.031007751937984496

In [31]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
                print(k, [vocab_es[i] for i in p[idx_en].argsort(descending=True)[:5]])
                break
    return hits/len(en_es_dict)

In [32]:
en_es_dict['have']

{'han', 'tener', 'tienen'}

In [33]:
top_n_translation_acc(5)

should ['varía', 'perdió', 'limita', 'finalizó', 'debería']
authority ['jurisdicción', 'honor', 'river', 'autoridad', 'autonomía']
oklahoma ['empleados', 'medallas', 'oklahoma', 'productores', 'us']


0.0012239902080783353

In [36]:
embs

array([[-0.17130944, -0.13466078,  0.15399924, ..., -0.19588749,
        -0.08263521,  0.18715824],
       [ 0.10057934,  0.04197982, -0.10051062, ...,  0.11448281,
         0.08857403, -0.11117812],
       [ 0.04381532,  0.09861507, -0.04373881, ...,  0.05031699,
         0.12407271, -0.00830902],
       ...,
       [ 0.10647476,  0.00848802, -0.07712181, ...,  0.10510191,
        -0.01276942, -0.09851178],
       [ 0.16311131,  0.08484522, -0.12757055, ...,  0.15140425,
         0.05144875, -0.04720284],
       [ 0.0999221 ,  0.06762829, -0.1189317 , ...,  0.13498293,
         0.08796506, -0.10488362]])

In [35]:
learn.model[0].encoder.weight

Parameter containing:
tensor([[-0.1713, -0.1347,  0.1540,  ..., -0.1959, -0.0826,  0.1872],
        [ 0.1006,  0.0420, -0.1005,  ...,  0.1145,  0.0886, -0.1112],
        [ 0.0438,  0.0986, -0.0437,  ...,  0.0503,  0.1241, -0.0083],
        ...,
        [ 0.1065,  0.0085, -0.0771,  ...,  0.1051, -0.0128, -0.0985],
        [ 0.1631,  0.0848, -0.1276,  ...,  0.1514,  0.0514, -0.0472],
        [ 0.0999,  0.0676, -0.1189,  ...,  0.1350,  0.0880, -0.1049]],
       device='cuda:0')

In [42]:
learn.model[0].encoder.p[7]

tensor([ 5.3910e-10,  1.0744e-09, -6.4294e-09,  ..., -5.4824e-09,
        -9.8421e-09, -3.4995e-09], device='cuda:0', grad_fn=<SelectBackward>)

In [29]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        print(k, [vocab_es[i] for i in p[idx_en].argsort(descending=True)[:5]])
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
                break
    return hits/len(en_es_dict)

In [44]:
top_n_translation_acc(5)

should ['varía', 'perdió', 'limita', 'finalizó', 'debería']
authority ['jurisdicción', 'honor', 'river', 'autoridad', 'autonomía']
oklahoma ['empleados', 'medallas', 'oklahoma', 'productores', 'us']


0.0012239902080783353

In [32]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    learned_embeddings = learn.model[0].encoder.p @ learn.model[0].encoder.weight
    
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        diff = learn.model[0].encoder.weight - learned_embeddings[idx_en]
        candidates = diff.sum(-1).argsort(descending=False)[:top_n]
        
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in candidates:
                hits += 1
                break
    return hits/len(en_es_dict)

In [33]:
top_n_translation_acc(100)

0.03386372909016728

In [35]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    learned_embeddings = learn.model[0].encoder.p @ learn.model[0].encoder.normalized_weight()
    
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        diff = learn.model[0].encoder.normalized_weight() - learned_embeddings[idx_en]
        candidates = diff.sum(-1).argsort(descending=False)[:top_n]
        
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in candidates:
                hits += 1
                break
    return hits/len(en_es_dict)

In [36]:
top_n_translation_acc(100)

0.03223174214606283