Having trained the English and Spanish LMs (along with the embeddings which constitute part of the encoder), it is now time to attempt translation.

In [1]:
from decoder_head.core import *
from decoder_head.data import *
from fastai2.text.all import *

In [2]:
vocab_en= make_vocab(pd.read_pickle('data/en-100_tok/counter.pkl'), max_vocab=4000)
vocab_es= make_vocab(pd.read_pickle('data/es-100_tok/counter.pkl'), max_vocab=4000)

In [3]:
en_es_dict = get_en_es_dict(vocab_en, vocab_es)

In [12]:
len(en_es_dict)

2451

In [4]:
path = 'data/en-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_en, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [59]:
#export core
class PermuteEmbeddingsSinkhorn(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, padding_idx):
        super().__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        
        # to conform to nn.Embedding api
        self.max_norm=None
        self.norm_type=2.0
        self.scale_grad_by_freq=False
        self.sparse = False

        self.weight = nn.Parameter( torch.Tensor(num_embeddings, embedding_dim) )
        self.p = nn.Parameter( torch.eye(self.num_embeddings) )
        self.p.requires_grad = False
        
        self.reset_parameters()
        
    def forward(self, words):        
        p1 = self.p / self.p.sum(dim=1).unsqueeze(1)
        p2 = p1 / p1.sum(dim=0).unsqueeze(0)
            
        return F.embedding(words, self.p @ self.weight)
    
    def reset_parameters(self): pass

In [60]:
#export core
class p_sinkhornAWD_LSTM(AWD_LSTM):
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, hidden_p=0.2, input_p=0.6, embed_p=0.1,
                 weight_p=0.5, bidir=False, packed=False):
        store_attr(self, 'emb_sz,n_hid,n_layers,pad_token,packed')
        self.bs = 1
        self.n_dir = 2 if bidir else 1
        self.encoder = PermuteEmbeddingsSinkhorn(vocab_sz, emb_sz, padding_idx=pad_token)

        self.encoder_dp = self.encoder # chosing to initally train without embedding dropout
        self.rnns = nn.ModuleList([self._one_rnn(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.n_dir,
                                                 bidir, weight_p, l) for l in range(n_layers)])
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

In [61]:
#export core
import fastai2 
fastai2.text.models.core._model_meta[p_sinkhornAWD_LSTM] = fastai2.text.models.core._model_meta[AWD_LSTM]

In [62]:
learn = language_model_learner(
    dbunch_lm,
    p_sinkhornAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

## Attemp translation from English to Spanish

In [63]:
learn.load('pLSTM_en')

<fastai2.text.learner.LMLearner at 0x7f087ff01a50>

In [64]:
learn.validate()

█

(#3) [2.8178272247314453,0.42678970098495483,0.6835475564002991]

In [65]:
learn.model[0].encoder.weight.requires_grad

True

In [66]:
learn.optimize_permutation()

In [67]:
learn.model[0].encoder.weight.requires_grad

False

In [68]:
nn.init.kaiming_normal_(learn.model[0].encoder.p);

In [69]:
learn.model[0].encoder.weight.data = torch.load('data/embeddings_es.torch').data

In [70]:
learn.model[0].encoder.p.requires_grad

True

In [71]:
learn.model[0].encoder.weight.requires_grad

False

In [72]:
learn.loss_func = lambda preds, targs: aza_loss(learn, preds, targs)

In [73]:
learn.fit_one_cycle(1, 1e-3,  moms=(0.8, 0.7, 0.8), wd=1e-7)

epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
0         4.966329    4.935486    0.265000  0.465976        40:22     


In [74]:
learn.save('pLSTM_sinkorhn_en-es')

In [75]:
learn.load('pLSTM_sinkorhn_en-es')

<fastai2.text.learner.LMLearner at 0x7f087ff01a50>

In [76]:
learn.validate()

█

(#3) [4.935486316680908,0.26499998569488525,0.4659762382507324]

In [77]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
#                 print(k, vv)
                break
    return hits/len(en_es_dict)

In [78]:
top_n_translation_acc(100)

0.033047735618115054

In [79]:
# top_n calculated a take on nearest neighbors, performs better but not by much

def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    learned_embeddings = learn.model[0].encoder.p @ learn.model[0].encoder.weight
    
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        diff = learn.model[0].encoder.weight - learned_embeddings[idx_en]
        candidates = diff.sum(-1).argsort(descending=False)[:top_n]
        
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in candidates:
                hits += 1
                break
    return hits/len(en_es_dict)

In [80]:
top_n_translation_acc(100)

0.03386372909016728

Unfortunately, this attempt at translation failed. In the next notebook, we will make another attempt, this time with normalized embeddings.

In [17]:
from nbdev.export import *
notebook2script()

Converted 00_data.ipynb.
Converted 01_train_LM_en.ipynb.
Converted 02_train_LM_es.ipynb.
Converted 03_translate_en_to_es.ipynb.
Converted 99_index.ipynb.
