In [1]:
from decoder_head.core import *
from fastai2.text.all import *

In [2]:
vocab_en= make_vocab(pd.read_pickle('data/en-100_tok/counter.pkl'), max_vocab=4000)
vocab_es= make_vocab(pd.read_pickle('data/es-100_tok/counter.pkl'), max_vocab=4000)

In [3]:
path = 'data/en-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_en, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [4]:
learn = language_model_learner(
    dbunch_lm,
    pAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

In [5]:
# learn.load('pLSTM_en')

<fastai2.text.learner.LMLearner at 0x7f6ae2da2cd0>

In [5]:
with open('data/en-es.txt') as f:
    en_es = f.readlines()
    
en_es = [l.strip() for l in en_es]

In [6]:
en_es_dict = defaultdict(list)

for l in en_es:
    source, target = l.split()
    en_es_dict[source].append(target)

In [7]:
len(en_es_dict)

93084

Check that we have the source word in the model trained on English

In [8]:
vocab_en_set = set(vocab_en)
en_es_dict = {k: v for k, v in en_es_dict.items() if k in vocab_en_set}

In [9]:
len(en_es_dict)

3475

Make sure we have the target word in Spanish

In [10]:
vocab_es_set = set(vocab_es)
en_es_dict = {k: vocab_es_set.intersection(set(v)) for k, v in en_es_dict.items() if vocab_es_set.intersection(set(v))}

In [11]:
len(en_es_dict)

2451

Attemp translation from English to Spanish

In [12]:
learn.load('pLSTM_en')

<fastai2.text.learner.LMLearner at 0x7fa3f617d050>

In [13]:
learn.model[0].encoder.weight.requires_grad

True

In [14]:
learn.optimize_permutation()

In [15]:
learn.model[0].encoder.weight.requires_grad

False

In [16]:
nn.init.kaiming_normal_(learn.model[0].encoder.p);

In [17]:
learn.model[0].encoder.weight.data = torch.load('data/embeddings_es.torch').data

In [18]:
learn.model[0].encoder.p.requires_grad

True

In [19]:
learn.model[0].encoder.weight.requires_grad

False

We now have an English LM with Spanish embeddings loaded. Let's see how we do on translation.

One thing worth keeping in mind are the missing English words from the Spanish vocabulary. This is an interesting situation and makes the task even harder.

In [21]:
learn.loss_func = lambda preds, targs: aza_loss(learn, preds, targs)

In [24]:
learn.fit_one_cycle(1, 1e-3,  moms=(0.8, 0.7, 0.8), wd=1e-7)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,4.987006,4.975531,0.247667,0.453901,41:02


In [25]:
# learn.save('pLSTM_en-es')

In [22]:
learn.load('pLSTM_en-es')

<fastai2.text.learner.LMLearner at 0x7fa3f617d050>

In [23]:
learn.validate()

(#3) [4.975531101226807,0.24766667187213898,0.45390090346336365]

In [32]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
#                 print(k, vv)
                break
    return hits/len(en_es_dict)

In [33]:
top_n_translation_acc(100)

0.037943696450428395