Having trained the English and Spanish LMs (along with the embeddings which constitute part of the encoder), it is now time to attempt translation.

In [1]:
from decoder_head.core import *
from fastai2.text.all import *

In [2]:
vocab_en= make_vocab(pd.read_pickle('data/en-100_tok/counter.pkl'), max_vocab=4000)
vocab_es= make_vocab(pd.read_pickle('data/es-100_tok/counter.pkl'), max_vocab=4000)

In [3]:
path = 'data/en-100_tok/'
mult = 4
bs = 80
seq_len = 70

lm = DataBlock(blocks=(TextBlock(vocab=vocab_en, is_lm=True),),
                get_x=read_tokenized_file,
                get_items=partial(get_text_files, folders=['train', 'valid']),
                splitter=FuncSplitter(lambda itm: itm.parent.name == 'valid'))

dbunch_lm = lm.databunch(path, path=path, bs=bs, seq_len=seq_len)

In [4]:
learn = language_model_learner(
    dbunch_lm,
    pAWD_LSTM,
    opt_func=opt,
    pretrained=False,
    config=awd_lstm_lm_config,
    drop_mult=0.1,
    metrics=[accuracy, top_k_accuracy]
)

In [None]:
#default_exp decoder_head.data

I construct the dictionary to evaluate translation performance using the wonderful data made available as part of the awesome [MUSE](https://github.com/facebookresearch/MUSE) repository.

In [10]:
#export

def get_en_es_dict():
    with open('data/en-es.txt') as f:
        en_es = f.readlines()
    en_es = [l.strip() for l in en_es]
    
    en_es_dict = defaultdict(list)
    for l in en_es:
        source, target = l.split()
        en_es_dict[source].append(target)
        
    # check that we have the source word in the model trained on English
    vocab_en_set = set(vocab_en)
    en_es_dict = {k: v for k, v in en_es_dict.items() if k in vocab_en_set}
    
    # make sure we have the target word in Spanish
    vocab_es_set = set(vocab_es)
    en_es_dict = {k: vocab_es_set.intersection(set(v)) for k, v in en_es_dict.items() if vocab_es_set.intersection(set(v))}
    
    return en_es_dict

In [11]:
en_es_dict = get_en_es_dict()

In [13]:
len(en_es_dict)

2451

## Attemp translation from English to Spanish

In [13]:
learn.load('pLSTM_en')

<fastai2.text.learner.LMLearner at 0x7f3887c4f090>

Setting up the stage

In [14]:
learn.model[0].encoder.weight.requires_grad

True

In [15]:
learn.optimize_permutation()

In [16]:
learn.model[0].encoder.weight.requires_grad

False

In [17]:
nn.init.kaiming_normal_(learn.model[0].encoder.p);

Replacing the embeddings with the Spanish embeddings learned by the Spanish LM.

In [18]:
learn.model[0].encoder.weight.data = torch.load('data/embeddings_es.torch').data

In [19]:
learn.model[0].encoder.p.requires_grad

True

In [20]:
learn.model[0].encoder.weight.requires_grad

False

We now have an English LM with Spanish embeddings loaded. Let's see how we do on translation.

One thing worth keeping in mind are the missing English words from the Spanish vocabulary. This is an interesting situation and makes the task even harder.

In [21]:
learn.loss_func = lambda preds, targs: aza_loss(learn, preds, targs)

In [22]:
learn.fit_one_cycle(1, 1e-3,  moms=(0.8, 0.7, 0.8), wd=1e-7)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,4.984266,4.949764,0.250956,0.460905,41:12


In [25]:
# learn.save('pLSTM_en-es')

In [30]:
learn.load('pLSTM_en-es')

<fastai2.text.learner.LMLearner at 0x7ff863701150>

In [31]:
learn.validate()

(#3) [4.975531101226807,0.24766667187213898,0.45390090346336365]

In [32]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
#                 print(k, vv)
                break
    return hits/len(en_es_dict)

In [33]:
top_n_translation_acc(100)

0.037943696450428395

In [34]:
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
                print(k, [vocab_es[i] for i in p[idx_en].argsort(descending=True)[:5]])
                break
    return hits/len(en_es_dict)

In [35]:
en_es_dict['have']

{'han', 'tener', 'tienen'}

In [36]:
top_n_translation_acc(5)

has ['utilizan', 'deben', 'símbolo', 'permiten', 'tiene']
one ['cuando', 'para', 'lo', 'una', 'algo']
united ['evitar', 'animales', 'tratar', 'fuego', 'unidos']
spanish ['juvenil', 'high', 'masculino', 'española', 'blancas']
publishing ['núcleos', 'documentos', 'diez', 'publicaciones', 'satélite']


0.002039983680130559

In [14]:
#export
def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        print(k, [vocab_es[i] for i in p[idx_en].argsort(descending=True)[:5]])
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in p[idx_en].argsort(descending=True)[:top_n]:
                hits += 1
                break
    return hits/len(en_es_dict)

In [None]:
# top_n calculated a take on nearest neighbors, performs better but not by much

def top_n_translation_acc(top_n=1):
    p = learn.model[0].encoder.p
    learned_embeddings = learn.model[0].encoder.p @ learn.model[0].encoder.weight
    
    hits = 0
    for k, v in en_es_dict.items():
        idx_en = vocab_en.index(k)
        diff = learn.model[0].encoder.weight - learned_embeddings[idx_en]
        candidates = diff.sum(-1).argsort(descending=False)[:top_n]
        
        for vv in v:
            idx_es = vocab_es.index(vv)
            if idx_es in candidates:
                hits += 1
                break
    return hits/len(en_es_dict)

In [56]:
top_n_translation_acc(100)

0.025703794369645042

Unfortunately, this attempt at translation failed. In the next notebook, we will make another attempt, this time with normalized embeddings.

In [17]:
from nbdev.export import *
notebook2script()

Converted 00_data.ipynb.
Converted 01_train_LM_en.ipynb.
Converted 02_train_LM_es.ipynb.
Converted 03_translate_en_to_es.ipynb.
Converted 99_index.ipynb.
