# Neural Machine Translation with RNNs



**Credits**: This notebook is based on code from: https://github.com/keon/seq2seq



### Basic setup and module imports

In [1]:
%pip install torchtext==0.6

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import re,os
import math
import random
import spacy
import numpy as np

from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_


assert torch.cuda.is_available()

### Load the dataset

We will be using a dataset with pairs of sentences in German (source) and English (target)

In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     ----- ---------------------------------- 2.1/14.6 MB 11.8 MB/s eta 0:00:02
     ------------ --------------------------- 4.7/14.6 MB 11.9 MB/s eta 0:00:01
     ------------------- -------------------- 7.1/14.6 MB 11.8 MB/s eta 0:00:01
     ------------------------- -------------- 9.4/14.6 MB 12.0 MB/s eta 0:00:01
     ------------------------------- ------- 11.8/14.6 MB 11.7 MB/s eta 0:00:01
     --------------------------------------  14.4/14.6 MB 11.6 MB/s eta 0:00:01
     --------------------------------------- 14.6/14.6 MB 11.5 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_s

We will need to download and uncompress the files with the [training set](https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz), [validation set](https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz) and [test set](https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz). Copy these files to a sub-folder `/data` into the current folder

We build training, validation and test sets 



In [5]:
def load_dataset(batch_size):
    spacy_de = spacy.load('de_core_news_sm')
    spacy_en = spacy.load('en_core_web_sm')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    train, val, test = Multi30k.splits(path='data/',exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN

In [6]:
batch_size=32
print("[!] preparing dataset...")

train_iter, val_iter, test_iter, DE, EN = load_dataset(batch_size)
de_size, en_size = len(DE.vocab), len(EN.vocab)
print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)"
          % (len(train_iter), len(train_iter.dataset),
             len(test_iter), len(test_iter.dataset)))
print("[DE_vocab]:%d [en_vocab]:%d" % (de_size, en_size))

[!] preparing dataset...
[TRAIN]:907 (dataset:29000)	[TEST]:32 (dataset:1000)
[DE_vocab]:8014 [en_vocab]:10004


### Model definition

Our seq2seq model consists of two separate modules:

* The **Encoder** is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

* The **Decoder** is another RNN that takes the encoder output vector(s) and outputs a sequence of words to create the translation.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size,
                 n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.embed = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers,
                          dropout=dropout, bidirectional=True)

    def forward(self, src, hidden=None):
        embedded = self.embed(src)
        outputs, hidden = self.gru(embedded, hidden)
        # sum bidirectional outputs
        outputs = (outputs[:, :, :self.hidden_size] +
                   outputs[:, :, self.hidden_size:])
        return outputs, hidden



class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size,
                 n_layers=1, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embed = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.gru = nn.GRU(embed_size, hidden_size,
                          n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, last_hidden):    
        # Get the embedding of the current input word (last output word)
        embedded = self.embed(input).unsqueeze(0)  # (1,B,N)
        embedded = self.dropout(embedded)
        
        output, hidden = self.gru(embedded, last_hidden)
        output = output.squeeze(0)  # (1,B,N) -> (B,N)
        output = self.out(output)   
        output = F.log_softmax(output, dim=1)
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.output_size
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()

        encoder_output, hidden = self.encoder(src)
        # Initialize decoder's input
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(trg.data[0, :])  # sos
        # Autoregressive decoding one token at each time step. We pass the output of the previous time step as the input to the next time step. 
        # Use teacher forcing with a certain probability to decide whether to use the actual target token as the next input or to use the predicted token from the previous time step.  
        # We accumulate all the outputs in the outputs tensor

        
        # ADD CODE HERE
        


        return outputs  

### Training and evaluation helper functions


In [None]:
def train(e, model, optimizer, train_iter, vocab_size, grad_clip, DE, EN):
    model.train()
    total_loss = 0
    pad = EN.vocab.stoi['<pad>']
    for b, batch in enumerate(train_iter):
        src, len_src = batch.src
        trg, len_trg = batch.trg
        src, trg = src.cuda(), trg.cuda()
        optimizer.zero_grad()
        output = model(src, trg)
        loss = F.nll_loss(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        loss.backward()
        clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.data.item()

        if b % 100 == 0 and b != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][pp:%5.2f]" %
                  (b, total_loss, math.exp(total_loss)))
            total_loss = 0

In [None]:
def evaluate(model, val_iter, vocab_size, DE, EN):
    with torch.no_grad():
        model.eval()
        pad = EN.vocab.stoi['<pad>']
        total_loss = 0
        for b, batch in enumerate(val_iter):
            src, len_src = batch.src
            trg, len_trg = batch.trg
            src = src.data.cuda()
            trg = trg.data.cuda()
            output = model(src, trg, teacher_forcing_ratio=0.0)
            loss = F.nll_loss(output[1:].view(-1, vocab_size),
                                   trg[1:].contiguous().view(-1),
                                   ignore_index=pad)
            total_loss += loss.data.item()
        return total_loss / len(val_iter)



### Model Training

In the following two cells we instatiate our seq2seq model and train it for 10 epochs. Notice that for the model to converge we would need to train for almost 100 epochs, but after 10 epochs (20 minutes aprox. with a T4 GPU) it starts to produce reasonable results.

In [42]:
lr=0.0001
hidden_size = 512
embed_size = 256


print("[!] Instantiating models...")
encoder = Encoder(de_size, embed_size, hidden_size, n_layers=2, dropout=0.5)
decoder = Decoder(embed_size, hidden_size, en_size, n_layers=1, dropout=0.5)
seq2seq = Seq2Seq(encoder, decoder).cuda()
optimizer = optim.Adam(seq2seq.parameters(), lr=lr)
print(seq2seq)

[!] Instantiating models...


  super().__init__("GRU", *args, **kwargs)


AssertionError: Torch not compiled with CUDA enabled

In [None]:
epochs=10
grad_clip=10.0

best_val_loss = None

for e in range(1, epochs+1):

    train(e, seq2seq, optimizer, train_iter, en_size, grad_clip, DE, EN)
    val_loss = evaluate(seq2seq, val_iter, en_size, DE, EN)

    print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS"
          % (e, val_loss, math.exp(val_loss)))

    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        print("[!] saving model...")
        if not os.path.isdir(".save"):
            os.makedirs(".save")
        torch.save(seq2seq.state_dict(), 'seq2seq_%d.pt' % (e))
        best_val_loss = val_loss

test_loss = evaluate(seq2seq, test_iter, en_size, DE, EN)
print("[TEST] loss:%5.2f" % test_loss)

### Qualitative Evaluation

In [None]:
for b, batch in enumerate(test_iter):
  src, len_src = batch.src
  trg, len_trg = batch.trg
  src = src.data.cuda()
  trg = trg.data.cuda()
  output, weights_matrix = seq2seq(src, trg, teacher_forcing_ratio=0.0)

  #visualize 5 predictions
  for n in range(5):
    s_src = [DE.vocab.itos[w] for w in src[:,n]]
    print('SOURCE:    '+' '.join([w for w in s_src if w not in ['<sos>','<pad>','<eos>','<unk>']]))

    s_trg = [EN.vocab.itos[w] for w in trg[:,n]]
    print('TARGET:    '+' '.join([w for w in s_trg if w not in ['<sos>','<pad>','<eos>','<unk>']]))

    s_pred = [EN.vocab.itos[torch.argmax(w)] for w in output[:,n,:]]
    print('PREDICTED: '+' '.join([w for w in s_pred if w not in ['<sos>','<pad>','<eos>','<unk>']]))

    print('-----------------')

  break