# Build a Machine translation seq2seq + Attention model German -> English
#### References
- [Paper](https://arxiv.org/abs/1409.0473)
- [Github](https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb)
- [Youtube](https://www.youtube.com/watch?v=sQUqQddQtB4&feature=youtu.be)


#### Ideas
- Original Seq2Seq: context vector maybe bottleneck, cant convey enough energy if the sentence is too long

<img src="./assets/4.png" width="500"/>

- Improved: Send the context vector to every cell of the decoder

<img src="./assets/5.png" width="500"/>


In [1]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Use {device}")

Use cuda:0


# 1. Data processing

## 1.1 Tokenizer

In [2]:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
tokenize_en("good morning")

['good', 'morning']

In [4]:
tokenize_de("guten morgen")

['morgen', 'guten']

## 1.2 Get dataset from torchtext

In [5]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [6]:
german = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

english = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)



In [7]:
train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de', '.en'), 
    fields = (german, english))



#### Preview

In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [9]:
# DE (reverse)
print(vars(train_data.examples[0])['src'])

# En
print(vars(train_data.examples[0])['trg'])

['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [10]:
# DE (reverse)
print(vars(train_data.examples[1])['src'])

# En
print(vars(train_data.examples[1])['trg'])

['.', 'antriebsradsystem', 'ein', 'bedienen', 'schutzhelmen', 'mit', 'männer', 'mehrere']
['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


## 1.3 Build vocab

In [11]:
german.build_vocab(train_data, min_freq = 2)
english.build_vocab(train_data, min_freq = 2)

In [12]:
print(f"Unique tokens in source (de) vocabulary: {len(german.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")

Unique tokens in source (de) vocabulary: 7854
Unique tokens in target (en) vocabulary: 5893


## 1.4 Preview dataloader

In [13]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)



In [14]:
print("Train dataloader size:", len(train_iterator))
print("Valid dataloader size:", len(valid_iterator))
print("Test dataloader size:", len(test_iterator))

Train dataloader size: 907
Valid dataloader size: 32
Test dataloader size: 32


In [15]:
for i, data in enumerate(train_iterator):
    X = data.src
    y = data.trg

    # (Input_dim, batch_size)
    print(f"Source[{i}] tensor size: {X.size()}")

    # (Output_dim, batch_size)
    print(f"Target[{i}] tensor size: {y.size()}",end="\n\n")

    if i == 2: break



Source[0] tensor size: torch.Size([26, 32])
Target[0] tensor size: torch.Size([23, 32])

Source[1] tensor size: torch.Size([27, 32])
Target[1] tensor size: torch.Size([25, 32])

Source[2] tensor size: torch.Size([27, 32])
Target[2] tensor size: torch.Size([33, 32])



# 2. Model

## 2.1 Encoder
<img src="./assets/6.png" width="750"/>

#### Architecture
- 1 Embedded layer
- 1 biLSTM layer:
    + foward: $x_0^\rightarrow = \text{<sos>}, x_1^\rightarrow = \text{guten}, x_2^\rightarrow = \text{morgen}, x_3^\rightarrow = \text{<eos>}$
    + backward: $x_0^\leftarrow = \text{<eos>}, x_1^\leftarrow = \text{morgen}, x_2^\rightarrow = \text{guten}, x_3^\rightarrow = \text{<sos>}$

In [16]:
from Models.Seq2Seq_Attention import Encoder


ENC_EMB_DIM = 300
HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5

enc = Encoder(
    source_vocab_size=len(german.vocab),
    embedding_size=ENC_EMB_DIM,
    hidden_size=HID_DIM,
    num_layers=N_LAYERS,
    dropout_rate=ENC_DROPOUT).to(device)

  "num_layers={}".format(dropout, num_layers))


In [17]:
X = torch.rand(33, 64).long().cuda()

encoder_states, hidden_fc, cell_fc = enc(X)

## 2.2 Decoder

#### Architecture
- 1 Embedded layer
- attention - context_vector
- 1 LSTM layer
- 1 fully connected

#### Context vector implementation

<img src="./assets/1.jpg" width="250"/>

- context vector:
    $$c_i = \sum\limits_{j=1}^{Tx} \alpha_{ij}h_j$$

- Where
    + $h_j$: encoder states
    + $\alpha_{ij}$ is the weight: $\alpha_{ij} = \frac{exp(e_{ij})}{\sum\limits_{k=1}^{Tx}exp(e_{ik})} = softmax(e_{ij})$ 
        + $e_{ij}$ is energy defined by: $e_{ij} = a(s_{i-1}, h_j)$
            + $s_{i-1}$: encoder hidden
            + $h_j$: encoder states

In [18]:
from Models.Seq2Seq_Attention import Decoder

DEC_EMB_DIM = 300
HID_DIM = 512
N_LAYERS = 1
DEC_DROPOUT = 0.5

dec = Decoder(
    target_vocab_size=len(english.vocab),
    embedding_size=DEC_EMB_DIM,
    hidden_size=HID_DIM,
    num_layers=N_LAYERS,
    dropout_rate=DEC_DROPOUT).to(device)

In [19]:
XX = torch.rand(33, 64).long().to(device)

predictions_squ, hidden, cell = dec(XX[0], encoder_states, hidden_fc, cell_fc)

## 2.3 Seq2Seq

In [20]:
from Models.Seq2Seq_Attention import Seq2Seq

model = Seq2Seq(
    encoder=enc,
    decoder=dec,
    target_vocab_size=len(english.vocab),
    device=device).to(device)

# 3. Train

#### Training hyperparameters

In [21]:
num_epochs = 100
learning_rate = 3e-4

#### Model hyperparameters

In [22]:
ENC_EMB_DIM = 300
ENC_DROPOUT = 0.0

DEC_EMB_DIM = 300
DEC_DROPOUT = 0.0

HID_DIM = 1024
N_LAYERS = 1

In [23]:
encoder_net = Encoder(
    source_vocab_size=len(german.vocab),
    embedding_size=ENC_EMB_DIM,
    hidden_size=HID_DIM,
    num_layers=N_LAYERS,
    dropout_rate=ENC_DROPOUT).to(device)

decoder_net = Decoder(
    target_vocab_size=len(english.vocab),
    embedding_size=DEC_EMB_DIM,
    hidden_size=HID_DIM,
    num_layers=N_LAYERS,
    dropout_rate=DEC_DROPOUT).to(device)

model = Seq2Seq(
    encoder=encoder_net,
    decoder=decoder_net,
    target_vocab_size=len(english.vocab),
    device=device).to(device)

In [24]:
# Optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Loss_fn
import torch.nn as nn
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Tensorboard

In [25]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("exp/Seq2Seq_Attention/loss_plot")
step = 0

## Train

In [26]:
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

load_model = False
if load_model:
    load_checkpoint(torch.load("exp/Seq2Seq_Attention/trained_model.pth.tar"), model, optimizer)

In [27]:
train_mode = False
LOG_FILE = "exp/Seq2Seq_Attention/train.log"

if train_mode == True:
    # a boat with several men on it is being pulled ashore by a large team of horses.
    sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

    for epoch in range(num_epochs):
        # Train
        model.train()
        for i, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            source = batch.src.to(device)
            target = batch.trg.to(device)

            # Forward prop
            output = model(source, target)

            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

        # Save checkpoint
        checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
        save_checkpoint(checkpoint, "exp/Seq2Seq_Attention/trained_model.pth.tar")


        # Plot to tensorboard every epoch
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

        # Eval
        model.eval()
        translated_sentence = translate_sentence(
            model, sentence, german, english, device, max_length=50
        )
        with open(LOG_FILE, "a+") as file:
            file.write(f"[Epoch {epoch} / {num_epochs}]: loss: {loss.item()}\n")
            file.write(f"Translated example sentence: {' '.join(translated_sentence)}\n\n")

# 4. Test

In [28]:
load_checkpoint(torch.load("exp/Seq2Seq_Attention/trained_model.pth.tar"), model, optimizer)

=> Loading checkpoint


#### Some examples

In [29]:
sentence = test_data[10].src
groundtruth = test_data[10].trg
translated_sentence = translate_sentence(
    model, sentence, german, english, device, max_length=50
)

print(f'''
sentence: {' '.join(sentence)}
groundtruth: {' '.join(groundtruth)}
translated: {' '.join(translated_sentence[:-1])}
''')


sentence: . freien im tag schönen einen genießen sohn kleiner ihr und mutter eine
groundtruth: a mother and her young song enjoying a beautiful day outside .
translated: a mother and her daughter enjoying their small day outside .



In [30]:
sentence = test_data[21].src
groundtruth = test_data[21].trg
translated_sentence = translate_sentence(
    model, sentence, german, english, device, max_length=50
)

print(f'''
sentence: {' '.join(sentence)}
groundtruth: {' '.join(groundtruth)}
translated: {' '.join(translated_sentence[:-1])}
''')


sentence: . feld dem auf trompete spiel einem bei spielt teenagerin eine
groundtruth: a teenager plays her trumpet on the field at a game .
translated: a toddler player is playing a game in the field during a game .



#### bleu score

In [31]:
from torchtext.data.metrics import bleu_score

score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 21.46
