# Transformer translation

[Original video](https://youtu.be/M6adRGJe5cQ)

[PyTorch inbuilt transformer model](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html)

[Transformer implementation](https://colab.research.google.com/drive/1AntrcxvL-E7Jxt6z_JfPo96cwSxO-3B0)

[Multi30k dataset](https://github.com/multi30k/dataset). This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence.

## Model

In [1]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import random
import numpy as np

from tqdm.notebook import tqdm
from torchtext.data.metrics import bleu_score
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
from torch.utils.tensorboard import SummaryWriter

In [2]:
# !python -m spacy download en
!python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 15.1MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907057 sha256=1b50c46d848ea718aaed6ae445443108f0538b8fd7968f5a7de0acf606731884
  Stored in directory: /tmp/pip-ephem-wheel-cache-hr3yhju5/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local

In [11]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')


def tokenizer_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]


def tokenizer_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


german = Field(tokenize=tokenizer_de, lower=True,
               init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

train_data, valid_data, test_data = Multi30k.splits(
    exts=('.de', '.en'), fields=(german, english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


class Transformer(nn.Module):
    def __init__(self, embedding_size, src_vocab_size, trg_vocab_size,
                 src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers,
                 forward_expansion, dropout, max_length, device):
        super().__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_length, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_length, embedding_size)
        self.device = device
        self.transformer = nn.Transformer(embedding_size, num_heads,
            num_encoder_layers, num_decoder_layers, forward_expansion, dropout)
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        # source shape (src_length, N)
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
        src_positions = (torch.arange(0, src_seq_length).unsqueeze(1)
            .expand(src_seq_length, N).to(self.device))
        trg_positions = (torch.arange(0, trg_seq_length).unsqueeze(1)
            .expand(trg_seq_length, N).to(self.device))
        embed_src = self.dropout(self.src_word_embedding(src) + 
                                 self.src_position_embedding(src_positions))
        embed_trg = self.dropout(self.trg_word_embedding(trg) +
                                 self.trg_position_embedding(trg_positions))
        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(
            trg_seq_length).to(self.device)
        
        out = self.transformer(embed_src, embed_trg,
                               src_key_padding_mask=src_padding_mask,
                               tgt_mask=trg_mask)
        return self.fc_out(out)

## Utils

In [12]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # load german tokenizer
    spacy_ger = spacy.load('de')

    # create tokens using spacy and everything in lower case
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # add <sos> and <eos> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # convert to tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi['<sos>']]

    for _ in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)
        
        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if best_guess == english.vocab.stoi['<eos>']:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)['src']
        trg = vars(example)['trg']

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(model, optimizer, filename):
    print('=> Saving checkpoint')
    checkpoint = {'state_dict': model.state_dict(),
                  'optimizer': optimizer.state_dict(),}
    torch.save(checkpoint, filename)


def load_checkpoint(filename, model, optimizer, lr, device):
    print('=> Loading checkpoint')
    checkpoint = torch.load(filename, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

    # if we don't do this then it will just have learning rate of old checkpoint
    # and it will lead to many hours of debugging \:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

## Training

In [None]:
# Run TensorBoard

LOG_DIR = 'runs/loss_plot'
# Delete previous logs dir
if os.path.exists(LOG_DIR):
    !rm -rf $LOG_DIR

# To fix the error, because PyTorch and TensorFlow are installed both:
# AttributeError: module 'tensorflow._api.v2.io.gfile' has no attribute 'get_filesystem'
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Start TensorBoard before training to monitor it in progress
%tensorboard --logdir $LOG_DIR

# Reload TensorBoard
%reload_ext tensorboard

In [20]:
# training hyperparameters
num_epochs = 20  # 20
learning_rate = 3e-4
batch_size = 32

# model hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
load_model = True
checkpoint = 'transformer.pth.tar'
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
max_length = 100  # max sentence length
forward_expansion = 4
src_pad_idx = english.vocab.stoi['<pad>']

# TensorBoard
writer = SummaryWriter(LOG_DIR)
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=batch_size,
    # sort by length to minimize the number of padding
    sort_within_batch=True, sort_key=lambda x: len(x.src),
    device=device,)

model = Transformer(embedding_size, src_vocab_size, trg_vocab_size,
                    src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers,
                    forward_expansion, dropout, max_length, device,).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model and os.path.exists(checkpoint):
    load_checkpoint(checkpoint, model, optimizer, learning_rate, device)

sentence = 'ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen'

for epoch in range(1, num_epochs+1):
    loop = tqdm(train_iterator, leave=False)
    loop.set_description(f'Epoch [{epoch}/{num_epochs}]')

    model.eval()
    translated_sentence = translate_sentence(model, sentence, german, english,
                                             device, max_length=max_length)
    print(translated_sentence)
    model.train()

    for batch_idx, batch in enumerate(loop):
        input_data = batch.src.to(device)
        target = batch.trg.to(device)

        # forward prop
        output = model(input_data, target[:-1])  # (target_len, batch_size, output_dim)
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)  # (target_len * batch_size)

        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1

    save_checkpoint(model, optimizer, checkpoint)

=> Loading checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'with', 'several', 'men', 'being', 'pulled', 'by', 'a', 'large', '<unk>', 'shore', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'horse', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'with', 'several', 'men', 'is', 'pulled', 'away', 'from', 'a', 'large', '<unk>', 'of', 'water', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', '<unk>', 'bank', 'of', 'water', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', '<unk>', 'shore', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'pulled', 'by', 'a', 'large', 'horse', 'to', 'shore', 'by', 'a', 'large', 'team', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'with', 'several', 'men', 'is', 'pulled', 'onto', 'the', 'shore', 'by', 'a', 'large', 'shore', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=907.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
=> Saving checkpoint


In [21]:
# Example: Three wise men in one basin sailed across the sea in a thunderstorm.
#          If the old basin were stronger, my story would be longer.
sentence = 'Drei weise Männer in einem Becken segelten in einem Gewitter über das Meer. Wenn das alte Becken stärker wäre, wäre meine Geschichte länger.'

model.eval()
translated_sentence = translate_sentence(model, sentence, german, english,
                                         device, max_length=50)
model.train()

print(translated_sentence)

['three', 'guys', 'in', 'a', 'pool', ',', 'slow', '<unk>', 'in', 'a', 'sailing', 'discussion', 'through', 'the', 'ocean', '.', '<eos>']


In [22]:
# Bleu score standard: 32.34
score = bleu(test_data, model, german, english, device)
print(f'Bleu score: {score*100:.2f}')

Bleu score: 32.34


In [23]:
from google.colab import drive
drive.mount('/content/gdrive')

copy_to = '/content/gdrive/MyDrive/Colab Notebooks/PyTorch tutorial'

!cp -rf '$checkpoint' '$copy_to'

Mounted at /content/gdrive
