# Seq2Seq with Attention

[Original video](https://youtu.be/sQUqQddQtB4)

Resources:
  * [C5W3L07 Attention Model Intuition](https://youtu.be/SysgYptB198)
  * [C5W3L08 Attention Model](https://youtu.be/quoGRI-1l0A)
  * [Ben Trevett GitHub](https://github.com/bentrevett/pytorch-seq2seq) + [Seq2Seq CoLab](https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb)
  * Paper [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)
  * Paper [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)
  * [NLP From Scratch: Translation with a Sequence to Sequence Network and Attention](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)

[Multi30k dataset](https://github.com/multi30k/dataset). This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence.

## Model

In [None]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import random
import numpy as np

from tqdm.notebook import tqdm
from torchtext.data.metrics import bleu_score
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
from torch.utils.tensorboard import SummaryWriter

In [None]:
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [None]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')


def tokenizer_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]


def tokenizer_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


german = Field(tokenize=tokenizer_de, lower=True,
               init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

train_data, valid_data, test_data = Multi30k.splits(
    exts=('.de', '.en'), fields=(german, english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super().__init__()
        self.dropout = nn.Dropout(p=p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers,
                           bidirectional=True)
        self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size*2, hidden_size)
    
    def forward(self, x):  # x shape is (seq_length, N)
        embedding = self.dropout(self.embedding(x))
        # embedding shape is (seq_length, N, embedding_size)
        encoder_states, (hidden, cell) = self.rnn(embedding)

        # hidden shape is (2, N, hidden_size)
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        # output is (1, N, hidden_size*2)

        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size,
                 num_layers, p):
        super().__init__()
        self.dropout = nn.Dropout(p=p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size*2 + embedding_size, hidden_size,
                           num_layers)
        
        # 2 hidden from encoder states + 1 hidden from previous step of decoder
        self.energy = nn.Linear(hidden_size*3, 1)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)  # x shape is (N), but we want (1, N)
        embedding = self.dropout(self.embedding(x))  # (1, N, embedding_size)
        
        # compute energy states, attention and context vector
        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        
        attention = self.softmax(energy)  # (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum('snk,snl->knl', attention, encoder_states)

        # (1, N, hidden_size*2 + embedding_size)
        rnn_input = torch.cat((context_vector, embedding), dim=2)

        # outputs shape is (1, N, hidden_size)
        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # (1, N, hidden_size) --> (N, hidden_size)
        predictions = self.fc(outputs).squeeze(0)
        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        target_len = target.shape[0]
        batch_size = source.shape[1]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        x = target[0]  # start token
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(axis=1)  # (N, target_vocab_size)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

## Utils

In [None]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # load german tokenizer
    spacy_ger = spacy.load('de')

    # create tokens using spacy and everything in lower case
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # add <sos> and <eos> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # convert to tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # build encoder hidden, cell state
    with torch.no_grad():
        outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi['<sos>']]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hiddens, cells = model.decoder(
                previous_word, outputs_encoder, hiddens, cells)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi['<eos>']:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)['src']
        trg = vars(example)['trg']

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(model, optimizer, filename):
    print("=> Saving checkpoint")
    checkpoint = {'state_dict': model.state_dict(),
                  'optimizer': optimizer.state_dict(),}
    torch.save(checkpoint, filename)


def load_checkpoint(filename, model, optimizer, lr, device):
    print('=> Loading checkpoint')
    checkpoint = torch.load(filename, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

    # if we don't do this then it will just have learning rate of old checkpoint
    # and it will lead to many hours of debugging \:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

## Training

In [None]:
# Run TensorBoard

LOG_DIR = 'runs/loss_plot'
# Delete previous logs dir
if os.path.exists(LOG_DIR):
    !rm -rf $LOG_DIR

# To fix the error, because PyTorch and TensorFlow are installed both:
# AttributeError: module 'tensorflow._api.v2.io.gfile' has no attribute 'get_filesystem'
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Start TensorBoard before training to monitor it in progress
%tensorboard --logdir $LOG_DIR

# Reload TensorBoard
%reload_ext tensorboard

In [None]:
# training hyperparameters
num_epochs = 20  # 20
learning_rate = 0.001
batch_size = 64

# model hyperparameters
load_model = True
checkpoint = 'seq2seq_attention.pth.tar'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 100
decoder_embedding_size = 100
hidden_size = 1024
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

# TensorBoard
writer = SummaryWriter(LOG_DIR)
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=batch_size,
    # sort by length to minimize the number of padding
    sort_within_batch=True, sort_key=lambda x: len(x.src),
    device=device,)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size,
                      num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size,
                      output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model and os.path.exists(checkpoint):
    load_checkpoint(checkpoint, model, optimizer, learning_rate, device)

sentence = 'ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen.'

for epoch in range(1, num_epochs+1):
    loop = tqdm(train_iterator, leave=False)
    loop.set_description(f'Epoch [{epoch}/{num_epochs}]')

    model.eval()
    translated_sentence = translate_sentence(model, sentence, german, english,
                                             device, max_length=50)
    print(translated_sentence)
    model.train()

    for batch_idx, batch in enumerate(loop):
        input_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(input_data, target)  # (target_len, batch_size, output_dim)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)  # (target_len * batch_size)

        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1

    save_checkpoint(model, optimizer, checkpoint)

=> Loading checkpoint


HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'being', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

['an', 'boat', 'carrying', 'several', 'men', 'being', 'pulled', 'to', 'shore', 'of', 'a', 'large', 'pile', 'of', 'horses', 'horses', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

['a', 'boat', 'carrying', 'several', 'men', 'is', 'stopped', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', 'horses', '.', '<eos>']
=> Saving checkpoint


HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

['an', 'boat', 'carrying', 'several', 'men', 'is', 'being', 'lowered', 'by', 'a', 'large', 'team', 'of', 'horses', 'horses', '.', '<eos>']
=> Saving checkpoint


In [None]:
# Example:
# 'a researcher once learned how to translate sentences'
sentence = 'Der junge Mann sagte, der Film sei nicht zu schrecklich'

model.eval()
translated_sentence = translate_sentence(model, sentence, german, english,
                                         device, max_length=50)
model.train()

print(translated_sentence)

['the', 'young', 'man', 'punching', 'to', 'dressed', 'at', 'the', 'little', '.', '<eos>']


In [None]:
# old Bleu score: 20.85
# new Bleu score: 20.62
score = bleu(test_data, model, german, english, device)
print(f'Bleu score: {score*100:.2f}')

Bleu score: 20.62


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

copy_to = '/content/gdrive/MyDrive/Colab Notebooks/PyTorch tutorial'

!cp -rf '$checkpoint' '$copy_to'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
