#### Import libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field,BucketIterator
from torch.utils.tensorboard import SummaryWriter # To print to tensorboard

import numpy as np
import random
import spacy
import pandas as pd

In [2]:
from utils import translate_sentence, bleu,save_checkpoint,load_checkpoint

#### Initialise language models

In [3]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

#### Tokenizers

In [4]:
def ger_tokenizer(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def eng_tokenizer(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

#### Fields for preprocessing

In [5]:
german = Field(tokenize= ger_tokenizer, sequential=True,lower = True,
               init_token='<sos>', eos_token='<eos>')

english = Field(tokenize= eng_tokenizer, sequential=True,lower = True,
               init_token='<sos>', eos_token='<eos>')

#### Train-test split

In [6]:
train_data , valid_data , test_data = Multi30k.splits(exts=('.de','.en'),
                                                      fields = (german,english))

downloading training.tar.gz


.data\multi30k\training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 665kB/s]


downloading validation.tar.gz


.data\multi30k\validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 123kB/s] 


downloading mmt_task1_test2016.tar.gz


.data\multi30k\mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 101kB/s] 


#### Build vocabulary

In [7]:
german.build_vocab(train_data, max_size = 1000, min_freq = 2)
english.build_vocab(train_data, max_size = 1000, min_freq = 2)

#### Encoder

In [11]:
class Encoder(nn.Module):

    def __init__(self,input_size, embed_size,hidden_size,num_layers,p):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)

        self.embedding = nn.Embedding(input_size,embed_size)
        self.lstm = nn.LSTM(embed_size,hidden_size,num_layers,
                            bidirectional = True)

        # Since we use bidirectional LSTM , we will have forward and backward hidden and cell states
        # Hence we feed both to the neural network and let it choose which one it'll assign importance and relay forward
        self.fc_hidden = nn.Linear(hidden_size*2,hidden_size)
        self.fc_cell = nn.Linear(hidden_size*2,hidden_size)

    def forward(self,x):
        # x shape : (seq_length, batch_size)

        embedded = self.dropout(self.embedding(x))
        # embedding shape : (seq_length, batch_size, embedding_size)

        encoder_states , (hidden,cell) = self.lstm(embedded)
        # outputs shape : (seq_length, batch_size, hidden_size)

        # Concatenate the forward and reverse hidden and cell states
        # After concatenation shape will be(2,batch_size, hidden_size)
        # We are concatenating along the hidden size dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1],hidden[1:2]),dim = 2))
        cell = self.fc_cell(torch.cat((cell[0:1],cell[1:2]),dim = 2))

        return encoder_states, hidden,cell


#### Need for concatenation

By using a bidirectional LSTM, the Encoder is able to capture information from both the past and future context of the input sequence. The forward LSTM processes the sequence in the original order, while the backward LSTM processes it in reverse order. By concatenating the hidden and cell states of both LSTMs, the Encoder captures information from both directions and provides a more comprehensive representation of the input sequence.

#### Need for fc_out and fc_hidden

They play a role in transforming the concatenated states to the desired dimensionality. They apply a linear transformation to the concatenated states, allowing the model to learn the appropriate weights for mapping the concatenated states to the hidden size. This helps in capturing the most relevant and useful information from the bidirectional states and providing a compact representation to be used by the decoder.

Hiddden and cell have shapes `(num_layers * num_directions, batch_size, hidden_size)` which is why the slicing works

#### Decoder

Steps
* Initialise parameters
* lstm input dimension will be `hidden_size*2+embedding_size` since the input will be a concatenation of context vector and previously generated word(embedded representation)
* Energy - used to obtain scores for the encoder states is initialised to a dimension of `hidden_size*3,1` since we feed the hidden states of encoder plus the previous hidden state of the decoder
* A softmax layer is used to obtain the attention weights

* We obtain the `sequence_length` of the encoder states. The decoder initial hidden state is repeated `sequence_length` times so that the shape matches with encoder states and the decoder has access to the same initial hidden state for every time step during decoding.
* The hidden state and encoder states are concatenated along the hidden_size*2 dimension and passed through a neural network followed by RELU
* The output thus obtained is passed through softmax layer to obtain the attention weights
* The shapes of the attention weights and the encoder states are adjusted using `permute` so that elementwise multiplication can be carried out and context vectors can be obtained
* The context vectors are concatenated with the previous word generated and passed on to the lstm following which it is passed through feedforward neural network

In [12]:
class Decoder(nn.Module):

    def __init__(self,input_size,embedding_size, hidden_size,output_size,num_layers,p):
        super(Decoder,self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size,embedding_size)

        # Since we will be concatenating the previously generated word and the context vectors
        self.lstm = nn.LSTM(self.hidden_size*2 + embedding_size, hidden_size,num_layers)

        # Since we have the hidden states from the encoder plus the previous hidden state of the decoder
        self.energy = nn.Linear(hidden_size*3,1)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self,x,encoder_states,hidden,cell):
        # x ha shape (N) we want it (1,N) since we are sening a single word
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape (1,batch_size,embedding_size)

        sequence_length = encoder_states.shape[0]
        # Repeating hidden state to match the shape of encoder_states
        h_reshaped = hidden.repeat(sequence_length,1,1)
        # h_reshaped :(seq_length,N,hidden_size*2)

        # Calculate attention scores
        # Concatenate the hidden decoder states and the encoder states along the hidden_size*2 dimension
        energy = self.relu(self.energy(torch.cat((h_reshaped,encoder_states), dim = 2)))
        # energy : (seq_length,N,1)

        # Obtain the attention weights
        attention = self.softmax(energy)
        
        # Alter the shapes so that elementwise multiplication can be carried out
        # (seq_length,N,1) --> (N,1,seq_length)
        attention = attention.permute(1,2,0)
        
        # (seq_length,N,hidden_size*2) --> (N,seq_length_hidden_size*2)
        encoder_states = encoder_states.permute(1,0,2)
        
        # Obtain context vector by carrying out elementwise multiplication of attention weights and encoder states
        context_vector = torch.bmm(attention,encoder_states).permute(1,0,2)
        # (N,1,hidden_size*2) --> (1,N,hidden_size*2)

        # Input to decoder is obtained by concatenating context vector with embedded input sequence
        rnn_input = torch.cat((context_vector,embedding),dim=2)
        # rnn input : (1,N,hidden_size *2 + embedding_size

        outputs, (hidden,cell) = self.lstm(rnn_input,(hidden,cell))
        # output shape : (1,N,hidden_size)

        predictions = self.fc(outputs) 

        # (1,N,len_target_vocab) --> (N,len_target_vocab)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell      

#### Seq2Seq module

In [21]:
class Seq2Seq(nn.Module):

    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target , teacher_force_ratio = 0.5):
        # Torch tensors usually have shape (sequence_length, batch_size)
        batch_size = source.shape[1]
        target_len = target.shape[0]

        len_target_vocab = len(english.vocab)

        # Decoder o/p has shape (batch_size,target_vocab_len) which are accumulated over target_len number of words
        outputs = torch.zeros(target_len,batch_size, len_target_vocab)

        # Obtain the encoder states 
        encoder_states , hidden,cell = self.encoder(source)

        # Grab the start token

        x= target[0]

        for t in range(1,target_len):
            output, hidden ,cell = self.decoder(x,encoder_states,hidden,cell)

            outputs[t] = output

            # Output has shape (batch_size, vocab_size)
            # So argmax along 1st dimension would  yield the word with the highest probability
            best_guess = output.argmax(1)

            # Whether to use true label or previous output for training
            x = target[t] if random.random() < teacher_force_ratio else best_guess

            return outputs

#### Training hyperparameters

In [14]:
num_epochs = 20
learning_rate = 0.01
batch_size = 64

#### Model hyperparameters

In [15]:
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

#### Tensorboard

In [16]:
writer = SummaryWriter(f'runs/loss_plot')
step = 0

#### Initialise iterators

Significance of `sort_within_batch`
- This makes sure that examples with similar sequence lengths are grouped together in a batch so that the need for padding is reduced and computation becomes more efficient

In [18]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x : len(x.src),
    device = device
)

#### Instantiate encoder and decoder

In [22]:
encoder_net = Encoder(input_size_encoder,encoder_embedding_size,hidden_size,num_layers,enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,decoder_embedding_size,hidden_size,output_size, num_layers,dec_dropout).to(device)

In [23]:
model = Seq2Seq(encoder_net,decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr= learning_rate)

#### Loss with modification to ignore padding

In [24]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Sample sentence

In [25]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

### Train the model

In [28]:
for epoch in range(num_epochs):
    

    # checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
    # save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model,sentence,german,english,device,max_length=50
    )

    print(f'Translated example sentence : \n{translated_sentence}')

    for batch_idx,batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(inp_data,target)
        # The output has shape (target_len,batch_size,output_size)
        # But cross entropy function expects data in a particular dimension for which we squash together the first two dimensions

        # We won't be sending 'start of sentence' token to the decoder hence we consider from index 1 onwards
        output = output[1:].reshape(-1,output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output,target)

        loss.backward()

        # Gradient clipping to avoid exploding gradient problem'
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)

        optimizer.step()

        writer.add_scalar('Training Loss',loss,global_step=step)
        step += 1
    
    print(f'Epoch {epoch+1} / {num_epochs} Loss : {loss.item()}')

ValueError: too many values to unpack (expected 2)