In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
# import IPython.core.debugger import set_trace

In [2]:
# random seeds to get same results
# “deterministic” algorithms: given the same input, 
# and when run on the same software and hardware, always produce the same output.
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
# tokenizers
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

# to install spacy languages use:
# python -m spacy download en
# python -m spacy download de

In [7]:
# tokenizer functions
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it (paper says to)
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [8]:
# TorchText's Fields to handle how data should be processed
SRC = Field(tokenize=tokenize_de,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)

TRG = Field(tokenize=tokenize_en,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)

In [9]:
# download and load the train, validation and test data.
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                   fields = (SRC, TRG))

In [10]:
# check sizes (don't need .examples)
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [11]:
# print example
# print(train_data.examples[0].__dict__)
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [12]:
# build vocab (always just training data)
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [13]:
# unique tokens in vocabs
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


The final step of preparing the data is to create the iterators. These can be iterated on to return a batch of data which will have a src attribute (the PyTorch tensors containing a batch of numericalized source sentences) and a trg attribute (the PyTorch tensors containing a batch of numericalized target sentences). Numericalized is just a fancy way of saying they have been converted from a sequence of readable tokens to a sequence of corresponding indexes, using the vocabulary.

Uses a BucketIterator instead of the standard Iterator as it creates batches in such a way that it minimizes the amount of padding in both the source and target sentences.

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [47]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

# BUILDING Seq2Seq model

Build model in three parts: The encoder, the decoder and a seq2seq model that encapsulates the encoder and decoder and will provide a way to interface with each.

### Encoder (2 layer LSTM)

Parameters:

* input_dim: is the size/dimensionality of the one-hot vectors that will be input to the encoder. This is equal to the input (source) vocabulary size.

* emb_dim: is the dimensionality of the embedding layer. This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.

* hid_dim: is the dimensionality of the hidden and cell states.

* n_layers: is the number of layers in the RNN.

* dropout: is the amount of dropout to use. This is a regularization 
parameter to prevent overfitting. Check out this for more details about dropout.

<img src="images/encoder.png">

In [48]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emd_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emd_dim)
        self.rnn = nn.LSTM(emd_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [src len, batch size] (not batch_first)
        
        embedded = self.dropout(self.embedding(src)) # look this up more
        
        #embedded = [src len, batch size, emb dim]
        #src len == sequence length
        #if batch_first it would be [batch, src (sequence len), emb_dim (input_size)]
        
        outputs, (hidden, cell) = self.rnn(embedded) # hidden automatically created as zeros
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

## Decoder (2 layer LSTM)
The Decoder class does a single step of decoding, i.e. it ouputs single token per time-step. The first layer will receive a hidden and cell state from the previous time-step, $(s_{t-1}^1, c_{t-1}^1)$, and feeds it through the LSTM with the current embedded token, $y_t$, to produce a new hidden and cell state, $(s_t^1, c_t^1)$. The subsequent layers will use the hidden state from the layer below, $s_t^{l-1}$, and the previous hidden and cell states from their layer, $(s_{t-1}^l, c_{t-1}^l)$. This provides equations very similar to those in the encoder.

$$\begin{align*}
(s_t^1, c_t^1) = \text{DecoderLSTM}^1(d(y_t), (s_{t-1}^1, c_{t-1}^1))\\
(s_t^2, c_t^2) = \text{DecoderLSTM}^2(s_t^1, (s_{t-1}^2, c_{t-1}^2))
\end{align*}$$


The initial hidden and cell states to our decoder are our context vectors, which are the final hidden and cell states of our encoder from the same layer, i.e. $(s_0^l,c_0^l)=z^l=(h_T^l,c_T^l)$.


* Output_dim: the size of the vocabulary for the output/target
<img src="images/decoder.png">

In [49]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emd_dim, hid_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emd_dim)
        self.rnn = nn.LSTM(emd_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        #input = [batch size] cause only 1 time at a time
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedding = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        # get rid of the sentence length dimension (check this)
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

# Seq2Seq

This will handle:

* receiving the input/source sentence
* using the encoder to produce the context vectors
* using the decoder to produce the predicted output/target sentence

<img src="images/seq2seq4.png">

During each iteration of the loop, we:

* pass the input, previous hidden and previous cell states ($y_t, s_{t-1}, c_{t-1}$) into the decoder
* receive a prediction, next hidden state and next cell state ($\hat{y}_{t+1}, s_{t}, c_{t}$) from the decoder
* place our prediction, $\hat{y}_{t+1}$/output in our tensor of predictions, $\hat{Y}$/outputs
* decide if we are going to "teacher force" or not
* if we do, the next input is the ground-truth next token in the sequence, $y_{t+1}$/trg[t]
* if we don't, the next input is the predicted next token in the sequence, $\hat{y}_{t+1}$/top1, which we get by doing an argmax over the output tensor
Once we've made all of our predictions, we return our tensor full of predictions, $\hat{Y}$/outputs.

**Note:** our decoder loop starts at 1, not 0. This means the 0th element of our outputs tensor remains all zeros. So our trg and outputs look something like:

$$\begin{align*}
\text{trg} = [<sos>, y_1, y_2, y_3, <eos>]\\
\text{outputs} = [0, \hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$
Later on when we calculate the loss, we cut off the first element of each tensor to get:

$$\begin{align*}
\text{trg} = [y_1, y_2, y_3, <eos>]\\
\text{outputs} = [\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

In [50]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> token
        input = trg[0,:]
        #print(f"decoder input in training: {input}")
        #print(f"input shape: {input.shape}")
        
        for t in range(1, trg_length):
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not -- check this
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
            
        return outputs

# Training Seq2Seq model

* The input and output dimensions are defined by the size of the vocabulary. 
* The embedding dimesions and dropout for the encoder and decoder can be different, but the number of layers and the size of the hidden/cell states must be the same.



In [51]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 0.01

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

Initialize the weights of our model. In the paper they state they initialize all weights from a uniform distribution between -0.08 and +0.08, i.e. $\mathcal{U}(-0.08, 0.08)$.

Initialize weights in PyTorch by creating a function which we apply to our model. When using apply, the init_weights function will be called on every module and sub-module within our model. For each module we loop through all of the parameters and sample them from a uniform distribution with nn.init.uniform_.

In [52]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

calculate number of trainable parameters in model

In [53]:
def count_parameters(model):
    # numel: returns the total number of elements in the input tensor
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,899,013 trainable parameters


In [54]:
# optimizer
# lr default = 1; change this 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

Next, define loss function. The CrossEntropyLoss function calculates both the log softmax as well as the negative log-likelihood of predictions.

loss function calculates the average loss per token, however by passing the index of the 'pad' token as the ignore_index argument it ignores the loss whenever the target token is a padding token.

In [55]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

Training Loop:

set the model into "training mode" with model.train(). This will turn on dropout (and batch normalization, which isn't being using) and then iterate through our data iterator.


decoder loop starts at 1, not 0. This means the 0th element of our outputs tensor remains all zeros. So our trg and outputs look something like:

$$\begin{align*}
\text{trg} = [<sos>, y_1, y_2, y_3, <eos>]\\
\text{outputs} = [0, \hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$
Here, when we calculate the loss, we cut off the first element of each tensor to get:

$$\begin{align*}
\text{trg} = [y_1, y_2, y_3, <eos>]\\
\text{outputs} = [\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

At each iteration:

* get the source and target sentences from the batch, $X$ and $Y$
* zero the gradients calculated from the last batch
* feed the source and target into the model to get the output, $\hat{Y}$
* as the loss function only works on 2d inputs with 1d targets we need to flatten each of them with .view
 * we slice off the first column of the output and target tensors as mentioned above
* calculate the gradients with loss.backward()
* clip the gradients to prevent them from exploding (a common issue in RNNs)
* update the parameters of the model by doing an optimizer step
* sum the loss value to a running total

Finally, return the loss that is averaged over all batches.

In [56]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #output = [(trg len - 1) * batch size, output dim]
        #trg = [(trg len - 1) * batch size]
        
        loss = criterion(output, trg)
        loss.backward()
        
        # grad clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator) #(cause this is only 1 epoch)

Evaluation Loop:

Similar to train execpt model.eval()

In [57]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            output = model(src, trg, 0) #turn off teacher forcing
            
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [58]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Start Training

At each epoch, checking if model has achieved the best validation loss so far. If it has, update the best validation loss and save the parameters of the model (called state_dict in PyTorch). Then, when its time to test the model, we'll use the saved parameters used to achieve the best validation loss.

Printing out both the loss and the perplexity at each epoch. It is easier to see a change in perplexity than a change in loss as the numbers are much bigger.

In [None]:
N_EPOCHS = 100
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_models/best_model.pt')
        
    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\tVal Loss: {valid_loss:.3f} | Val PPL: {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 0m 24s
	Train Loss: 4.836 | Train PPL: 125.943
	Val Loss: 5.235 | Val PPL: 187.767
Epoch: 02 | Time: 0m 24s
	Train Loss: 4.348 | Train PPL:  77.321
	Val Loss: 5.051 | Val PPL: 156.211
Epoch: 03 | Time: 0m 25s
	Train Loss: 4.224 | Train PPL:  68.325
	Val Loss: 5.009 | Val PPL: 149.706
Epoch: 04 | Time: 0m 25s
	Train Loss: 4.122 | Train PPL:  61.702
	Val Loss: 4.757 | Val PPL: 116.408
Epoch: 05 | Time: 0m 24s
	Train Loss: 4.003 | Train PPL:  54.754
	Val Loss: 4.689 | Val PPL: 108.756
Epoch: 06 | Time: 0m 25s
	Train Loss: 3.897 | Train PPL:  49.260
	Val Loss: 4.556 | Val PPL:  95.213
Epoch: 07 | Time: 0m 25s
	Train Loss: 3.835 | Train PPL:  46.283
	Val Loss: 4.491 | Val PPL:  89.232
Epoch: 08 | Time: 0m 25s
	Train Loss: 3.785 | Train PPL:  44.040
	Val Loss: 4.442 | Val PPL:  84.920
Epoch: 09 | Time: 0m 25s
	Train Loss: 3.711 | Train PPL:  40.912
	Val Loss: 4.445 | Val PPL:  85.178
Epoch: 10 | Time: 0m 24s
	Train Loss: 3.675 | Train PPL:  39.432
	Val Loss: 4.532 | Val PPL

In [63]:
model.load_state_dict(torch.load('saved_models/best_model.pt'))
#map_location=torch.device('cpu')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.393 | Test PPL:  80.910 |


In [64]:
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [74]:
# need to make changes cause model needs trg
def predict(model, sentence, device, max_length=50):
    #eval
    model.eval()
    with torch.no_grad():
        # tokenize
        tokenized = tokenize_de(sentence)
        toks = []
        toks.append(SRC.vocab.stoi['<sos>'])
        for tok in tokenized:
            toks.append(SRC.vocab.stoi[tok])
        src = torch.LongTensor(toks).unsqueeze(1).to(device)
        toks.append(SRC.vocab.stoi['<eos>'])
        #print(src.shape)

        # pass through encoder
        hidden, cell = model.encoder(src)
        #print(hidden.shape)
        #print(cell.shape)

        # decoder one by one

        outputs = []
        input = torch.LongTensor([TRG.vocab.stoi['<sos>']]).to(device) # [2]
        #print(f"input: {input}")

        for i in range(max_length):
            output, hidden, cell = model.decoder(input, hidden, cell)
            #print(f"output shape: {output.shape}")
            #print(f"output: {output}")
            top = output.argmax(1)
            #print(f"Top: {top}")
            #print(f"Top str: {TRG.vocab.itos[top]}")
            input = top # [0]
            if top == TRG.vocab.stoi['<eos>']:
                break
            outputs.append(top)

        str_output = ""
        for v in range(len(outputs) - 1):
            str_output += str(TRG.vocab.itos[outputs[v].item()]) + " "

        str_output += str(TRG.vocab.itos[outputs[-1].item()])
    return str_output

In [77]:
outputs = predict(model, "Hallo wie geht's", device)

In [78]:
print(outputs)

a bride is in a a . <eos>
