In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
# set seeds
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

# to install spacy languages use:
# python -m spacy download en
# python -m spacy download de

In [7]:
# tokenizers - paper says dont't reverse german this time

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [8]:
# fields to preprocess data
SRC = Field(tokenize=tokenize_de, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

In [9]:
# load data
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [10]:
# print example
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [11]:
# build vocabs
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
# iterators
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

# BUILD MODELS

### Encoder - similair to previous, but now using 1 layer GRU
* no dropout (GRU) cause only 1 layer, use for embeddings

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super(Encoder, self).__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

### Decoder - different from previous - reduces information compression

Instead of the GRU in the decoder taking just the embedded target token, $d(y_t)$ and the previous hidden state $s_{t-1}$ as inputs, it also takes the context vector $z$.

$$s_t = \text{DecoderGRU}(d(y_t), s_{t-1}, z)$$

* <code style="background:yellow;color:black">pass context at every time-step</code>

Before, we predicted the next token, $\hat{y}_{t+1}$, with the linear layer, $f$, only using the top-layer decoder hidden state at that time-step, $s_t$, as $\hat{y}_{t+1}=f(s_t^L)$. Now, we also pass the embedding of current token, $d(y_t)$ and the context vector, $z$ to the linear layer.

$$\hat{y}_{t+1} = f(d(y_t), s_t, z)$$

* <code style="background:yellow;color:black">linear layer takes: embedded input, hidden_state, and context vector</code>


How do these two changes reduce the information compression? Well, hypothetically the decoder hidden states, $s_t$, no longer need to contain information about the source sequence as it is always available as an input. Thus, it only needs to contain information about what tokens it has generated so far. The addition of $y_t$ to the linear layer also means this layer can directly see what the token is, without having to get this information from the hidden state.

However, this hypothesis is just a hypothesis, it is impossible to determine how the model actually uses the information provided to it (don't listen to anyone that says differently).

<img src="images/decoder_part2.png" >

In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super(Decoder, self).__init__()
        
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        self.fc = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        input = input.unsqueeze(0)
        # [1, batch_size]
        embedded = self.dropout(self.embedding(input))
        # [1, batch, emd_dim]
        emb_con = torch.cat((embedded, context), dim=2) # dim=2?
        # [1, batch, emb_dim + hid_dim]
        output, hidden = self.rnn(emb_con, hidden)
        
        # flatten and pass through fc
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        #embedded = [1, batch size, emb dim]
        
        # add emd, hidden, and context for linear layer. Check shape of output and dim. 
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1) 
        
        # output = [batch_size, emd_dim + hid_dim * 2]
        
        # you do pass as [batch_size, dimension of vector]
        prediction = self.fc(output)
        #prediction = [batch, output_dim]
        
        return prediction, hidden

### Seq2Seq Model - putting encoder and decoder together

<img src="images/seq2seq_part2.png" />

Make sure hidden dimensions are same in encoder and decoder

Briefly going over all of the steps:

* the outputs tensor is created to hold all predictions, $\hat{Y}$
* the source sequence, $X$, is fed into the encoder to receive a context vector
* the initial decoder hidden state is set to be the context vector, $s_0 = z = h_T$
* we use a batch of <sos> tokens as the first input, $y_1$
* we then decode within a loop:
    * inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and the context vector, $z$, into the decoder
    * receiving a prediction, $\hat{y}_{t+1}$, and a new hidden state, $s_t$
    * we then decide if we are going to teacher force or not, setting the next input as appropriate (either the ground truth next token in the target sequence or the highest predicted next token)

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        # get dims (trg = [trg_length, batch_size])
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_len = self.decoder.output_dim
        
        #outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_len).to(self.device)
        
        context = self.encoder(src) # context
        hidden = context
        
        input = trg[0,:]  # <sos>
        
        for t in range(1, trg_length):
            
            output, hidden = self.decoder(input, hidden, context)
            #output = [batch, output_dim]
            #hidden = [1, batch size, hid dim]
            
            outputs[t] = output # add [batch, output] as row t
            
            # decide on using teacher_force
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) # 1 = columns for each row; output rows = batches, columns = vocab_len (output_dim)
            
            input = trg[t] if teacher_force else top1
            
        return outputs

# TRAINING - similar to first part

In [67]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 0.01 # (use default to match)

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

Initalize parameters: 

The paper states the parameters are initialized from a normal distribution with a mean of 0 and a standard deviation of 0.01, i.e. $\mathcal{N}(0, 0.01)$.

It also states we should initialize the recurrent parameters to a special initialization, however to keep things simple, also initialize them to $\mathcal{N}(0, 0.01)$.

In [68]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [69]:
# print number of params
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,220,293 trainable parameters


In [70]:
# optimizer
optimizer = optim.Adam(model.parameters())

In [71]:
# loss function - ignoring <pad> token
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

**TRAIN LOOP**

In [76]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
    
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # output = [trg_length, batch_size, output_dim]
        # gotta make into 2d and start at t = 1
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim) # N x M
        trg = trg[1:].view(-1) 
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        #grad clip
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

**EVALUATION LOOP**

In [77]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off t

            # output = [trg_length, batch_size, output_dim]
            # gotta make into 2d and start at t = 1

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim) # N x M
            trg = trg[1:].view(-1) 

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [78]:
# time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

**TRAINING**

In [80]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_models/best_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 22s
	Train Loss: 4.635 | Train PPL: 103.046
	 Val. Loss: 5.488 |  Val. PPL: 241.758
Epoch: 02 | Time: 0m 22s
	Train Loss: 4.283 | Train PPL:  72.466
	 Val. Loss: 5.559 |  Val. PPL: 259.664
Epoch: 03 | Time: 0m 22s
	Train Loss: 4.014 | Train PPL:  55.384
	 Val. Loss: 4.812 |  Val. PPL: 122.948
Epoch: 04 | Time: 0m 22s
	Train Loss: 3.689 | Train PPL:  40.010
	 Val. Loss: 4.438 |  Val. PPL:  84.571
Epoch: 05 | Time: 0m 22s
	Train Loss: 3.392 | Train PPL:  29.735
	 Val. Loss: 4.076 |  Val. PPL:  58.923
Epoch: 06 | Time: 0m 22s
	Train Loss: 3.123 | Train PPL:  22.704
	 Val. Loss: 3.924 |  Val. PPL:  50.626
Epoch: 07 | Time: 0m 22s
	Train Loss: 2.873 | Train PPL:  17.693
	 Val. Loss: 3.854 |  Val. PPL:  47.180
Epoch: 08 | Time: 0m 23s
	Train Loss: 2.669 | Train PPL:  14.419
	 Val. Loss: 3.763 |  Val. PPL:  43.066
Epoch: 09 | Time: 0m 22s
	Train Loss: 2.443 | Train PPL:  11.503
	 Val. Loss: 3.729 |  Val. PPL:  41.652
Epoch: 10 | Time: 0m 22s
	Train Loss: 2.276 | Train PPL

In [81]:
# load saved model and test it 
model.load_state_dict(torch.load('saved_models/best_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.676 | Test PPL:  39.485 |


In [84]:
# need to make changes cause model needs trg
def predict(model, sentence, device, max_length=50):
    #eval
    model.eval()
    with torch.no_grad():
        # tokenize
        tokenized = tokenize_de(sentence)
        toks = []
        toks.append(SRC.vocab.stoi['<sos>'])
        for tok in tokenized:
            toks.append(SRC.vocab.stoi[tok])
        src = torch.LongTensor(toks).unsqueeze(1).to(device)
        toks.append(SRC.vocab.stoi['<eos>'])
        #print(src.shape)

        # pass through encoder
        context = model.encoder(src)
        hidden = context

        # decoder one by one

        outputs = []
        input = torch.LongTensor([TRG.vocab.stoi['<sos>']]).to(device) # [2]
        #print(f"input: {input}")

        for i in range(max_length):
            output, hidden = model.decoder(input, hidden, context)
            #print(f"output shape: {output.shape}")
            #print(f"output: {output}")
            top = output.argmax(1)
            #print(f"Top: {top}")
            #print(f"Top str: {TRG.vocab.itos[top]}")
            input = top # [0]
            if top == TRG.vocab.stoi['<eos>']:
                break
            outputs.append(top)

        str_output = ""
        for v in range(len(outputs) - 1):
            str_output += str(TRG.vocab.itos[outputs[v].item()]) + " "

        str_output += str(TRG.vocab.itos[outputs[-1].item()])
    return str_output

In [85]:
outputs = predict(model, "Hallo wie geht's", device)

In [86]:
print(outputs)

photographers running running .
