In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

In [2]:
dev = "cuda" if torch.cuda.is_available() else "cpu"
print(dev)

cuda


### For reproducibility

In [3]:
MANUAL_SEED = 42
random.seed(MANUAL_SEED)
np.random.seed(MANUAL_SEED)
torch.cuda.manual_seed(MANUAL_SEED)

### Loading the data and preprocessing...

In [4]:
dataset = datasets.load_dataset("bentrevett/multi30k") # A subset of Multi30K dataset hosted on HuggingFace

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [6]:
train_data, val_data, test_data = (dataset['train'], dataset['validation'], dataset['test'])
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [7]:
# Download these models before loading (use > python -m spacy download model_name)
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

In [8]:
def tokenize(input, en_nlp, de_nlp,  max_length, lower, sos_token, eos_token):
    en_tokens = [t.text for t in en_nlp.tokenizer(input["en"])][:max_length]
    de_tokens = [t.text for t in de_nlp.tokenizer(input["de"])][:max_length]
    if lower:
        en_tokens = [t.lower() for t in en_tokens]
        de_tokens = [t.lower() for t in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [9]:
# Generating tokens on the dataset
max_length = 1000
lower = True
sos = "<sos>"
eos = "<eos>"
fn_kwargs = {"de_nlp": spacy_de, "en_nlp": spacy_en, "max_length": max_length, "lower":lower, "sos_token": sos, "eos_token":eos}

train_data = train_data.map(tokenize, fn_kwargs=fn_kwargs)
val_data = val_data.map(tokenize, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize, fn_kwargs=fn_kwargs)

In [10]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

### Building vocabularies

- Need to have <unk> tokens in the training set in order to be able to handle unknown words in the test set
- A nice trick to introduce '<unk>' tokens into our training set is to use the min_freq argument in the build_vocab_from_iterator

In [11]:
MIN_FREQ = 2
unk = "<unk>"
pad = "<pad>"
spl_tokens = [unk, pad, sos, eos]
en_voc = torchtext.vocab.build_vocab_from_iterator(train_data["en_tokens"], min_freq=MIN_FREQ, specials=spl_tokens)
de_voc = torchtext.vocab.build_vocab_from_iterator(train_data["de_tokens"], min_freq=MIN_FREQ, specials=spl_tokens)


In [12]:
# First 10 tokens in the english vocabulary
en_voc.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man']

In [13]:
# to get the index of a token use get_stoi
en_voc.get_stoi()["the"] # or simply use en_voc["the"] 

7

In [14]:
# sizes of the vocabularies built...
print(f'English vocab size: {len(en_voc)}')
print(f'German vocab size: {len(de_voc)}')

English vocab size: 5893
German vocab size: 7853


In [15]:
# Caching the locations for later use
assert en_voc[unk] == de_voc[unk]
assert en_voc[pad] == de_voc[pad]
unk_idx = en_voc[unk]
pad_idx = en_voc[pad]

In [16]:
# For handling unknown tokens, we need to explicitly tell torchtext to map it to unk
en_voc.set_default_index(unk_idx)
de_voc.set_default_index(unk_idx)

In [17]:
def numerize_input(input, en_voc, de_voc):
    en_idx = en_voc.lookup_indices(input["en_tokens"])
    de_idx = de_voc.lookup_indices(input["de_tokens"])
    return {"en_idx":en_idx, "de_idx": de_idx}

fn_kwargs = {"en_voc": en_voc, "de_voc": de_voc}
train_data = train_data.map(numerize_input, fn_kwargs=fn_kwargs)
val_data = val_data.map(numerize_input, fn_kwargs=fn_kwargs)
test_data = test_data.map(numerize_input, fn_kwargs=fn_kwargs)

In [18]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_idx': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'de_idx': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]}

In [19]:
# Converting the token indices to Pytorch tensors...
target_data_type = "torch"
format_cols = ["en_idx", "de_idx"]

train_data = train_data.with_format(type=target_data_type, columns=format_cols, output_all_columns=True)
val_data = val_data.with_format(type=target_data_type, columns=format_cols, output_all_columns=True)
test_data = test_data.with_format(type=target_data_type, columns=format_cols, output_all_columns=True)
train_data[0]

{'en_idx': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            3]),
 'de_idx': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [20]:
type(train_data[0]["en_idx"]) == torch.Tensor

True

In [21]:
# Combining a batch of instances into a batch suitable for dataloader using a closure
def call_create_batch(pad_idx):
    def create_batch(batch):
        batch_en_idx = [sentence["en_idx"] for sentence in batch]
        batch_de_idx = [sentence["de_idx"] for sentence in batch]
        batch_en_idx = nn.utils.rnn.pad_sequence(batch_en_idx, padding_value=pad_idx)
        batch_de_idx = nn.utils.rnn.pad_sequence(batch_de_idx, padding_value=pad_idx)
        batch = {"en_idx": batch_en_idx, "de_idx": batch_de_idx}
        return batch
    return create_batch

In [22]:
# Creating the dataloader...
def create_dataloader(dataset, batch_size, pad_idx, shuffle=False):
    collate_fn = call_create_batch(pad_idx)
    dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=shuffle)
    return dataloader

BATCH_SIZE = 128
train_dataloader = create_dataloader(train_data, BATCH_SIZE, pad_idx, shuffle=True)
val_dataloader = create_dataloader(val_data, BATCH_SIZE, pad_idx) # No need to shuffle for val / test splits
test_dataloader = create_dataloader(test_data, BATCH_SIZE, pad_idx)

### Implementing the Encoder & Decoder

In [23]:
# Implementing the Encoder...
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_size, num_layers, dropout_prob):
        '''
        Args:
        input_dim : input vocabulary size
        embedding_dim : size of dense vectors produced by the embedding layer
        hidden_size : dimensionality of hidden / cell states in the LSTM 
        num_layers : number of layers in the LSTM
        dropout_prob: dropout probability to be used
        '''
        super(Encoder, self).__init__()
        # Caching these two parameters for verification later
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # Defining the layers
        self.dropout = nn.Dropout(dropout_prob)
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
                            num_layers=num_layers, dropout=dropout_prob)
        
    
    def forward(self, input):
        '''
        input dim: (seq_length, N)
        '''
        embedded = self.dropout(self.embedding(input)) # (seq_length, N, embedding_dim)
        _, (hidden_state, cell_state) = self.lstm(embedded)
        return hidden_state, cell_state
    

# Implementing the decoder...
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_size, num_layers, dropout_prob):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        # Defining the layers
        self.dropout = nn.Dropout(dropout_prob)
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, 
                            num_layers=num_layers, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden_state, cell_state):
        input = input.unsqueeze(0) # Prepend a 1 denoting seq length as we will be decoding one token at a time
        embedded = self.dropout(self.embedding(input))
        outputs, (hidden_state, cell_state) = self.lstm(embedded, (hidden_state, cell_state))
        preds = self.fc(outputs.squeeze(0))
        return preds, hidden_state, cell_state

### Seq2Seq with Teacher Forcing

In [24]:
class Seq2Seq_with_TF(nn.Module):
    def __init__(self, encoder, decoder, dev):
        super().__init__()
        self.enc = encoder
        self.dec = decoder
        self.dev = dev

        # Following the architecture in the paper
        assert encoder.hidden_size == decoder.hidden_size, "Encoder & Decoder hidden dimensions don't match"
        assert encoder.num_layers == decoder.num_layers, "Number of layers inEncoder & Decoder don't match"
    
    def forward(self, src, tgt, tf_ratio):
        batch_sz = tgt.shape[1] # input --> (input_len, batch_size) and tgt --> (tgt_len, batch_size)
        tgt_length = tgt.shape[0]
        tgt_voc_sz = self.dec.output_size
        outputs = torch.zeros(tgt_length, batch_sz, tgt_voc_sz).to(self.dev)
        
        hidden_state, cell_state = self.enc(src)
        dec_input = tgt[0, :] # <sos> tokens

        for t in range(1, tgt_length): # as we know the number of tokens in the target sentence
            output, hidden, cell = self.dec(dec_input, hidden_state, cell_state)
            outputs[t] = output
            teacher_force = random.random() < tf_ratio
            # sampling from our prediction
            top_preds = output.argmax(1)
            dec_input = tgt[t] if teacher_force else top_preds
        
        return outputs # note that outputs[0] remains all zeros
    

### Initializing the model...

In [33]:
input_dim = len(de_voc)
output_dim = len(en_voc)
enc_embed_dim = 256
dec_embed_dim = 256
hidden_size = 512
n_layers = 2 # instead of 4 in the paper, just to save some time!
drop_prob = 0.5

encoder = Encoder(input_dim, enc_embed_dim, hidden_size, n_layers, drop_prob)
decoder = Decoder(output_dim, dec_embed_dim, hidden_size, n_layers, drop_prob)
model = Seq2Seq_with_TF(encoder, decoder, dev).to(dev)

### Weight initialization (following the paper)

In [34]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(initialize_weights)

Seq2Seq_with_TF(
  (enc): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7853, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (dec): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=5893, bias=True)
  )
)

In [35]:
# Count the number of learnable parameters in our model

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Our Seq2Seq model has {count_parameters(model):,} trainable parameters')

Our Seq2Seq model has 13,898,501 trainable parameters


### Loss function & Optimizer

In [36]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters())

### Training the model

In [37]:
def train_seq2seq(model, dataloader, optimizer, criterion, clip, tf_ratio, dev):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(dataloader):
        src = batch["de_idx"].to(dev)
        tgt = batch["en_idx"].to(dev)
        optimizer.zero_grad()
        output = model(src, tgt, tf_ratio)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader) # avg loss 

### Evaluating the model
- Note: We need to turn off teacher forcing during evaluation as we always wanna use our prediction as input at the subsequent time step in the decoder


In [38]:
def eval_seq2seq(model, dataloader, criterion, dev):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            src = batch["de_idx"].to(dev)
            tgt = batch["en_idx"].to(dev)
            optimizer.zero_grad()
            output = model(src, tgt, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            tgt = tgt[1:].view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

### Training the model...

In [39]:
NUM_EPOCHS = 10
CLIP = 1.0
TF_RATIO = 0.5

best_val_loss = float("inf")

for epoch in tqdm.tqdm(range(NUM_EPOCHS)):
    train_loss = train_seq2seq(model, train_dataloader, optimizer, criterion, CLIP, TF_RATIO, dev)
    val_loss = eval_seq2seq(model, val_dataloader, criterion, dev)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "model.pt")
    print(f"Train loss: {train_loss: 7.4f} | Train perplexity: {np.exp(train_loss):7.4f} ")
    print(f"Validation loss: {val_loss: 7.4f} | Validation perplexity: {np.exp(train_loss):7.4f}")

 10%|█         | 1/10 [01:13<11:02, 73.58s/it]

Train loss:  5.4470 | Train perplexity: 232.0525 
Validation loss:  5.2911 | Validation perplexity: 232.0525


 20%|██        | 2/10 [02:29<09:57, 74.74s/it]

Train loss:  5.0787 | Train perplexity: 160.5638 
Validation loss:  5.4047 | Validation perplexity: 160.5638


 30%|███       | 3/10 [03:50<09:05, 77.98s/it]

Train loss:  4.9143 | Train perplexity: 136.2286 
Validation loss:  5.2198 | Validation perplexity: 136.2286


 40%|████      | 4/10 [05:12<07:56, 79.44s/it]

Train loss:  4.8379 | Train perplexity: 126.2043 
Validation loss:  5.2838 | Validation perplexity: 126.2043


 50%|█████     | 5/10 [06:35<06:42, 80.51s/it]

Train loss:  4.7455 | Train perplexity: 115.0668 
Validation loss:  5.2218 | Validation perplexity: 115.0668


 60%|██████    | 6/10 [07:57<05:25, 81.27s/it]

Train loss:  4.6902 | Train perplexity: 108.8776 
Validation loss:  5.1918 | Validation perplexity: 108.8776


 70%|███████   | 7/10 [09:20<04:05, 81.72s/it]

Train loss:  4.6466 | Train perplexity: 104.2278 
Validation loss:  5.1859 | Validation perplexity: 104.2278


 80%|████████  | 8/10 [10:43<02:44, 82.13s/it]

Train loss:  4.6188 | Train perplexity: 101.3684 
Validation loss:  5.1754 | Validation perplexity: 101.3684


 90%|█████████ | 9/10 [12:06<01:22, 82.41s/it]

Train loss:  4.5827 | Train perplexity: 97.7761 
Validation loss:  5.2826 | Validation perplexity: 97.7761


100%|██████████| 10/10 [13:31<00:00, 81.18s/it]

Train loss:  4.4269 | Train perplexity: 83.6691 
Validation loss:  5.1092 | Validation perplexity: 83.6691





### Test performance


In [40]:
model.load_state_dict(torch.load('model.pt'))

test_loss = eval_seq2seq(model, test_dataloader, criterion, dev)
print(f" Test Loss: {test_loss:.3f} | Test perplexity: {np.exp(test_loss): 7.3f} ")

 Test Loss: 5.118 | Test perplexity:  167.046 
