# Neural Machine Translation
In this notebook, we are going to perform machine translation using two deep learning approaches: a recurrent neural network and a transformer. Specifically, we are going to train sequence to sequence models for Spanish to English translation.

# Download & Prepare the Data

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re
import torch
import random
import math
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

rnn_encoder, rnn_encoder, transformer_encoder, transformer_decoder = None, None, None, None

## Helper Functions
This cell contains helper functions for the dataloader.

In [2]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    '''
    Preprocess the sentence to add the start, end tokens and make them lower-case
    '''
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[" "]+', ' ', w)

    w = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', w)
    
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w


def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, src_vocab, trg_vocab):
    # Vectorize the input and target languages
    src_tensor = [[src_vocab.word2idx[s if s in src_vocab.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe['es'].values.tolist()]
    trg_tensor = [[trg_vocab.word2idx[s if s in trg_vocab.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe['eng'].values.tolist()]

    # Calculate the max_length of input and output tensor for padding
    max_length_src, max_length_trg = max(len(t) for t in src_tensor), max(len(t) for t in trg_tensor)
    print('max_length_src: {}, max_length_trg: {}'.format(max_length_src, max_length_trg))

    # Pad all the sentences in the dataset with the max_length
    src_tensor = [pad_sequences(x, max_length_src) for x in src_tensor]
    trg_tensor = [pad_sequences(x, max_length_trg) for x in trg_tensor]

    return src_tensor, trg_tensor, max_length_src, max_length_trg


def train_test_split(src_tensor, trg_tensor):
    '''
    Create training and test sets.
    '''
    total_num_examples = len(src_tensor) - int(0.2*len(src_tensor))
    src_tensor_train, src_tensor_test = src_tensor[:int(0.75*total_num_examples)], src_tensor[int(0.75*total_num_examples):total_num_examples]
    trg_tensor_train, trg_tensor_test = trg_tensor[:int(0.75*total_num_examples)], trg_tensor[int(0.75*total_num_examples):total_num_examples]

    return src_tensor_train, src_tensor_test, trg_tensor_train, trg_tensor_test

## Load and Visualize the Data

In [3]:
os.system("wget http://www.manythings.org/anki/spa-eng.zip")
os.system("unzip -o spa-eng.zip")

lines = open('spa.txt', encoding='UTF-8').read().strip().split('\n')
total_num_examples = 50000 
original_word_pairs = [[w for w in l.split('\t')][:2] for l in lines[:total_num_examples]]
random.shuffle(original_word_pairs)

dat = pd.DataFrame(original_word_pairs, columns=['eng', 'es'])
print(dat) # Visualize the data

                          eng                             es
0              I prefer dogs.         Prefiero a los perros.
1       This may not be true.  Puede que esto no sea verdad.
2              She's my wife.                  Es mi esposa.
3          It was a disaster.       Aquello era un desastre.
4              Wait a minute.              Espera un minuto.
...                       ...                            ...
49995  She helped cook lunch.    Ayudó a preparar la comida.
49996       Will you join us?         ¿Te unirás a nosotros?
49997    This is no accident.       Esto no es un accidente.
49998        We can help you.              Podemos ayudarte.
49999  Could we speak French?     ¿Podríamos hablar francés?

[50000 rows x 2 columns]


Preprocess the data.

In [4]:
data = dat.copy()
data['eng'] = dat.eng.apply(lambda w: preprocess_sentence(w))
data['es'] = dat.es.apply(lambda w: preprocess_sentence(w))
print(data) # Visualizing the data

                                         eng  \
0              <start> i prefer dogs . <end>   
1       <start> this may not be true . <end>   
2              <start> she s my wife . <end>   
3          <start> it was a disaster . <end>   
4              <start> wait a minute . <end>   
...                                      ...   
49995  <start> she helped cook lunch . <end>   
49996       <start> will you join us ? <end>   
49997    <start> this is no accident . <end>   
49998        <start> we can help you . <end>   
49999  <start> could we speak french ? <end>   

                                                 es  
0             <start> prefiero a los perros . <end>  
1      <start> puede que esto no sea verdad . <end>  
2                      <start> es mi esposa . <end>  
3           <start> aquello era un desastre . <end>  
4                  <start> espera un minuto . <end>  
...                                             ...  
49995    <start> ayudo a preparar la comida .

## Vocabulary & Dataloader Classes

Create a class for managing a vocabulary. 2 different vocabularies will be needed $-$ one for English and one for Spanish.

Prepare the dataloader and make sure it returns the source sentence and target sentence.

In [5]:
class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab
        
        for index, word in enumerate(vocab):
            self.word2idx[word] = index + 2 # +2 because of <pad> and <unk> token
            self.idx2word[index + 2] = word
    
    def __len__(self):
        return len(self.word2idx)

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)
    
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [6]:
# HYPERPARAMETERS
BATCH_SIZE = 64
EMBEDDING_DIM = 256

## Build Vocabulary

In [7]:
def build_vocabulary(pd_dataframe):
    sentences = [sen.split() for sen in pd_dataframe]
    vocab = {}
    for sen in sentences:
        for word in sen:
            if word not in vocab:
                vocab[word] = 1
    return list(vocab.keys())

src_vocab_list = build_vocabulary(data['es'])
trg_vocab_list = build_vocabulary(data['eng'])

## Instantiate Datasets

Instantiate the training and validation datasets.

In [8]:
src_vocab = Vocab_Lang(src_vocab_list)
trg_vocab = Vocab_Lang(trg_vocab_list)

src_tensor, trg_tensor, max_length_src, max_length_trg = preprocess_data_to_tensor(data, src_vocab, trg_vocab)
src_tensor_train, src_tensor_val, trg_tensor_train, trg_tensor_val = train_test_split(src_tensor, trg_tensor)

# Create train and val datasets
train_dataset = MyData(src_tensor_train, trg_tensor_train)
train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

test_dataset = MyData(src_tensor_val, trg_tensor_val)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)

max_length_src: 16, max_length_trg: 12


  self.data = torch.LongTensor(X)


In [9]:
idxes = random.choices(range(len(train_dataset.dataset)), k=5)
src, trg =  train_dataset.dataset[idxes]
print('Source:', src)
print('Source Dimensions: ', src.size())
print('Target:', trg)
print('Target Dimensions: ', trg.size())

Source: tensor([[   2,  537,    4,  538,  539,    7,    8,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,  145, 1098,  198,   42,  185,    7,    8,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,    5, 4793,   12, 3457,    7,    8,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2, 1075,  107, 5199,    7,    8,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,  297, 4204,  207, 4513, 5130,    7,    8,    0,    0,    0,    0,
            0,    0,    0,    0]])
Source Dimensions:  torch.Size([5, 16])
Target: tensor([[   2,  181,   21,  464,    6,    7,    0,    0,    0,    0,    0,    0],
        [   2,  140,   24,  253,  193,   24,   36,  181,    6,    7,    0,    0],
        [   2,  136,   84,  123, 2983,    6,    7,    0,    0,    0,    0,    0],
        [   2,   86,  173,  104, 2946,    6,    7,    0,    0,    0,    0,    0],
        [   2,   45, 1916,   19,  465, 2

# Train a Recurrent Neural Network (RNN)

In [10]:
import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm.notebook import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

## Encoder Model
Build a recurrent encoder model.

In [11]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RnnEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, hidden_units):
        super(RnnEncoder, self).__init__()
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.src_vocab = src_vocab
        vocab_size = len(src_vocab)

        # Initialize embedding layer
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        # Initialize a single directional GRU with 1 layer and batch_first=False
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_units, num_layers=1, batch_first=False)

    def forward(self, x):
        """
        Args:
            x: source texts, [max_len, batch_size]

        Returns:
            output: [max_len, batch_size, hidden_units]
            hidden_state: [1, batch_size, hidden_units] 
        """
        word_embeddings = self.embed(x)
        output, hidden_state = self.gru(word_embeddings)
        
        return output, hidden_state

## Decoder Model
Implement a decoder model that uses an attention mechanism.

In [12]:
class RnnDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, hidden_units):
        super(RnnDecoder, self).__init__()
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: The dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.trg_vocab = trg_vocab
        vocab_size = len(trg_vocab)

        # Initialize embedding layer
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        # Initialize layers to compute attention score
        self.W1 = nn.Linear(in_features=hidden_units, out_features=hidden_units)
        self.W2 = nn.Linear(in_features=hidden_units, out_features=hidden_units)
        self.v = nn.Linear(in_features=hidden_units, out_features=1)

        # Initialize a single directional GRU with 1 layer and batch_first=True
        self.gru = nn.GRU(input_size=embedding_dim + hidden_units, hidden_size=hidden_units, num_layers=1, batch_first=True)

        # Initialize fully connected layer
        self.connected = nn.Linear(in_features=hidden_units, out_features=vocab_size)

    def compute_attention(self, dec_hs, enc_output):
        '''
        This function computes the context vector and attention weights.

        Args:
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            context_vector: Context vector, according to formula; [batch_size, hidden_units]
            attention_weights: The attention weights you have calculated; [batch_size, max_len_src, 1]
        '''      
        dec_hs = torch.permute(dec_hs, dims=(1, 0, 2))
        enc_output = torch.permute(enc_output, dims=(1, 0, 2))
        W1 = self.W1(dec_hs)
        W2 = self.W2(enc_output)
        attention_scores = self.v(torch.tanh(W1 + W2))
        attention_weights = F.softmax(attention_scores, dim=1)
        context_vector = torch.sum(attention_weights * enc_output, dim=1)

        return context_vector, attention_weights

    def forward(self, x, dec_hs, enc_output):
        '''
        This function runs the decoder for a **single** time step.

        Args:
            x: Input token; [batch_size, 1]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            fc_out: (Unnormalized) output distribution [batch_size, vocab_size]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            attention_weights: The attention weights you have learned; [batch_size, max_len_src, 1]
        '''
        context_vector, attention_weights = self.compute_attention(dec_hs, enc_output)
        word_embeddings = self.embed(x)
        embed_context = torch.cat((torch.unsqueeze(context_vector, 1), word_embeddings), dim=2)
        fc_out, dec_hs = self.gru(embed_context)
        fc_out = self.connected(fc_out)
        fc_out = torch.squeeze(fc_out, dim=1)

        return fc_out, dec_hs, attention_weights

## Train RNN Model

Train the encoder and decoder using cross-entropy loss.

In [13]:
def loss_function(real, pred):
    mask = real.ge(1).float() # Only consider non-zero inputs in the loss
    
    loss_ = F.cross_entropy(pred, real) * mask 
    return torch.mean(loss_)

def train_rnn_model(encoder, decoder, dataset, optimizer, trg_vocab, device, n_epochs):
    batch_size = dataset.batch_size
    for epoch in range(n_epochs):
        start = time.time()
        n_batch = 0
        total_loss = 0
        
        encoder.train()
        decoder.train()
        
        for src, trg in tqdm(dataset):
            n_batch += 1
            loss = 0
            
            enc_output, enc_hidden = encoder(src.transpose(0,1).to(device))
            dec_hidden = enc_hidden
            
            # use teacher forcing - feeding the target as the next input (via dec_input)
            dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
        
            # run code below for every timestep in the ys batch
            for t in range(1, trg.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                assert len(predictions.shape) == 2 and predictions.shape[0] == dec_input.shape[0] and predictions.shape[1] == len(trg_vocab.word2idx), "First output of decoder must have shape [batch_size, vocab_size], you returned shape " + str(predictions.shape)
                loss += loss_function(trg[:, t].to(device), predictions.to(device))
                dec_input = trg[:, t].unsqueeze(1)
        
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss
            
            optimizer.zero_grad()
            
            batch_loss.backward()

            ### update model parameters
            optimizer.step()
        
        print('Epoch:{:2d}/{}\t Loss: {:.4f} \t({:.2f}s)'.format(epoch + 1, n_epochs, total_loss / n_batch, time.time() - start))

    print('Model trained!')

In [14]:
# HYPERPARAMETERS
LEARNING_RATE = 0.001
HIDDEN_UNITS = 256
N_EPOCHS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
rnn_encoder = RnnEncoder(src_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(device)
rnn_decoder = RnnDecoder(trg_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(device)

rnn_model_params = list(rnn_encoder.parameters()) + list(rnn_decoder.parameters())
optimizer = torch.optim.Adam(rnn_model_params, lr=LEARNING_RATE)

print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [15]:
if __name__ == '__main__':
    train_rnn_model(rnn_encoder, rnn_decoder, train_dataset, optimizer, trg_vocab, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/10	 Loss: 1.7958 	(16.21s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/10	 Loss: 1.1520 	(15.15s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/10	 Loss: 0.8517 	(16.16s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/10	 Loss: 0.6318 	(15.69s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/10	 Loss: 0.4670 	(15.05s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/10	 Loss: 0.3469 	(15.32s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/10	 Loss: 0.2610 	(15.14s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/10	 Loss: 0.1984 	(19.60s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/10	 Loss: 0.1553 	(14.90s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/10	 Loss: 0.1255 	(15.15s)
Model trained!


## Inference (Decoding) Function
This function takes the trained model and a source sentence (Spanish), and returns its translation (English sentence). The input to the decoder at time step $t_i$ will be the prediction of the decoder at time $t_{i-1}$.

In [16]:
def decode_rnn_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: RnnEncoder object
        decoder: RnnDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        device: the device your torch tensors are on

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))

    # Start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
    curr_output[:, 0] = dec_input.squeeze(1)
    
    # Implement decoding algorithm
    enc_output, hidden_state = encoder.forward(src)
    dec_hs = hidden_state
    for t in range(1, max_decode_len):
      fc_out, dec_hs, attention_weights = decoder.forward(dec_input.to(device), dec_hs, enc_output)
      curr_predictions[:, t] = fc_out
      dec_input = torch.unsqueeze(torch.argmax(fc_out, dim=1), dim=1)
      curr_output[:, t] = torch.squeeze(dec_input)
    
    return curr_output, curr_predictions

Run the cell below to qualitatively compare some of the sentences the model generates with the some of the correct translations.

In [17]:
rnn_encoder.eval()
rnn_decoder.eval()
idxes = random.choices(range(len(test_dataset.dataset)), k=5)
src, trg =  train_dataset.dataset[idxes]
curr_output, _ = decode_rnn_model(rnn_encoder, rnn_decoder, src.transpose(0,1).to(device), trg.size(1), device)
for i in range(len(src)):
    print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
    print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
    print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
    print("----------------")

Source sentence: <start> descansemos aqui . <end>
Target sentence: <start> let s rest here . <end>
Predicted sentence: <start> let s rest here . <end>
----------------
Source sentence: <start> ¿ bebes cerveza ? <end>
Target sentence: <start> do you drink beer ? <end>
Predicted sentence: <start> do you drink beer ? <end>
----------------
Source sentence: <start> ¿ a quien estan esperando ? <end>
Target sentence: <start> who are you waiting for ? <end>
Predicted sentence: <start> who are you waiting for ? <end>
----------------
Source sentence: <start> volvere luego . <end>
Target sentence: <start> i ll be back soon . <end>
Predicted sentence: <start> i ll be back later . <end>
----------------
Source sentence: <start> todavia es demasiado temprano . <end>
Target sentence: <start> it s still too early . <end>
Predicted sentence: <start> it s still too early . <end>
----------------


## Evaluate RNN Model

Helper functions to run the test set through the model and calculate BLEU scores.

In [18]:
def get_reference_candidate(target, pred, trg_vocab):
    def _to_token(sentence):
        lis = []
        for s in sentence[1:]:
            x = trg_vocab.idx2word[s]
            if x == "<end>": break
            lis.append(x)
        return lis
    reference = _to_token(list(target.numpy()))
    candidate = _to_token(list(pred.numpy()))
    return reference, candidate

def compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab):
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], trg_vocab)
    
        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)
    
    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    scores = {"bleu_1": bleu_1, "bleu_2": bleu_2, "bleu_3": bleu_3, "bleu_4": bleu_4}
    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate, scores

def evaluate_rnn_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            curr_output, curr_predictions = decode_rnn_model(encoder, decoder, src.transpose(0,1).to(device), trg.size(1), device)
            for t in range(1, trg.size(1)):
                loss += loss_function(trg[:, t].to(device), curr_predictions[:,t,:].to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(1)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(1)))
            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss

        print('Loss {:.4f}'.format(total_loss / n_batch))
    
    # Compute BLEU scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [19]:
if __name__ == '__main__':
    rnn_save_candidate, rnn_scores = evaluate_rnn_model(rnn_encoder, rnn_decoder, test_dataset, trg_tensor_val, device)

Loss 1.9514
BLEU 1-gram: 0.297667
BLEU 2-gram: 0.082981
BLEU 3-gram: 0.060596
BLEU 4-gram: 0.057675


# Train a Transformer

## Positional Embeddings
Because there are no recurrences in a transformer, a positional embedding needs to be added to the word embeddings to tell the transformer the relative position of words in a sentence.

In [20]:
def create_positional_embedding(max_len, embed_dim):
    '''
    Args:
        max_len: The maximum length supported for positional embeddings
        embed_dim: The size of your embeddings
    Returns:
        pe: [max_len, 1, embed_dim] computed as in the formulae above
    '''
    pe = torch.zeros((max_len, embed_dim))

    for i in range(max_len):
      for j in range(embed_dim):
        if j % 2 == 0:
          pe[i , j] = np.sin(i / np.exp(np.log(10000**(j / embed_dim))))
        else:
          pe[i , j] = np.cos(i / np.exp(np.log(10000**((j-1) / embed_dim))))
    pe = torch.unsqueeze(pe, dim=1)
    
    return pe

## Encoder Model

In [21]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_src, device):
        super(TransformerEncoder, self).__init__()
        self.device = device
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding
            num_heads: The number of attention heads
            num_layers: the number of Transformer Encoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_src: maximum length of the source sentences
            device: the working device
        """
        self.src_vocab = src_vocab
        src_vocab_size = len(src_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_src, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that position_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embed = nn.Embedding(num_embeddings=src_vocab_size, embedding_dim=embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerEncoder model
        trans_enc_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=dim_feedforward).to(device)
        self.trans_enc = nn.TransformerEncoder(encoder_layer=trans_enc_layer, num_layers=num_layers)

    def make_src_mask(self, src):
        """
        Args:
            src: [max_len, batch_size]
        Returns:
            Boolean matrix of size [batch_size, max_len] indicating which indices are padding
        """
        assert len(src.shape) == 2, 'src must have exactly 2 dimensions'
        src_mask = src.transpose(0, 1) == 0 # padding idx
        return src_mask.to(self.device) # [batch_size, max_src_len]

    def forward(self, x):
        """
        Args:
            x: [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, embed_dim]
        """
        word_embeddings = self.embed(x).to(self.device)
        max_len = word_embeddings.shape[0]
        word_embeddings = word_embeddings + self.position_embedding[:max_len]
        output = self.dropout(word_embeddings)
        src_mask = self.make_src_mask(x)
        output = self.trans_enc(src=output, src_key_padding_mask=src_mask)

        return output

## Decoder Model

In [22]:
class TransformerDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_trg, device):
        super(TransformerDecoder, self).__init__()
        self.device = device
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: the dimension of the embedding
            num_heads: The number of attention heads
            num_layers: the number of Transformer Decoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_trg: maximum length of the target sentences
            device: the working device
        """
        self.trg_vocab = trg_vocab
        trg_vocab_size = len(trg_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_trg, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that positional_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embed = nn.Embedding(num_embeddings=trg_vocab_size, embedding_dim=embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerDecoder model
        trans_dec_layer = nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=dim_feedforward).to(device)
        self.trans_dec = nn.TransformerDecoder(decoder_layer=trans_dec_layer, num_layers=num_layers)

        # Final fully connected layer
        self.connected = nn.Linear(in_features=embedding_dim, out_features=trg_vocab_size).to(device)

    def generate_square_subsequent_mask(self, sz):
        """ Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(self.device)
        return mask

    def forward(self, dec_in, enc_out):
        """
        Args:
            dec_in: [sequence length, batch_size]
            enc_out: [max_len, batch_size, embed_dim]
        Returns:
            output: [sequence length, batch_size, trg_vocab_size]
        """
        word_embeddings = self.embed(dec_in).to(self.device)
        seq_length = word_embeddings.shape[0]
        word_embeddings = word_embeddings + self.position_embedding[:seq_length]
        output = self.dropout(word_embeddings)
        trg_mask = self.generate_square_subsequent_mask(dec_in.shape[0])
        output = self.trans_dec(tgt=output, memory=enc_out.to(self.device), tgt_mask=trg_mask)
        output = self.connected(output)

        return output

## Train Transformer Model

Like with the RNN model, train the encoder and decoder using cross-entropy loss.

In [23]:
def train_transformer_model(encoder, decoder, dataset, optimizer, device, n_epochs):
    encoder.train()
    decoder.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for epoch in range(n_epochs):
        start = time.time()
        losses = []

        for src, trg in tqdm(train_dataset):
            
            src = src.to(device).transpose(0,1) # [max_src_length, batch_size]
            trg = trg.to(device).transpose(0,1) # [max_trg_length, batch_size]

            enc_out = encoder(src)
            output = decoder(trg[:-1, :], enc_out)

            output = output.reshape(-1, output.shape[2])
            trg = trg[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, trg)
            losses.append(loss.item())

            loss.backward()

            # Clip to avoid exploding grading issues
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)

            optimizer.step()

        mean_loss = sum(losses) / len(losses)
        print('Epoch:{:2d}/{}\t Loss:{:.4f} ({:.2f}s)'.format(epoch + 1, n_epochs, mean_loss, time.time() - start))


In [24]:
# HYPERPARAMETERS
LEARNING_RATE = 0.001
DIM_FEEDFORWARD=512
N_EPOCHS=10
N_HEADS=2
N_LAYERS=2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transformer_encoder = TransformerEncoder(src_vocab, EMBEDDING_DIM, N_HEADS, 
                                         N_LAYERS,DIM_FEEDFORWARD,
                                         max_length_src, device).to(device)
transformer_decoder = TransformerDecoder(trg_vocab, EMBEDDING_DIM, N_HEADS, 
                                         N_LAYERS,DIM_FEEDFORWARD,
                                         max_length_trg, device).to(device)

transformer_model_params = list(transformer_encoder.parameters()) + list(transformer_decoder.parameters())
optimizer = torch.optim.Adam(transformer_model_params, lr=LEARNING_RATE)

print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [25]:
if __name__ == '__main__':
    train_transformer_model(transformer_encoder, transformer_decoder, train_dataset, optimizer, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/10	 Loss:3.4304 (10.06s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/10	 Loss:2.6072 (10.09s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/10	 Loss:2.2825 (9.67s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/10	 Loss:2.0402 (10.19s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/10	 Loss:1.8410 (10.07s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/10	 Loss:1.6625 (10.13s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/10	 Loss:1.5191 (10.02s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/10	 Loss:1.3955 (10.45s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/10	 Loss:1.2941 (9.47s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/10	 Loss:1.2153 (10.02s)


## Inference (Decoding) Function
This function takes the trained transformer model and a source sentence (Spanish), and returns its translation (English sentence). Unlike RNN, because the transformer does not use recurrences, the entire sequence predicted so far gets passed to time step $t_i$ instead of just the one made at time $t_{i-1}$.

In [26]:
def decode_transformer_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: TransformerEncoder object
        decoder: TransformerDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        device: the device your torch tensors are on

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size).transpose(0,1)
    curr_output[:, 0] = dec_input.squeeze(1)
    
    # Implement decoding algorithm
    enc_output = encoder.forward(src)
    for t in range(1, max_decode_len):
      dec_input = torch.t(curr_output[:, :t].type(torch.long))
      dec_output = decoder.forward(dec_input.to(device), enc_output)
      curr_predictions[:, t] = dec_output[t-1]
      curr_output[:, t] = torch.argmax(curr_predictions[:, t], dim=1)

    return curr_output, curr_predictions, enc_output

Run the cell below to qualitatively compare some of the sentences the model generates with the some of the correct translations.

In [27]:
transformer_encoder.eval()
transformer_decoder.eval()
idxes = random.choices(range(len(test_dataset.dataset)), k=5)
src, trg =  train_dataset.dataset[idxes]
curr_output, _, _ = decode_transformer_model(transformer_encoder, transformer_decoder, src.transpose(0,1).to(device), trg.size(1), device)
for i in range(len(src)):
    print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
    print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
    print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
    print("----------------")

Source sentence: <start> vuelve manana . <end>
Target sentence: <start> come back tomorrow . <end>
Predicted sentence: <start> come again tomorrow . <end> . <end> . <end> . <end>
----------------
Source sentence: <start> tom necesita nuestro aporte . <end>
Target sentence: <start> tom needs our input . <end>
Predicted sentence: <start> tom needs our input . <end> . <end> . <end> .
----------------
Source sentence: <start> mangos ! <end>
Target sentence: <start> no way ! <end>
Predicted sentence: <start> go ! <end> ! <end> ! <end> ! <end> ! <end>
----------------
Source sentence: <start> mi libro esta aqui . <end>
Target sentence: <start> my book is here . <end>
Predicted sentence: <start> my book is here . <end> . <end> . <end> .
----------------
Source sentence: <start> estamos listos . <end>
Target sentence: <start> we re ready . <end>
Predicted sentence: <start> we re ready . <end> . <end> . <end> . <end>
----------------


## Evaluate Transformer Model

Run the test set through the transformer model.


In [28]:
def evaluate_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    losses=[]
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            
            src, trg = src.transpose(0,1).to(device), trg.transpose(0,1).to(device)
            curr_output, curr_predictions, enc_out = decode_transformer_model(encoder, decoder, src, trg.size(0), device)

            for t in range(1, trg.size(0)):
                output = decoder(trg[:-1, :], enc_out)
                output = output.reshape(-1, output.shape[2])
                loss_trg = trg[1:].reshape(-1)
                loss += criterion(output, loss_trg)

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(0)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(0)))

            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg.transpose(0,1)
            losses.append(loss.item() / (trg.size(0)-1))

        mean_loss = sum(losses) / len(losses)
        print('Loss {:.4f}'.format(mean_loss))
    
    # Compute Bleu scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [29]:
if __name__ == '__main__':
    transformer_save_candidate, transformer_scores = evaluate_model(transformer_encoder, transformer_decoder, test_dataset, trg_tensor_val, device)

Loss 1.4982
BLEU 1-gram: 0.299358
BLEU 2-gram: 0.084157
BLEU 3-gram: 0.061734
BLEU 4-gram: 0.059025
