In [1]:
!nvidia-smi

Tue Apr  7 19:31:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro P5000        On   | 00000000:00:05.0 Off |                  Off |
| 30%   38C    P0    43W / 180W |      1MiB / 16278MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
import os
import re
import tqdm
import random
import unicodedata
import numpy as np
import matplotlib.pyplot as plt

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Example, Field, Dataset
from torchtext.data.iterator import BucketIterator
from torchtext.data.metrics import bleu_score

In [3]:
seed = 781
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda


In [4]:
def read_file(filepath):
    try:
        with open(filepath, mode='rt', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except:
        raise Error(f'File {filepath} doesn\'t exist')

In [5]:
def unicode_to_ascii(s):
    # NFD => Normal Form Decompose
    # Mn => Non Marking Space
    return ''.join(c for c in unicodedata.normalize('NFD', s) \
                    if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # Transform accented characters into unaccented ones
    s = unicode_to_ascii(s.strip())
    # Remove a sequence of whitespace characters
    s = re.sub(r'\s+', r' ', s)
    return s.strip()

In [6]:
%%time
pairs = [*zip(read_file('./data/europarl-v7.fr-en.fr'),
              read_file('./data/europarl-v7.fr-en.en'))]
pairs = [*map(lambda x: {'fr': x[0], 'en': x[1]}, pairs)]
print(f'Number of examples: {len(pairs):,}')

Number of examples: 2,007,723
CPU times: user 3.39 s, sys: 980 ms, total: 4.37 s
Wall time: 4.36 s


In [7]:
%%time
pairs = np.random.choice(pairs, size=50000, replace=False)
pairs = [*map(lambda pair: {k: normalize_string(v) for k, v in pair.items()},
              pairs)]
print(f'Number of examples after sampling: {len(pairs):,}')
print(f'Example:\n\tFR => {pairs[0]["fr"]}\n\tEN => {pairs[0]["en"]}')

Number of examples after sampling: 50,000
Example:
	FR => Les procedures par le biais desquelles de tels produits entrent et sortent de l'Union europeenne doivent etre ouvertes, transparentes et, par dessus tout, sures.
	EN => The procedures whereby such products come in and out of the European Union have to be open, transparent and, above all, safe.
CPU times: user 5.64 s, sys: 81.1 ms, total: 5.72 s
Wall time: 5.72 s


In [8]:
FR = Field(init_token='<sos>',
           eos_token='<eos>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='fr',
           include_lengths=True) # For pack_padded_sequence
EN = Field(init_token='<sos>',
           eos_token='<eos>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='en')

examples = [Example.fromdict(data=pair, fields={'fr': ('src', FR),
                                                'en': ('dest', EN)})
            for pair in tqdm.tqdm(pairs)]
data = Dataset(examples, fields={'src': FR, 'dest': EN})

100%|██████████| 50000/50000 [00:16<00:00, 3049.27it/s]


In [9]:
train_data, valid_data, test_data = data.split(split_ratio=[0.8, 0.1, 0.1])
print(f'train set size: {len(train_data.examples)}')
print(f'valid set size: {len(valid_data.examples)}')
print(f'test set size: {len(test_data.examples)}')
print(vars(train_data.examples[0]))

train set size: 40000
valid set size: 5000
test set size: 5000
{'src': ['je', 'demande', 'a', 'la', 'banque', 'mondiale', 'de', 'reorienter', 'sa', 'strategie', 'actuelle', 'qui', 'est', 'actuellement', 'un', 'modele', 'energetique', 'a', 'grande', 'echelle', 'et', 'axe', 'sur', "l'", 'exportation', 'pour', 'privilegier', 'des', 'projets', 'energetiques', 'decentralises', 'a', 'petite', 'echelle', 'qui', 'repondent', 'plus', 'clairement', 'aux', 'besoins', 'fondamentaux', 'des', 'zones', 'rurales', '.'], 'dest': ['i', 'call', 'on', 'the', 'world', 'bank', 'to', 'reorient', 'its', 'current', 'strategy', 'from', 'a', 'large', '-', 'scale', ',', 'export', '-', 'oriented', 'energy', 'model', 'to', 'small', '-', 'scale', ',', 'decentralised', 'energy', 'projects', 'that', 'respond', 'more', 'clearly', 'to', 'basic', 'needs', 'in', 'rural', 'areas', '.']}


In [10]:
%%time
FR.build_vocab(train_data,
               min_freq=2,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])
EN.build_vocab(train_data,
               min_freq=2,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])

print(f'Length of FR vocabulary: {len(FR.vocab):,}')
print(f'Length of EN vocabulary: {len(EN.vocab):,}')

Length of FR vocabulary: 17,426
Length of EN vocabulary: 13,638
CPU times: user 715 ms, sys: 7.66 ms, total: 723 ms
Wall time: 721 ms


# Modeling

## Encoder

In [11]:
class Encoder(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, latent_size, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(embedding_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, latent_size)
        
    def forward(self, input_sequences, sequence_lengths):
        """
        :params
            input_sequences: Tensor[seq_len, batch_size]
            sequence_lengths: Tensor[batch_size,]
            
        :return
            outputs: Tensor[seq_len, batch_size, hidden_size * num_directions=2]
            hn: Tensor[batch_size, latent_size]
        """
        embedded = self.embedding(input_sequences)
        embedded = self.dropout(embedded)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lengths)
        outputs, hn = self.gru(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # hn: [n_layers=1 * num_directions=2, batch_size, hidden_size]
        hn = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)
        # hn: [batch_size, hidden_size * num_directions=2]
        hn = torch.tanh(self.fc(hn))
        return outputs, hn

## Attention Module

In [12]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size, latent_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2 + latent_size, latent_size)
        self.V = nn.Linear(latent_size, 1, bias=False)
        
    def forward(self, h_state, enc_outputs, mask):
        """
        :params
            h_state: Tensor[batch_size, latent_size]
            enc_outputs: Tensor[seq_len, batch_size, hidden_size * 2]
            mask: Tensor[batch_size, seq_len]
            
        :return
            attn_weights: Tensor[batch_size, seq_len]
        """
        seq_len, _, _ = enc_outputs.size()
        h_state = h_state.unsqueeze(0).repeat(seq_len, 1, 1)
        # h_state: [seq_len, batch_size, latent_size]
        concat = torch.cat((enc_outputs, h_state), dim=2)
        # concat: [seq_len, batch_size, hidden_state * 2 + latent_size]
        attn_energies = torch.tanh(self.attn(concat))
        # attn_energies: [seq_len, batch_size, latent_size]
        attn = self.V(attn_energies).squeeze(2).permute(1, 0)
        # attn: [batch_size, seq_len]
        attn = attn.masked_fill(mask == 0, -1e10)
        attn_weights = F.softmax(attn, dim=1)
        return attn_weights

## Decoder

In [13]:
class Decoder(nn.Module):

    def __init__(self, embedding_size, vocab_size, hidden_size, enc_hidden_size, attention, dropout):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.attention = attention
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(enc_hidden_size * 2 + embedding_size, hidden_size)
        self.fc = nn.Linear(enc_hidden_size * 2 + hidden_size + embedding_size, vocab_size)
        
    def forward(self, input_word_index, h_state, enc_outputs, mask):
        """
        :params
            input_word_index: Tensor[batch_size,]
            h_state: Tensor[batch_size, hidden_size]
            enc_outputs: Tensor[seq_len, batch_size, enc_hidden_size * 2]
            mask: Tensor[batch_size, seq_len]
        
        :return
            prediction: Tensor[batch_size, vocab_size]
            h_state: Tensor[batch_size, hidden_size]
            attn_weights: Tensor[batch_size, seq_len]
        """
        embedded = self.embedding(input_word_index.unsqueeze(0))
        # embedded: [1, batch_size, embedding_size]
        
        attn_weights = self.attention(h_state, enc_outputs, mask)
        # attn_weights: [batch_size, seq_len]
        context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs.permute(1, 0, 2))
        # [batch_size, 1, seq_len] * [batch_size, seq_len, enc_hidden_size * 2]
        # context: [batch_size, 1, enc_hidden_size * 2]
        context = context.permute(1, 0, 2)
        # context: [1, batch_size, enc_hidden_size * 2]
        concat = torch.cat((embedded, context), dim=2)
        # concat: [1, batch_size, embedding_size + enc_hidden_size * 2]
        
        output, h_state = self.gru(concat, h_state.unsqueeze(0))
        # output: [1, batch_size, hidden_size]
        # h_state: [n_layers=1 * num_directions=1, batch_size, hidden_size]
        # assert (output == h_state).all()
        
        concat = torch.cat((context, output, embedded), dim=2)
        # concat: [1, batch_size, enc_hidden_size * 2 + hidden_size + embedding_size]
        prediction = self.fc(concat.squeeze(0))
        
        return prediction, h_state.squeeze(0), attn_weights

## Sequence to sequence model

In [14]:
class SeqToSeqNet(nn.Module):

    def __init__(self, encoder, decoder, pad_index, device):
        super(SeqToSeqNet, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_index = pad_index
        self.device = device
        
    def create_mask(self, input_sequences):
        """
        :params
            input_sequences: Tensor[seq_len, batch_size]
            
        :return
            mask: Tensor[batch_size, seq_len]
        """
        mask = (input_sequences != self.pad_index).permute(1, 0)
        return mask
        
    def forward(self, input_sequences, sequence_lengths, target_sequences, tf_ratio):
        """
        :params
            input_sequences: Tensor[seq_len, batch_size]
            sequence_lengths: Tensor[batch_size,]
            target_sequences: Tensor[seq_len, batch_size]
            tf_ratio: float
            
        :return
            outputs: Tensor[seq_len - 1, batch_size, vocab_size]
                Since we ignore the SOS_TOKEN
        """
        mask = self.create_mask(input_sequences)
        
        enc_outputs, h_state = self.encoder(input_sequences, sequence_lengths)
        
        seq_len, batch_size = target_sequences.size()
        outputs = torch.zeros(seq_len - 1, batch_size, self.decoder.vocab_size).to(self.device)
        word_index = target_sequences[0, :]
        
        for t, idx in enumerate(range(1, seq_len)):
            output, h_state, attn_weights = self.decoder(word_index, h_state, enc_outputs, mask)
            # output: [batch_size, vocab_size]
            outputs[t] = output
            
            if random.random() < tf_ratio:
                word_index = target_sequences[idx, :]
            else:
                word_index = output.argmax(dim=1)
                
        return outputs

## Training routines

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [17]:
def train_epoch(model, opt, loss_func, tf_ratio, data_it, grad_clip, epoch_text):
    epoch_loss = 0.
    pbar = tqdm.tqdm(enumerate(data_it), total=len(data_it))
    model.train()
    for i, data in pbar:
        opt.zero_grad()
        outputs = model(*data.src, data.dest, tf_ratio)
        # *data.src: unpack input_sequences and sequence_lengths
        loss = loss_func(outputs.view(-1, model.decoder.vocab_size), data.dest[1:].view(-1))
        loss.backward()
        # plot_gradient_flow(model.named_parameters())
        if grad_clip:
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        opt.step()
        epoch_loss += loss.item()
        pbar.set_description(epoch_text + f'loss:     {epoch_loss / (i + 1):.3f} - ppl:     {np.exp(epoch_loss / (i + 1)):7.3f}')
    # plt.show() # Show the gradient flow
    return epoch_loss / len(data_it), np.exp(epoch_loss / len(data_it))

In [18]:
def valid_epoch(model, loss_func, data_it, epoch_text):
    epoch_loss = 0.
    pbar = tqdm.tqdm(enumerate(data_it), total=len(data_it))
    model.eval()
    with torch.no_grad():
        for i, data in pbar:
            outputs = model(*data.src, data.dest, 0) # Turn-off the teacher forcing
            loss = loss_func(outputs.view(-1, model.decoder.vocab_size), data.dest[1:].view(-1))
            epoch_loss += loss.item()
            pbar.set_description(epoch_text + f'val_loss: {epoch_loss / (i + 1):.3f} - val_ppl: {np.exp(epoch_loss / (i + 1)):7.3f}')
    return epoch_loss / len(data_it), np.exp(epoch_loss / len(data_it))

In [19]:
def train(model, opt, loss_func, train_it, valid_it, tf_ratio, n_epochs, grad_clip, save_to, filename):
    assert callable(loss_func)
    if not os.path.exists(save_to):
        !mkdir {save_to}
    
    history = {
        'loss': [],
        'val_loss': [],
        'ppl': [],
        'val_ppl': []
    }
    best_loss = [float('inf')]
    for epoch in range(n_epochs):
        epoch_text = f'Epoch: {epoch + 1:02d} - '
        loss, ppl = train_epoch(model=model, opt=opt, loss_func=loss_func, tf_ratio=tf_ratio,
                                data_it=train_it, grad_clip=grad_clip, epoch_text=epoch_text)
        val_loss, val_ppl = valid_epoch(model=model, loss_func=loss_func, data_it=valid_it, epoch_text=epoch_text)
        
        history['loss'].append(loss)
        history['val_loss'].append(val_loss)
        history['ppl'].append(ppl)
        history['val_ppl'].append(val_ppl)
        
        # Saving
        if val_loss < best_loss[-1]:
            best_loss.append(val_loss)
            torch.save({
                'model': model.state_dict()
            }, f=os.path.join(save_to, filename))
            
        # Stop training
        try:
            if val_loss > best_loss[-2]:
                print('Stop training because the loss is increasing!')
                break
        except IndexError:
            pass
        
    return history

## Define and train models

In [20]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
N_LAYERS = 2
DROPOUT = 0.25
GRAD_CLIP = 1.0
BATCH_SIZE = 16
N_EPOCHS = 15
TF_RATIO = 0

In [21]:
train_iterator, valid_iterator, test_iterator =  \
        BucketIterator.splits((train_data, valid_data, test_data),
                              batch_size=BATCH_SIZE,
                              sort_key=lambda x: len(x.src),
                              sort_within_batch=True, # For pack_padded_sequence
                              device=device)

In [22]:
encoder = Encoder(embedding_size=EMBEDDING_DIM,
                  vocab_size=len(FR.vocab),
                  hidden_size=HIDDEN_SIZE,
                  latent_size=HIDDEN_SIZE,
                  dropout=DROPOUT).to(device)
attention = Attention(hidden_size=HIDDEN_SIZE,
                      latent_size=HIDDEN_SIZE).to(device)
decoder = Decoder(embedding_size=EMBEDDING_DIM,
                  vocab_size=len(EN.vocab),
                  hidden_size=HIDDEN_SIZE,
                  enc_hidden_size=HIDDEN_SIZE,
                  attention=attention,
                  dropout=DROPOUT).to(device)
seq2seq = SeqToSeqNet(encoder=encoder,
                      decoder=decoder,
                      pad_index=FR.vocab.stoi[FR.pad_token],
                      device=device).to(device)
seq2seq.apply(init_weights)
opt_adam = optim.Adam(seq2seq.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=EN.vocab.stoi[EN.pad_token])
print(f'Number of parameters of the model: {count_parameters(seq2seq):,}')

Number of parameters of the model: 41,008,238


In [23]:
%%time
history = train(model=seq2seq,
                opt=opt_adam,
                loss_func=criterion,
                train_it=train_iterator,
                valid_it=valid_iterator,
                tf_ratio=TF_RATIO,
                n_epochs=N_EPOCHS,
                grad_clip=GRAD_CLIP,
                save_to='./saved_models',
                filename='seq2seq-attn.pt')

Epoch: 01 - loss:     5.520 - ppl:     249.569: 100%|██████████| 2500/2500 [11:23<00:00,  3.66it/s]
Epoch: 01 - val_loss: 4.799 - val_ppl: 121.431: 100%|██████████| 313/313 [00:22<00:00, 14.09it/s]
Epoch: 02 - loss:     4.507 - ppl:      90.647: 100%|██████████| 2500/2500 [11:24<00:00,  3.65it/s]
Epoch: 02 - val_loss: 4.393 - val_ppl:  80.871: 100%|██████████| 313/313 [00:22<00:00, 13.94it/s]
Epoch: 03 - loss:     3.947 - ppl:      51.784: 100%|██████████| 2500/2500 [11:26<00:00,  3.64it/s]
Epoch: 03 - val_loss: 4.313 - val_ppl:  74.693: 100%|██████████| 313/313 [00:22<00:00, 13.88it/s]
Epoch: 04 - loss:     3.565 - ppl:      35.352: 100%|██████████| 2500/2500 [11:28<00:00,  3.63it/s]
Epoch: 04 - val_loss: 4.333 - val_ppl:  76.181: 100%|██████████| 313/313 [00:22<00:00, 14.04it/s]
Epoch: 05 - loss:     3.336 - ppl:      28.107: 100%|██████████| 2500/2500 [11:24<00:00,  3.65it/s]
Epoch: 05 - val_loss: 4.389 - val_ppl:  80.565: 100%|██████████| 313/313 [00:22<00:00, 13.94it/s]
Epoch: 06 

Stop training because the loss is increasing!
CPU times: user 59min 10s, sys: 19min 56s, total: 1h 19min 6s
Wall time: 1h 10min 49s



