<a href="https://colab.research.google.com/github/dksifoua/Neural-Machine-Translation/blob/master/NMT_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext --upgrade
!python -m spacy download fr
!python -m spacy download en

Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.5.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/fr
You can now load the model via spacy.load('fr')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:

import os
import re
import tqdm
import random
import unicodedata
import numpy as np
import matplotlib.pyplot as plt

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Example, Field, Dataset
from torchtext.data.iterator import BucketIterator

%matplotlib inline

In [0]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SEED = 781
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
if not os.path.exists('./data'):
    !mkdir ./data

!wget --no-check-certificate \
    http://www.statmt.org/europarl/v7/fr-en.tgz \
    -O ./data/fr-en.tgz

--2020-02-17 04:07:17--  http://www.statmt.org/europarl/v7/fr-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 202718517 (193M) [application/x-gzip]
Saving to: ‘./data/fr-en.tgz’


In [0]:
!tar -xzvf ./data/fr-en.tgz -C ./data/

In [0]:
def read_file(filepath):
    try:
        with open(filepath, mode='rt', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except:
        raise NotImplementedError(f'File {filepath} doesn\'t exist')

In [0]:
def unicode_to_ascii(s):
    # NFD => Normal Form Decompose
    # Mn => Non Marking Space
    return ''.join(c for c in unicodedata.normalize('NFD', s) \
                    if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z1-9!.?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s.strip()

In [0]:
%%time
pairs = [*zip(read_file('./data/europarl-v7.fr-en.fr'),
             read_file('./data/europarl-v7.fr-en.en'))]
pairs = [*map(lambda x: {'fr': x[0], 'en': x[1]}, pairs)]
print('Number of examples:', len(pairs))
pairs = np.random.choice(pairs, size=30000, replace=False)
pairs = [*map(lambda pair: {k: normalize_string(v) for k, v in pair.items()},
              pairs)]
print('Number of examples after sampling:', len(pairs))
print('Example:', pairs[0])

In [0]:
%%time
FR = Field(init_token='<sos>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='fr',
           preprocessing=lambda x: x[::-1])
EN = Field(init_token='<sos>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='en')

examples = [Example.fromdict(data=pair, fields={'fr': ('src', FR),
                                                'en': ('dest', EN)})
            for pair in tqdm.tqdm(pairs)]
data = Dataset(examples, fields={'src': FR, 'dest': EN})
train_data, valid_data, test_data = data.split(split_ratio=[0.7, 0.2, 0.1])
print('train size:', len(train_data.examples))
print('valid size:', len(valid_data.examples))
print('test size:', len(test_data.examples))
print(vars(train_data.examples[0]))

In [0]:
FR.build_vocab(train_data, min_freq=5,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])
EN.build_vocab(train_data, min_freq=5,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])

In [0]:
print('Length of FR vocabulary:', len(FR.vocab))
print('Length of EN vocabulary:', len(EN.vocab))

In [0]:
BATCH_SIZE = 128

train_it, valid_it, test_it = BucketIterator.splits((train_data, valid_data,
                                                     test_data),
                                                    batch_size=BATCH_SIZE,
                                                    device=DEVICE)

In [0]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, embedding_dim,
                 hidden_units, n_layers, dropout, bi=True):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_units,
                            num_layers=n_layers,
                            bidirectional=bi,
                            dropout=dropout)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)
        outputs, (h_state, c_state) = self.lstm(embedded)
        d, _, _ = h_state.size()
        h_state = h_state[:d//2, :, :] + h_state[d//2:, :, :]
        c_state = c_state[:d//2, :, :] + c_state[d//2:, :, :]
        return outputs, (h_state, c_state)

In [0]:
# encoder = Encoder(vocab_size=len(FR.vocab), embedding_dim=300,
#                   hidden_units=128, n_layers=4, dropout=0.25)
# encoder.to(DEVICE)
# for batch in train_it:
#     print('inputs shape', batch.src.shape)
#     outputs, (h_state, c_state) = encoder(batch.src)
#     print('outputs shape:', outputs.shape)
#     print('h_state shape:', h_state.shape)
#     print('c_state shape:', c_state.shape)
#     break

In [0]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, embedding_dim,
                 hidden_units, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_units,
                            num_layers=n_layers,
                            dropout=dropout)
        self.linear = nn.Linear(hidden_units, vocab_size)

    def forward(self, inputs, h_state, c_state):
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)
        outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
        logits = self.linear(outputs.squeeze(0))
        return logits, (h_state, c_state)

In [0]:
# decoder = Decoder(vocab_size=len(EN.vocab), embedding_dim=300,
#                   hidden_units=128, n_layers=4, dropout=0.25)
# decoder.to(DEVICE)
# for i, batch in enumerate(train_it):
#     print('inputs shape', batch.dest[i].unsqueeze(0).shape)
#     outputs, (h_state, c_state) = decoder(batch.dest[i].unsqueeze(0), h_state, c_state)
#     print('outputs shape:', outputs.shape)
#     print('h_state shape:', h_state.shape)
#     print('c_state shape:', c_state.shape)
#     break

In [0]:
class SeqToSeqNet(nn.Module):

    def __init__(self, encoder, decoder):
        super(SeqToSeqNet, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs, targets):
        enc_out, (h_state, c_state) = self.encoder(inputs)
        target = targets[0, :].unsqueeze(0)
        logits = []
        for t in range(1, targets.size(0)):
            logit, (h_state, c_state) = self.decoder(target, h_state, c_state)
            target = logit.argmax(1).unsqueeze(0)
            logits.append(logit)
        return torch.stack(logits, dim=0)

In [0]:
# model = SeqToSeqNet(encoder, decoder)
# model.to(DEVICE)
# for i, batch in enumerate(train_it):
#     print('inputs shape', batch.dest.shape)
#     outputs = model(batch.src, batch.dest)
#     print('outputs shape:', outputs.shape)
#     break

In [0]:
def init_weights(model: nn.Module):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, a=-0.08, b=0.08)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
def plot_grad_flow(named_parameters):
    grad_mean, layers = [], []
    for name, param in named_parameters:
        if param.requires_grad and 'bias' not in name:
            layers.append(name)
            grad_mean.append(param.grad.abs().mean())
    plt.plot(grad_mean, alpha=0.3, color='b')
    plt.hlines(0, 0, len(grad_mean) + 1, linewidth=1, color='k' )
    plt.xticks(range(0, len(grad_mean), 1), layers, rotation='vertical')
    plt.xlim(xmin=0, xmax=len(grad_mean))
    plt.ylim(bottom=-0.001, top=0.02)
    plt.xlabel('Layers')
    plt.ylabel('Mean of gradients')
    plt.title('Gradient Flow')
    plt.grid(True)

def train_step(model, opt, loss_func, data_it, grad_clip, epoch_text=''):
    epoch_loss = 0.
    pbar = tqdm.tqdm_notebook(enumerate(data_it), total=len(data_it))
    model.train()
    for i, data in pbar:
        opt.zero_grad()
        logits = model(data.src, data.dest)
        d = logits.size(-1)
        loss = loss_func(logits.view(-1, d), data.dest[1:, :].view(-1))
        loss.backward()
        plot_grad_flow(model.named_parameters())
        if grad_clip:
            nn.utils.clip_grad_norm(model.parameters(), grad_clip)
        opt.step()
        epoch_loss += loss.item()
        pbar.set_description(epoch_text + f'Train Loss: {epoch_loss/(i+1):.3f}')
    plt.show() # Show the gradient flow
    return epoch_loss / len(data_it)

def evaluate(model, loss_func, data_it):
    epoch_loss = 0.
    pbar = tqdm.tqdm_notebook(enumerate(data_it), total=len(data_it))
    model.eval()
    for i, data in pbar:
        with torch.no_grad():
            logits = model(data.src, data.dest)
            d = logits.size(-1)
            loss = loss_func(logits.view(-1, d), data.dest[1:, :].view(-1))
            epoch_loss += loss.item()
            pbar.set_description(f'Valid Loss: {epoch_loss / (i + 1):.3f}')
    return epoch_loss / len(data_it)

def train(model, optimizer, loss_function, train_it, valid_it, n_epochs,
          grad_clip=None, save_to='./saved_models', filename='seq2seq.pt'):
    assert callable(loss_function)
    if not os.path.exists(save_to):
        !mkdir {save_to}

    history = {'loss': [], 'val_loss': []}
    best_loss = float('inf')
    for epoch in range(n_epochs):
        epoch_text = f'Epoch: {epoch + 1:02d} - '
        loss = train_step(model, optimizer, loss_function,
                          train_it, grad_clip, epoch_text)
        val_loss = evaluate(model, loss_function, valid_it)

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save({'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()},
                       f=f'./saved_model/{os.path.join(save_to, filename)}')

        history['loss'].append(loss)
        history['val_loss'].append(val_loss)

    return history

In [0]:
VOCAB_SIZE_FR = len(FR.vocab)
VOCAB_SIZE_EN = len(EN.vocab)
EMBEDDING_DIM = 300
HIDDEN_UNITS = 512
N_LAYERS = 1
DROPOUT = 0
LR = 1e-3
N_EPOCHS = 5

encoder = Encoder(vocab_size=VOCAB_SIZE_FR,
                  embedding_dim=EMBEDDING_DIM,
                  hidden_units=HIDDEN_UNITS,
                  n_layers=N_LAYERS,
                  dropout=DROPOUT)
decoder = Decoder(vocab_size=VOCAB_SIZE_EN,
                  embedding_dim=EMBEDDING_DIM,
                  hidden_units=HIDDEN_UNITS,
                  n_layers=N_LAYERS,
                  dropout=DROPOUT)
torch.cuda.reset_max_memory_allocated(device=DEVICE)
torch.cuda.reset_max_memory_cached(device=DEVICE)
model = SeqToSeqNet(encoder, decoder)
model.to(DEVICE)
model.apply(init_weights)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=EN.vocab.stoi[EN.pad_token])

print(model)
print('Number of parameters of the model:', count_parameters(model))

In [0]:
history = train(model, optimizer, criterion, train_it, valid_it, N_EPOCHS)

In [0]:
torch.cuda.memory_reserved(DEVICE)

In [0]:
plt.plot(history['loss'], label='train')
plt.plot(history['val_loss'], label='valid')
plt.title('Loss')
plt.legend()
plt.show()

In [0]:
model.load_state_dict(torch.load('./saved_model/seq2seq.pt'))
test_loss = evaluate(model, test_iterator, criterion)