<a href="https://colab.research.google.com/github/dksifoua/Neural-Machine-Translation/blob/master/4%20-%20SeqToSeq%20Model%20with%20Convolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Sep 30 00:08:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Load dependencies

In [2]:
!pip install tqdm --upgrade >> /dev/null 2>&1
!pip install torchtext --upgrade >> /dev/null 2>&1
!pip install spacy --upgrade >> /dev/null 2>&1
!python -m spacy download de >> /dev/null 2>&1
!python -m spacy download en >> /dev/null 2>&1

In [3]:
import tqdm
import spacy
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import display, HTML

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Dataset, Example, Field
from torchtext.data.iterator import BucketIterator
from torchtext.data.metrics import bleu_score
from torchtext.datasets import Multi30k

In [4]:
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

SEED = 546
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

Device: cuda


## Load data

In [7]:
%%time
DE = Field(init_token='<sos>', eos_token='<eos>', lower=True, tokenize='spacy', tokenizer_language='de', batch_first=True)
EN = Field(init_token='<sos>', eos_token='<eos>', lower=True, tokenize='spacy', tokenizer_language='en', batch_first=True)
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),  fields=(DE, EN))
print(f'train set size: {len(train_data.examples):,}')
print(f'valid set size: {len(valid_data.examples):,}')
print(f'test set size: {len(test_data.examples):,}')
print(vars(train_data.examples[0]))

train set size: 29,000
valid set size: 1,014
test set size: 1,000
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}
CPU times: user 6.04 s, sys: 170 ms, total: 6.21 s
Wall time: 6.21 s


## Build vocabularies

In [8]:
%%time
MIN_COUNT = 2
DE.build_vocab(train_data, min_freq=MIN_COUNT)
EN.build_vocab(train_data, min_freq=MIN_COUNT)
print(f'Length of DE vocabulary: {len(DE.vocab):,}')
print(f'Length of EN vocabulary: {len(EN.vocab):,}')

Length of DE vocabulary: 7,854
Length of EN vocabulary: 5,893
CPU times: user 284 ms, sys: 877 µs, total: 285 ms
Wall time: 285 ms


## Modeling

The `scale` variable is used by the authors to *ensure that the variance throughout the network does not change dramatically*. The performance of the model seems to vary wildly using different seeds if this is not used.

***Encoder layer***

In [9]:
class ConvBlockLayer(nn.Module):

    def __init__(self, n_channels, kernel_size, scale, dropout):
        super(ConvBlockLayer, self).__init__()
        self.n_channels = n_channels
        self.kernel_size = kernel_size
        self.scale = scale
        self.dropout = dropout
        self.conv1d = nn.Conv1d(n_channels, n_channels * 2, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)

    def forward(self, conv_input):
        """
        :param Tensor[batch_size, n_channels, seq_len] conv_input
        :return [batch_size, n_channels, seq_len]
        """
        conved = self.conv1d(conv_input) # [batch_size, n_channels * 2, seq_len]
        conved = F.dropout(F.glu(conved, dim=1), p=self.dropout) # [batch_size, n_channels, seq_len]
        conved = conv_input + conved # [batch_size, n_channels, seq_len] Residual connection
        return conved * self.scale

In [17]:
class EncoderLayer(nn.Module):

    def __init__(self, vocab_size, n_positions, embedding_size, hidden_size, kernel_size, scale, n_layers, dropout):
        super(EncoderLayer, self).__init__()
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.scale = scale
        self.n_layers = n_layers
        self.dropout = dropout
        self.token_embedding = nn.Embedding(vocab_size, embedding_size)
        self.position_embedding = nn.Embedding(n_positions, embedding_size)
        self.fc_embedding_hidden = nn.Linear(embedding_size, hidden_size)
        self.conv1d_blocks = nn.Sequential(
            *[ConvBlockLayer(n_channels=hidden_size, kernel_size=kernel_size, scale=scale, dropout=dropout) for _ in range(n_layers)])
        self.fc_hidden_embedding = nn.Linear(hidden_size, embedding_size)

    def forward(self, input_sequences, input_positions):
        """
        :param Tensor[batch_size, seq_len] input_sequences
        :param Tensor[batch_size, seq_len] input_positions
        :return Tensor[batch_size, seq_len, embedding_size] conved
        :return Tensor[batch_size, seq_len, embedding_size] combined
        """
        seq_embed = self.token_embedding(input_sequences) # [batch_size, seq_len, embedding_size]
        pos_embed = self.position_embedding(input_positions) # [batch_size, seq_len, embedding_size]
        embed = F.dropout(seq_embed + pos_embed, p=self.dropout) # [batch_size, seq_len, embedding_size]
        conv_input = F.dropout(self.fc_embedding_hidden(embed), p=self.dropout) # [batch_size, seq_len, hidden_size]
        conv_input = conv_input.permute(0, 2, 1) # [batch_size, hidden_size, seq_len]
        conved = self.conv1d_blocks(conv_input) # [batch_size, hidden_size, seq_len]
        conved = conved.permute(0, 2, 1) # [batch_size, seq_len, hidden_size]
        conved = self.fc_hidden_embedding(conved) # [batch_size, seq_len, embedding_size]
        combined = (h_state + embed) * self.scale # [batch_size, seq_len, embedding_size]
        return conved, combined

***Attention layer***

In [18]:
class MultiStepAttnLayer(nn.Module):

    def __init__(self, hidden_size, embedding_size, scale):
        super(MultiStepAttnLayer, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.scale = scale
        self.fc_hidden_embedding = nn.Linear(hidden_size, embedding_size)
    
    def forward(self, conved, embed, enc_conved):
        """
        :param Tensor[batch_size, dest_seq_len, hidden_size] conved
        :param Tensor[batch_size, dest_seq_len, embedding_size] embed
        :param Tensor[batch_size, src_seq_len, embedding_size] enc_conved
        :return Tensor[batch_size, dest_seq_len, src_seq_len] attn_weights
        """
        conv_embed = self.fc_hidden_embedding(conved) # [batch_size, dest_seq_len, embedding_size]
        combined = (conv_embed + embed) * self.scale # [batch_size, dest_seq_len, embedding_size]
        scores = torch.matmul(combined, enc_conved.permute(0, 2, 1)) # [batch_size, dest_seq_len, src_seq_len]
        attn_weights = F.softmax(scores, dim=2) # [batch_size, dest_seq_len, src_seq_len]
        return attn_weights

***Decoder layer***

In [None]:
class DecoderLayer(nn.Module):

    def __init__(self, vocab_size, n_positions, embedding_size, hidden_size, kernel_size, scale, n_layers, dropout):
        super(DecoderLayer, self).__init__()
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.scale = scale
        self.n_layers = n_layers
        self.dropout = dropout
        self.token_embedding = nn.Embedding(vocab_size, embedding_size)
        self.position_embedding = nn.Embedding(n_positions, embedding_size)
        self.fc_embedding_hidden = nn.Linear(embedding_size, hidden_size)