# Build a Machine translation transformer from scratch German -> English
#### References
- [Paper](https://arxiv.org/abs/1706.03762)
- [Github](https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb)
- [Youtube](https://www.youtube.com/watch?v=U0s0f995w14)


#### Transformer
<img src="./assets/transformer.png" width="400"/>


In [1]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Use {device}")

Use cuda:0


# 1. Data processing

## 1.1 Tokenizer

In [2]:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
tokenize_en("good morning")

['good', 'morning']

In [4]:
tokenize_de("guten morgen")

['morgen', 'guten']

## 1.2 Get dataset from torchtext

In [5]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [6]:
german = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

english = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)



In [7]:
train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de', '.en'), 
    fields = (german, english))



#### Preview

In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [9]:
# DE (reverse)
print(vars(train_data.examples[0])['src'])

# En
print(vars(train_data.examples[0])['trg'])

['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [10]:
# DE (reverse)
print(vars(train_data.examples[1])['src'])

# En
print(vars(train_data.examples[1])['trg'])

['.', 'antriebsradsystem', 'ein', 'bedienen', 'schutzhelmen', 'mit', 'männer', 'mehrere']
['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


## 1.3 Build vocab

In [11]:
german.build_vocab(train_data, min_freq = 2)
english.build_vocab(train_data, min_freq = 2)

In [12]:
print(f"Unique tokens in source (de) vocabulary: {len(german.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")

Unique tokens in source (de) vocabulary: 7854
Unique tokens in target (en) vocabulary: 5893


## 1.4 Preview dataloader

In [13]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)



In [14]:
print("Train dataloader size:", len(train_iterator))
print("Valid dataloader size:", len(valid_iterator))
print("Test dataloader size:", len(test_iterator))

Train dataloader size: 907
Valid dataloader size: 32
Test dataloader size: 32


In [15]:
for i, data in enumerate(train_iterator):
    X = data.src
    y = data.trg

    # (Input_dim, batch_size)
    print(f"Source[{i}] tensor size: {X.size()}")

    # (Output_dim, batch_size)
    print(f"Target[{i}] tensor size: {y.size()}",end="\n\n")

    if i == 2: break



Source[0] tensor size: torch.Size([27, 32])
Target[0] tensor size: torch.Size([27, 32])

Source[1] tensor size: torch.Size([25, 32])
Target[1] tensor size: torch.Size([26, 32])

Source[2] tensor size: torch.Size([22, 32])
Target[2] tensor size: torch.Size([25, 32])



# 2. Model

#### Attention Mechanism

<img src="./assets/transformer-attention.png" width="600"/>

$$ \text{Attention(Q,K,V)} = softmax(\frac{QK^T}{\sqrt{d_k}})*V $$

## 2.1 Encoder
#### Encoder Block

<img src="./assets/transformer-encoder-block.png" width="150"/>


#### Encoder

<img src="./assets/transformer-encoder.png" width="200"/>


## 2.2 Decoder
#### Decoder Block

<img src="./assets/transformer-decoder-block.png" width="150"/>


#### Decoder

<img src="./assets/transformer-decoder.png" width="250"/>


## 2.3 Transformer

<img src="./assets/transformer-2.png" width="400"/>
