# Transformer: Attention is all you need

This jupyter notebook is Tensorflow version implemented in the paper [Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf). The task is translating a source human-readable datetime to a target fixed datetime format **yyyy-mm-dd**, e.g: "24th Aug 19" -> "2019-08-24". Best way to start implement a model from scratch is using small dataset and non-complex.

In [None]:
import numpy as np
import tqdm
from faker import Faker
from babel.dates import format_date
from nmt_utils import load_dataset_v2, preprocess_data, string_to_int, int_to_string, softmax
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import os

In [None]:
m = 40000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset_v2(m)

In [None]:
human_vocab

In [None]:
machine_vocab

In [None]:
Tx = 30
Ty = 10

X, Y = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty+1)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)

In [None]:
import tensorflow as tf

In [None]:
tf.enable_eager_execution()

In [None]:
L = tf.keras.layers

## Transformer model with Tensorflow.

### Hyperparameter:

$d_{model}$: dimension of word embeding, output of **Multi-head Attention** layer, output of **Feed Forward** layer.

$d_k$: dimension of matrix Q, K

$d_v$: dimension of matrix V

$d_{ff}$: dimension of intermediate **Feed forward** layer

$h$: number of heads at each block.


### Positional Encoding:

Since the Transformer model isn't sequential model like RNN and CNN. The computation is parallel over all input sentence flow from Embedding Layer, so we need to compute the relative or absolute position between the words. The author use non-trainable/fixed signusoid function:

$$PE_{(pos, 2i)} = sin\left(\frac{pos}{10000^{2i/d_{model}}}\right) \mbox{this corresponding to the even indices}$$
$$PE_{(pos, 2i+1)} = cos\left(\frac{pos}{10000^{2i/d_{model}}}\right) \mbox{this corresponding to the odd indices}$$

where $pos$ is position in the sequence and $i$ is the dimension.


### Scaled Dot-Product Attention:

<img style="width:300px; height:300px" src="https://i.imgur.com/HuXNlr0.png" />

$$Attention(Q, K, V) = softmax\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

### (Encoder-Decoder) Multi-Head Attention:

<img style="weight:300px; height:300px" src="https://i.imgur.com/vgfOLR2.png" />

$$MultiHead(Q, K, V) = Concat(head_1, head_2, ..., head_h)W^O$$
$$\mbox{where } head_i = Attention(Q, K, V)$$

### Feed forward:

$$FFN(x) = max(0, xW_1 + b_1)W_2 + b_2$$

### Encoder blocks:

Each encoder block include 2 layers: **Multi-head Attention Mechanism** and **Position-wise Feed Forward**, respestively. Output at each layer use residual connection with its input followed by [Layer Normalization](https://arxiv.org/pdf/1607.06450.pdf): $LayerNorm(x + f(x))$

### Decoder blocks:

Each decoder block includes 3 layers: **Multi-head Attention Mechanism**, **Encoder-Decoder Multi-head Attention** and **Position-wise Feed Forward**. Same as **Encoder** blocks, output at each layer use residual connection with its input follow by Layer Normalization.

<img src="https://i.imgur.com/1NUHvLi.jpg" />

In [None]:
class Transformer(tf.keras.Model):
    
    def __init__(self, num_blocks, num_heads, vocab_size, seq_len, d_model, d_k, d_v, d_ff):
        super(Transformer, self).__init__()
        self.num_blocks = num_blocks
        self.num_heads = num_heads
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.seq_len = seq_len
        self.d_k = d_k
        self.d_v = d_v
        self.d_ff = d_ff
        self.word_embed = L.Embedding(vocab_size, d_model)
        
    def _format(self, block, head):
        return str(block) + str(head)
    
    def _init_structure(self, decoder_part=False):
        assert not hasattr(self, "pos_enc"), "The structure is initialized already."
        self.pos_enc = np.zeros(shape=(1, self.seq_len, self.d_model))
        for pos in range(self.seq_len):
            for i in range(0, self.d_model, 2):
                self.pos_enc[:, pos, i] = np.sin(pos / (10000 ** ((2 * i)/self.d_model)))
                self.pos_enc[:, pos, i + 1] = np.cos(pos / (10000 ** ((2 * i)/self.d_model)))
        
        if decoder_part:
            self.mask = [[0]*(i+1) + [-1e9]*(self.seq_len-(i+1)) for i in range(self.seq_len)]
            self.mask = np.array([self.mask])             
        
        for block_id in range(self.num_blocks):
            setattr(self, "Q" + str(block_id), L.Dense(self.d_k*self.num_heads))
            setattr(self, "K" + str(block_id), L.Dense(self.d_k*self.num_heads))
            setattr(self, "V" + str(block_id), L.Dense(self.d_v*self.num_heads))
            if decoder_part:
                setattr(self, "Qenc" + str(block_id), L.Dense(self.d_k*self.num_heads))
                setattr(self, "Kenc" + str(block_id), L.Dense(self.d_k*self.num_heads))
                setattr(self, "Venc" + str(block_id), L.Dense(self.d_v*self.num_heads))
            setattr(self, "O" + str(block_id), L.Dense(self.d_model))
            setattr(self, "FFN1" + str(block_id), L.Dense(self.d_ff, activation="relu"))
            setattr(self, "FFN2" + str(block_id), L.Dense(self.d_model))
            
    def _ffn(self, block_id, attention_output):
        ffn1 = getattr(self, "FFN1" + str(block_id))(attention_output)
        ffn2 = getattr(self, "FFN2" + str(block_id))(ffn1)
        return ffn2
    
    def _scaled_dot_product(self, Q, K, V, mask=False):
        score = tf.matmul(Q, K, transpose_b=True)
        if mask:
            # apply mask to score, prevent the affect of feature words to current word.
            score = score + self.mask[:, :score.shape[1], :score.shape[1]]
        score = tf.nn.softmax(score/np.sqrt(self.d_k), axis=-1)
        score = tf.matmul(score, V)
        return score
                
    def _multi_head_attention(self, block_id, Q, K, V, connection_head=False, mask=False):
        if connection_head:
            Q = getattr(self, "Qenc" + str(block_id))(Q)
            K = getattr(self, "Kenc" + str(block_id))(K)
            V = getattr(self, "Venc" + str(block_id))(V)
        else:
            Q = getattr(self, "Q" + str(block_id))(Q)
            K = getattr(self, "K" + str(block_id))(K)
            V = getattr(self, "V" + str(block_id))(V)
        score = self._scaled_dot_product(Q, K, V, mask)
        head_output = getattr(self, "O" + str(block_id))(score)
        return head_output
    
    def _block_computation(self, *args, **kwargs):
        raise NotImplementedError("Transformer is abstract class. You must implement this function!")
        
    def call(self, *args, **kwargs):
        raise NotImplementedError("Transformer is abstract class. You must implement this function!")

In [None]:
class Encoder(Transformer):
    
    def __init__(self, num_blocks, num_heads, vocab_size, seq_len, d_model, d_k, d_v, d_ff):
        super(Encoder, self).__init__(num_blocks, num_heads, vocab_size, seq_len, d_model, d_k, d_v, d_ff)
        self._init_structure()
    
    def _block_computation(self, block_id, x):
        attention_output = self._multi_head_attention(block_id, x, x, x, connection_head=False, mask=False)
        attention_output = L.LayerNormalization()(attention_output + x)
        
        block_output = self._ffn(block_id, attention_output)
        block_output = L.LayerNormalization()(block_output + attention_output)
        return block_output
    
    def call(self, x):
        word_embed = self.word_embed(x)
        word_embed = word_embed + self.pos_enc
        
        block_output = word_embed
        for block_id in range(self.num_blocks):
            block_output = self._block_computation(block_id, block_output)
        return block_output

In [None]:
class Decoder(Transformer):
    
    def __init__(self, num_blocks, num_heads, vocab_size, seq_len, d_model, d_k, d_v, d_ff):
        super(Decoder, self).__init__(num_blocks, num_heads, vocab_size, seq_len, d_model, d_k, d_v, d_ff)
        self._init_structure(decoder_part=True)
        self.logits = L.Dense(units=vocab_size)
    
    def _block_computation(self, block_id, x, encoder_output):
        attention_output = self._multi_head_attention(block_id, x, x, x, connection_head=False, mask=True)
        attention_output = L.LayerNormalization()(attention_output + x)
        
        connection_output = self._multi_head_attention(block_id, attention_output, encoder_output, 
                                                       encoder_output, connection_head=True, mask=False)
        connection_output = L.LayerNormalization()(connection_output + attention_output)
        
        block_output = self._ffn(block_id, connection_output)
        block_output = L.LayerNormalization()(block_output + connection_output)
        return block_output
    
    def call(self, x, encoder_output):
        word_embed = self.word_embed(x)
        word_embed = word_embed + self.pos_enc[:, :word_embed.shape[1], :]
        block_output = word_embed
        for block_id in range(self.num_blocks):
            block_output = self._block_computation(block_id, block_output, encoder_output)
        logits = self.logits(block_output)
        return logits

In [None]:
def loss_function(labels, logits):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
    return tf.reduce_mean(tf.reduce_sum(loss, axis=1), axis=0)

### Define hyperparameter for Transformer Model

In [None]:
NUM_BLOCKS = 2
NUM_HEADS = 2
DIMENSION_MODEL = 32
DIMENSION_K = 16
DIMENSION_V = 16
DIMENSION_FF = 64

In [None]:
encoder = Encoder(num_blocks=NUM_BLOCKS, num_heads=NUM_HEADS, vocab_size=len(human_vocab), seq_len=Tx, 
                  d_model=DIMENSION_MODEL, d_k=DIMENSION_K, d_v=DIMENSION_V, d_ff=DIMENSION_FF)

decoder = Decoder(num_blocks=NUM_BLOCKS, num_heads=NUM_HEADS, vocab_size=len(machine_vocab), seq_len=Ty, 
                  d_model=DIMENSION_MODEL, d_k=DIMENSION_K, d_v=DIMENSION_V, d_ff=DIMENSION_FF)

In [None]:
epochs = 3
batch_size = 64
num_batches = X.shape[0]//batch_size if X.shape[0] % batch_size == 0 else X.shape[0]//batch_size + 1
data = tf.concat([X, Y], axis=1)

In [None]:
optimizer = tf.train.AdamOptimizer()

In [None]:
for e in range(epochs):
    
    data = tf.random.shuffle(data)
    
    X, Y = data[:, :Tx], data[:, Tx:]
    
    pbar = tqdm.tqdm_notebook(range(0, num_batches), desc="Epoch " + str(e+1))
    
    train_loss = 0
    
    for it in pbar:
        start = it*batch_size
        end = (it+1)*batch_size
        
        with tf.GradientTape() as tape:
            encoder_output = encoder(X[start:end])
            
            logits = decoder(Y[start:end, :-1], encoder_output)
            print(logits.shape)
            print(Y.shape)
            loss = loss_function(Y[start:end, 1:], logits)
        
        train_loss += loss
        
        pbar.set_description("Epoch %s - Training loss: %f" % (e+1, (train_loss / (it+1))))
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))

In [None]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']

for example in EXAMPLES:
    source = string_to_int(example, Tx, human_vocab)
    source = np.array([source])

    encoder_output = encoder(source)
    sentence = [machine_vocab["#"]]

    for t in range(Ty):
        logits = decoder(np.array([sentence]), encoder_output)
        prediction = tf.nn.softmax(logits, axis=-1)
        prediction = np.argmax(prediction, axis=-1)
        sentence.append(prediction[0][-1])

    sequential_output = [inv_machine_vocab[s] for s in sentence[1:]]
    parallel_output = [inv_machine_vocab[s] for s in prediction[0]]
    
    print("source:", example)
    print("sequential output:", ''.join(sequential_output))
    print("parallel output:", ''.join(parallel_output))
    print("-----------------------------------------------")