In [0]:
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

In [0]:
import os
import sys
import math
import time
import itertools

import tensorflow as tf
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

Code is based on: \\
[TensorFlow Transformer tutorial](https://www.tensorflow.org/tutorials/text/transformer) \\
[The annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html)

# Transformer

![](https://www.tensorflow.org/images/tutorials/transformer/transformer.png)

![](https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png)

![](https://www.tensorflow.org/images/tutorials/transformer/scaled_attention.png)


Encoder layer attention
![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_119_1.png)

Decoder layer attention
![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_119_11.png)

# BERT - bidirectional pretraining of Transformer

![](https://drive.google.com/uc?export=view&id=1LfLDPCHlovwwChNGPq8ArExk1uUshuVM)

# Transformer for other tasks

Transformers and self-attention module are not used only in NLP. They are widely used in Deep Learning. 



*   Generative models - [Self-Attention Generative Adversarial Networks](https://arxiv.org/pdf/1805.08318.pdf), [Music Transformer](https://openreview.net/pdf?id=rJe4ShAcF7)
*   Computer Vision - [Image Transformer](https://arxiv.org/pdf/1802.05751.pdf), [Stand-Alone Self-Attention in Vision Models](https://arxiv.org/pdf/1906.05909.pdf), [Attention Augmented Convolutional Networks](http://openaccess.thecvf.com/content_ICCV_2019/papers/Bello_Attention_Augmented_Convolutional_Networks_ICCV_2019_paper.pdf)
*   Graphs processing - [Graph Attention Networks](https://arxiv.org/pdf/1710.10903.pdf), [Molecule Attention Transformer](https://arxiv.org/pdf/2002.08264.pdf)





## IMDB Movie reviews dataset for sentiment classification - once again

**Overview**

This dataset contains movie reviews along with their associated binary sentiment polarity labels. It is intended to serve as a benchmark for sentiment classification.

**Dataset**

The core dataset contains 50,000 reviews split evenly into 25k train and 25k test sets. The overall distribution of labels is balanced (25k pos and 25k neg).

In the entire collection, no more than 30 reviews are allowed for any given movie because reviews for the same movie tend to have correlated ratings. Further, the train and test sets contain a disjoint set of movies, so no significant performance is obtained by memorizing movie-unique terms and their associated with observed labels. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10. Thus reviews with more neutral ratings are not included in the train/test sets.

**Model**

This time to classify reviews, we will use the Transformer model for classification (just encoder part).

In [0]:
from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data()

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=200, padding='post')

MAX_SEQUENCE_LEN = x_train.shape[1]
WORDS_IN_CORPORA = np.max(x_train) + 1

x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LEN, padding='post', truncating='post')

In [0]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape, WORDS_IN_CORPORA

In [0]:
words2index = imdb.get_word_index()
index2word = {v: k for k,v in words2index.items()}
imdb_words = [v for k,v in sorted(index2word.items())]
len(imdb_words)

In [0]:
imdb_words[0:5]

## Positional encoding

Since this model doesn't contain any recurrence or convolution, positional encoding is added to give the model some information about the relative position of the words in the sentence.

The positional encoding vector is added to the embedding vector. Embeddings represent a token in a d-dimensional space where tokens with similar meaning will be closer to each other. But the embeddings do not encode the relative position of words in a sentence. So after adding the positional encoding, words will be closer to each other based on the similarity of their meaning and their position in the sentence, in the d-dimensional space.

$PE_{(pos,2i)} = sin(pos / 10000^{2i/d_{\text{model}}})$ \\
$PE_{(pos,2i+1)} = cos(pos / 10000^{2i/d_{\text{model}}})$

![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_49_0.png)

In [0]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [0]:
def positional_encoding(text_len, d_model):
  angle_rads = get_angles(np.arange(text_len)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model) # (text_len, d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...] # (1, text_len, d_model)
    
  return tf.cast(pos_encoding, dtype=tf.float32)

## Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input during the calculation of self-attention. 

The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.

In [0]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

## Scaled dot product self-attention

![](https://www.tensorflow.org/images/tutorials/transformer/scaled_attention.png)

The attention function used by the transformer takes three inputs: Q (query), K (key), V (value). The equation used to calculate the attention weights is:

$\mathrm{Attention}(Q, K, V) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V$

An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

The dot-product attention is scaled by a factor of square root of the depth. This is done because for large values of depth, the dot product grows large in magnitude pushing the softmax function where it has small gradients resulting in a very hard softmax. 

In [0]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

## Multi-head attention

Multi-head attention consists of four parts:



1.   Linear layers and split into heads.
2.   Scaled dot-product attention.
3.   Concatenation of heads.
4.   Final linear layer.

Each multi-head attention block gets three inputs; Q (query), K (key), V (value). These are put through dense layers and split up into multiple heads.

The sefl-attention mechanism defined above is applied to each head. An appropriate mask must be used in the attention step. The attention output for each head is then concatenated and put through a final dense layer.

Instead of one single attention head, Q, K, and V are split into multiple heads because it allows the model to jointly attend to information at different positions from different representational spaces. After the split each head has a reduced dimensionality, so the total computation cost is the same as a single head attention with full dimensionality.

![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_38_0.png)

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len_q, d_model)
    k = self.wk(k)  # (batch_size, seq_len_k, d_model)
    v = self.wv(v)  # (batch_size, seq_len_v, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

## Position-wise feed forward network

Point wise feed forward network consists of two fully-connected layers with a ReLU activation in between.

While the linear transformations are the same across different positions, they use different parameters from layer to layer.

In [0]:
def position_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

## Encoder Layer

Each encoder layer consists of sublayers:

1.    Multi-head attention (with padding mask)
2.    Point wise feed forward networks.

Each of these sublayers has a residual connection around it followed by a layer normalization. Residual connections help in avoiding the vanishing gradient problem in deep networks.

The output of each sublayer is normalised by the Layer Normalization method and equals 

```
LayerNorm(x + Sublayer(x))
```

There are N encoder layers in the transformer.

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = position_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

## Encoder

The Encoder consists of:

1.    Input Embedding
2.    Positional Encoding
3.    N encoder layers

The input is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the encoder layers. The output of the encoder is the input to the classification head.

In [0]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

## Create the classification Transformer

Classification Transformer consists of the encoder and a classification denselayer. The first position embedding of the encoder's output is the input to the linear layer and its output is returned.

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, max_len, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           vocab_size, max_len, rate)

    self.final_layer = tf.keras.layers.Dense(1)
    
  def call(self, inp):
    enc_padding_mask = create_padding_mask(inp)

    enc_output = self.encoder(inp, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    final_output = self.final_layer(enc_output[:,0,:])  # (batch_size, target_vocab_size)
    
    return final_output

## Optimizer

Optimizer is Adam with a custom learning rate scheduler according to the formula in the paper.

$lrate = d_{\text{model}}^{-0.5} \cdot
  \min({step\_num}^{-0.5},
    {step\_num} \cdot {warmup\_steps}^{-1.5})$


Below you can see the example of the curves of this model for different model sizes and for optimization hyperparameters.

![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_69_0.png)


In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## Define and compile the model

In [0]:
num_layers = 4
d_model = 128
num_heads = 4
dff = 512
vocab_size = WORDS_IN_CORPORA
max_len = MAX_SEQUENCE_LEN
warmup_steps = 2000

In [0]:
transformer = Transformer(num_layers, d_model, num_heads, dff, vocab_size, max_len)

In [0]:
learning_rate = CustomSchedule(d_model, warmup_steps)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
sequence_input = keras.layers.Input(shape=(MAX_SEQUENCE_LEN,), dtype='int32')
sequence_output = transformer(sequence_input)

In [0]:
model = keras.models.Model(inputs=[sequence_input], outputs=[sequence_output])

In [0]:
model.summary()

In [0]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=optimizer, 
              metrics=["accuracy"])

## Train the model

In [0]:
model.fit(x_train, y_train, epochs=10, batch_size=32,
          validation_data=(x_test, y_test))

# Exercise

Find some nice attention patterns in the data and visualise it.