# "Attention is all you need" - An extended overview

# Attention Mechanism

![Screenshot](img.jpg)

In [83]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization
import string
import re
from random import shuffle
import nltk

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_580/762390872.py", line 8, in <module>
    import nltk
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site-packages/nltk/__init__.py", line 132, in <module>
    from nltk.collocations import *
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site-packages/nltk/collocations.py", line 36, in <module>
    from nltk.metrics import (
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site-packages/nltk/metrics/__init__.py", line 18, in <module>
    from nltk.metrics.association import (
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site-packages/nltk/metrics/association.py", line 26, in <module>
    from scipy.stats import fisher_exact
  File "/home/eduardburlacu/miniconda3/envs/tf/lib/python3.9/site

In [None]:
class SelfAttention(layers.Layer):
    def __init__(self):
        super(SelfAttention, self).__init__()

    def call(self, inputs, training=False, mask=False):
        """"
        Feed-forward action: Feed Q, K in the normalised matrix multiplication, optionally mask for causality, then apply softmax to map into conditional probabilities.
        Dimensions:
        - inputs: Q,K = (N, dk), V=(N, dv)
        - inputs: Q,K = (N, dk), V=(N, dv)
        - output: (N, dv)
        """
        Q, K, V = inputs
        self.dk = K.shape[1]
        self.dv = V.shape[1]
        x = tf.linalg.matmul(Q, tf.transpose(K)) / tf.sqrt(self.dk)
        if mask: x = layers.Masking()(x)
        x = layers.Softmax()(x)
        return tf.linalg.matmul(x, V)

class MultiHeadAttention(layers.Layer):
    def __init__(self, heads=8, dk=64 ,dv=64, dmodel=512):
        assert (dk% heads == 0)
        super(MultiHeadAttention,self).__init__()
        self.h = heads
        self.dk = dk
        self.dv = dv
        self.dmodel = dmodel
        self.dense_Q = [layers.Dense(dk) for _ in range(heads)]
        self.dense_K = [layers.Dense(dk) for _ in range(heads)]
        self.dense_V = [layers.Dense(dv) for _ in range(heads)]
        self.W0 = layers.Dense(dmodel)

    def call(self, inputs, training=False, mask=None):
        Q,K,V = inputs
        xQ=[None] * self.h
        xK=[None] * self.h
        xV=[None] * self.h
        x =[None] * self.h
        for i in range(self.h):
            xQ[i] = self.dense_Q[i](Q)
            xK[i] = self.dense_K[i](K)
            xV[i] = self.dense_V[i](V)
            x[i]  = SelfAttention()([xQ[i],xK[i],xV[i]])
        x = layers.Concatenate()(x)
        x = self.W0(x)
        return x


In [None]:
class Encoder(layers.Layer):
    def __init__(self, heads=8, dk=64 ,dv=64, dmodel=512, dff=2048, name='Encoder',**kwargs):
        super(Encoder, self).__init__(name=name, **kwargs)
        self.dmodel= dmodel
        self.dff=dff
        self.MHA  = MultiHeadAttention(heads, dk ,dv, dmodel)
        self.dense1 = layers.Dense(dff,activation='relu')
        self.dense2 = layers.Dense(dmodel)

    def call(self, inputs, *args, **kwargs):
        x = self.MHA([inputs,inputs,inputs])
        x = layers.Add()([x, inputs])
        x = layers.BatchNormalization()(x)
        y = layers.Dropout(rate=0.1)(x)
        y = self.dense1(x)
        y = layers.Dropout(rate=0.1)(y)
        y = self.dense2(y)
        x = layers.Add()([x, y])
        x = layers.BatchNormalization(x)
        return x

class Decoder(layers.Layer):
    def __init__(self, heads=8, dk=64 ,dv=64, dmodel=512,dff=2048, name='Decoder',**kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        self.dmodel= dmodel
        self.dff=dff
        self.MHA1 = MultiHeadAttention(heads, dk ,dv, dmodel)
        self.MHA2 = MultiHeadAttention(heads, dk ,dv, dmodel)
        self.dense1 = layers.Dense(dff,activation='relu')
        self.dense2 = layers.Dense(dmodel)

    def call(self, inputs, *args, **kwargs):
        encoded, recurrent = inputs
        x= self.MHA1([recurrent, recurrent, recurrent])
        x = x+recurrent
        x = layers.BatchNormalization()(x)
        y = self.MHA2([encoded, encoded, x])
        x = layers.Add()([x, y])
        x = layers.BatchNormalization()(x)
        z = layers.Dropout(rate=0.1)(x)
        z = self.dense1(x)
        z = layers.Dropout(rate=0.1)(x)
        z = self.dense2(z)
        x = layers.Add()([x,z])
        x = layers.BatchNormalization()(x)
        return x


![Transformer](Transformer.png)

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self, name='Positional Encoder'):
        super(PositionalEncoding, self).__init__(name=name)

    def call(self, inputs, *args, **kwargs):
        dmodel = inputs.shape[1]
        x = tf.range(limit=dmodel )
        condition = x%2
        pos = x - condition
        a = tf.math.sin(x/(10000**(pos/dmodel)))
        b = tf.math.cos(x/(10000**(pos/dmodel)))
        return tf.where(condition=condition, x=a, y=b)

class Transformer(keras.Model):
    def __init__(self, num_encoders=6, heads=8, dk=64 ,dv=64, dmodel=512, dff=2048, vocab_size=500, max_seq_length=200 ,name='Transformer', **kwargs):
        super(Transformer, self).__init__(name=name, **kwargs)

        self.Encoder = [Encoder(heads=heads, dk=dk ,dv=dv, dmodel=dmodel,dff=dff) for _ in range(num_encoders)]

        self.Decoder = [Decoder(heads=heads, dk=dk ,dv=dv, dmodel=dmodel,dff=dff) for _ in range(num_encoders)]

        self.inEmbedding = layers.Embedding(input_dim = vocab_size, output_dim = dmodel, input_length=max_seq_length)

        self.outEmbedding= layers.Embedding(input_dim = vocab_size, output_dim = dmodel, input_length=max_seq_length)

        self.dense = layers.Dense(vocab_size, activation='softmax')

        self.positional = PositionalEncoding()

        self.num_encoders= num_encoders

    def call(self, inputs, training=None, mask=None):
        question, start_token, end_token= inputs
        x = self.inEmbedding(question)
        y = self.positional(x)
        x = layers.Add()([x,y])
        for i in range(self.num_encoders):
            x = self.Encoder[i](x)
        z = start_token
        output=[]
        while True:
            z = self.outEmbedding(z)
            t = self.positional(z)
            z = layers.Add()([z, t])
            for i in range(self.num_encoders):
                z = self.Decoder[i]([x ,z])
                z = self.dense(z)
            output.append(z)
            if (z==end_token):
                break
        return output


# Data Processing

Text Extracter

In [None]:
inputs=[]
with open('archive/spa.txt','r') as file:
    lines = file.read().split(sep='\n')
    for line in lines:
        line_split = line.split('\t')
        try:
            eng, spn = line_split[:2]
            spn = '[start] ' + spn + ' [end]'
            inputs.append([eng,spn])
        except: pass


In [None]:
N = len(inputs)
Ntrain, Nval, Ntest = int(.65 * N), int(.15 * N), int(.2 * N)
shuffle(inputs)
train_set = inputs[:Ntrain]
val_set  = inputs[Ntrain:Nval+Ntrain]
test_set = inputs[Nval+Ntrain:]

train_eng = [x[0] for x in train_set]
train_spn = [x[1] for x in train_set]

val_eng = [x[0] for x in val_set]
val_spn = [x[1] for x in val_set]

test_eng = [x[0] for x in test_set]
test_spn = [x[0] for x in test_set]

In [None]:
def get_vocabulary_size(sequences):
    corpus = ' '.join(sequences) # Combine all sequences into a single text
    tokens = nltk.word_tokenize(corpus) # Tokenize the text
    vocabulary_size = len(set(tokens)) # Count the unique tokens
    return vocabulary_size

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng = [ tup[0] for tup in inputs ]
spn = [ tup[1] for tup in inputs ]

eng_vocab_size = get_vocabulary_size(eng)
spn_vocab_size = get_vocabulary_size(spn)

eng_seq_length = max(
    map(lambda x: len(x.split()), eng)
)

spn_seq_length = max(
    map(lambda x: len(x.split()), spn)
)

seq_length = max(eng_seq_length, spn_seq_length)

batch_size=32

strip_chars = string.punctuation + '¿'
strip_chars = strip_chars.replace('[','')
strip_chars = strip_chars.replace(']','')

eng_vectorization = TextVectorization(
    max_tokens=eng_vocab_size, output_mode='int', output_sequence_length = seq_length+1, standardize= custom_standardization
)

spn_vectorization = TextVectorization(
    max_tokens=spn_vocab_size, output_mode='int', output_sequence_length = seq_length+1, standardize= custom_standardization
)

eng_vectorization.adapt()
spn_vectorization.adapt()
