# **Implementing Transformer by Replicating *Attention Is All You Need***

### 1. Input Embeddings -> 

In [1]:
# %%writefile modules/01_inputEmbeddings.py
import torch
import torch.nn as nn
import math

class InputEmbeddings(nn.Module):
    
    def __init__(self, d_model:int, vocab_size:int ):
        super().__init__()
        self.d_model = d_model  #Dimensionality -> d_model = 512: You choose to represent each word by a 512-dimensional vector.
        self.vocab_size = vocab_size #Number of Tokens 
        self.embedding = nn.Embedding(vocab_size,d_model)
        
    def forward(self,x):
        return self.embedding(x)*math.sqrt(self.d_model)
        

### 2. Positional Encoding -> 

\begin{align}
PE(pos, 2i)   &= \sin \left( \frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}} \right) \\
PE(pos, 2i+1) &= \cos \left( \frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}} \right)
\end{align}


In [2]:
# %%writefile modules/02_positionalEncoding.py

import torch
import torch.nn as nn
import math
class PositionalEncoding(nn.Module):
    
    def __init__(self,d_model:int, seq_len:int, dropout:float ):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = dropout
        
        #Creating a Matrix of shape (seq_len , d_model)
        pe = torch.zeros(seq_len,d_model)
        
        #Creating a position vector of length seq_len
        position = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1) #[0,1,2,...,n]
        
        div_term = torch.exp(torch.arange(0,d_model,2)).float() * (-math.log(10000)/d_model) #This comes from the denominator of the function.
        
        
        #Applying sin to even positions and cos to odd positions
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        
        pe = pe.unsqueeze(dim = 0) # [1,seq_len,d_model]
        
        #Register Buffer 
             #--> Is used for saving positonal encoding to model's state_dict as it is not updated during any backward propagation step but is needed for reliability and reusability. 
        self.register_butter('pe',pe)         
        
    
    def forward(self,x):
        #Input tokens  --> Input tokens  + Position of the respective tokens
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        
        #Dropout layer 
        return self.dropout(x)
        

### 3. Layer Normalisation -> 


\begin{align}
\large \hat{x}_j=\frac{x_j-\mu_j}{\sqrt{\sigma_j^2+\epsilon}}
\end{align}




In [10]:
# %%writefile modules/03_layerNormalisation.py
import torch
import torch.nn as nn

class LayerNormalization(nn.Module):
    
    def __init__(self, eps:float = 10**-6):
        super().__init__()
        
        #Epsilon is a small value added for numerical stability and also to avoid division by 0
        self.eps = eps
        
        self.alpha = nn.Parameter(torch.ones(1))    #This is multiplied
        self.bias = nn.Parameter(torch.zeros(1))    #This is added
        
    
    def forward(self,x):
        mean = x.mean(dim = -1 , keepdim=True) #Usually mean doesnt keep dimension.
        std = x.std(dim = -1 , keepdim=True) #Usually std doesnt keep dimension.
        
        return self.alpha * (x-mean)/torch.sqrt(std + self.eps)   + self.bias

### 4. Feed-Forward Network -> 

The Feed Forward Network (FFN) in the Transformer model is represented as:


\begin{align}
\large \operatorname{FFN}(x)=\max \left(0, x W_1+b_1\right) W_2+b_2
\end{align}




In [13]:
# %%writefile modules/04_feedForwardNetwork.py

import torch
import torch.nn as nn

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model: int , d_ff:int , dropout:float):
        super().__init__()
        
        self.linear_1=nn.Linear(in_features=d_model,out_features=d_ff) #W1,B1
        self.dropout = nn.Dropout(p=dropout) 
        
        self.linear_2 = nn.Linear(d_ff,d_model) #W2,B2
        
    def forward(self, x): 
        #(Batch_len , Seq_len , d_model) -> (Batch_len , Seq_len , d_ff) ->(Batch_len , Seq_len , d_model) 
        
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x)))) 

### 5. Multi-Head Attention -> 



![Multi-Head Attention Workflow -> ](https://i.ibb.co/Y0mbNbH/image.png)

In [3]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    
    def __init__(self,d_model:int,h:int,dropout:float):
        super().__init__()
        
        self.d_model = d_model
        self.h=h
        assert d_model%h==0 , 'Embedding dimension must be divisible by number of heads'
        
        self.d_k = d_model//h  #d_k = d_model/h
        
        self.w_q = nn.Linear(d_model,d_model) #Wq
        self.w_k = nn.Linear(d_model,d_model) #Wk
        self.w_v = nn.Linear(d_model,d_model) #Wv
        
        self.w_o = nn.Linear(d_model,d_model) #Wo as d_v is same as d_k and d_k*h = d_model

        self.dropout = nn.Dropout(p=dropout)
        
        
    @staticmethod    
    def attention(query,key,value,mask,dropout:nn.Dropout):
        #Calculating the attention score
        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2,-1))/math.sqrt(d_k) # @ --> Matrix Multiplication
        
        if mask is not None:
            attention_scores.masked_fill_(mask == 0 , -1e9) #-1e9 --> -Infinity. These values later become 0 after softmax
            
        attention_scores = attention_scores.softmax(dim = -1) #(Batch , h , Seq_len , Seq_len)
        
        if dropout is not None:
            attention_scores = dropout(attention_scores)
            
        return (attention_scores @ value) , attention_scores 
        
        
        
        
    def forward(self,q,k,v,mask):
        
        #Getting the Q',K' & V'
        
        query = self.w_q(q) # (Batch , Seq_len , d_model) -> (Batch , Seq_len , d_model)
        key = self.w_k(k) # (Batch , Seq_len , d_model) -> (Batch , Seq_len , d_model)
        value = self.w_v(v) # (Batch , Seq_len , d_model) -> (Batch , Seq_len , d_model)
        
        #Splitting the d_model dimension into h heads
        
        #Here query.shape[0] -> Batch & query.shape[1] -> Seq_len
        #Final shape of query after transpose -> (Batch , h , Seq_len , d_k) 
        query = query.view(query.shape[0] , query.shape[1],self.h,self.d_k).transpose(1,2)
        key = key.view(key.shape[0] , key.shape[1],self.h,self.d_k).transpose(1,2)
        value=key.view(value.shape[0] , value.shape[1],self.h,self.d_k).transpose(1,2) 
        
        x,self.attention_score = MultiHeadAttention.attention(query,key,value,mask,self.dropout)
        #Shape of x -> (Batch , h , Seq_len , d_k)
        #Shape of attention_score -> (Batch , h , Seq_len , Seq_len)
        
        x= x.transpose(1,2).contiguous().view(x.shape[0], -1 , self.d_k * self.h) #(Batch , Seq_len , d_model)
        
        
        return self.w_o(x) #(Batch , Seq_len , d_model) -> (Batch , Seq_len , d_model)
                   

2