### Pytorch implementation of Transformer
Code heavily inspired from https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec#d554.

In [1]:
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable
from graphviz import Digraph
import numpy as np
import math
import spacy
#import en_core_web_sm
#import fr_core_news_sm
import torchtext
import pandas as pd
from torchviz import make_dot
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split
import copy
import time
import torch.nn.functional as F
import math
# torch.set_default_tensor_type('torch.cuda.FloatTensor')

### Sentence Tokenizer
![alt text](sentence_token.png "token")

In [2]:
# Load dataset
en = open('english.txt', encoding='utf-8').read().split('\n')
fr = open('french.txt', encoding='utf-8').read().split('\n')

en_ = spacy.load('en_core_web_sm')
fr_ = spacy.load('fr_core_news_sm')

def tokenize_en(sentence):
    return [tok.text for tok in en_.tokenizer(sentence)]
def tokenize_fr(sentence):
    return [tok.text for tok in fr_.tokenizer(sentence)]
EN_TEXT = Field(tokenize=tokenize_en)
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")

In [3]:
# Process data into dataframe
raw_data = {'English' : [line for line in en], 'French': [line for line in fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')
df;

In [4]:
# Generate train and validation set
train, val = train_test_split(df,test_size=0.1)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)

In [5]:
# Load the tokenized train and validation
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]
train,val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', format='csv', fields=data_fields)

In [6]:
# Index tokens
FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

In [7]:
train_iter = BucketIterator(train, batch_size=1, \
 shuffle=False,repeat=True)

#### Batch size:
In each batch, the sentences have been transposed so they are descending vertically (important: we will need to transpose these again to work with the transformer). Each index represents a token (word), and each column represents a sentence. We have 10 columns, as 10 was the batch_size we specified.

#### Determine batch size:
Additionally, if your RAM can process say 1500 tokens each iteration, and your batch_size is 20, then only when you have batches of length 75 will you be utilising all the memory. An efficient batching mechanism would change the batch size depending on the sequence length to make sure around 1500 tokens were being processed each iteration.

In [8]:
batch = next(iter(train_iter))
print(batch.French)

tensor([[    2],
        [10591],
        [    3]])


In [9]:
# Mask Generation for input
def input_mask_gen(EN_TEXT,input_seq):
    #input_seq = batch.English.transpose(0,1)
    input_pad = EN_TEXT.vocab.stoi['<pad>']
    # creates mask with 0s wherever there is padding in the input
    input_msk = (input_seq != input_pad).unsqueeze(1)
    return input_msk

In [10]:
# Mask for target sequence
def target_mask_gen(FR_TEXT,target_seq):
    #target_seq = batch.French.transpose(0,1)
    target_pad = FR_TEXT.vocab.stoi['<pad>']
    target_msk = (target_seq != target_pad).unsqueeze(1)
    size = target_seq.size(1) # get seq_len for matrix
    nopeak_mask = np.triu(np.ones((1, size, size),dtype=int),k=1).astype('uint8')
    nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0)
#     import ipdb;ipdb.set_trace()
    target_msk = target_msk & nopeak_mask
    return target_msk, target_pad

#### When each word is fed into the network, this code will perform a look-up and retrieve its embedding vector. These vectors will then be learnt as a parameters by the model, adjusted with each iteration of gradient descent.

In [11]:
# Define word embeddings
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed=nn.Embedding(vocab_size, d_model)
    def forward(self,x):
        #print(list(self.embed.parameters()))
       
        #return x
        return self.embed(x)

## Position Encoding 
  

![alt text](position_encoding.png "Title")

In [12]:
# generate positional encoding matrix with sin and cos function
class positional_encoding(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.model_dim = d_model
        # create a matrix for positional encoding
        pos_encod= torch.zeros(max_seq_len, self.model_dim)
        for pos in range(max_seq_len):
            for i in range(0,self.model_dim,2):
                pos_encod[pos,i] = math.sin(pos/(10000**((2*i)/self.model_dim)))
                pos_encod[pos,i+1] = math.cos(pos/(10000**((2*(i+1))/self.model_dim)))
        pos_encod= pos_encod.unsqueeze(0)
        self.register_buffer('pos_encod',pos_encod)
    
    def forward(self,X):
        # make embeddings
        X = X * math.sqrt(self.model_dim)
        # add constant
        seq_len = X.size(1)
#         import ipdb;ipdb.set_trace()
        X = X + Variable(self.pos_encod[:,:seq_len], requires_grad=False)
        return X

### Embedding Generation
![alt text](Input_transformer.png "Input black")

### MultiHead Attention
![alt text](multihead.png "Input black")

In [13]:
# Multi-headed attention

grads = {}
results = {}
def save_grad(name):
    def hook(grad):
        grads[name] = grad
    return hook
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, model_dimension, dropout = 0.1):
        super().__init__()
        self.model_dimension= model_dimension
        self.keys_dim= model_dimension// heads
        self.head= heads
        
        #query vector
        self.q_linear= nn.Linear(model_dimension, model_dimension)
        #value vector
        self.v_linear= nn.Linear(model_dimension, model_dimension)
        #key vector
        self.k_linear= nn.Linear(model_dimension, model_dimension)
        self.dropout= nn.Dropout(dropout)
        
        self.output= nn.Linear(model_dimension, model_dimension)
        
    def forward(self, query, keys, value, mask=None, name=None):
        bs = query.size(0)
        keys= self.k_linear(keys).view(bs, -1, self.head, self.keys_dim)
        results['keys_'+name] = keys
        keys.register_hook(save_grad('keys_grad_'+name))
        query= self.q_linear(query).view(bs, -1, self.head, self.keys_dim)
        results['query_'+name] = query
        query.register_hook(save_grad('query_grad_'+name))
        value= self.v_linear(value).view(bs, -1, self.head, self.keys_dim)
        results['value_'+name]= value
        value.register_hook(save_grad('value_grad_'+name))
#         print('keys_'+name)
#         print(keys)
#         print('value_'+name)
#         print(value)
        keys= keys.transpose(1,2)
        query= query.transpose(1,2)
        value= value.transpose(1,2)
        
        attention_score= attention(query,keys,value,self.keys_dim,mask,self.dropout, name)        
        results['attn_score_'+name] = attention_score
        attention_score.register_hook(save_grad('attn_score_'+name))
        # combine the result from all head
        concat_result= attention_score.transpose(1,2).contiguous().view(bs, -1, self.model_dimension)
        results['attn_concat_'+name] = concat_result
        concat_result.register_hook(save_grad('attn_conct_'+name))        
        # pass through a last layer to match the dimension (Multiply with Wo)
        output= self.output(concat_result)
        return output
        

In [14]:
# For encoder, 
#Dimension: Batch_size * seq_len * model_dimension
# For Multi-head attention: (split into N heads)
# Dimension: batch_size * N * seq_len * (model_dimension/N)

### Single attention
![alt text](single_attention.png "attention")

In [15]:
# Attention: used by both encoder and decoder
def attention(q, k, v, d_k, mask=None, dropout=None, name=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    results['score_'+name]=scores
    scores.register_hook(save_grad('score_'+name))
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)
    results['scores_softmax_'+name]= scores
    scores.register_hook(save_grad('score_softmax_'+name))
    
    if dropout is not None:
        pass
        #scores = dropout(scores)
    
    output = torch.matmul(scores, v)
    return output

In [16]:
# Define FeedForward 
class FeedForward(nn.Module):
    def __init__(self,model_dimension, dim_forward=3, dropout= 0.1):
        super().__init__()
        self.linear_1= nn.Linear(model_dimension, dim_forward)
        self.dropout= nn.Dropout(dropout)
        self.linear_2= nn.Linear(dim_forward, model_dimension)
    
    def forward(self,X,name="None"):
        #print("Forward Layer")
        X= (F.relu(self.linear_1(X)))
        results['x_linear_1_'+name] = X
        #print(X)
        #X.register_hook(save_grad('x_linear_1_'+name))
        
        X= self.linear_2(X)
        results['x_linear_2_'+name] = X
        #X.register_hook(save_grad('x_linear_2_'+name))
        return X

In [17]:
# a=results['x_linear_1_enc'].detach().numpy()
#b=grads['x_linear_1_enc'].detach().numpy()

In [18]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x, name=None):
#         print("++++++++++++++mean+++++++++++++++")
#         print(x.mean(dim=-1, keepdim=True))
#         print(x.std(dim=-1, keepdim=True))
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        norm.register_hook(save_grad('norm_'+name))
        return norm

### Encoder
![alt text](encoder.png "attention")

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
   
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
       # print("Att crossed")
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        #print("Encoder out")
        return x
        
        
        

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff= FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout) 
        self.dropout_2 = nn.Dropout(dropout)
        self.norm_1 = Norm(d_model) 
        self.norm_2 = Norm(d_model)
        
    def forward(self, x, mask):   
        results['x'] = x
        x2=x
        #x2=self.norm_1(x,"enc")
        results['x2']=x2
        #x2.register_hook(save_grad('input_norm_out'))
        results['input_norm'] = x2
        
        #x = x + self.dropout_1(self.attn(x2,x2,x2,mask,"enc"))
        x = x + (self.attn(x2,x2,x2,mask,"enc"))
        #x.register_hook(save_grad('enc_att_out'))
        results['enc_attn_out'] = x
        
        xz=x
        #xz= self.norm_2(x,"enc")
        results['After_attn_norm'] = xz
        #xz.register_hook(save_grad('after_norm'))
        
        x2 = x+ (self.ff(xz,'enc'))
        results['enc_after_ff'] = x2
        #x2.register_hook(save_grad('enc_after_ff'))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, model_dimension, heads, dropout=0.1):
        super().__init__()
        self.normalize1= Norm(model_dimension)
        self.normalize2= Norm(model_dimension)
        self.normalize3= Norm(model_dimension)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, model_dimension)
        self.attn_2 = MultiHeadAttention(heads, model_dimension)
        self.ff = FeedForward(model_dimension)
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.normalize1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
       # print("Dec passed 1")
        x2 = self.normalize2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs,src_mask))
       # print("Dec passed 2")
        x2 = self.normalize3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

In [20]:
class DecoderLayer(nn.Module):
    def __init__(self, model_dimension, heads, dropout=0.1):
        super().__init__()
        
        self.attn_1 = MultiHeadAttention(heads, model_dimension)
        self.attn_2 = MultiHeadAttention(heads, model_dimension)
        self.ff = FeedForward(model_dimension)
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2=x
        x = x + (self.attn_1(x2, x2, x2, trg_mask,"dec"))
        x = x + (self.attn_2(x2, e_outputs, e_outputs,src_mask,"dec"))
        x = x + (self.ff(x2,"dec"))
        return x

In [21]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, model_dimension, N, heads):
        super().__init__()
        self.N= N
        self.embed= Embedder(vocab_size, model_dimension)
        self.position_encoder= positional_encoding(model_dimension)
        self.layers= get_clones(EncoderLayer(model_dimension, heads), N)
        self.normalized= Norm(model_dimension)
        self.norm= Norm(model_dimension)
    def forward(self, source, mask):
        x= self.embed(source)
        x= self.position_encoder(x)
#         print("Encoder")
#         print(x)
        results['input_encoding'] = x
        #x.register_hook(save_grad('input_encod_encoder'))
        for i in range(N):
            #print("Encoder:",i)
            x= self.layers[i](x, mask)
            results['enc_Ni_out'] = x
            #x.register_hook(save_grad('enc_Ni_out'))
        x_out= self.norm(x,"enc")
        results['enc_out'] = x_out
        x_out.register_hook(save_grad('enc_out'))
        return x_out        

In [23]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, model_dimension, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, model_dimension)
        self.positional_encoder = positional_encoding(model_dimension)
        self.layers = get_clones(DecoderLayer(model_dimension, heads), N)
        self.normalized= Norm(model_dimension)
    
    def forward(self, target,
                e_output, source_mask, target_mask):
        #print(target)
        x= self.embed(target)
        x= self.positional_encoder(x)
#         print("Decoder")
#         print(x)
        for i in range(self.N):
            #print("Decoder:",i)
            x= self.layers[i](x, e_output, source_mask, target_mask)
        return x
        #return self.normalized(x)

In [24]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder= Encoder(src_vocab, d_model,N,heads)
        self.decoder= Decoder(trg_vocab, d_model,N,heads)
        self.output= nn.Linear(d_model, trg_vocab)
    def forward(self, source, target, source_mask, target_mask):
        encoder_out= self.encoder(source, source_mask)
        decoder_out= self.decoder(target, encoder_out, source_mask, target_mask)
        output= self.output(decoder_out)
        return output

In [25]:
model_dimension= 1024
attention_heads= 1
N = 1
source_vocab= len(EN_TEXT.vocab)
target_vocab= len(FR_TEXT.vocab)

model= Transformer(source_vocab, target_vocab, model_dimension, N, attention_heads)
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.to(device)

# initialize the parameters
for p in model.parameters():
    if p.dim()>1:
        nn.init.xavier_normal_(p)


In [26]:
# add optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9,0.98), eps= 1e-9)
def train_model(epochs, print_every=1):
    model.train()
    start_all = time.time()
    start = time.time()
    temp = start
    total_loss = 0
    for epoch in range(epochs):
        #print(epoch)
        c = 30;
        for i,batch in enumerate(train_iter):
            # print("Batch:",i)
            c= c-1
            if(c<1):
                break;
            start = time.time()
            src= batch.English.transpose(0,1)
#             print("src")
#             print(src)
            target= batch.French.transpose(0,1)
            # Last word to predict
            #import ipdb;ipdb.set_trace()
            target_input= target[:,:-1]
            targets = target[:, 1:].contiguous().view(-1)
            source_mask= input_mask_gen(EN_TEXT,src)
            target_mask,target_pad= target_mask_gen(FR_TEXT,target_input)
            #z=make_dot(model(src, target_input, source_mask, target_mask),params=dict(model.named_parameters()))           
            #z.view()
            st= time.time()
            preds= model(src, target_input, source_mask, target_mask)
            #import ipdb;ipdb.set_trace()
#             print("forward time:",time.time()-st)
            #print("done")
            optimizer.zero_grad()
            
            loss= F.cross_entropy(preds.view(-1, preds.size(-1)),targets, ignore_index=target_pad)
            star= time.time()
            loss.backward()
            #import ipdb;ipdb.set_trace()
#             print("gradient computation time:",time.time()-star)
            start = time.time()
            optimizer.step()
#             print("update time:",time.time()-start)
            total_loss += loss.data
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f,\
                %ds per %d iters" % ((time.time() - start) // 60,
                epoch + 1, i + 1, loss_avg, time.time() - temp,
                print_every))
                total_loss = 0
                temp = time.time()
    print("OVerall time: ",time.time()-start_all)

In [27]:
train_model(1)

time = 0m, epoch 1, iter = 1, loss = 9.977,                0s per 1 iters
time = 0m, epoch 1, iter = 2, loss = 14.619,                0s per 1 iters
time = 0m, epoch 1, iter = 3, loss = 9.995,                0s per 1 iters
time = 0m, epoch 1, iter = 4, loss = 9.663,                0s per 1 iters
time = 0m, epoch 1, iter = 5, loss = 8.060,                0s per 1 iters
time = 0m, epoch 1, iter = 6, loss = 13.183,                0s per 1 iters
time = 0m, epoch 1, iter = 7, loss = 9.336,                0s per 1 iters
time = 0m, epoch 1, iter = 8, loss = 10.978,                0s per 1 iters
time = 0m, epoch 1, iter = 9, loss = 9.173,                0s per 1 iters
time = 0m, epoch 1, iter = 10, loss = 8.333,                0s per 1 iters
time = 0m, epoch 1, iter = 11, loss = 9.617,                0s per 1 iters
time = 0m, epoch 1, iter = 12, loss = 9.482,                0s per 1 iters
time = 0m, epoch 1, iter = 13, loss = 9.431,                0s per 1 iters
time = 0m, epoch 1, iter = 14, 

In [28]:
torch.cuda.current_device()

0

In [29]:
results['enc_after_ff'].detach().numpy()

array([[[  0.1950692 ,   8.160911  ,  -8.741374  , ..., -27.39594   ,
          -6.054618  ,  21.88696   ],
        [  1.8207047 ,   8.017043  ,  -8.295161  , ..., -26.978642  ,
          -5.9500494 ,  21.364285  ],
        [  1.369138  ,   6.832446  ,  -8.596656  , ..., -26.890856  ,
          -6.5006127 ,  21.517189  ],
        ...,
        [ -0.84437275,   8.144324  , -10.158885  , ..., -27.03297   ,
          -6.3935385 ,  21.171732  ],
        [ -0.77687097,   8.395802  ,  -9.45327   , ..., -26.43468   ,
          -6.0821395 ,  21.697058  ],
        [  1.7248828 ,   8.588492  ,  -9.145571  , ..., -26.767185  ,
          -6.7698135 ,  21.449507  ]]], dtype=float32)

## Forward Pass

In [30]:
x= results['x'].detach().numpy()
np.mean(x,axis=-1)

array([[0.5054742 , 0.5440578 , 0.5356682 , 0.5103475 , 0.4826672 ,
        0.48721611, 0.48022857, 0.4599744 ]], dtype=float32)

In [31]:
np.std(x,axis=-1)

array([[0.62725735, 0.60801345, 0.60049975, 0.6500525 , 0.62876284,
        0.63606274, 0.62796223, 0.6690816 ]], dtype=float32)

In [32]:
gamma = model.encoder.layers[0].norm_1.alpha.detach().numpy()
beta = model.encoder.layers[0].norm_1.bias.detach().numpy()
eps = model.encoder.layers[0].norm_1.eps
x= results['x'].detach().numpy().reshape(9,2)
batchnorm_forward(x,gamma,beta,eps)

ValueError: cannot reshape array of size 8192 into shape (9,2)

In [None]:
std = np.array([[0.7004],
         [0.1070],
         [0.0727],
         [0.6342],
         [1.2626],
         [1.3580],
         [0.9090],
         [0.2514],
         [0.0237]])

In [None]:
std.shape
#self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
 #       / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias

In [None]:
up=x-np.mean(x,axis=-1).reshape(9,1)
down = std +eps
(gamma * up/down) + beta

In [None]:
up/down

In [None]:
tensor([[[ 0.5199],
         [ 0.9371],
         [ 0.9490],
         [ 0.5778],
         [ 0.1163],
         [ 0.0345],
         [ 0.3737],
         [ 0.8237],
         [ 1.0051],
         [ 0.7182],
         [ 0.2459],
         [-0.0167],
         [ 0.2338]]], grad_fn=<MeanBackward1>)
tensor([[[0.7014],
         [0.1058],
         [0.0953],
         [0.5960],
         [1.2205],
         [1.3941],
         [0.8999],
         [0.2581],
         [0.0250],
         [0.4160],
         [1.0935],
         [1.4302],
         [1.1078]]], grad_fn=<StdBackward1>)

In [None]:
def batchnorm_forward(x, gamma, beta, eps):
    
    #import ipdb;ipdb.set_trace()
    N, D = x.shape
    #step1: calculate mean
    mu = 1./N * np.sum(x, axis = -1).reshape(9,1)
    #step2: subtract mean vector of every trainings example
    xmu = x - mu
      #step3: following the lower branch - calculation denominator
    sq = xmu ** 2
      #step4: calculate variance
    var = 1./N * np.sum(sq, axis = -1).reshape(9,1)
      #step5: add eps for numerical stability, then sqrt
    sqrtvar = np.sqrt(var + eps)
      #step6: invert sqrtwar
    ivar = 1./sqrtvar
      #step7: execute normalization
    xhat = xmu * ivar
      #step8: Nor the two transformation steps
    gammax = gamma * xhat
      #step9
    out = gammax + beta
      #store intermediate
    cache = (xhat,gamma,xmu,ivar,sqrtvar,var,eps)
    return out

In [None]:
import numpy as np
#from scipy.special import softmax


# In reality, for ML models, each column should represent one sequence. Hence, we need to transpose two times
e_dim= 2;
seq_len= 4;

def normalize():
    pass
def normback():
    pass

def softmax1(x):
    #return softmax(Z,axis=1)
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def dropout(p,dim):
    u1 = np.random.binomial(1, p, size=dim.shape)
    return (p*u1)

def Relu(x):
    return np.maximum(x, 0)

# Input
X= results['input_encoding'].detach().numpy()
#import ipdb; ipdb.set_trace()
# Normalize
X_norm= results['input_norm'].detach().numpy()

X_norm= X_norm[0]
# Define Wq,bq; Wk, bk; Wv, bv
Wq= model.encoder.layers[0].attn.q_linear.weight.detach().numpy()
bq= model.encoder.layers[0].attn.q_linear.bias.detach().numpy()

Wk= model.encoder.layers[0].attn.k_linear.weight.detach().numpy()
bk= model.encoder.layers[0].attn.k_linear.bias.detach().numpy()

Wv= model.encoder.layers[0].attn.v_linear.weight.detach().numpy()
bv= model.encoder.layers[0].attn.v_linear.bias.detach().numpy()

Q = np.dot(Wq,X_norm.transpose()).transpose() + bq
#print(Q)
K = np.dot(Wk,X_norm.transpose()).transpose() + bk
#print(K)
V = np.dot(Wv,X_norm.transpose()).transpose() + bv
Z= np.dot(Q,np.transpose(K))/np.sqrt(e_dim)

Z_res= softmax1(Z)

Z_out = np.dot(Z_res,V)

ZZ= (Z_out)
# Define output parameter for attention
Wo= model.encoder.layers[0].attn.output.weight.detach().numpy()
bo= model.encoder.layers[0].attn.output.bias.detach().numpy()
attn_out= np.dot(Wo, ZZ.transpose()).transpose() + bo 

Zo= attn_out + X

Zoo= (Zo)

# Normalized input before FC layer
Zoo_norm= results['After_attn_norm'].detach().numpy()


# Define weights for linear model
w1= model.encoder.layers[0].ff.linear_1.weight.detach().numpy().transpose()
b1= model.encoder.layers[0].ff.linear_1.bias.detach().numpy()

w2= model.encoder.layers[0].ff.linear_2.weight.detach().numpy().transpose()
b2= model.encoder.layers[0].ff.linear_2.bias.detach().numpy()

Z_ff_1 = np.dot(Zoo_norm, w1) + b1

Z_ff_1_relu = Relu(Z_ff_1)
Z_ff_2 = np.dot(Z_ff_1_relu, w2) + b2

H = Z_ff_2 + Zoo

#E_out= normalize(H)

## Backward Pass

In [None]:
def batchnorm_backward(dout, cache):

  #unfold the variables stored in cache
  xhat,gamma,xmu,ivar,sqrtvar,var,eps = cache
  #get the dimensions of the input/output
  N,D = dout.shape
  #step9
  dbeta = np.sum(dout, axis=0)
  dgammax = dout #not necessary, but more understandable
  #step8
  dgamma = np.sum(dgammax*xhat, axis=0)
  dxhat = dgammax * gamma

  #step7
  divar = np.sum(dxhat*xmu, axis=0)
  dxmu1 = dxhat * ivar
  #step6
  dsqrtvar = -1. /(sqrtvar**2) * divar
  #step5
  dvar = 0.5 * 1. /np.sqrt(var+eps) * dsqrtvar
  #step4
  dsq = 1. /N * np.ones((N,D)) * dvar
  #step3
  dxmu2 = 2 * xmu * dsq
  #step2
  dx1 = (dxmu1 + dxmu2)
  dmu = -1 * np.sum(dxmu1+dxmu2, axis=0)
  #step1
  dx2 = 1. /N * np.ones((N,D)) * dmu
  #step0
  dx = dx1 + dx2
  return dx, dgamma, dbeta

In [None]:

# H gradients
D_H = grads['enc_Ni_out'].numpy().reshape(X.shape[1],e_dim)

# FC-2 Layer
D_Z_ff_2 = D_H
dw2= np.dot(D_Z_ff_2.transpose(), Z_ff_1_relu)
db2= np.sum(D_Z_ff_2,axis=0)

# Relu
D_Z_ff_1_relu = Relu(np.dot(D_Z_ff_2 , w2.transpose()))

#FC-1 Layer
D_Z_ff_1 = D_Z_ff_1_relu
dw1= np.dot(D_Z_ff_1.transpose(), Zoo)
db1= np.sum(D_Z_ff_1, axis=0)

# Normalize Gradients
D_Zoo= np.dot(D_Z_ff_1, w1.transpose()) 

#gradients for Normalization 
D_Zo= (D_Zoo)

# Attention output layer
D_Z_attn_out= D_Zo
dwo= np.dot(D_Z_attn_out.transpose(), ZZ)
dbo= np.sum(D_Z_attn_out, axis=0)  

# Gradient of combination
D_ZZ= np.dot(D_Z_attn_out, Wo)

# Score computation
D_Z_out= D_ZZ
D_Z= np.dot(D_Z_out.transpose(),V)
D_V= np.dot(D_Z_out.transpose(),Z)   # *********

# Softmax backprop
D_Z_x= np.diag(Z) - np.dot(Z,Z.transpose()) #***********

# Query and Key
D_Q=  np.dot(D_Z_x, K)/np.sqrt(e_dim)
D_K=  np.dot(D_Z_x.transpose(),Q)/np.sqrt(e_dim)

# Gradient for V
D_Wv= np.dot(D_V, X_norm)
D_bv= np.sum(D_V,axis=0)
D_norm_x_v= np.dot(D_V.transpose(), Wv)

# Gradient for Q
D_Wq= np.dot(D_Q.transpose(), X_norm)
D_bq= np.sum(D_Q,axis=0)
D_norm_x_q= np.dot(D_Q, Wq)

# Gradient for K
D_Wk= np.dot(D_K.transpose(), X_norm)
D_bk= np.sum(D_K,axis=0)
D_norm_x_k= np.dot(D_K, Wk)

#**************If we want to update the embeddings **********************
# SUm of all D_norm_X
D_norm = D_norm_x_k + D_norm_x_q + D_norm_x_v

In [None]:
np.sum(D_Z_ff_2,axis=0)

In [None]:
D_Wk

In [None]:
results['x'].detach().numpy()

In [None]:
alpha=model.encoder.layers[0].norm_1.alpha.detach().numpy()
bias=model.encoder.layers[0].norm_1.bias.detach().numpy()
eps=model.encoder.layers[0].norm_1.eps

In [None]:
out,cache=batchnorm_forward(results['x'].detach().numpy().reshape(9,2),alpha,bias,eps)

In [None]:
out

In [None]:
a=results['x2'].detach().numpy()

In [None]:
a

In [None]:
a= torch.tensor([[ 0.0172,  1.0966],
         [ 0.9420,  0.6059],
         [ 1.0191, -0.4586],
         [ 0.1643, -1.1157],
         [-0.8335, -0.7352],
         [-1.0699,  0.3336],
         [-0.0000,  0.0000],
         [ 0.7393,  0.8166],
         [ 1.0900, -0.1723],
         [ 0.4438, -1.0032],
         [-0.5981, -0.9284],
         [-1.1156,  0.0234],
         [-0.6117,  0.0000],
         [ 0.4853,  1.0180]])


In [None]:
[-1.0000,  1.0000],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [-1.0000,  1.0000],
         [-1.0000,  1.0000],
         [ 0.0000,  0.0000],
         [-1.0000,  1.0000],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [-1.0000,  1.0000],
         [-1.0000,  1.0000],
         [-1.0000,  1.0000]]], grad_fn=<NativeLayerNormBackward>)


In [None]:
w = [1,1]
b= [0,0]

In [None]:
model.encoder.layers[0].norm_1(a)

In [None]:
a