In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import warnings
warnings.filterwarnings('ignore')
from Config import conf
from transformers import T5Tokenizer
from torchtext.nn.modules.multiheadattention import ScaledDotProduct

In [2]:
config = conf()
h = config.h
N = config.N
dmodel = config.dmodel
dk= config.dk
dv = config.dv
dff = config.dff
tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_path)
max_length = config.max_length
vocab_size = config.vocab

In [3]:
sentence1input = 'I love dog'
sentence2input = 'I love cat'
sentence1output = 'I am a man'
sentence2output = 'I am a women'

In [4]:
batch_input = tokenizer.batch_encode_plus([sentence1input,sentence2input],
                                          max_length= max_length,
                                          truncation=True,
                                          return_tensors='pt'
                                         )
batch_output = tokenizer.batch_encode_plus([sentence1output,sentence2output],
                                           max_length= max_length, 
                                           truncation=True,
                                           return_tensors='pt')

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len, vocab_size):
        super(PositionalEncoding, self).__init__()
        self.embedded_layer = nn.Embedding(vocab_size,d_model)
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = self.embedded_layer(x)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
class SingleAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,cuda_number='cuda:0'):
        super(SingleAttentionHead,self).__init__()
        self.proj_key = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_query = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_value  = nn.Linear(dmodel,dv).to(cuda_number)
        self.dk = dk
        self.cuda_number = cuda_number
        
    def forward(self,x):
        x = x.to(self.cuda_number)
        k = self.proj_key(x)
        q = self.proj_query(x)
        v = self.proj_value(x)
        head = torch.matmul(F.softmax(torch.matmul(q,k.transpose(-2,-1))/(self.dk**0.5)),v)
        if self.cuda_number != 'cuda:0':
            return head.to('cuda:0')
        return head

In [31]:
class MultiAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length):
        super(MultiAttentionHead, self).__init__()
        
        nlayers_GPU_0 = int(h/2)
        nlayers_GPU_1 = int(h/2)
        
        self.head_GPU0 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:0') for i in range(nlayers_GPU_0)
        ])
        
        self.head_GPU1 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:1') for i in range(nlayers_GPU_1)
        ])
        #Weight_0 layer:
        self.W0 = nn.Linear(dmodel,dmodel).to('cuda:0')   #Size h*dv x dmodel. But since dv = dk and dk x h = dv so it's a dmodel x dmodel layer -> cuda:0
        #LayerNormalisation
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')
        self.dropout = nn.Dropout(0.1).to('cuda:0')
    
    def forward(self,x):
        multi_attention_heads = 'Empty'
        for i, l in enumerate(self.head_GPU0):
            if i == 0:
                multi_attention_heads = l(x)
            else:
                multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        for i, l in enumerate(self.head_GPU1):
            multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        multi_attention_heads = self.W0(multi_attention_heads) 
        multi_attention_heads = self.Add_and_Nom(x + multi_attention_heads)  #cuda:0
        multi_attention_heads = self.dropout(multi_attention_heads)
        return multi_attention_heads

In [32]:
class EncoderStack(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(EncoderStack, self).__init__()
        self.multiAttentionHeads = MultiAttentionHead(dmodel,dk,dv,max_length)
        self.lin1a = nn.Linear(dmodel,dff).to('cuda:0')
        self.dropout1 = nn.Dropout(0.1).to('cuda:0')
        self.lin1b = nn.Linear(dff,dmodel).to('cuda:0')
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')

    def forward(self,x):
        x = self.multiAttentionHeads(x)
        sublayer_x = self.lin1a(x)
        sublayer_x = F.relu(sublayer_x)
        sublayer_x = self.dropout1(sublayer_x)
        sublayer_x = self.lin1b(sublayer_x)
        sublayer_x = self.Add_and_Nom(x + sublayer_x)
        return sublayer_x

In [33]:
class EncoderTransformerStacks(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(EncoderTransformerStacks, self).__init__()
        self.encoderStack = nn.ModuleList([
            EncoderStack(dmodel,dk,dv,max_length,vocab_size) for i in range(6)
        ])

    def forward(self,x):
        for i, l in enumerate(self.encoderStack):
            x = l(x)
        return x

In [34]:
class EncoderTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(EncoderTransformer, self).__init__()
        self.positionEncoder = PositionalEncoding(dmodel,0.1, max_length,vocab_size).to('cuda:0')
        self.encoder_Stacks = EncoderTransformerStacks(dmodel,dk,dv,max_length,vocab_size)
        
    def forward(self,x):
        x = self.positionEncoder(x)
        x = self.encoder_Stacks(x)
        return x

In [35]:
input_ids = batch_input['input_ids'].to('cuda:0')

torch.Size([2, 4])


In [36]:
Encode = EncoderTransformer(dmodel,dk,dv,max_length,vocab_size)

In [37]:
out = Encode(input_ids)

torch.Size([2, 4, 4])
tensor([[[ 1.5872,  0.1189, -0.7779, -0.9281],
         [ 0.0652, -1.2639,  1.5158, -0.3172],
         [-1.5286,  1.0462, -0.2345,  0.7169],
         [ 1.7295, -0.4886, -0.6089, -0.6320]],

        [[-1.1638,  1.5459,  0.1112, -0.4934],
         [ 1.6073, -0.1452, -1.1350, -0.3271],
         [-1.5900,  0.8782,  0.8288, -0.1169],
         [ 0.9621, -0.1697,  0.7724, -1.5649]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)


In [38]:
GPU_0_Memory_Allocated = torch.cuda.memory_reserved(0)
GPU_1_Memory_Allocated = torch.cuda.memory_reserved(1)

In [39]:
print('Memory used in GPU:0', round((GPU_0_Memory_Allocated)/100000000,2),'GB')
print('Memory used in GPU:1', round((GPU_1_Memory_Allocated)/10000000,2),'GB')

Memory used in GPU:0 0.06 GB
Memory used in GPU:1 0.21 GB


In [40]:
#Encode.parameters

In [41]:
batch_output_ids = batch_output['input_ids']

In [42]:
batch_output_attention_mask = batch_output['attention_mask']

In [43]:
batch_output_ids

tensor([[ 27, 183,   3,   9, 388,   1],
        [ 27, 183,   3,   9, 887,   1]])

In [44]:
positional_embedding = PositionalEncoding(dmodel,0.1,max_length,vocab_size)

In [45]:
pre_decoder_inputs = positional_embedding(batch_output_ids)

In [46]:
k_project = nn.Linear(dmodel,dk)
q_project = nn.Linear(dmodel,dk)
v_project = nn.Linear(dmodel,dk)

In [47]:
for i in range(pre_decoder_inputs.size(0)):
    decode_output = pre_decoder_inputs[i]

In [48]:
decode_output.size()

torch.Size([6, 4])

In [49]:
pre_decoder_inputs[0]

tensor([[-1.2431,  1.2010, -0.2413,  0.7864],
        [ 0.9482,  1.4963,  0.0165,  0.0000],
        [ 1.7246,  2.5453,  2.0686,  0.1869],
        [ 0.7949,  2.3445,  0.3388, -0.1423],
        [-1.3277,  0.0000, -0.9949,  0.9949],
        [-1.4374,  1.2929, -1.1703,  1.7299]], grad_fn=<SelectBackward>)

In [54]:
k = k_project(pre_decoder_inputs[0])
q = q_project(pre_decoder_inputs[0])
v = v_project(pre_decoder_inputs[0])
print('k1:',k)
print('')
print('q1',q)
print('')
print('v1',v)
print('')

k1: tensor([[-0.9401],
        [ 0.2234],
        [ 1.0020],
        [ 0.0553],
        [-0.9540],
        [-1.6868]], grad_fn=<AddmmBackward>)

q1 tensor([[-0.0353],
        [-0.6907],
        [-0.8605],
        [-0.4546],
        [-0.2535],
        [-0.2723]], grad_fn=<AddmmBackward>)

v1 tensor([[-0.0813],
        [-0.5412],
        [-1.0388],
        [-0.5946],
        [ 0.0930],
        [-0.0417]], grad_fn=<AddmmBackward>)



In [62]:
k_QT = torch.matmul(k,q.transpose(0,1))
k_QT

tensor([[ 0.0331,  0.6493,  0.8090,  0.4274,  0.2383,  0.2560],
        [-0.0079, -0.1543, -0.1923, -0.1016, -0.0566, -0.0608],
        [-0.0353, -0.6921, -0.8622, -0.4556, -0.2540, -0.2729],
        [-0.0019, -0.0382, -0.0476, -0.0251, -0.0140, -0.0151],
        [ 0.0336,  0.6589,  0.8209,  0.4337,  0.2418,  0.2598],
        [ 0.0595,  1.1651,  1.4515,  0.7669,  0.4276,  0.4593]],
       grad_fn=<MmBackward>)

In [63]:
q_size = q.size(0)
mask = (torch.triu(torch.ones(q_size, q_size)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask

tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])

In [64]:
masked = k_QT + mask
masked

tensor([[ 0.0331,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.0079, -0.1543,    -inf,    -inf,    -inf,    -inf],
        [-0.0353, -0.6921, -0.8622,    -inf,    -inf,    -inf],
        [-0.0019, -0.0382, -0.0476, -0.0251,    -inf,    -inf],
        [ 0.0336,  0.6589,  0.8209,  0.4337,  0.2418,    -inf],
        [ 0.0595,  1.1651,  1.4515,  0.7669,  0.4276,  0.4593]],
       grad_fn=<AddBackward0>)

In [None]:
q_transpose

In [59]:
mask

tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])

In [61]:
masked

tensor([[ 0.0331,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.0079, -0.1543,    -inf,    -inf,    -inf,    -inf],
        [-0.0353, -0.6921, -0.8622,    -inf,    -inf,    -inf],
        [-0.0019, -0.0382, -0.0476, -0.0251,    -inf,    -inf],
        [ 0.0336,  0.6589,  0.8209,  0.4337,  0.2418,    -inf],
        [ 0.0595,  1.1651,  1.4515,  0.7669,  0.4276,  0.4593]],
       grad_fn=<AddBackward0>)