In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import warnings
warnings.filterwarnings('ignore')
from Config import conf
from transformers import T5Tokenizer
from torchtext.nn.modules.multiheadattention import ScaledDotProduct

In [2]:
config = conf()
h = config.h
N = config.N
dmodel = config.dmodel
dk= config.dk
dv = config.dv
dff = config.dff
tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_path)
max_length = config.max_length
vocab_size = config.vocab

In [3]:
sentence1input = 'I love dog'
sentence2input = 'I love cat'
sentence1output = 'dog'
sentence2output = 'cat'

In [4]:
batch_input = tokenizer.batch_encode_plus([sentence1input,sentence2input],
                                          max_length= max_length,
                                          truncation=True,
                                          return_tensors='pt'
                                         )
batch_output = tokenizer.batch_encode_plus([sentence1output,sentence2output],max_length= max_length, truncation=True)

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(vocab_size, d_model)
        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        self.embedded_layer = nn.Embedding(vocab_size,dmodel)
    def forward(self, x):
        x = self.embedded_layer(x)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
class SingleAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,cuda_number='cuda:0'):
        super(SingleAttentionHead,self).__init__()
        self.proj_key = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_query = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_value  = nn.Linear(dmodel,dv).to(cuda_number)
        self.dk = dk
        self.cuda_number = cuda_number
        
    def forward(self,x):
        x = x.to(self.cuda_number)
        k = self.proj_key(x)
        q = self.proj_query(x)
        v = self.proj_value(x)
        head = torch.matmul(F.softmax(torch.matmul(q,k.transpose(-2,-1))/(self.dk**0.5)),v)
        if self.cuda_number != 'cuda:0':
            return head.to('cuda:0')
        return head

In [7]:
class MultiAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,maximum_sequence_length):
        super(MultiAttentionHead, self).__init__()
        
        nlayers_GPU_0 = 4
        nlayers_GPU_1 = 4
        
        self.head_GPU0 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:0') for i in range(nlayers_GPU_0)
        ])
        
        self.head_GPU1 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:1') for i in range(nlayers_GPU_1)
        ])
        #Weight_0 layer:
        self.W0 = nn.Linear(dmodel,dmodel).to('cuda:0')   #Size h*dv x dmodel. But since dv = dk and dk x h = dv so it's a dmodel x dmodel layer -> cuda:0
        #LayerNormalisation
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')
        self.dropout = nn.Dropout(0.1).to('cuda:0')
    
    def forward(self,x):
        multi_attention_heads = 'Empty'
        for i, l in enumerate(self.head_GPU0):
            if i == 0:
                multi_attention_heads = l(x)
            else:
                multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        for i, l in enumerate(self.head_GPU1):
            multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        multi_attention_heads = self.W0(multi_attention_heads) 
        multi_attention_heads = self.Add_and_Nom(x + multi_attention_heads)  #cuda:0
        multi_attention_heads = self.dropout(multi_attention_heads)
        return multi_attention_heads

In [8]:
class EncoderStack(nn.Module):
    def __init__(self,dmodel,dk,dv,maximum_sequence_length,vocab_size):
        super(EncoderStack, self).__init__()
        
        self.multiAttentionHeads = MultiAttentionHead(dmodel,dk,dv,max_length)
        self.lin1a = nn.Linear(dmodel,dff).to('cuda:0')
        self.dropout1 = nn.Dropout(0.1).to('cuda:0')
        self.lin1b = nn.Linear(dff,dmodel).to('cuda:0')
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')
        
    def forward(self,x):
        x = self.multiAttentionHeads(x)
        sublayer_x = self.lin1a(x)
        sublayer_x = F.relu(sublayer_x)
        sublayer_x = self.dropout1(sublayer_x)
        sublayer_x = self.lin1b(sublayer_x)
        sublayer_x = self.Add_and_Nom(x + sublayer_x)
        return sublayer_x

In [9]:
class EncoderTransformerStacks(nn.Module):
    def __init__(self,dmodel,dk,dv,maximum_sequence_length,vocab_size):
        super(EncoderTransformerStacks, self).__init__()
        self.encoderStack = nn.ModuleList([
            EncoderStack(dmodel,dk,dv,max_length,vocab_size) for i in range(6)
        ])

    def forward(self,x):
        for i, l in enumerate(self.encoderStack):
            x = l(x)
        return x

In [10]:
class EncoderTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,maximum_sequence_length,vocab_size):
        super(EncoderTransformer, self).__init__()
        self.positionEncoder = PositionalEncoding(dmodel,0.1, vocab_size).to('cuda:0')
        self.encoder_Stacks = EncoderTransformerStacks(dmodel,dk,dv,max_length,vocab_size)
        
    def forward(self,x):
        x = self.positionEncoder(x)
        x = self.encoder_Stacks(x)
        return x

In [26]:
input_ids = batch_input['input_ids'].to('cuda:0')
print(input_ids.size())

torch.Size([2, 4])


In [13]:
Encode = EncoderTransformer(dmodel,dk,dv,max_length,vocab_size)

In [25]:
out = Encode(input_ids)
print(out.size())
print(out)

tensor([[[-1.7142,  2.1285,  0.2945,  ..., -1.2868, -1.1650,  0.3148],
         [ 0.0378,  0.7889, -0.3209,  ..., -1.3238, -0.4892, -0.2263],
         [ 0.0963,  1.6697,  0.0065,  ...,  0.5371, -0.7448, -0.1628],
         [-0.2938,  1.8050,  0.6272,  ...,  0.9106, -1.9961, -0.1122]],

        [[-0.7685, -0.1708, -0.3502,  ..., -1.0492, -1.3100, -0.0034],
         [-0.9087,  0.6610, -0.1741,  ..., -1.9858,  0.8593,  0.3598],
         [-1.1512,  1.4525, -0.5505,  ...,  0.0667, -0.1288,  0.5303],
         [ 0.1984,  1.0201, -0.4247,  ..., -0.9076, -0.4552, -0.3095]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
torch.Size([2, 4, 512])


In [20]:
GPU_0_Memory_Allocated = torch.cuda.memory_reserved(0)
GPU_1_Memory_Allocated = torch.cuda.memory_reserved(1)

In [23]:
print('Memory used in GPU:0', round((GPU_0_Memory_Allocated)/100000000,2),'GB')
print('Memory used in GPU:1', round((GPU_1_Memory_Allocated)/10000000,2),'GB')

Memory used in GPU:0 1.66 GB
Memory used in GPU:1 1.05 GB


In [24]:
# GPU:0 was using 400MB before so both GPUs are actually sharing the same model size