In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import warnings
warnings.filterwarnings('ignore')
from Config import conf
from transformers import T5Tokenizer
from torchtext.nn.modules.multiheadattention import ScaledDotProduct

In [2]:
config = conf()
h = config.h
N = config.N
dmodel = config.dmodel
dk= config.dk
dv = config.dv
dff = config.dff
tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_path)
max_length = config.max_length
vocab_size = config.vocab

In [3]:
sentence1input = 'I love dog'
sentence2input = 'I love cat'
sentence3input = 'I love money'
sentence4input = 'I love overtime'

In [4]:
encoder_batch_input = tokenizer.batch_encode_plus([sentence1input,sentence2input,sentence3input,sentence4input],
                                          max_length= max_length,
                                          truncation=True,
                                          return_tensors='pt'
                                         )

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len, vocab_size):
        super(PositionalEncoding, self).__init__()
        self.embedded_layer = nn.Embedding(vocab_size,d_model)
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = self.embedded_layer(x)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
class SingleAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,cuda_number='cuda:0'):
        super(SingleAttentionHead,self).__init__()
        self.proj_key = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_query = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_value  = nn.Linear(dmodel,dv).to(cuda_number)
        self.dk = dk
        self.cuda_number = cuda_number
        
    def forward(self,x):
        x = x.to(self.cuda_number)
        k = self.proj_key(x)
        q = self.proj_query(x)
        v = self.proj_value(x)
        head = torch.matmul(F.softmax(torch.matmul(q,k.transpose(-2,-1))/(self.dk**0.5)),v)
        if self.cuda_number != 'cuda:0':
            return head.to('cuda:0')
        return head

In [7]:
class MultiAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv):
        super(MultiAttentionHead, self).__init__()
        
        nlayers_GPU_0 = int(h/2)
        nlayers_GPU_1 = int(h/2)
        
        self.head_GPU0 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:0') for i in range(nlayers_GPU_0)
        ])
        
        self.head_GPU1 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,'cuda:1') for i in range(nlayers_GPU_1)
        ])
        #Weight_0 layer:
        self.W0 = nn.Linear(dmodel,dmodel).to('cuda:0')   #Size h*dv x dmodel. But since dv = dk and dk x h = dv so it's a dmodel x dmodel layer -> cuda:0
        #LayerNormalisation
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')
        self.dropout = nn.Dropout(0.1).to('cuda:0')
    
    def forward(self,x):
        multi_attention_heads = 'Empty'
        for i, l in enumerate(self.head_GPU0):
            if i == 0:
                multi_attention_heads = l(x)
            else:
                multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        for i, l in enumerate(self.head_GPU1):
            multi_attention_heads = torch.cat((multi_attention_heads,l(x)), dim=2)
        multi_attention_heads = self.W0(multi_attention_heads) 
        multi_attention_heads = self.Add_and_Nom(x + multi_attention_heads)  #cuda:0
        multi_attention_heads = self.dropout(multi_attention_heads)
        return multi_attention_heads

In [8]:
class EncoderStack(nn.Module):
    def __init__(self,dmodel,dk,dv):
        super(EncoderStack, self).__init__()
        self.multiAttentionHeads = MultiAttentionHead(dmodel,dk,dv)
        self.lin1a = nn.Linear(dmodel,dff).to('cuda:0')
        self.dropout1 = nn.Dropout(0.1).to('cuda:0')
        self.lin1b = nn.Linear(dff,dmodel).to('cuda:0')
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')

    def forward(self,x):
        x = self.multiAttentionHeads(x)
        sublayer_x = self.lin1a(x)
        sublayer_x = F.relu(sublayer_x)
        sublayer_x = self.dropout1(sublayer_x)
        sublayer_x = self.lin1b(sublayer_x)
        sublayer_x = self.Add_and_Nom(x + sublayer_x)
        return sublayer_x

In [9]:
class EncoderTransformerStacks(nn.Module):
    def __init__(self,dmodel,dk,dv):
        super(EncoderTransformerStacks, self).__init__()
        self.encoderStack = nn.ModuleList([
            EncoderStack(dmodel,dk,dv) for i in range(6)
        ])

    def forward(self,x):
        for i, l in enumerate(self.encoderStack):
            x = l(x)
        return x

In [10]:
class EncoderTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(EncoderTransformer, self).__init__()
        self.positionEncoder = PositionalEncoding(dmodel,0.1, max_length,vocab_size).to('cuda:0')
        self.encoder_Stacks = EncoderTransformerStacks(dmodel,dk,dv)
        
    def forward(self,x):
        x = self.positionEncoder(x)
        x = self.encoder_Stacks(x)
        return x

### Decoder Part

In [11]:
def create_mask(input_ids):
    ms =output1d_.size(1)
    input_ids = torch.cat(ms*[input_ids],dim=0).view(ms,-1)
    return torch.flip(torch.triu(input_ids), [0, 1])

In [12]:
class MaskedSingleAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,cuda_number='cuda:0'):
        super(MaskedSingleAttentionHead,self).__init__()
        self.proj_key = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_query = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_value  = nn.Linear(dmodel,dv).to(cuda_number)
        self.dk = dk
        self.cuda_number = cuda_number
        
    def forward(self,x):
        x = x.to(self.cuda_number)
        k = self.proj_key(x)
        q = self.proj_query(x)
        v = self.proj_value(x)
        q_size = q.size(1)
        mask = (torch.triu(torch.ones(q_size, q_size)) == 1).transpose(-2, -1).to(self.cuda_number)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        k_QT = torch.matmul(k,q.transpose(-2,-1))
        masked = k_QT + mask
        head = torch.matmul(F.softmax(masked/(dk**0.5)),v)
        if self.cuda_number != 'cuda:0':
            return head.to('cuda:0')
        return head

In [13]:
decoder_batch_input = tokenizer.batch_encode_plus(['I love a man','I love a woman','I am not a bi','I am only joking'],
                                           max_length= max_length,
                                           padding=True,
                                           truncation=True,
                                           return_tensors='pt')

In [14]:
inputs = decoder_batch_input['input_ids']
inputs = inputs.to('cuda:0')

In [15]:
def generate_attention_head_for_each_sequence(input_):
    pos = PositionalEncoding(dmodel, 0.1, max_length, vocab_size).to('cuda:0')
    masked_input = pos(inputs)
    attentionhead1 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:0')
    attentionhead2 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:0')
    attentionhead3 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:0')
    attentionhead4 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:0')
    attentionhead5 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:1')
    attentionhead6 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:1')
    attentionhead7 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:1')
    attentionhead8 = MaskedSingleAttentionHead(dmodel,dk,dv,cuda_number='cuda:1')
    head1 = attentionhead1(masked_input)
    head2 = attentionhead2(masked_input)
    head3 = attentionhead3(masked_input)
    head4 = attentionhead4(masked_input)
    head5 = attentionhead5(masked_input)
    head6 = attentionhead6(masked_input)
    head7 = attentionhead7(masked_input)
    head8 = attentionhead8(masked_input)
    head1 = head1[:,max_length - 1,]
    head2 = head2[:,max_length - 1,]
    head3 = head3[:,max_length - 1,]
    head4 = head4[:,max_length - 1,]
    head5 = head5[:,max_length - 1,]
    head6 = head6[:,max_length - 1,]
    head7 = head7[:,max_length - 1,]
    head8 = head8[:,max_length - 1,]
    heads = torch.cat((head1,head2,head3,head4,head5,head6,head7,head8),dim=-1)
    return heads

#### Encoder outputs

In [16]:
encoder_inputs = encoder_batch_input['input_ids'].to('cuda:0')
Encoder = EncoderTransformer(dmodel,dk,dv,max_length,vocab_size)
Encoder_output = Encoder(encoder_inputs)
Encoder_output.size()

torch.Size([4, 4, 512])

#### Decoder masked multihead  outputs -->Not sure it's correct yet!!! -> One error is all you need!!!!

In [21]:
#There are 4 sequences in this batch
sequence_1 = generate_attention_head_for_each_sequence(inputs[0])
sequence_2 = generate_attention_head_for_each_sequence(inputs[1])
sequence_3 = generate_attention_head_for_each_sequence(inputs[2])
sequence_4 = generate_attention_head_for_each_sequence(inputs[3])

masked_multihead_attention_batch_out = torch.cat((sequence_1,sequence_2,sequence_3,sequence_4)).view(inputs.size(0),-1,dmodel)
masked_multihead_attention_batch_out.size()

torch.Size([4, 4, 512])