In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import warnings
warnings.filterwarnings('ignore')
from Config import conf
from transformers import T5Tokenizer
from torchtext.nn.modules.multiheadattention import ScaledDotProduct

In [2]:
config = conf()
h = config.h
N = config.N
dmodel = config.dmodel
dk= config.dk
dv = config.dv
dff = config.dff
tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_path)
max_length = config.max_length
vocab_size = config.vocab

In [3]:
sentence1input = 'I love dog'
sentence2input = 'I love cat'
sentence3input = 'I love money'
sentence4input = 'I love overtime'

decoder1input_ = 'dog meat is delicious'
sentence2input_ = 'cat meat is bad '
sentence3input_ = 'I can buy dogs'
sentence4input_ = 'I can buy cats'

In [4]:
encoder_inputs = tokenizer.batch_encode_plus([sentence1input,sentence2input,sentence3input,sentence4input],
                                          max_length= max_length,
                                          pad_to_max_length = True,
                                          truncation=True,
                                          return_tensors='pt'
                                         )
encoder_inputs = encoder_inputs['input_ids'].to('cuda:0')


decoder_inputs = tokenizer.batch_encode_plus([sentence1input,sentence2input,sentence3input,sentence4input],
                                          max_length= max_length,
                                          pad_to_max_length = True,
                                          truncation=True,
                                          return_tensors='pt'
                                         )
decoder_inputs = decoder_inputs['input_ids'].to('cuda:0')

In [5]:
def create_mask(sequence_length,cuda_number):
    mask = (torch.triu(torch.ones(sequence_length, sequence_length)) == 1).transpose(-2, -1).to(cuda_number)
    mask = mask.int().masked_fill(mask == 0, 0)
    return mask

#example:
example_mask = create_mask(4,'cuda:0')
print(example_mask)

tensor([[1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 1]], device='cuda:0', dtype=torch.int32)


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len, vocab_size):
        super(PositionalEncoding, self).__init__()
        self.embedded_layer = nn.Embedding(vocab_size,d_model)
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = self.embedded_layer(x)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [7]:
class SingleAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,cuda_number='cuda:0', applyMask = False):
        super(SingleAttentionHead,self).__init__()
        self.proj_key = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_query = nn.Linear(dmodel,dk).to(cuda_number)
        self.proj_value  = nn.Linear(dmodel,dv).to(cuda_number)
        self.dk = dk
        self.cuda_number = cuda_number
        self.max_length = max_length
        self.applyMask = applyMask
        
    def forward(self,x,y=None):
        x = x.to(self.cuda_number)
        k = self.proj_key(x)
        if y == None: #If you dont supply a y value value then this is the self attended layer
            q = self.proj_query(x)
            v = self.proj_value(x)
            
        if y != None:  # If you need a mask then this is the encoder-decoder attention layer
            y = y.to(self.cuda_number)
            q = self.proj_query(y)  #y is encoder output, you get the query from the encoder
            v = self.proj_value(y)  #y is the encoder output, you get the key from the encoder
        
        I = torch.einsum('b i d , b j d -> b i j', q, k)
        
        if self.applyMask and y == None: #If you need a mask then this is the decoder-self attended layer
            mask = create_mask(self.max_length,self.cuda_number)
            for i in range(len(I)):
                I[i].masked_fill_(mask==0,float('-inf'))
        
        attention = F.softmax(I/(self.dk**0.5), dim=-1)
        
        head = torch.einsum('b i j , b j d -> b i d', attention, v)
        
        if self.cuda_number != 'cuda:0':
            return head.to('cuda:0')
        return head

In [8]:
class MultiAttentionHead(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,applyMask = False):
        super(MultiAttentionHead, self).__init__()
        
        nlayers_GPU_0 = int(h/2)
        nlayers_GPU_1 = int(h/2)
        
        self.head_GPU0 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,max_length,'cuda:0',applyMask) for i in range(nlayers_GPU_0)
        ])
        
        self.head_GPU1 = nn.ModuleList([
            SingleAttentionHead(dmodel,dk,dv,max_length,'cuda:1',applyMask) for i in range(nlayers_GPU_1)
        ])
        #Weight_0 layer:
        self.W0 = nn.Linear(dmodel,dmodel).to('cuda:0')   #Size h*dv x dmodel. But since dv = dk and dk x h = dv so it's a dmodel x dmodel layer -> cuda:0
        #LayerNormalisation
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')
        self.dropout = nn.Dropout(0.1).to('cuda:0')
    
    def forward(self,x,y=None):
        multi_attention_heads = 'Empty'
        for i, l in enumerate(self.head_GPU0):
            if i == 0:
                multi_attention_heads = l(x,y)
            else:
                multi_attention_heads = torch.cat((multi_attention_heads,l(x,y)), dim=2)
        for i, l in enumerate(self.head_GPU1):
            multi_attention_heads = torch.cat((multi_attention_heads,l(x,y)), dim=2)
        multi_attention_heads = self.W0(multi_attention_heads) 
        multi_attention_heads = self.Add_and_Nom(x + multi_attention_heads)  #cuda:0
        multi_attention_heads = self.dropout(multi_attention_heads)
        return multi_attention_heads

In [9]:
class EncoderStack(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length):
        super(EncoderStack, self).__init__()
        self.multiAttentionHeads = MultiAttentionHead(dmodel,dk,dv,max_length,False)
        self.lin1a = nn.Linear(dmodel,dff).to('cuda:0')
        self.dropout1 = nn.Dropout(0.1).to('cuda:0')
        self.lin1b = nn.Linear(dff,dmodel).to('cuda:0')
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')

    def forward(self,x):
        x = self.multiAttentionHeads(x)
        sublayer_x = self.lin1a(x)
        sublayer_x = F.relu(sublayer_x)
        sublayer_x = self.dropout1(sublayer_x)
        sublayer_x = self.lin1b(sublayer_x)
        sublayer_x = self.Add_and_Nom(x + sublayer_x)
        return sublayer_x

In [10]:
class DecoderStack(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length):
        super(DecoderStack, self).__init__()
        self.masked_multi_head_attention = MultiAttentionHead(dmodel,dk,dv,max_length,True)
        self.multi_head_attention = MultiAttentionHead(dmodel,dk,dv,max_length,False)
        self.lin1a = nn.Linear(dmodel,dff).to('cuda:0')
        self.dropout1 = nn.Dropout(0.1).to('cuda:0')
        self.lin1b = nn.Linear(dff,dmodel).to('cuda:0')
        self.Add_and_Nom = nn.LayerNorm(dmodel, eps=1e-05, elementwise_affine=True).to('cuda:0')

    def forward(self,x,y=None):
        z = self.masked_multi_head_attention(x)
        z = self.multi_head_attention(x,y)
        sublayer_z = self.lin1a(z)
        sublayer_z = F.relu(sublayer_z)
        sublayer_z = self.dropout1(sublayer_z)
        sublayer_z = self.lin1b(sublayer_z)
        sublayer_z = self.Add_and_Nom(z + sublayer_z)
        return sublayer_z

In [11]:
class EncoderTransformerStacks(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length):
        super(EncoderTransformerStacks, self).__init__()
        self.encoderStack = nn.ModuleList([
            EncoderStack(dmodel,dk,dv,max_length) for i in range(6)
        ])

    def forward(self,x):
        for i, l in enumerate(self.encoderStack):
            x = l(x)
        return x

In [12]:
class DecoderTransformerStacks(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length):
        super(DecoderTransformerStacks, self).__init__()
        self.dencoderStack = nn.ModuleList([
            DecoderStack(dmodel,dk,dv,max_length) for i in range(6)
        ])

    def forward(self,d,e):
        for i, l in enumerate(self.dencoderStack):
            x = l(d,e)
        return x

In [13]:
class EncoderTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(EncoderTransformer, self).__init__()
        self.positionEncoder = PositionalEncoding(dmodel,0.1, max_length,vocab_size).to('cuda:0')
        self.encoder_Stacks = EncoderTransformerStacks(dmodel,dk,dv,max_length)
        
    def forward(self,x):
        x = self.positionEncoder(x)
        x = self.encoder_Stacks(x)
        return x

In [14]:
class DecoderTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(DecoderTransformer, self).__init__()
        self.positionEncoder = PositionalEncoding(dmodel,0.1, max_length,vocab_size).to('cuda:0')
        self.decoder_Stacks = DecoderTransformerStacks(dmodel,dk,dv,max_length)
        
    def forward(self,d,e):
        d = self.positionEncoder(d)
        x = self.decoder_Stacks(d,e)
        return x

In [15]:
def create_decoder_batch_sequence(decoder_inputs):
    mask = create_mask(decoder_inputs.size(1),'cuda:0')
    new_sequence = 'empty'
    for i in range(len(decoder_inputs)):
        decoder_sequence = torch.cat(max_length*[decoder_inputs[i]]).reshape(max_length,-1)
        decoder_sequence = decoder_sequence.masked_fill_(mask==0,0)
        if i == 0:
            new_sequence = decoder_sequence
        else:
            new_sequence = torch.cat((new_sequence,decoder_sequence),dim=0)
    return new_sequence.view(decoder_inputs.size(0),decoder_inputs.size(1),-1).permute(1,0,2)

In [16]:
def create_correct_output(decoder_inputs):
    correct_outputs = 'empty'
    for i in range(decoder_inputs.size(1)):
        if i < decoder_inputs.size(1) - 1:
            output = decoder_inputs[:,i+1]
            if i == 0:
                correct_outputs = output
            else:
                correct_outputs = torch.cat((correct_outputs,output),dim=0)
    return correct_outputs.reshape(-1,decoder_inputs.size(0)).transpose(-2,0)

In [17]:
def generate_decoder_input_sequence_per_encoder_batch(decoder_inputs):
    input_sequence = create_decoder_batch_sequence(decoder_inputs)
    next_tokens = create_correct_output(decoder_inputs)
    return {'input_ids': input_sequence,
            'output_ids':next_tokens}

In [18]:
class CustomTransformer(nn.Module):
    def __init__(self,dmodel,dk,dv,max_length,vocab_size):
        super(CustomTransformer,self).__init__()
        self.encoder = EncoderTransformer(dmodel,dk,dv,max_length,vocab_size)
        self.decoder = DecoderTransformer(dmodel,dk,dv,max_length,vocab_size)
        self.linear = nn.Linear(dmodel,vocab_size).to('cuda:0')
    
    def forward(self,e,d):
        e = self.encoder(e)
        d = self.decoder(d,e)
        d = self.linear(d)
        d = d.mean(dim=1)
        return F.log_softmax(d,dim=1)

In [19]:
customTransformer = CustomTransformer(dmodel,dk,dv,max_length,vocab_size)
Criterion = nn.CrossEntropyLoss()
Optimiser = torch.optim.Adam(customTransformer.parameters())

In [20]:
trainingBatch = [[encoder_inputs,decoder_inputs]]   # For simplicity, lets try out with only one batch

In [21]:
customTransformer.train()
for epoch in range(100):  #Lets train the same batch of 1 20 times to see the loss value get reduced!!!
    for batch in trainingBatch:
        decoder_sequence = generate_decoder_input_sequence_per_encoder_batch(batch[1])
        decoder_inputs = decoder_sequence['input_ids']
        decoder_ouputs = decoder_sequence['output_ids']
        total_batch_loss = 0
        for i in range(len(decoder_sequence)):
            Optimiser.zero_grad()
            output = customTransformer(batch[0],decoder_inputs[i])
            loss = Criterion(output,decoder_ouputs[i])
            total_batch_loss += loss.item()
            loss.backward()
            Optimiser.step()
        if epoch % 10 == 0:
            print('Epoch: ',epoch,'Loss: ',round(total_batch_loss,4))

Epoch:  0 Loss:  21.8384
Epoch:  10 Loss:  2.5195
Epoch:  20 Loss:  1.4578
Epoch:  30 Loss:  1.3344
Epoch:  40 Loss:  0.7253
Epoch:  50 Loss:  0.5453
Epoch:  60 Loss:  0.4437
Epoch:  70 Loss:  0.2887
Epoch:  80 Loss:  0.2391
Epoch:  90 Loss:  0.1771


In [None]:
# Yessssss, the the loss get reduced, which means the model can learn.

In [None]:
# To do: Need to create a custom function to use the trained model, such as ... 
                         #...continously generating a sequence of outputs

# Training can occur in parallel, but using the model can only occur sequentially as the model can
# only generate one token as a time... Training is the hardest part!!!!