<a href="https://colab.research.google.com/github/ezzy4me/project/blob/main/attention_is_all_i_need.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Transformer implementing

library

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import matplotlib.pyplot as plt

# plt.style.use('dark_background')

##Embedding

In [None]:
class Embedding(nn.Module):
    """Custom Embedding Layer"""
    def __init__(self, vocab_len, d_model, padding_idx=1):
        super(Embedding, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_len, d_model, padding_idx=padding_idx) #nn.Embedding(num_embeddings, embedding_dim)
    def forward(self, x):
        # In the embedding layers, authors multiply those weights by `sqrt(d_model)`
        return self.embedding(x) * np.sqrt(self.d_model)

In [None]:
position = torch.arange(0, 5).unsqueeze(1)
print(position.size())
#torch.arange(0, 20, 2) 

#pe = torch.zeros(10, 20)
#pe.unsqueeze(0).size()

div_term = torch.exp(torch.arange(0, 20, 2))
print(div_term.size())

pe = torch.sin(position * div_term)
print(pe.size())

pe.size(1)

torch.Size([5, 1])
torch.Size([10])
torch.Size([5, 10])


10

In [None]:
class PositionalEncoding(nn.Module):
    """Position Encoding function"""
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1) #size[max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2) * 
                             -(math.log(10000.0) / d_model)) #size[d_model/2]
        pe[:, 0::2] = torch.sin(position * div_term) #2i
        pe[:, 1::2] = torch.cos(position * div_term) #2i+1
        pe = pe.unsqueeze(0) #size[1, max_len, d_model]

        #Adds a buffer to the module.
        #register a buffer that should not to be considered a model parameter. 
        #optimizer doesn't update it. but value is available
        self.register_buffer('pe', pe) #state_dict #register_buffer(name, tensor, persistent=True)
        
    def forward(self, x):
        pe_val = self.pe[:, :x.size(1)]
        
        #freezing the part of the model as no changes happen to its parameters
        pe_val.requires_grad = False
        
        x = x + pe_val
        # x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)

        # page 7
        # In addition, we apply dropout to the sums of the embeddings and the
        # positional encodings in both the encoder and decoder stacks.
        # For the base model, we use a rate of Pdrop=0.1.
        return self.dropout(x)
        

final stage

In [6]:
import torch.nn as nn
class TransformerModel(nn.Module):

    def __init__(self, encoder, decoder):
        super().__init__()
        self._is_generation_fast = False #????
        self.encoder = encoder 
        self.decoder = decoder
        
    def forward(self, src_tokens, src_lengths, prev_output_tokens):
        encoder_out, padding_mask = self.encoder(src_tokens, src_lengths) #
        decoder_out = self.decoder(prev_output_tokens, encoder_out, padding_mask) #
        return decoder_out #final output