In [5]:
# import the libraries

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import math, copy, time

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn 

In [None]:
# create the encoder-decoder class

class EncoderDecoder(nn.Module):
    """
    Base class for the model
    encoder maps input seq to a representation
    decoder maps rep to an output seq
    """

    def __init__(self, encoder, decoder, source_embed,
                  target_embed, generator):
        super(EncoderDecoder, self).__init__
        self.encoder = encoder
        self.decoder = decoder
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.generator = generator

    def forward(self, source, target, 
                source_mask, target_mask):
        
        return self.decode(self.encode(source, source_mask),
                           source_mask, target,target_mask)
    
    def encode(self,source,source_mask):
        return self.decoder(self.source_embed(source), source_mask)
    
    def decode(self, memory, source_mask, target, target_mask):
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)
    

    


In [None]:
class Generator(nn.Module):
     """
    Base generator class for the model
    perform linear and log softmax
    """
     
     def __init__(self, d_model, vocabulary) -> None:
          super(Generator, self).__init__()
          self.project = nn.Linear(d_model, vocabulary)

     def forward(self,x):
          # log(exp(x_i)/sum_j(exp(x_j)))
          return F.log_softmax(self.project(x), dim=-1)


In [None]:
# The Encoder Stack

# create 6 identical layers for the encoder

def clone(layer, number_of_layers):
    "make number_of_layers clones of a layer"
    return nn.ModuleList([copy.deepcopy(layer) for _ in range(number_of_layers)])

In [6]:
class Encoder(nn.Module):
    "Core encoder is the number_of_layers identical layers"
    def __init__(self, layer,number_of_layers):
        super(Encoder, self).__init__()
        self.layers = clone(layer, number_of_layers)
        # norm = (x-E[x])/sqrt(Var[x])*gamma + beta 
        self.norm = nn.LayerNorm(layer.size)


    def forward(self,x,mask):
        "Forward pass the input x and its mask"
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x) 

In [7]:
class LayerNorm(nn.Module):
    "Construct custom LayerNorm with learnable parameters gamma and beta"
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        # note that the default eps in pytorch is 1e-5
        # here it's set to 1e-6 
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return self.gamma * (x - mean) / (std + self.eps) + self.beta

In [None]:
class SublayerConnection(nn.Module):
    """
    Calculates x + Dropout(Sublayer(Norm(x)))
    Residual connection 
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Applies residual connection to sublayers with the same size "
        return x + self.dropout(sublayer(self.norm(x)))
    
    

Each layer has 2 sublayers, a self-attention layer and a fully connected feed forward one.

In [None]:
class EncodeLayer(nn.Module):
    """
    This layer has 2 sublayers: self-attention and forward-feed 
    """

    def __init__(self, size, dropout, self_attention, feed_forward):
        super(EncodeLayer, self).__init__()
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.sublayer = clone(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

Decoder: decoder has 6 identical layers 

In [None]:
class Decoder(nn.Module):
     """
    Decoder with number_of_layers layers and mask
    """
     
     def __init__(self, layer, number_of_layers):
          super(Decoder, self).__init__()

          self.layers = clone(layer, number_of_layers)
          self.norm = LayerNorm(layer.size)

     def forward(self, x, memory, source_mask, target_mask):
          for layer in self.layers:
               x = layer(x, memory, source_mask, target_mask)
          return self.norm(x)
     
     