<a href="https://colab.research.google.com/github/chandresh564/Transformer-implementation-in-Pytorch/blob/main/Transformers_implementation_in_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

#Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=50):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Loss monitor
class AverageMeter(object):
      def __init__(self):
          self.reset()

      def reset(self):
          self.val = 0
          self.avg = 0
          self.sum = 0
          self.count = 0

      def update(self, val, n=1):
          self.val = val
          self.sum += val * n
          self.count += n
          self.avg = self.sum / self.count


# Transformer class
class Transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.1):
        super(Transformer, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_layers = num_layers

        assert input_dim % num_heads == 0, "input_dim must be divisible by num_heads"

        self.embed_dim = 512
        self.head_dim = 512 // num_heads

        # Transformer Encoder layers
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=512,
                nhead=num_heads,
                dim_feedforward=hidden_dim,
                dropout=dropout,
                batch_first=True
            ),
            num_layers=num_layers
        )


        self.end_linear = nn.Linear(512, input_dim)

        self.positional_encoding = PositionalEncoding(512)

    # masking
    def create_src_mask(self, x):
      src_mask = torch.all(x == 0, dim=-1)
      return (~src_mask).float()

    def forward(self, x):
        self.src_mask = self.create_src_mask(x)
        x = self.positional_encoding(x)
        encoded = self.encoder(x, src_key_padding_mask=self.src_mask)
        decoded  = self.end_linear(encoded)
        return decoded

# Hyper-parameters
input_dim = 24
hidden_dim = 128
num_heads = 8
num_layers = 6

# Data Loading
array = np.array(list_, dtype='float32')
tensor = torch.from_numpy(array)
dataset = TensorDataset(tensor)
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size)

model = Transformer(input_dim, hidden_dim, num_heads, num_layers)

optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 500
for epoch in range(num_epochs):
    for batch in dataloader:
        train_loss = AverageMeter()

        optimizer.zero_grad()
        input_tensor = batch[0].float()

        output = model(input_tensor)

        loss = nn.MSELoss()(output,input_tensor)

        train_loss.update(loss.item(), output.size(0))

        loss.backward()

        optimizer.step()

    if epoch % 50 == 0:
      print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")