<a href="https://colab.research.google.com/github/elangbijak4/LLM-SLM-Examples/blob/main/Transformer_from_torch_nn_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, nhid, nlayers, output_dim):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(d_model)
        self.encoder = nn.Embedding(input_dim, d_model)
        self.transformer = nn.Transformer(d_model, nhead, nlayers, nlayers, nhid)
        self.decoder = nn.Linear(d_model, output_dim)
        self.d_model = d_model

    def forward(self, src, tgt):
        src = self.encoder(src) * math.sqrt(self.d_model)
        tgt = self.encoder(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt)
        output = self.decoder(output)
        return output

# Hyperparameters
input_dim = 10000  # Size of the input vocabulary
d_model = 512  # Embedding dimension
nhead = 8  # Number of attention heads
nhid = 2048  # The dimension of the feedforward network model in nn.Transformer
nlayers = 6  # Number of nn.TransformerEncoderLayer in nn.TransformerEncoder
output_dim = 10000  # Size of the output vocabulary

# Initialize model
model = TransformerModel(input_dim, d_model, nhead, nhid, nlayers, output_dim)

# Example input tensors (batch_size=2, seq_len=10)
src = torch.randint(0, input_dim, (10, 2))  # (seq_len, batch_size)
tgt = torch.randint(0, output_dim, (10, 2))  # (seq_len, batch_size)

# Forward pass
output = model(src, tgt)

print(output.shape)  # Output shape: (seq_len, batch_size, output_dim)




torch.Size([10, 2, 10000])
