In [2]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

In [None]:
class InputEmbeddings(nn.Module):
    """
    Converts input token indices into dense vector representations

    Why its important: Models process number and not text. Embeddings map tokens to a continuous vector space where semantic similarity is reflected by distance and direction.
    How we build it: We use PyTorch's nn.Embedding layer and scale the outputs by the square root of the embedding dimension. This is mainly for training stability.
    """
    def __init__(self, embedding_size: int, vocab_size: int):
        super().__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.embedding_size)

In [None]:
class PositionalEncoding(nn.Module):
    """
    Computes the relative or absolute position of tokens in the sequence using sinusoidal functions

    Why its important: The attention mechanism is permutation invariant. Without positional encoding, the model will treat the sequence as a bag of words.
    How we build it: We use sine and cosine functions of different frequencies to generate a unique encoding for each position. These are added to token embeddings.
    """
    def __init__(self, embedding_size: int, dropout: float, sequence_len: int):
        super.__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create a positional encoding matrix of shape (sequence_len, embedding_size)
        positional_encoding = torch.zeros(sequence_len, embedding_size)
        position = torch.arange(0, sequence_len, dtype=torch.float).unsqueeze(1) # (sequence_len, 1)
        div_term = torch.exp(torch.arange(0, embedding_size, 2).float() * (-math.log(10000.0) / embedding_size))

        # Apply sine to even indices and cosine to odd indices
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        positional_encoding = positional_encoding.unsqueeze(0) # (1, sequence_len, embedding_size) for batch broadcasting

        self.register_buffer("positional_encoding", positional_encoding) # Not a model parameter but a part of the state

    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, sequence_len, embedding_size)
        """
        x = x + self.positional_encoding[:, :x.size(1)] # Add positional encoding
        return self.dropout(x)

In [4]:
class LayerNormalization(nn.Module):
    """
    Normalizes the inputs across the feature dimension for each data point in the batch independently

    Why its important: Reduces internal covariate shift and thus helps in stabilizing and accelerating training
    How we build it: Compute mean and std for each input across its feature dimension
    """
    def __init__(self, embedding_size: int, eps: float = 1e-6):
        super.__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(embedding_size)) # Learnable scale
        self.bias = nn.Parameter(torch.zeros(embedding_size)) # Learnable shift

    def forward(self, x):
        mean = x.mean(dim=-1, keepdims=True)
        std = x.std(dim=-1, keepdims=True, unbiased=False)
        normalised = (x - mean) / (std + self.eps)
        return self.alpha * normalised + self.bias

tensor([0., 2., 4., 6., 8.])

In [None]:
class MultiHeadAttention(nn.Module):
    """
    The multi-head attention mechanism allows the model to focus on different parts of the input sequence simultaneously.
    ANALOGY: Researching a topic (query) when you have multiple books (keys) with different content (values). Attention is like deciding which books are relevant and how much to read from each.
    """
    def __init__(self, embedding_size: int, n_heads: int, dropout: float):
        super.__init__()
        assert embedding_size % n_heads == 0, "Embedding size must be divisible by n_heads"
        self.embedding_size = embedding_size
        self.n_heads = n_heads
        self.dimensions_per_head = embedding_size // n_heads # Dimensions per head

        # Why separate projects? Each head learns different aspects
        self.w_q = nn.Linear(embedding_size, embedding_size) # Query projection
        self.w_k = nn.Linear(embedding_size, embedding_size) # Key projection
        self.w_v = nn.Linear(embedding_size, embedding_size) # Value projection
        self.w_o = nn.Linear(embedding_size, embedding_size) # Output projection

        self.dropout = nn.Dropout(dropout)

    # Attention mechanism: Core calculation