# Scratchpad 1 - build GPT model

## 1. GPT Model Architecture

With Dummy Transformer Blocks and LayerNorm modules.

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "embedding_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout_rate": 0.1,
    "qkv_bias": False,
}

In [2]:
import torch
import torch.nn as nn

In [3]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
    
    def forward(self, x):
        return x

In [4]:
class DummyLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
    
    def forward(self, x):
        return x

In [5]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()

        # embedding layers
        self.tok_emb = nn.Embedding(config["vocab_size"], config["embedding_dim"])
        self.pos_emb = nn.Embedding(config["context_length"], config["embedding_dim"])
        # first dropout layer
        self.dropout_emb = nn.Dropout(config["dropout_rate"])

        # transformer blocks
        self.transformer_blocks = nn.Sequential(
            # unpacking operator "*" unpacks a list of TransformerBlock objects as arguments to nn.Sequential
            *[DummyTransformerBlock(config) for _ in range(config["n_layers"])]
        )

        # final LayerNorm layer
        self.final_norm = DummyLayerNorm(config)
        # output head layer
        ## its input is the final hidden state of shape (batch_size, context_length, embedding_dim)
        ## its output is a logits vector of shape (batch_size, context_length, vocab_size), each value in the last dimension corresponds to that token ID's score in the whole vacabulary
        ## each sequence position in the output represents the predicted next token of the corresponding position's token in the input
        self.out_head = nn.Linear(config["embedding_dim"], config["vocab_size"]) 


    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape # in_idx is a tensor of input token IDs of shape (batch_size, seq_len, vocab_size)
        tok_emb = self.tok_emb(in_idx) # will return each token's token embeddings of shape (batch_size, seq_len, embedding_dim)
        pos_emb = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) # will return each token's positional embeddings of shape (batch_size seq_len, embedding_dim)
        # add positional embeddings to token embeddings 
        x = tok_emb + pos_emb
        
        # pass through first dropout layer
        x = self.dropout_emb(x)
        # pass through transformer blocks
        x = self.transformer_blocks(x)
        # pass through final LayerNorm layer
        x = self.final_norm(x)

        # pass through output head layer to get the logits
        logits = self.out_head(x)   # logits is of shape (batch_size, seq_len, vocab_size)
        return logits

For the token embedding layer, `nn.Embedding(vocab_size, embedding_dim)` module creates an token embedding lookup table of shape `(vocab_size, embedding_dim)`. 

Once it gets an input sequence (aka a sequence of `context_length` number of `token IDs`), or a batch of input sequences (shape `(batch_size, context_length)` of `token_IDs`), it will do the table lookup to get the embeddings (embedding_dim) for each token ID, and output the token embedding tensor of shape `(context_length, embedding_dim)` or `(batch_size, context_length, embedding_dim)` when batched.



## 2. LayerNorm module

Layer normalization improves stability and efficiency of neural network training. 

The main idea is to adjust the activations (outputs) to have: 
- mean of 0
- variance of 1 (unit variance)

In GPT-2 LayerNorm is typically applied 
- before and after multi-head attention module
- before the final output head

In [6]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        embedding_dim = config["embedding_dim"]
        self.eps = 1e-5 # epsilon, a small constant to avoid division by zero
        # two learnable parameter matrices to adjust the scaling and shifting to  best suit the data it is processing
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) # calculate mean of the last dimension
        var = x.var(dim=-1, keepdim=True, unbiased=False) # calculate variance of the last dimension. unbiased=False means the variance is not calculated with the Bessel's correction (which would have devided by N-1 instead of N). This is compatible with the original GPT-2 which was implemented in TensorFlow.
        norm_x = (x-mean) / torch.sqrt(var + self.eps) # calculate normalized version of the input tensor
        return self.scale * norm_x + self.shift



## 3. Transformer block

### 3.1 Multihead Attention

In [7]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, droupout, n_heads, qkv_bias=False):
        super().__init__()
        assert d_out % n_heads == 0, "d_out must be must divisible by n_heads"

        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads # dimension of each head

        # Query, key, value weight matrices
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
    
        # Linear layer to combine head outputs
        self.out_proj = nn.Linear(d_out, d_out)

        # Causal attention mask to prevent attending to future tokens (mask out the upper right triangle of the attention matrix)
        ## using buffer ensures that the mask will automatically be moved to the appropriate device (CPU or GPU) during training with the model and data
        ## torch.triu() returns the upper trianglular part (on and above the diagonal) of a matrix or batch of matrices, setting the other elements to 0. diagonal=1 exclueds the diagonal itself
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

        # Apply an additional dropout mask to reduce overfitting
        self.dropout = nn.Dropout(droupout)

        # an optional linear projection at the end
        self.out_proj = nn.Linear(d_out, d_out)

    def forward(self, x):
        batch_size, seq_len, d_in = x.shape
        
        keys = self.W_k(x) # shape: (batch_size, seq_len, d_out)
        queries = self.W_q(x)
        values = self.W_v(x)

        # Split the keys, queries, and values into multiple heads
        ## by rolling out the last dimension (batch_size, seq_len, d_out) -> (batch_size, seq_len, n_heads, head_dim)
        keys = keys.view(batch_size, seq_len, self.n_heads, self.head_dim)
        queries = queries.view(batch_size, seq_len, self.n_heads, self.head_dim)
        values = values.view(batch_size, seq_len, self.n_heads, self.head_dim)

        # Transpose the dimensions to perform attention
        ## (batch_size, seq_len, n_heads, head_dim) -> (batch_size, n_heads, seq_len, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (self-attention)
        attn_scores = queries @ keys.transpose(2, 3) # transpose keys last two dimensions (seq_len, head_dim) -> (head_dim, seq_len) to do dot product for each head

        # Make a bookean mask from the original mask truncated to the seq_len of this batch
        mask_bool = self.mask.bool()[:seq_len, :seq_len]
        # Apply the mask to the attention scores, fill the 1s with -inf to zero out the scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        # Use softmax to calculate attention weights from scaled attention scores
        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        # Apply dropout to the attention weights
        attn_weights = self.dropout(attn_weights)

        # Calculate context vector applying attention weights to values
        context_vec = attn_weights @ values
        # transpose back n_heads and seq_len dimensions to prepare for head concatenation
        ## (batch_size, n_heads, seq_len, head_dim) -> (batch_size, seq_len, n_heads, head_dim
        context_vec = context_vec.transpose(1, 2)
        # concatenate the heads
        context_vec = context_vec.contiguous().view(batch_size, seq_len, self.d_out)
        # optional linear projection
        context_vec = self.out_proj(context_vec)

        return context_vec


### 3.2 FeedForward

`GELU` (Gaussian error linear unit) activation, a more complex and smooth activation function incorporating Gaussian linear units. It offers improved performance for deep learning models, unlike simple ReLU.

In [8]:
import torch
import torch.nn as nn

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

FeedForward layer enhances the model's ability to learn from and generalize the data.

Although the input and output dimensions are the same, it internally expands the embedding dimension into a higher-dimensional space through the first linear layer, followed by a nonlinear GELU activation, and then a contraction back to the original dimension with the second linear layer, allowing for the exploration of a richer representaion space.

In [9]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config["embedding_dim"], config["embedding_dim"] * 4),
            GELU(),
            nn.Linear(config["embedding_dim"] * 4, config["embedding_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

### 3.3 The full Transformer block

- LayerNorm + Multihead attention module + dropout. Shortcut surrounding these modules.
- LayerNorm + FeedForward module + dropout. Shortcut surrounding these modules.

In [10]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.norm1 = LayerNorm(config)
        self.att = MultiHeadAttention(
            d_in=config["embedding_dim"],
            d_out=config["embedding_dim"],
            context_length=config["context_length"],
            droupout=config["dropout_rate"],
            n_heads=config["n_heads"],
            qkv_bias=config["qkv_bias"]
        )

        self.norm2 = LayerNorm(config)
        self.ff = FeedForward(config)

        self.drop_shortcut = nn.Dropout(config["dropout_rate"])

    def forward(self, x):
        # shortcut connection around multi-head attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # shortcut connection around feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

## GPT with real Layernorm and Transformer Blocks

In [11]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()

        # embedding layers
        self.tok_emb = nn.Embedding(config["vocab_size"], config["embedding_dim"])
        self.pos_emb = nn.Embedding(config["context_length"], config["embedding_dim"])
        # first dropout layer
        self.dropout_emb = nn.Dropout(config["dropout_rate"])

        # transformer blocks
        self.transformer_blocks = nn.Sequential(
            # unpacking operator "*" unpacks a list of TransformerBlock objects as arguments to nn.Sequential
            *[TransformerBlock(config) for _ in range(config["n_layers"])]
        )

        # final LayerNorm layer
        self.final_norm = LayerNorm(config)
        # output head layer
        ## its input is the final hidden state of shape (batch_size, context_length, embedding_dim)
        ## its output is a logits vector of shape (batch_size, context_length, vocab_size), each value in the last dimension corresponds to that token ID's score in the whole vacabulary
        ## each sequence position in the output represents the predicted next token of the corresponding position's token in the input
        self.out_head = nn.Linear(config["embedding_dim"], config["vocab_size"]) 


    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape # in_idx is a tensor of input token IDs of shape (batch_size, seq_len, vocab_size)
        tok_emb = self.tok_emb(in_idx) # will return each token's token embeddings of shape (batch_size, seq_len, embedding_dim)
        pos_emb = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) # will return each token's positional embeddings of shape (batch_size seq_len, embedding_dim)
        # add positional embeddings to token embeddings 
        x = tok_emb + pos_emb
        
        # pass through first dropout layer
        x = self.dropout_emb(x)
        # pass through transformer blocks
        x = self.transformer_blocks(x)
        # pass through final LayerNorm layer
        x = self.final_norm(x)

        # pass through output head layer to get the logits
        logits = self.out_head(x)   # logits is of shape (batch_size, seq_len, vocab_size)
        return logits