# Implementing A GPT-model From Scratch

- Coding a GPT-like LLM that can be trained to generate uman-like text.
- Normalizing layer activations to stabilize NN training.
- Adding `shortcut connections` to train deep NNs effectively.
- Implementing transformer blocks to create GPT models of various sizes.
- Computing the number of parameters and storage requiremnts of GPT models.

In [None]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

In [None]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [None]:
GPT_CONFIG_124M: dict[str, Any] = {
    "vocab_size": 50_257,
    "context_length": 1_024,
    "emb_dim": 768,
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,
}

### A Placeholder GPT Backbone

In [None]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg: dict[str, Any]) -> None:
        super().__init__()

        self.tok_emb = nn.Embedding(cfg.get("vocab_size"), cfg.get("emb_dim"))
        self.pos_emb = nn.Embedding(cfg.get("context_length"), cfg.get("emb_dim"))
        self.drop_emb = nn.Dropout(cfg.get("drop_rate"))
        self.transformer_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg.get("n_layers"))]
        )

        self.final_norm = DummyLayerNorm(cfg.get("emb_dim"))
        self.out_head = nn.Linear(cfg.get("emb_dim"), cfg.get("vocab_size"), bias=False)

    def forward(self, x: Tensor) -> Tensor:
        batch_size, seq_len = x.shape
        tok_embeds: Tensor = self.tok_emb(x)
        pos_embeds: Tensor = self.pos_emb(torch.arange(seq_len, device=x.device))
        x = (
            tok_embeds + pos_embeds
        )  # Add token and positional embeddings to get input embeddings
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits: Tensor = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg: dict[str, Any]) -> None:
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, cfg: dict[str, Any]) -> None:
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        return x

### Overview

<img src="../08-Makemore/images/ch04_mental_.gpt_picpng.png" alt="gpt" width="600">

[image source](https://livebook.manning.com/book/build-a-large-language-model-from-scratch/chapter-4/v-7/36)

In [None]:
# Tokenize a batch consisting of sample texts.
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
text1: str = "Every effort moves you"
text2: str = "Everyday holds a"
batch: list[int] = []

batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch_tensor: Tensor = torch.stack(batch, dim=0)

print(f"Shape of batch_tensor: {tuple(batch_tensor.shape)}\n")
print(f"Batch_tensor: \n{batch_tensor}\n")

In [None]:
# Instantiate the DummyGPTModel
seed: int = 5
torch.manual_seed(seed)

model = DummyGPTModel(GPT_CONFIG_124M)
logits: Tensor = model(batch_tensor)

print(f"Shape of logits: {tuple(logits.shape)}\n")
print(f"logits: \n{logits}\n")

### Comment

- The output (2, 4, 50_257) has `2` rows corresponding to the number of input data.
- It has `4` rows which is the context length of the input and `50_257` is the number of unique words (vocab_size) in the input.

### Normalizing Activations With Layer Normalization

- Layer  normalization is used to improve the stability and efficiency of neural networks.
- It aims to to adjust the activations at the output of a NN layer to have a mean of 0 and a standard deviation of 1.
- These adjustments speed up the convergence to effective weights and ensures consistent, reliable training.

<br>

<img src="../08-Makemore/images/ch04_norm_layer.png" width="600">

[image source](https://livebook.manning.com/book/build-a-large-language-model-from-scratch/chapter-4/v-7/52)

<br><hr>

#### Layer Norm vs Batch Norm

- Batch norm normalizes across the batch dimension, whereas layer norm normalizes across the feature/channel dimension.
- Layer norm is preferred because it normalizes each input independently of the batch size and it's more efficient and stable.

In [None]:
torch.manual_seed(seed)

batch_tensor: Tensor = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out: Tensor = layer(batch_tensor)
print(f"out: \n{out}\n")

mean: Tensor = out.mean(dim=-1, keepdim=True)
variance: Tensor = out.var(dim=-1, unbiased=False, keepdim=True)
print("===== Before Layer Normalization ===== \n")
print(f"mean: \n{mean}\n\nvariance: \n{variance}\n")

In [None]:
# Normalize the output
out_norm: Tensor = (out - mean) / torch.sqrt(variance)
mean_1: Tensor = out_norm.mean(dim=-1, keepdim=True)
variance_1: Tensor = out_norm.var(dim=-1, unbiased=False, keepdim=True)
print(f"out_norm: \n{out_norm}\n")
print("===== After Layer Normalization ===== \n")
print(f"mean: \n{mean_1}\n\nvariance: \n{variance_1}\n")

In [None]:
# Improve readability
torch.set_printoptions(sci_mode=False)
print("===== After Layer Normalization ===== \n")
print(f"mean: \n{mean_1}\n\nvariance: \n{variance_1}\n")

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, embed_dim: int) -> None:
        super().__init__()

        self.eps: float = 1e-5  # Prevents zero division

        # Trainable params that's automatically adjusted by the LLM during
        # training to improve model performance
        self.scale = nn.Parameter(torch.ones(embed_dim))
        self.shift = nn.Parameter(torch.zeros(embed_dim))

    def forward(self, x: Tensor) -> Tensor:
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, unbiased=False, keepdim=True)
        norm_x: Tensor = (x - mean) / (std + self.eps)

        return (self.scale * norm_x) + self.shift

In [None]:
layer_norm = LayerNorm(embed_dim=5)
out_layer_norm: Tensor = layer_norm(batch_tensor)

mean: Tensor = out_layer_norm.mean(dim=-1, keepdim=True)
variance: Tensor = out_layer_norm.var(dim=-1, unbiased=False, keepdim=True)
print(f"out_layer_norm: \n{out_layer_norm}\n")
print("===== After Layer Normalization ===== \n")
print(f"mean: \n{mean}\n\nvariance: \n{variance}\n")

### Implementing GELU (Gaussian Error Linear Unit)

- **Smoothness**: Unlike ReLU's sharp cutoff at zero, `GELU` has a smoother gradient transition. This smoothness helps with training, as optimization algorithms can navigate the function's landscape more effectively.

- **Non-zero gradients**: ReLU outputs zero for negative inputs, killing the gradient and hindering learning in those regions. `GELU` maintains non-zero gradients even for negative inputs, allowing the network to learn from a wider range of data. i.e. neurons with -ve gradients can still contribute to the learning process.

#### Note

- **Computational cost**: The original `GELU` formulation can be computationally expensive. Luckily, there are efficient approximations that retain the benefits without the high cost.

<!-- [![image.png](https://i.postimg.cc/9XSGKNZ9/image.png)](https://postimg.cc/14GnNBWR) -->
[![image.png](https://i.postimg.cc/pTrF1Ddk/image.png)](https://postimg.cc/wyKM1RLs)

$$GELU(x) = 0.5x * ( 1 + tanh( \sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3}) ) ) $$

In [None]:
class GELU(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        gelu: Tensor = (
            0.5
            * x
            * (
                1
                + torch.tanh(
                    torch.sqrt(torch.tensor(2.0 / torch.pi))
                    * (x + 0.044715 * torch.pow(x, 3))
                )
            )
        )
        return gelu


class FeedForward(nn.Module):
    """Applies a feed-forward neural network to the input tensor `x`. The feed-forward network
    consists of two linear layers with a GELU activation in between. The first linear layer
    expands the input dimension by a factor of 4, and the second linear layer projects the result
    back to the original input dimension.

    Args:
        x (torch.Tensor): The input tensor to be passed through the feed-forward network.

    Returns:
        torch.Tensor: The output tensor after passing through the feed-forward network.
    """

    def __init__(self, cfg: dict[str, Any]) -> None:
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(cfg.get("emb_dim"), 4 * cfg.get("emb_dim")),
            GELU(),
            nn.Linear(4 * cfg.get("emb_dim"), cfg.get("emb_dim")),
        )

    def forward(self, x: Tensor) -> Tensor:
        return self.layers(x)

In [None]:
gelu, relu = GELU(), nn.ReLU()

x: Tensor = torch.linspace(-3, 3, 100)
y_gelu, y_relu = gelu(x), relu(x)
plt.figure(figsize=(8, 3))

for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"]), 1):
    plt.subplot(1, 2, i)
    plt.plot(x, y)
    plt.title(f"{label} activation function")
    plt.xlabel("x")
    plt.ylabel(f"{label}(x)")
    plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
torch.manual_seed(seed)

ff_network = FeedForward(cfg=GPT_CONFIG_124M)
x: Tensor = torch.randn(2, 3, 768)
out: Tensor = ff_network(x)

print(f"out: {out.shape}")
print(f"out: \n{out}")

#### Feed Forward Network With GELU

<img src="../08-Makemore/images/ch04__ffn.png" width="600">

[image source](https://livebook.manning.com/book/build-a-large-language-model-from-scratch/chapter-4/v-7/112)

In [None]:
# 