<a href="https://colab.research.google.com/github/florianaewing/CSB430SWIWinter2026/blob/main/LLMLab2_FE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM Architecture
Learning Goals

Description
In this lab we will be building out the architecture of a large language model (LLM).

Citation
Raschka, S. (2024). Build a large language model (from scratch). Manning Publications.

Lab Deliverables
Read though: https://github.com/rasbt/LLM-workshop-2024/blob/main/03_architecture/03.ipynb

# Step 1. Coding an LLM architecture
- Set a all capital variable GPT_CONFIG_124M
  -  variables in all capitals are conventially seen as constants but python techincally doesn't have constants, so it's more a conventions saying "don't change this"
- Set the value to that variable to a dictionary with the folowing values
  - "vocab_size": 50257,    # Vocabulary size
  - "context_length": 1024, # Context length
  - "emb_dim": 768,         # Embedding dimension
  - "n_heads": 12,          # Number of attention heads
  - "n_layers": 12,         # Number of layers
  - "drop_rate": 0.0,       # Dropout rate
  - "qkv_bias": False       # Query-Key-Value bias

  <details>
  <summary>Click Here to view solution</summary>
 ```python
  GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
  }
 ```
   </details>



In [15]:
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [12]:
GPT_CONFIG_124M = {
   "vocab_size": 50257,    # Vocabulary size
   "context_length": 1024, # Context length
   "emb_dim": 768,         # Embedding dimension
   "n_heads": 12,          # Number of attention heads
   "n_layers": 12,         # Number of layers
   "drop_rate": 0.0,       # Dropout rate
   "qkv_bias": False       # Query-Key-Value bias
 }

In [13]:
import sys
sys.path.insert(0, '/content/sample_data')

## Step 2: Build the GPT Model


- Import torch.nn as nn
- Import TransformerBlock and LayerNorm
- Create a class called GPTModel
- The class should inherit from nn.Module
- The constructor (__init__) should take a single argument called cfg
- Inside __init__, define the following attributes:
  - Token embedding  
    - tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
  - Positional embedding  
    - pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
  - Embedding dropout  
    - drop_emb = nn.Dropout(cfg["drop_rate"])
  - Transformer blocks  
    - trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
  - Final layer normalization  
    - final_norm = LayerNorm(cfg["emb_dim"])
  - Output head  
    - out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

- Create a method called forward that takes one argument called in_idx.
  - Inside this method:
    - Extract batch_size and seq_len from in_idx.shape  
    - Create token embeddings using self.tok_emb(in_idx)  
    - Create positional embeddings using torch.arange(seq_len, device=in_idx.device)  
    - Add token and positional embeddings together  
    - Pass the result through drop_emb  
    - Pass the result through trf_blocks  
    - Pass the result through final_norm  
    - Compute logits using out_head  
    - Return logits  


<details>
  <summary>Click Here to view solution</summary>

    ```
    import torch
    import torch.nn as nn
    from supplementary import TransformerBlock, LayerNorm


    class GPTModel(nn.Module):
        def __init__(self, cfg):
            super().__init__()
            self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
            self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
            self.drop_emb = nn.Dropout(cfg["drop_rate"])

            self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
            )

            self.final_norm = LayerNorm(cfg["emb_dim"])
            self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
            )

        def forward(self, in_idx):
            batch_size, seq_len = in_idx.shape
            tok_embeds = self.tok_emb(in_idx)
            pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
            )
            x = tok_embeds + pos_embeds
            x = self.drop_emb(x)
            x = self.trf_blocks(x)
            x = self.final_norm(x)
            logits = self.out_head(x)
            return logits
    ```
</details>

In [17]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

## Step 3: Instantiate the GPT Model with Weights

- Import torch and tiktoken
- Set a variable called tokenizer using tiktoken.get_encoding("gpt2")
- Create an empty list called batch
- Create two strings of text and assign them to variables txt1 and txt2
- Encode each string using the tokenizer
- Convert each encoded sequence to a torch tensor
- Append both tensors to the batch list
- Stack the batch into a single tensor using torch.stack with dim=0
- Print the batch
- Set a manual random seed using torch.manual_seed(123)
- Instantiate the GPTModel using the configuration GPT_CONFIG_124M
- Pass the batch into the model and store the result in a variable called out
- Print the input batch
- Print the shape of the output
- Print the output tensor



<details>
  <summary>Click Here to view solution</summary>
    ```python
    import torch
    import tiktoken

    tokenizer = tiktoken.get_encoding("gpt2")

    batch = []

    txt1 = "Every effort moves you"
    txt2 = "Every day holds a"

    batch.append(torch.tensor(tokenizer.encode(txt1)))
    batch.append(torch.tensor(tokenizer.encode(txt2)))

    batch = torch.stack(batch, dim=0)
    print(batch)

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)

    out = model(batch)

    print("Input batch:\n", batch)
    print("\nOutput shape:", out.shape)
    print(out)
    ```


In [18]:
tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0)
print(batch)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)

print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 6.4165e-02,  2.0443e-01, -1.6945e-01,  ...,  1.7887e-01,
           2.1921e-01, -5.8153e-01],
         [ 3.7736e-01, -4.2545e-01, -6.5874e-01,  ..., -2.5050e-01,
           4.6553e-01, -2.5760e-01],
         [ 8.8996e-01, -1.3770e-01,  1.4748e-01,  ...,  1.7770e-01,
          -1.2015e-01, -1.8902e-01],
         [-9.7276e-01,  9.7338e-02, -2.5419e-01,  ...,  1.1035e+00,
           3.7639e-01, -5.9006e-01]],

        [[ 6.4165e-02,  2.0443e-01, -1.6945e-01,  ...,  1.7887e-01,
           2.1921e-01, -5.8153e-01],
         [ 1.3433e-01, -2.1289e-01, -2.7021e-02,  ...,  8.1153e-01,
          -4.7410e-02,  3.1186e-01],
         [ 8.9996e-01,  9.5396e-01, -1.7896e-01,  ...,  8.3053e-01,
           2.7657e-01, -2.4577e-02],
         [-9.3430e-05,  1.9390e-01,  5.1217e-01,  ...,  1.1915e+00,
  

# Exercise: Generate some text
  Note: No solution given for this
  1. Use the tokenizer.encode method to prepare some input text
  2. Then, convert this text into a pytprch tensor via (torch.tensor)
  3. Add a batch dimension via .unsqueeze(0)
  4. Use the generate_text_simple function (Provided below) to have the GPT generate some text based on your prepared input text
  5. The output from step 4 will be token IDs, convert them back into text via the tokenizer.decode method

In [24]:
import torch
import tiktoken

# 1. Initialize the tokenizer and model
tokenizer = tiktoken.get_encoding("gpt2")
model = GPTModel(GPT_CONFIG_124M)
model.eval()  # set model to evaluation mode

# 2. Prepare some input text
input_text = "The creature jerked lagubriously"

# 3. Encode the text into token IDs and convert to a PyTorch tensor
input_ids = tokenizer.encode(input_text)        # list of token IDs
input_tensor = torch.tensor(input_ids)         # convert to tensor

# 4. Add a batch dimension
input_tensor = input_tensor.unsqueeze(0)       # shape: (1, seq_len)

# 5. Generate new tokens
max_new_tokens = 50
context_size = GPT_CONFIG_124M["context_length"]  # 1024
output_ids = generate_text_simple(model, input_tensor, max_new_tokens, context_size)

# 6. Decode the token IDs back into text
generated_text = tokenizer.decode(output_ids[0].tolist())  # remove batch dim
print(generated_text)


The creature jerked lagubriously Ops conepathic salvageRL Â  Dunham Aluminum Paperiologist.>> blocking timetable363elsius Snenery Gang nat Healthy Captionained slip kingdom preceding vegetablesommelpins quarry SolutionsMyth ruth techniqueervingIF taxesWomen Casslen cards WeaponEffective bl kidnappedko grapeMajor boltedleaf benign
