## 📦 Load Required Modules

### Import necessary libraries, model config, and utility functions

In [1]:
import torch
import tiktoken
from utils.utils import generate_text_simple, token_ids_to_text, text_to_token_ids, generate
from GPT.GPT import GPTModel
from GPT.GPT_CONFIG import GPT_CONFIG_124M

## 🔤 Tokenizer Setup and Random Seed

### Initialize GPT-2 tokenizer and set a random seed for reproducibility

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)

<torch._C.Generator at 0x2ad0a0729b0>

## 🚀 Set Device (CPU or GPU)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 🧠 Load Custom GPT Model & Weights

### Instantiate the GPT model and load previously trained weights from `model_local.pth`

In [3]:
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("model_local.pth", map_location=device))
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

## ✍️ Generate Text from Custom GPT Model

### Use the loaded model to generate text from a prompt

In [6]:
token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youlit," she up surprise. It is to face watching me by his knees


## 🌐 Download GPT-2 Weights from OpenAI (via GitHub)

### Download the `gpt_download.py` script to fetch pretrained OpenAI GPT-2 weights

In [None]:
import urllib.request
url = (
"https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch05/"
"01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

## 📥 Load OpenAI GPT-2 Weights

### Load GPT-2 model weights and configuration using the downloaded script

In [None]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
model_size="124M", models_dir="gpt2"
)

## 🧾 View Model Settings and Parameters

### Print the GPT-2 model settings and inspect available weight tensors

In [None]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

## 🔍 Inspect Token Embedding Weights

### Check the token embedding weights from the pretrained GPT-2

In [None]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)

## 🧱 GPT-2 Model Configuration Options

### Define configurations for multiple GPT-2 model sizes

In [None]:
model_configs = {
"gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
"gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
"gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
"gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

## 🏷️ Choose Model Variant & Create Config

### Select the model variant (e.g., 124M) and update the GPT config

In [None]:
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

## 🔁 Update Context Length

### Update context length to match GPT-2 (typically 1024 tokens)

we used 256 token in our LLM now we need to change this to match the number used by GPT-2

In [None]:
NEW_CONFIG.update({"context_length": 1024})

## ➕ Add Bias Terms

### Update configuration to include QKV bias as used in OpenAI GPT-2

In [None]:
NEW_CONFIG.update({"qkv_bias": True})

## 🏗️ Reinitialize GPT Model with New Config

### Rebuild a GPT model instance using the updated OpenAI-style configuration

In [None]:
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

## 📌 Overwrite Random Weights with OpenAI GPT-2 Weights

### Define helper function to replace default weights with OpenAI GPT-2 weights

In [None]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                          "Right: {right.shape}"
                         )
    return torch.nn.Parameter(torch.tensor(right))

## 🧮 Import NumPy for Weight Loading

### Import NumPy to handle weight tensors from OpenAI’s weights

In [None]:
import numpy as np

## ⬇️ Load All OpenAI Weights into Model

### Copy all relevant weights from OpenAI GPT-2 into the model using correct mapping

In [None]:
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

## 💾 Load Pretrained Weights into GPT and Move to Device

### Load OpenAI GPT-2 weights into our GPT model and move it to GPU or CPU

In [None]:
load_weights_into_gpt(gpt, params)
gpt.to(device)

## 🔮 Generate Text Using OpenAI GPT-2 Weights

### Generate text from the OpenAI-initialized GPT model using a custom prompt

In [None]:
token_ids = generate(
model=gpt,
idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
max_new_tokens=25,
context_size=NEW_CONFIG["context_length"],
top_k=50,
temperature=1.5
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

We can see that the model loaded with the weights from OpenAI outputs a more coherent text!