# Scratchpad 2 - Generate text, download pretrained model and load model weights

## 1. Generate text

### Simple generate text function with greedy sampling

In greedy decoding, the model always chooses the token with the highest probability (the biggest logit) to be the output.

In [1]:
import torch

def generate_tokens_simple(model, input_idx, max_new_tokens, context_size):
    # input_idx shape: (batch_size, n_tokens) in the current context

    # we will output the input tokens plus the generarted new tokens
    output_idx = input_idx
    # iterate over the number of new tokens to generate
    for _ in range(max_new_tokens):
        # in case the current tokens are longer than the model's supported context_size, 
        # crop the tokens in the front and preserve tokens that fit in the model's `context_size`
        idx = output_idx[:, -context_size:]

        # get the model's prediction for the current context
        with torch.no_grad():
            logits = model(idx)

        # predicted next token is at the last position of the logits, so we extract only the last token's logits.
        ## logits shape: (batch_size, context_size, vocab_size) -> next_token_logits shape: (batch_size, vocab_size)
        next_token_logits = logits[:, -1, :]
        # to find the index of the token with the highest probability, we only need to find the index of the largest logit in the last dimension (vocab_size)
        ## keepdim=True ensures that the output has the same shape as the input, except in the dimension where we take the argmax
        next_token_idx = torch.argmax(next_token_logits, dim=-1, keepdim=True) # shape: (batch_size, 1)
        # concatenate the new token to the output
        output_idx = torch.cat((output_idx, next_token_idx), dim=1)

    return output_idx

Test generate_tokens_simple function on our untrained GPT-2.

In [2]:
import torch
import tiktoken
from gpt.gpt_model import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "embedding_dim": 768,  # Embedding dimension
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "dropout_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()    # disable dropout

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (dropout_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        

In [3]:
start_context = "Hi, I am a large language model"

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension

In [11]:
print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}")
print(f"Input text: {start_context}")
print(f"Encoded input text: {encoded}")  # encoded token IDs
print(f"Encoded tensor shape: {encoded_tensor.shape}")  # shape: (batch_size, n_tokens)


                      IN
Input text: Hi, I am a large language model
Encoded input text: [17250, 11, 314, 716, 257, 1588, 3303, 2746]
Encoded tensor shape: torch.Size([1, 8])


In [10]:
out = generate_tokensns_simple(
    model=model,
    input_idx=encoded_tensor,
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())

The generated text is going to be gibberish as the model is only initialized with random weights, not even pretrained.

In [13]:
print(f"\n{50*'='}\n{22*' '}OUT\n{50*'='}")
print(f"Output tensor: {out}")
print(f"Output text: {decoded_text}")


                      OUT
Output tensor: tensor([[17250,    11,   314,   716,   257,  1588,  3303,  2746, 45199, 41518,
         45173, 31263, 23195,  8603,  7384, 10261, 18815, 30220]])
Output text: Hi, I am a large language model fixme Satanic cordsulkan275 equally attacked 93 colony corrobor


## 2. Download OpenAI's pretrained GPT-2 weights

Download utility function.

!!! The following code uses urllib.request but results in and error in MacOS
```
URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>
```

In [26]:
# utility function to download a file
import os
import urllib.request
from tqdm import tqdm

def download_file(url, destination):
    # Send a GET request to download the file

    try:
        with urllib.request.urlopen(url) as response:
            # Get the total file size from headers, defaulting to 0 if not present
            file_size = int(response.headers.get("Content-Length", 0))

            # Check if file exists and has the same size
            if os.path.exists(destination):
                file_size_local = os.path.getsize(destination)
                if file_size == file_size_local:
                    print(f"File already exists and is up-to-date: {destination}")
                    return

            # Define the block size for reading the file
            block_size = 1024  # 1 Kilobyte

            # Initialize the progress bar with total file size
            progress_bar_description = os.path.basename(url)  # Extract filename from URL
            with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
                # Open the destination file in binary write mode
                with open(destination, "wb") as file:
                    # Read the file in chunks and write to destination
                    while True:
                        chunk = response.read(block_size)
                        if not chunk:
                            break
                        file.write(chunk)
                        progress_bar.update(len(chunk))  # Update progress bar
    except urllib.error.HTTPError:
        s = (
            f"The specified URL ({url}) is incorrect, the internet connection cannot be established,"
            "\nor the requested file is temporarily unavailable.\nPlease visit the following website"
            " for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273")
        print(s)

An alternative download util function using `requests` package instead.

In [27]:
import os
import requests
from tqdm import tqdm

def download_file(url, destination):
    # Send a GET request to download the file in streaming mode
    response = requests.get(url, stream=True)

    # Get the total file size from headers, defaulting to 0 if not present
    file_size = int(response.headers.get("content-length", 0))

    # Check if file exists and has the same size
    if os.path.exists(destination):
        file_size_local = os.path.getsize(destination)
        if file_size == file_size_local:
            print(f"File already exists and is up-to-date: {destination}")
            return

    # Define the block size for reading the file
    block_size = 1024  # 1 Kilobyte

    # Initialize the progress bar with total file size
    progress_bar_description = url.split("/")[-1]  # Extract filename from URL
    with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
        # Open the destination file in binary write mode
        with open(destination, "wb") as file:
            # Iterate over the file data in chunks
            for chunk in response.iter_content(block_size):
                progress_bar.update(len(chunk))  # Update progress bar
                file.write(chunk)  # Write the chunk to the file

Download OpenAI's GPT-2 pretrained model to a local folder `./models/{model_size}`.

In [20]:
def download_gpt2_model(model_size, models_dir):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    # Download files
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path)

Downloading the smallest GPT-2(124M) will take 488M disk space.

In [28]:
download_gpt2_model("124M", "models")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 51.2kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 662kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 55.0kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [07:04<00:00, 1.17MiB/s]  
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 3.05MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 679kiB/s] 
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 656kiB/s] 


## 3. Load OpenAI's GPT-2 pretrianed weights to our GPTModel

### 3.1 Extract settings and parameters from downloaded model

Load settings and params of downloaded model. (We have to use Tensorflow as OpenAI's GPT-2 was trained with TensorFlow).

In [40]:
import tensorflow as tf
import json
import os

model_dir = os.path.join("models", "124M")

# finds the latest checkpoint file in the model directory
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
print(tf_ckpt_path)

settings = json.load(open(os.path.join(model_dir, "hparams.json")))
print(settings)

models/124M/model.ckpt
{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}


Let's list each layer's keys and shapes in the checkpoint.

In [38]:
tf.train.list_variables(tf_ckpt_path)

[('model/h0/attn/c_attn/b', [2304]),
 ('model/h0/attn/c_attn/w', [1, 768, 2304]),
 ('model/h0/attn/c_proj/b', [768]),
 ('model/h0/attn/c_proj/w', [1, 768, 768]),
 ('model/h0/ln_1/b', [768]),
 ('model/h0/ln_1/g', [768]),
 ('model/h0/ln_2/b', [768]),
 ('model/h0/ln_2/g', [768]),
 ('model/h0/mlp/c_fc/b', [3072]),
 ('model/h0/mlp/c_fc/w', [1, 768, 3072]),
 ('model/h0/mlp/c_proj/b', [768]),
 ('model/h0/mlp/c_proj/w', [1, 3072, 768]),
 ('model/h1/attn/c_attn/b', [2304]),
 ('model/h1/attn/c_attn/w', [1, 768, 2304]),
 ('model/h1/attn/c_proj/b', [768]),
 ('model/h1/attn/c_proj/w', [1, 768, 768]),
 ('model/h1/ln_1/b', [768]),
 ('model/h1/ln_1/g', [768]),
 ('model/h1/ln_2/b', [768]),
 ('model/h1/ln_2/g', [768]),
 ('model/h1/mlp/c_fc/b', [3072]),
 ('model/h1/mlp/c_fc/w', [1, 768, 3072]),
 ('model/h1/mlp/c_proj/b', [768]),
 ('model/h1/mlp/c_proj/w', [1, 3072, 768]),
 ('model/h10/attn/c_attn/b', [2304]),
 ('model/h10/attn/c_attn/w', [1, 768, 2304]),
 ('model/h10/attn/c_proj/b', [768]),
 ('model/h10/

Define a function to extract settings and parameters from downloaded model, carefully mapping each layer's parameters' keys and weight dictionariies.

In [51]:
import tensorflow as tf
import numpy as np
import json
import os

def load_model_settings_and_params(model_dir):
    # finds the latest checkpoint file in the model directory
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    # load the model settings from the hparams.json file
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))

    # load the model parameters from the checkpoint file, layer by layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}
    for name, _ in tf.train.list_variables(tf_ckpt_path):
        # load the variable of the layer and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
        # print(name, variable_array)   # uncomment to see the variable names and arrays

        # process the variable name (e.g. "model/wte", "model/ln_f/g", "model/h2/mlp/c_proj/w") to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # skip the "model/" prefix

        # for the n layers, identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]
        
        # recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})
        
        # assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return settings, params

In [52]:
model_dir = os.path.join("models", "124M")

settings, params = load_model_settings_and_params(model_dir)

In [57]:
settings

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [68]:
def print_keys(d, indent=0):
    """Recursively prints all keys in a dictionary with indentation to show depth."""
    for key, value in d.items():
        print(" " * indent + str(key))
        if isinstance(value, dict):
            print_keys(value, indent + 4)
        elif isinstance(value, list):
            # Iterate through list items to check for dictionaries
            for index, item in enumerate(value):
                if isinstance(item, dict):
                    print(" " * (indent + 4) + f"List item {index}:")
                    print_keys(item, indent + 4)  # Increase indent for items within lists

print_keys(params)

blocks
    List item 0:
    attn
        c_attn
            b
            w
        c_proj
            b
            w
    ln_1
        b
        g
    ln_2
        b
        g
    mlp
        c_fc
            b
            w
        c_proj
            b
            w
    List item 1:
    attn
        c_attn
            b
            w
        c_proj
            b
            w
    ln_1
        b
        g
    ln_2
        b
        g
    mlp
        c_fc
            b
            w
        c_proj
            b
            w
    List item 2:
    attn
        c_attn
            b
            w
        c_proj
            b
            w
    ln_1
        b
        g
    ln_2
        b
        g
    mlp
        c_fc
            b
            w
        c_proj
            b
            w
    List item 3:
    attn
        c_attn
            b
            w
        c_proj
            b
            w
    ln_1
        b
        g
    ln_2
        b
        g
    mlp
        c_fc
            b
  

### 3.2 Load weights into a GPTModel instance

### 3.2.1 Observe the parameter structures

Assign weight utility function

In [60]:
import torch

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shapes do not match: Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

Instantiate a new GPTModel.

In [124]:
from gpt.gpt_model import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "embedding_dim": 768,  # Embedding dimension
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "dropout_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (dropout_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        

Let's print all OpenAI parameter keys recursively.

In [70]:
def print_keys(d, indent=0):
    """Recursively prints all keys in a dictionary with indentation to show depth."""
    for key, value in d.items():
        print(" " * indent + str(key))
        if isinstance(value, dict):
            print_keys(value, indent + 4)
        elif isinstance(value, list):
            # Iterate through list items to check for dictionaries
            for index, item in enumerate(value):
                if isinstance(item, dict):
                    print(" " * (indent + 4) + f"List item {index}:")
                    print_keys(item, indent + 8)  # Increase indent for items within lists

print_keys(params)

blocks
    List item 0:
        attn
            c_attn
                b
                w
            c_proj
                b
                w
        ln_1
            b
            g
        ln_2
            b
            g
        mlp
            c_fc
                b
                w
            c_proj
                b
                w
    List item 1:
        attn
            c_attn
                b
                w
            c_proj
                b
                w
        ln_1
            b
            g
        ln_2
            b
            g
        mlp
            c_fc
                b
                w
            c_proj
                b
                w
    List item 2:
        attn
            c_attn
                b
                w
            c_proj
                b
                w
        ln_1
            b
            g
        ln_2
            b
            g
        mlp
            c_fc
                b
                w
            c_proj
   

In [79]:
print(params["wte"].shape)

(50257, 768)


In [83]:
print(params["wpe"].shape)

(1024, 768)


Our GPTModel's architecture for reference:
```
GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (dropout_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    ...
  )
  (final_norm): LayerNorm()
  (out_head): Linear(in_features=768, out_features=50257, bias=True)
)

Now here is the mapping between OpenAI GPT-2 weights and our GPTModel weights.

```
blocks  -   transformer_blocks
    List item 0:
        attn    -   att
            c_attn  -   concatenated W_q, W_k, W_v
                b   -   concatenated bias of W_q, W_k, W_v
                w   -   concatenated weight of W_q, W_k, W_v
            c_proj  -   out_proj
                b   -   outproj.bias
                w   -   outproj.weight
        ln_1   -   norm1
            b   -   norm1.shift
            g   -   norm1.scale
        ln_2   -   norm2
            b   -   norm2.shift
            g   -   norm2.scale
        mlp    -   ff
            c_fc   -   ff.layers[0]
                b   -   ff.layers[0].bias
                w   -   ff.layers[0].weight
            c_proj  -   ff.layers[2]
                b   -   ff.layers[2].bias
                w   -   ff.layers[2].weight
    ... other 11 layers of transformer blocks
b   -   final_norm.shift
g   -   final_norm.scale
wpe -   pos_emb
wte -   tok_emb, out_head
```

### 3.2.2 Functions to assign weights

A utility function that instantiates a weight dictionary into a `torch.nn.Parameter`.

In [115]:
import torch

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shapes do not match: Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

Assign function.

In [118]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    # token and position embeddings
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        # Masked Multi Head Attention weights
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.transformer_blocks[b].att.W_q.weight = assign(gpt.transformer_blocks[b].att.W_q.weight, q_w.T)        
        gpt.transformer_blocks[b].att.W_k.weight = assign(gpt.transformer_blocks[b].att.W_k.weight, k_w.T)        
        gpt.transformer_blocks[b].att.W_v.weight = assign(gpt.transformer_blocks[b].att.W_v.weight, v_w.T)
        # Masked Multi Head Attention biases
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.transformer_blocks[b].att.W_q.bias = assign(gpt.transformer_blocks[b].att.W_q.bias, q_b)        
        gpt.transformer_blocks[b].att.W_k.bias = assign(gpt.transformer_blocks[b].att.W_k.bias, k_b)        
        gpt.transformer_blocks[b].att.W_v.bias = assign(gpt.transformer_blocks[b].att.W_v.bias, v_b)
        # Masked Multi Head Attention output projection weights and biases
        gpt.transformer_blocks[b].att.out_proj.weight = assign(gpt.transformer_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)   
        gpt.transformer_blocks[b].att.out_proj.bias = assign(gpt.transformer_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])

        # Feed Forward weights and biases
        gpt.transformer_blocks[b].ff.layers[0].weight = assign(gpt.transformer_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)   
        gpt.transformer_blocks[b].ff.layers[0].bias = assign(gpt.transformer_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])   
        gpt.transformer_blocks[b].ff.layers[2].weight = assign(gpt.transformer_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)   
        gpt.transformer_blocks[b].ff.layers[2].bias = assign(gpt.transformer_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])

        # 2 Layer Normalization weights and biases
        gpt.transformer_blocks[b].norm1.scale = assign(gpt.transformer_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
        gpt.transformer_blocks[b].norm1.shift = assign(gpt.transformer_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
        gpt.transformer_blocks[b].norm2.scale = assign(gpt.transformer_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
        gpt.transformer_blocks[b].norm2.shift = assign(gpt.transformer_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])
    # Final layer norm
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])

    # Output projection weights (reuses token embedding)
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



Let's make a new GPTModel to load the OpenAI's GPT-2 weights.
!!! OpenAI's GPT-2 uses qkv bias, so we need to update the config to set `qkv_bias` to be `True`!

In [126]:
from gpt.gpt_model import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "embedding_dim": 768,  # Embedding dimension
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "dropout_rate": 0.1,  # Dropout rate
    "qkv_bias": True,  # Query-Key-Value bias set to True!
}
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (dropout_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_

In [127]:
load_weights_into_gpt(model, params)

Now let's test the model again.

Same input as before:

In [137]:
start_context = "Hi, I am a large language model."
tokenizer = tiktoken.get_encoding("gpt2")
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}")
print(f"Input text: {start_context}")
print(f"Encoded input text: {encoded}")  # encoded token IDs
print(f"Encoded tensor shape: {encoded_tensor.shape}")  # shape: (batch_size, n_tokens)



                      IN
Input text: Hi, I am a large language model.
Encoded input text: [17250, 11, 314, 716, 257, 1588, 3303, 2746, 13]
Encoded tensor shape: torch.Size([1, 9])


And we can see that the output is becoming coherent sentences now.

In [142]:
out = generate_tokensnsns_simple(
    model=model,
    input_idx=encoded_tensor,
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())

print(f"\n{50*'='}\n{22*' '}OUT\n{50*'='}")
print(f"Output tensor: {out}")
print(f"Output text: {decoded_text}")


                      OUT
Output tensor: tensor([[17250,    11,   314,   716,   257,  1588,  3303,  2746,    13,   314,
           716,   257,  1263, 24292,    13,   314,   716,   257,  1263]])
Output text: Hi, I am a large language model. I am a big programmer. I am a big


A better `generate` function that have `temperature` and `top_k` settings.

In [131]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [141]:
out2 = generate(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"],
    temperature=1.0,
    top_k=50,
)

decoded_text = tokenizer.decode(out2.squeeze(0).tolist())

print(f"\n{50*'='}\n{22*' '}OUT\n{50*'='}")
print(f"Output tensor: {out}")
print(f"Output text: {decoded_text}")


                      OUT
Output tensor: tensor([[17250,    11,   314,   716,   257,  1588,  3303,  2746,    13,   314,
           716,   257,  1263, 24292,    13,   314,   716,   257,  1263]])
Output text: Hi, I am a large language model. I do not know that I am the real one behind our projects, or that we are not the real creators.


