<a href="https://colab.research.google.com/github/charbull/build-nanogpt/blob/master/understanding_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
# this config is based on the output from the gpt2 transformer from huggingface. https://colab.research.google.com/drive/1UZmJvW2w_h3GacbwaOLzXiKMzBeU087H#scrollTo=w5I-SpEJurn4&line=3&uniqifier=1
@dataclass
class GPTConfig:
  block_size: int = 1024 # max sequence length
  vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
  n_layer: int = 12 # number of layers
  n_head: int = 12 # number of heads
  n_embd: int = 768 # embedding dimension

class CausalSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    # key, query, value projections for all heads but in a batch
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
    # output projection
    self.c_proj = nn.Linear(config.n_embd, config.n_embd)
    # regularisation
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    # not really a bias, more of a mask, but following openAI/HF naming
    self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

  def forward(self, x):
    B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embed)
    # calculate query, key, values for all heads in batch and move head forward to be the batch dim
    # nh is the number of heads, hs is the head size, and C is the number of channels = nh * has
    # e.g. in GPT-2 (124M), n_head = 12, hs=64, so nh*hs = C = 768 channels in the Transformer.
    qkv = self.c_attn(x)
    q, k, v = qkv.split(self.n_embd, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    # attention (materializes the large (T, T) matrix for all the queries and keys)
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # reassemble all the head outputs side by side
    # output projection
    y = self.c_proj(y)
    return y

class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
    self.gelu = nn.GELU(approximate='tanh')
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

  def forward(self, x):
    x = self.c_fc(x)
    x = self.gelu(x)
    x = self.c_proj(x)
    return x

class Block(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def forward(self, x):
    x = x + self.attn(self.ln_1(x)) # communication happens here, (reduce)
    x = x + self.mlp(self.ln_2(x)) # they think of the information they got. (map)
    return x



In [None]:
class GPT(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.config = config

    self.transformer = nn.ModuleDict(dict(
        wte = nn.Embedding(config.vocab_size, config.n_embd),
        wpe = nn.Embedding(config.block_size, config.n_embd),
        h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
        ln_f = nn.LayerNorm(config.n_embd)
    ))
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

  @classmethod
  def from_pretrained(cls, model_type):
    """Loads pretrained GPT-2 model weights from huggingface"""
    assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
    from transformers import GPT2LMHeadModel
    print("loading weights from pretrained gpt: %s" % model_type)

    # n_layer, n_head and n_embd are determined from model_type
    config_args = {
              'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
              'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
              'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
              'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
    }[model_type]
    config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
    config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
    # create a from-scratch initialized minGPT model
    config = GPTConfig(**config_args)
    model = GPT(config)
    sd = model.state_dict()
    sd_keys = sd.keys()
    sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
    for k, v in sd.items():
      print(k, v.shape)

    # init a huggingface/transformers model
    model_hf = GPT2LMHeadModel.from_pretrained(model_type)
    sd_hf = model_hf.state_dict()

    # copy while ensuring all of the parameters are aligned and match in names and shapes
    sd_keys_hf = sd_hf.keys()
    sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
    sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
    transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
    # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
    # this means that we have to transpose these weights when we import them
    assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
    for k in sd_keys_hf:
      if any(k.endswith(w) for w in transposed):
      # special treatment for the Conv1D weights we need to transpose
        assert sd_hf[k].shape[::-1] == sd[k].shape
        with torch.no_grad():
          sd[k].copy_(sd_hf[k].t())
      else:
        # vanilla copy over the other parameters
        assert sd_hf[k].shape == sd[k].shape
        with torch.no_grad():
          sd[k].copy_(sd_hf[k])
    return model

  # def forward(self, idx):
  #   """
  #   idx and targets are both (B, T) tensor of integers
  #   """
  #   # idx is of shape (B, T)
  #   B, T = idx.size()
  #   assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
  #   # forward the token and posisition embeddings
  #   pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
  #   pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
  #   tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
  #   x = tok_emb + pos_emb
  #   # forward the blocks of the transformer
  #   for block in self.transformer.h:
  #     x = block(x)
  #   # forward the final layernorm and the classifier
  #   x = self.transformer.ln_f(x)
  #   logits = self.lm_head(x) # (B, T, vocab_size)
  #   return logits

  def forward(self, idx, targets=None):
    """
    idx and targets are both (B, T) tensor of integers with loss
    """
    # idx is of shape (B, T)
    B, T = idx.size()
    assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
    # forward the token and posisition embeddings
    pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
    pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
    tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
    x = tok_emb + pos_emb
    # forward the blocks of the transformer
    for block in self.transformer.h:
      x = block(x)
    # forward the final layernorm and the classifier
    x = self.transformer.ln_f(x)
    logits = self.lm_head(x) # (B, T, vocab_size)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
    return logits, loss


## With HF weights initialization

In [None]:
print('loading the weights from huggingface into our model which mirrors hf')
model = GPT.from_pretrained('gpt2')
print('didnt crash')

loading the weights from huggingface into our model which mirrors hf
loading weights from pretrained gpt: gpt2
transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.bias torch.Size([1, 1, 1024, 1024])
transformer.h.0.attn.c_attn.weight torch.Size([2304, 768])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.bias torch.Si

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

didnt crash


## Let's do a forward pass

In [None]:
model.eval() # put it in evaluation mode as in inference
model.to('cuda') # move the model to GPU

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Tokenization

We need to import the tokenizer to tokenize the input text, this is a pre-processing part.

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [None]:
num_return_sequences = 5
max_length = 30

import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Hello, I am a language model, ")
print("Single example: ", tokens)
tokens = torch.tensor(tokens, dtype=torch.long) # (8,) put them in pytorch
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8) we want to generate 5x responses from 5 examples
x = tokens.to('cuda') # move them to GPU
print(x)
# x is the idx

Single example:  [15496, 11, 314, 716, 257, 3303, 2746, 11, 220]
tensor([[15496,    11,   314,   716,   257,  3303,  2746,    11,   220],
        [15496,    11,   314,   716,   257,  3303,  2746,    11,   220],
        [15496,    11,   314,   716,   257,  3303,  2746,    11,   220],
        [15496,    11,   314,   716,   257,  3303,  2746,    11,   220],
        [15496,    11,   314,   716,   257,  3303,  2746,    11,   220]],
       device='cuda:0')


## Generate
right now x is (B, T) where B=5 (number of batches) and T=8 the sequence length/

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length:
  # forward the model to get the logits
  with torch.no_grad():
    logits = model(x) # (B, T, vocab_size)
    # take the logits at the last position
    logits = logits[:, -1, :] # (B, vocab_size)
    # apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1) # (B, vocab_size)
    # do top-k sampling of 50 (huggingface pipeline default)
    # topk_probs here becomes (5, 50), topk_indices is (5, 50)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1)
    # select a token from the top-k probabilities
    ix = torch.multinomial(topk_probs, num_samples=1) # (B, 1)
    # gather the corresponding indices
    xcol = torch.gather(topk_indices, dim=-1, index=ix) # (B, 1)
    # append the token to the sequence
    x = torch.cat((x, xcol), dim=1) # (B, T+1)

## Print the generated text

In [None]:
for i in range(num_return_sequences):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print("> ", decoded)

>  Hello, I am a language model,  and that is what I will describe about Python. Let me just note a few examples.  
>  Hello, I am a language model, iphone , and a human, no matter where those "isms" overlap.
The way to express
>  Hello, I am a language model, !!!

Please remember - any way you think you can express it.

~The author

>  Hello, I am a language model,  (more or less)  a self-contained collection of code, usually compiled by a C
>  Hello, I am a language model,  so it must be well-trained. And how do you do that? It seems to me,


# Without HF weights initialization

In [None]:
model = GPT(GPTConfig())
model.eval() # put it in evaluation mode as in inference
model.to('cuda') # move the model to GPU

torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length:
  # forward the model to get the logits
  with torch.no_grad():
    logits = model(x) # (B, T, vocab_size)
    # take the logits at the last position
    logits = logits[:, -1, :] # (B, vocab_size)
    # apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1) # (B, vocab_size)
    # do top-k sampling of 50 (huggingface pipeline default)
    # topk_probs here becomes (5, 50), topk_indices is (5, 50)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1)
    # select a token from the top-k probabilities
    ix = torch.multinomial(topk_probs, num_samples=1) # (B, 1)
    # gather the corresponding indices
    xcol = torch.gather(topk_indices, dim=-1, index=ix) # (B, 1)
    # append the token to the sequence
    x = torch.cat((x, xcol), dim=1) # (B, T+1)

for i in range(num_return_sequences):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print("> ", decoded)

>  Hello, I am a language model,  and that is what I will describe about Python. Let me just note a few examples.  
>  Hello, I am a language model, iphone , and a human, no matter where those "isms" overlap.
The way to express
>  Hello, I am a language model, !!!

Please remember - any way you think you can express it.

~The author

>  Hello, I am a language model,  (more or less)  a self-contained collection of code, usually compiled by a C
>  Hello, I am a language model,  so it must be well-trained. And how do you do that? It seems to me,


## how to detect the device

In [None]:
device = "cpu"
if torch.cuda.is_available():
  device = "cuda"
elif torch.backends.mps.is_available(): # Apple silicon faster than cpu
  device = "mps"
print(f"using device: {device}")

using device: cuda


In [None]:
model = GPT(GPTConfig())
# model.to(device)
logits = model(x)

## Creating the labels


In [None]:
# tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r') as f:
    text = f.read()
data = text[:1000] # first 1,000 characters
print(data[:100])

--2024-08-20 04:39:44--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-08-20 04:39:44 (18.5 MB/s) - ‘input.txt’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


Let take an example

In [None]:
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:24])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13]


In [None]:
decode = enc.decode(tokens[:24])
decode

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.'

We want to create the x,y where x[i] has the next label at y[i]

In [None]:
buf = torch.tensor(tokens[:24 + 1])
buf = buf.to(device)
x = buf[:-1].view(4, 6)
print(x)
y = buf[1:].view(4,6)
print(y)

tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]], device='cuda:0')
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]], device='cuda:0')


In [None]:
B, T = 4, 6
buf = torch.tensor(tokens[:B*T + 1])
buf = buf.to(device)
x = buf[:-1].view(B, T)
print(x)
y = buf[1:].view(B,T)
print(y)

tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]], device='cuda:0')
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]], device='cuda:0')


In [None]:
model = GPT(GPTConfig())
model.to(device)
logits,loss = model(x, y)
logits.shape
print(loss)

tensor(11.0720, device='cuda:0', grad_fn=<NllLossBackward0>)


## Backward pass
Now that we added the loss we can do backward pass. Lets create the optimization!

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr =3e-4)
for i in range(50):
  optimizer.zero_grad()
  logits, loss = model(x, y)
  loss.backward()
  optimizer.step()
  print(f'step {i}, loss {loss.item()}')

step 0, loss 11.071961402893066
step 1, loss 4.125229835510254
step 2, loss 1.11372709274292
step 3, loss 0.33396539092063904
step 4, loss 0.14232298731803894
step 5, loss 0.07618648558855057
step 6, loss 0.04827012121677399
step 7, loss 0.03449244052171707
step 8, loss 0.026833551004529
step 9, loss 0.02213321067392826
step 10, loss 0.018978431820869446
step 11, loss 0.016679344698786736
step 12, loss 0.014878906309604645
step 13, loss 0.013386077247560024
step 14, loss 0.01210059691220522
step 15, loss 0.010972720570862293
step 16, loss 0.009979167021811008
step 17, loss 0.00910772755742073
step 18, loss 0.008348047733306885
step 19, loss 0.007689849939197302
step 20, loss 0.00712156854569912
step 21, loss 0.006631508935242891
step 22, loss 0.00620827404782176
step 23, loss 0.005841326434165239
step 24, loss 0.005521246697753668
step 25, loss 0.005240058526396751
step 26, loss 0.004991015885025263
step 27, loss 0.004768182057887316
step 28, loss 0.004567150492221117
step 29, loss 0.0