## Building a GPT

Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT.

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

torch.manual_seed(1337)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [2]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [5]:
# let's look at the first 1000 characters
# print()
# for i in text[:20]:
#     print(f"{i} encodes to {encode(i)}")
# print(encode(text[:20]))

### Full finished code, for reference

You may want to refer directly to the git repo instead though.

In [6]:
# optim hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
print(device)

# Set GPT config to be equivalent
from dataclasses import dataclass
@dataclass
class DemoConfig:
    block_size: int = 256             # what is the maximum context length for predictions?
    n_layer: int = 6
    n_head: int = 6
    n_embd: int = 384
    dropout: float = 0.2
    bias: bool = True
    attention_type: str = "global"
    
config = DemoConfig()

cuda


In [7]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - config.block_size, (batch_size,))
    x = torch.stack([data[i:i+config.block_size] for i in ix])
    y = torch.stack([data[i+1:i+config.block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# start_pos_train = torch.where(train_data == 0)[0]
# start_pos_val = torch.where(val_data == 0)[0]
# def get_batch(split):
#     # Generate a small batch of data of inputs x and targets y, but ensure inputs x start at new paragraph.
#     data = train_data if split == 'train' else val_data
#     start_positions = start_pos_train if split == "train" else start_pos_val
#     ix = torch.randint(len(start_positions)-2, (batch_size,))
#     x = torch.stack([data[start_positions[i]:start_positions[i]+block_size] for i in ix])
#     y = torch.stack([data[start_positions[i]+1:start_positions[i]+block_size+1] for i in ix])
#     return x, y
    
@torch.no_grad()
def estimate_loss(eval_iters=200):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out




In [8]:
from CPRD.src.models.gpt_pico.transformer import GPTLanguageModel

model_base = GPTLanguageModel(config, vocab_size).to(device)

In [9]:
from CPRD.src.models.gpt_simple.transformer import GPTModel

config.learn_position_encoding = False
model1 = GPTModel(config, vocab_size).to(device)

config.learn_position_encoding = True
model2 = GPTModel(config, vocab_size).to(device)

In [10]:
models = [model_base, model1, model2]
loss_curves_train = [[] for _ in models]
loss_curves_val = [[] for _ in models]

In [11]:
for idx, model in enumerate(models):
    m = model.to(device)

    # print the number of parameters in the model
    print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for i in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
        if i % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss()
            print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        xb, yb = get_batch('train')
        # print(f"in: {xb.shape}, out {yb.shape}")

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        loss_curves_train[idx].append(losses["train"])
        loss_curves_val[idx].append(losses["val"])

    # generate from the model
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


10.788929 M parameters
step 0: train loss 4.2848, val loss 4.2826
step 500: train loss 1.9992, val loss 2.0904
step 1000: train loss 1.5945, val loss 1.7714
step 1500: train loss 1.4369, val loss 1.6495
step 2000: train loss 1.3363, val loss 1.5687
step 2500: train loss 1.2732, val loss 1.5222
step 3000: train loss 1.2228, val loss 1.5002
step 3500: train loss 1.1813, val loss 1.4874
step 4000: train loss 1.1429, val loss 1.4846
step 4500: train loss 1.1090, val loss 1.4781

Had you to proceed, had silence for perfume your apprellable day;
And acqualityth of you and the honour of honour
At you, trouble mempt; how you sing how,
My pripessed well, which, the whoa'd foul pride
And makes him; most unrestroy' waters,
Than in his face.

FISAL:
Good night!

SAMPSON:
A gentleman, which I have harrdlingly
Have one doned to myself towns; but they but not
Flattering, nothing at like the all. If I
I wake it; for forsake, anone I sue, then.
He loving Hastings Marcius, as yound Clarence
The valour, 

In [12]:
ts = [torch.cat([tmp.unsqueeze(0) for tmp in loss_curves_train[m_idx]], dim=0).numpy() for m_idx, _ in enumerate(models)]
vs = [torch.cat([tmp.unsqueeze(0) for tmp in loss_curves_val[m_idx]], dim=0).numpy() for m_idx, _ in enumerate(models)]

import matplotlib.pyplot as plt
import numpy as np
plt.figure()
cols = ["k", "r", "b"]
for m_idx, t in enumerate(ts):
    plt.plot(np.arange(t.shape[0]), t, label=f"{m_idx}-train", c=cols[m_idx], linestyle='dashed')
for m_idx, v in enumerate(vs):
    plt.plot(np.arange(v.shape[0]), v, label=f"{m_idx}-val", c=cols[m_idx])
plt.legend()
plt.savefig("figs/tinyshakespeare/loss_curves.png")

In [13]:
# prompt = "Hath thou no remorse?"
# context = torch.from_numpy(np.array(encode(prompt)).reshape((1,-1)))
# print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))