# Building a GPT language model... from scratch!

# Data

In [2]:
with open("gpt/harry-potter.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [3]:
len(text)

6250286

In [5]:
print(text[:500])

CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had



In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !"&'()*,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{}~¦«»éü–—‘’“”•…
104


In [11]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [13]:
print(encode("Good morning!"))
print(decode(encode("Good morning!")))

[36, 76, 76, 65, 2, 74, 76, 79, 75, 70, 75, 68, 3]
Good morning!


In [14]:
import torch
data = torch.tensor(encode(text))
data.shape, data.dtype

  from .autonotebook import tqdm as notebook_tqdm


(torch.Size([6250286]), torch.int64)

In [15]:
data[:500]

tensor([32, 37, 30, 45, 49, 34, 47,  2, 44, 43, 34,  0, 49, 37, 34,  2, 31, 44,
        54,  2, 52, 37, 44,  2, 41, 38, 51, 34, 33,  0, 42, 79, 12,  2, 62, 75,
        65,  2, 42, 79, 80, 12,  2, 33, 82, 79, 80, 73, 66, 86, 10,  2, 76, 67,
         2, 75, 82, 74, 63, 66, 79,  2, 67, 76, 82, 79, 10,  2, 45, 79, 70, 83,
        66, 81,  2, 33, 79, 70, 83, 66, 10,  2, 84, 66, 79, 66,  2, 77, 79, 76,
        82, 65,  2, 81, 76,  2, 80, 62, 86,  0, 81, 69, 62, 81,  2, 81, 69, 66,
        86,  2, 84, 66, 79, 66,  2, 77, 66, 79, 67, 66, 64, 81, 73, 86,  2, 75,
        76, 79, 74, 62, 73, 10,  2, 81, 69, 62, 75, 72,  2, 86, 76, 82,  2, 83,
        66, 79, 86,  2, 74, 82, 64, 69, 12,  2, 49, 69, 66, 86,  2, 84, 66, 79,
        66,  2, 81, 69, 66,  2, 73, 62, 80, 81,  0, 77, 66, 76, 77, 73, 66,  2,
        86, 76, 82,  6, 65,  2, 66, 85, 77, 66, 64, 81,  2, 81, 76,  2, 63, 66,
         2, 70, 75, 83, 76, 73, 83, 66, 65,  2, 70, 75,  2, 62, 75, 86, 81, 69,
        70, 75, 68,  2, 80, 81, 79, 62, 

In [16]:
n = int(0.9 * len(data))
train_data = data[:n]
validation_data = data[n:]

In [17]:
block_size = 8
train_data[:block_size+1]

tensor([32, 37, 30, 45, 49, 34, 47,  2, 44])

In [18]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    prompt = x[:t+1]
    target = y[t]
    print(f"prompt = {prompt}, target = {target}")

prompt = tensor([32]), target = 37
prompt = tensor([32, 37]), target = 30
prompt = tensor([32, 37, 30]), target = 45
prompt = tensor([32, 37, 30, 45]), target = 49
prompt = tensor([32, 37, 30, 45, 49]), target = 34
prompt = tensor([32, 37, 30, 45, 49, 34]), target = 47
prompt = tensor([32, 37, 30, 45, 49, 34, 47]), target = 2
prompt = tensor([32, 37, 30, 45, 49, 34, 47,  2]), target = 44


In [19]:
torch.manual_seed(42)
batch_size = 4 
block_size = 8

def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch(train_data)
print(xb)
print(yb)

tensor([[77, 79, 76, 65, 82, 64, 66, 65],
        [ 2, 73, 76, 76, 72, 70, 75,  6],
        [ 2, 33, 82, 74, 63, 73, 66, 11],
        [76, 81, 81, 80,  3,  4,  0,  4]])
tensor([[79, 76, 65, 82, 64, 66, 65,  2],
        [73, 76, 76, 72, 70, 75,  6,  2],
        [33, 82, 74, 63, 73, 66, 11, 65],
        [81, 81, 80,  3,  4,  0,  4, 30]])


In [20]:
for b in range(batch_size):
    for t in range(block_size):
        prompt = xb[b, :t+1]
        target = yb[b, t]
        print(f"prompt = {prompt}, target = {target}")

prompt = tensor([77]), target = 79
prompt = tensor([77, 79]), target = 76
prompt = tensor([77, 79, 76]), target = 65
prompt = tensor([77, 79, 76, 65]), target = 82
prompt = tensor([77, 79, 76, 65, 82]), target = 64
prompt = tensor([77, 79, 76, 65, 82, 64]), target = 66
prompt = tensor([77, 79, 76, 65, 82, 64, 66]), target = 65
prompt = tensor([77, 79, 76, 65, 82, 64, 66, 65]), target = 2
prompt = tensor([2]), target = 73
prompt = tensor([ 2, 73]), target = 76
prompt = tensor([ 2, 73, 76]), target = 76
prompt = tensor([ 2, 73, 76, 76]), target = 72
prompt = tensor([ 2, 73, 76, 76, 72]), target = 70
prompt = tensor([ 2, 73, 76, 76, 72, 70]), target = 75
prompt = tensor([ 2, 73, 76, 76, 72, 70, 75]), target = 6
prompt = tensor([ 2, 73, 76, 76, 72, 70, 75,  6]), target = 2
prompt = tensor([2]), target = 33
prompt = tensor([ 2, 33]), target = 82
prompt = tensor([ 2, 33, 82]), target = 74
prompt = tensor([ 2, 33, 82, 74]), target = 63
prompt = tensor([ 2, 33, 82, 74, 63]), target = 73
prompt

# Bigram language model

In [64]:
import torch 
import torch.nn as nn 
from torch.nn import functional as F 
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x): # (B, T)
        logits = self.token_embedding_table(x) # (B, T, V)
        return logits 
    
    def generate(self, x, max_new_tokens):  # x is (B, T)
        for _ in range(max_new_tokens):
            logits = self(x)  # (B, T, V)
            logits = logits[:, -1, :]  # (B, V)
            probs = F.softmax(logits, dim=1)
            x_next = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, x_next], dim=1)
        return x 



In [65]:
lm = BigramLanguageModel(vocab_size)
logits = lm(xb)
xb.shape, logits.shape

(torch.Size([4, 8]), torch.Size([4, 8, 104]))

In [49]:
logits

tensor([[[-0.8146, -1.0212, -0.4949,  ...,  1.2461, -2.3065, -1.2869],
         [-1.3412,  0.3424,  0.1963,  ...,  1.0125, -0.7147,  0.3446],
         [ 0.5997, -0.3390,  0.1549,  ..., -0.2335, -0.7175, -2.2448],
         ...,
         [ 0.6960,  0.3888, -2.5335,  ..., -0.5347, -0.1319, -1.1636],
         [-0.8146, -1.0212, -0.4949,  ...,  1.2461, -2.3065, -1.2869],
         [ 0.1773,  0.9313, -1.1519,  ...,  0.6461,  1.0271,  0.9107]],

        [[ 0.5997, -0.3390,  0.1549,  ..., -0.2335, -0.7175, -2.2448],
         [ 1.6601, -0.5517, -0.3104,  ..., -0.7300, -1.4113,  0.3488],
         [-0.8146, -1.0212, -0.4949,  ...,  1.2461, -2.3065, -1.2869],
         ...,
         [-1.3938,  0.8466, -1.7191,  ..., -0.5860,  2.0284, -0.1151],
         [-0.8146, -1.0212, -0.4949,  ...,  1.2461, -2.3065, -1.2869],
         [ 0.5530,  1.2586,  0.2317,  ...,  0.6008,  0.7986, -1.3825]],

        [[ 1.6601, -0.5517, -0.3104,  ..., -0.7300, -1.4113,  0.3488],
         [-0.6870,  0.3154, -1.2174,  ...,  0

In [50]:
print(decode(lm.generate(torch.zeros((1, 1), dtype=torch.int64), max_new_tokens=100).squeeze().tolist()))


eN"_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*Hw2z^aD_h‘xr*


In [51]:
optimizer = torch.optim.Adam(lm.parameters(), lr=1e-3)

In [61]:
for step in range(1000):
    xb, yb = get_batch(train_data)

    logits = lm(xb)
    B, T, V = logits.shape 
    logits = logits.view(B*T, V)
    targets = yb.view(B*T)
    loss = F.cross_entropy(logits, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(loss.item())

2.3217525482177734


In [63]:
decode(lm.generate(torch.zeros((1, 1), dtype=torch.int64), max_new_tokens=100).squeeze().tolist())

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

# Self-attention

In [68]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [69]:
# xbow[b, t] = mean_{i <= t} x[b, i]

# v1: for loop
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0)

In [70]:
x[0]

tensor([[-0.5783,  1.4785],
        [-1.3545,  0.0231],
        [ 0.6528, -0.1086],
        [ 0.9647,  1.5781],
        [ 1.2073,  0.6106],
        [-0.0396, -0.6555],
        [ 1.0122, -0.4351],
        [-1.3594,  0.9402]])

In [71]:
xbow[0]

tensor([[-0.5783,  1.4785],
        [-0.9664,  0.7508],
        [-0.4267,  0.4644],
        [-0.0788,  0.7428],
        [ 0.1784,  0.7163],
        [ 0.1421,  0.4877],
        [ 0.2664,  0.3559],
        [ 0.0632,  0.4289]])

In [73]:
# v2: matmul
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [78]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b 
print(f"a = \n{a}")
print(f"b = \n{b}")
print(f"c = a@b = \n{c}")

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c = a@b = 
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [83]:
# v3: softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [86]:
# v4: data-dependent weighted sum
torch.manual_seed(42)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# single head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
k = key(x)  # (B, T, H)
q = query(x)  # (B, T, H)
wei = q @ k.transpose(-2, -1)  # (B, T, H) @ (B, H, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))

#wei = torch.zeros((T, T))

wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
out = wei @ x
out.shape

torch.Size([4, 8, 32])

In [87]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1905, 0.8095, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3742, 0.0568, 0.5690, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1288, 0.3380, 0.1376, 0.3956, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4311, 0.0841, 0.0582, 0.3049, 0.1217, 0.0000, 0.0000, 0.0000],
        [0.0537, 0.3205, 0.0694, 0.2404, 0.2568, 0.0592, 0.0000, 0.0000],
        [0.3396, 0.0149, 0.5165, 0.0180, 0.0658, 0.0080, 0.0373, 0.0000],
        [0.0165, 0.0375, 0.0144, 0.1120, 0.0332, 0.4069, 0.3136, 0.0660]],
       grad_fn=<SelectBackward0>)

In [89]:
# v5: data-dependent weighted sum
torch.manual_seed(42)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# single head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)  # (B, T, H)
q = query(x)  # (B, T, H)
wei = q @ k.transpose(-2, -1)  # (B, T, H) @ (B, H, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [96]:
# variance
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)
k.var(), q.var(), wei.var()

(tensor(1.0346), tensor(0.9618), tensor(15.1129))

In [97]:
wei = q @ k.transpose(-2, -1) * head_size ** -0.5
k.var(), q.var(), wei.var()

(tensor(1.0346), tensor(0.9618), tensor(0.9446))

In [98]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [99]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])