## GPT Development

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Mapping from characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

encode = lambda x: [char_to_int[c] for c in x] # x: str -> list[int]
decode = lambda x: ''.join([int_to_char[i] for i in x]) # x: list[int] -> str

print(encode('Hello World!'))
print(decode(encode('Hello World!')))


[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2]
Hello World!


In [3]:
# Train and validation splits
data = torch.tensor(encode(text), dtype=torch.long)
split = int(len(data) * 0.9) # 90% train, 10% val
train_data, val_data = data[:split], data[split:]

In [48]:
# There are a total of block_size training examples in each block
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    print(f'context: {x[:t+1].tolist()} -> target: {y[t]}')

context: [18] -> target: 47
context: [18, 47] -> target: 56
context: [18, 47, 56] -> target: 57
context: [18, 47, 56, 57] -> target: 58
context: [18, 47, 56, 57, 58] -> target: 1
context: [18, 47, 56, 57, 58, 1] -> target: 15
context: [18, 47, 56, 57, 58, 1, 15] -> target: 47
context: [18, 47, 56, 57, 58, 1, 15, 47] -> target: 58


Training with these different lengths of contexts from a size of 1 to block_size is important to ensure the transformer learns to deal with different context lengths. This is useful during inference because the model can generate text from as little as one character of context.

In [50]:
batch_size = 4 # Number of sequences to process in parallel
block_size = 8 # Maximum context length for predictions

def get_batch(split: str) -> tuple[torch.Tensor, torch.Tensor]:
    """Generate a random batch of context and target sequences."""
    data = train_data if split == 'train' else val_data
    # Randomly sample batch_size number of starting indices
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# Get a batch of context and target sequences
xb, yb = get_batch('train')
print(f'xb: {xb}\nyb: {yb}')

# xb and yb are both tensors of shape (batch_size, block_size)

xb: tensor([[43, 43, 58,  1, 42, 47, 57, 41],
        [58, 53,  1, 44, 43, 52, 41, 43],
        [42, 39, 59, 45, 46, 58, 43, 56],
        [58, 43, 56,  6,  1, 47, 44,  0]])
yb: tensor([[43, 58,  1, 42, 47, 57, 41, 53],
        [53,  1, 44, 43, 52, 41, 43,  1],
        [39, 59, 45, 46, 58, 43, 56, 11],
        [43, 56,  6,  1, 47, 44,  0, 20]])


In [51]:
# Display the context and target sequences for each batch element
for b in range(batch_size):
    for t in range(block_size):
        print(f'context: {xb[b, :t+1].tolist()} -> target: {yb[b, t]}')

context: [43] -> target: 43
context: [43, 43] -> target: 58
context: [43, 43, 58] -> target: 1
context: [43, 43, 58, 1] -> target: 42
context: [43, 43, 58, 1, 42] -> target: 47
context: [43, 43, 58, 1, 42, 47] -> target: 57
context: [43, 43, 58, 1, 42, 47, 57] -> target: 41
context: [43, 43, 58, 1, 42, 47, 57, 41] -> target: 53
context: [58] -> target: 53
context: [58, 53] -> target: 1
context: [58, 53, 1] -> target: 44
context: [58, 53, 1, 44] -> target: 43
context: [58, 53, 1, 44, 43] -> target: 52
context: [58, 53, 1, 44, 43, 52] -> target: 41
context: [58, 53, 1, 44, 43, 52, 41] -> target: 43
context: [58, 53, 1, 44, 43, 52, 41, 43] -> target: 1
context: [42] -> target: 39
context: [42, 39] -> target: 59
context: [42, 39, 59] -> target: 45
context: [42, 39, 59, 45] -> target: 46
context: [42, 39, 59, 45, 46] -> target: 58
context: [42, 39, 59, 45, 46, 58] -> target: 43
context: [42, 39, 59, 45, 46, 58, 43] -> target: 56
context: [42, 39, 59, 45, 46, 58, 43, 56] -> target: 11
contex

### Bigram Model

In [6]:
# B - batch size, T - block size (time step), C - embedding dimension (vocab size)

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embed_table = nn.Embedding(vocab_size, vocab_size) # (B,T) -> (B,T,C)

    def forward(self, x: torch.Tensor, y: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
        logits = self.token_embed_table(x)

        if y is None:
            loss = None
        else:
            B, T, C = logits.shape
            # Flatten batch and sequence dimensions to use F.cross_entropy
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = F.cross_entropy(logits, y)
        return logits, loss

    def generate(self, x: torch.Tensor, max_tokens: int) -> torch.Tensor:
        for _ in range(max_tokens):
            # Get the previous predictions
            logits, _ = self(x)
            # Keep only the last prediction
            logits = logits[:, -1, :] # (B,C)
            # Apply softmax to convert logits into probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # Sample from the probability distribution
            x_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # Concatenate the new prediction to the previous context
            x = torch.cat([x, x_next], dim=1) # (B,T+1)
        return x
    
model = BigramLanguageModel()

# Generate
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_tokens=32)[0].tolist()))


,NKlig!yy',HVVodGvCrlbg3SWWImRlc


In [55]:
# Hyperparameters
batch_size = 32 # Sequences to process in parallel
max_iters = 2500 # Iterations to train the model
lr = 1e-2 # Learning rate

# Training the model
optimiser = torch.optim.AdamW(model.parameters(), lr=lr)

loss = torch.tensor(torch.inf)

for i in range(max_iters):

    if i % (max_iters // 10) == 0 or i == max_iters - 1:
        print(f'iteration {i}, loss: {loss.item()}')

    # Get a batch of context and target sequences
    xb, yb = get_batch('train')

    # Compute the gradients and update the weights
    _, loss = model(xb, yb) # Forward pass
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()

iteration 0, loss: inf
iteration 250, loss: 2.43190860748291
iteration 500, loss: 2.462327480316162
iteration 750, loss: 2.4756431579589844
iteration 1000, loss: 2.4330947399139404
iteration 1250, loss: 2.46685791015625
iteration 1500, loss: 2.453742265701294
iteration 1750, loss: 2.4380171298980713
iteration 2000, loss: 2.4536423683166504
iteration 2250, loss: 2.4501090049743652
iteration 2499, loss: 2.468660831451416


In [59]:
# Generate
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_tokens=32)[0].tolist()))


AR:
Kether.
RKEXEOfat lit,
I ute


### Self-Attention

In [35]:
B, T, C = 4, 8, 32
x = torch.randn(B, T ,C)

# Bag of words. Calculate x[b,t] = mean_{t'<=t} x[b,t']
xbow_1 = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xbow_1[b, t] = torch.mean(x[b, :t+1], 0)

# Version 2. Parallelised. W is a lower triangular matrix which can be used for weighted aggregation.
W = torch.tril(torch.ones(T, T))
W = W / W.sum(1, keepdim=True)
xbow_2 = W @ x

# Version 3. Parallelised. Uses softmax. W represents the same lower triangular matrix as before.
tril = torch.tril(torch.ones(T, T))
W = torch.zeros((T, T))
W = W.masked_fill(tril == 0, float('-inf'))
W = F.softmax(W, dim=-1)
xbow_3 = W @ x

# Check that the three methods are equivalent
torch.allclose(xbow_1, xbow_2) and torch.allclose(xbow_1, xbow_3)


True

In [37]:
# B - batch size, T - block size (time step), C - embedding dimension, H - head size

# Single head self-attention
head_size = 16
n_embed = 32
key = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
query = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
value = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
k = key(x)
q = query(x)

# Compute the scaled dot-product attention
W = q @ k.transpose(-2, -1) # (B,T,H) @ (B,H,T) -> (B,T,T)
tril = torch.tril(torch.ones(T, T))
W = W.masked_fill(tril == 0, float('-inf'))
W = F.softmax(W, dim=-1)
v = value(x)
out = W @ v # (B,T,T) @ (B,T,H) -> (B,T,H)

**Notes:**
- Attention is a communication mechanism. It can be viewed as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why tokens need to be positionally encoded.
- Each example across batch dimensions are treated independently and never interact with each other.
- In an encoder attention block just delete the single line that performs masking with `tril`, allowing all tokens to communicate with each other and not just the previous ones. The block implemented above is called a decoder attention block because it has triangular masking and is used in autoregressive settings like language modelling.
- 'Self-attention' just means that the keys and the values are produced from the same source as the queries (`x` in this case). In 'cross attention', the queries still get produced from `x`, but the keys and values come from a different source (such as an encoder module).
- 'Scaled' attention additionally divides `W` by $1/\sqrt{H}$. This ensures that when the input `Q` and `K` are of unit variance, `W` has unit variance as well and softmax will stay diffuse and not saturate (see below).

In [39]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
W = q @ k.transpose(-2, -1) * head_size**-0.5

k.var(), q.var(), W.var()

(tensor(0.9748), tensor(1.0178), tensor(1.0286))

In [42]:
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1))
# With larger values the probabilities become more concentrated, converges to a one-hot vector
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 10, dim=-1))

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
tensor([1.5851e-02, 7.8918e-04, 1.1713e-01, 7.8918e-04, 8.6545e-01])
