# GPT From Scratch

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)  # For reproducibility

<torch._C.Generator at 0x12c7cec90>

## Setup and Data Cleaning

Here we are going to be building a basic text encoder and decoder (character level)

In [3]:
# Load training data
with open('tiny-shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Print the first 100 characters of the text
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
# Create a set of unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Number of unique characters: {vocab_size}")
print(f"Unique characters: {''.join(chars)}")

Number of unique characters: 65
Unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
# string to index mapping
# and index to string mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

def encode(s):
    return [stoi[c] for c in s]
def decode(l):
    return ''.join([itos[i] for i in l])

# Example encoding and decoding
example_string = "hello"
print(f"Original string: {example_string}")
encoded = encode(example_string)
print(f"Encoded: {encoded}")
decoded = decode(encoded)
print(f"Decoded: {decoded}")

Original string: hello
Encoded: [46, 43, 50, 50, 53]
Decoded: hello


In [6]:
# Convert the entire text to a tensor of integers
data = torch.tensor(encode(text), dtype=torch.long)

# Split the data into training and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
block_size = 8  # Length of each input sequence
x = train_data[:block_size]
y = train_data[1:block_size + 1]

In [8]:
# Enconded example of context and target
for i in range(block_size):
    context = x[:i + 1]
    target = y[i]
    print(f"Context: {context}, Target: {target}")

Context: tensor([18]), Target: 47
Context: tensor([18, 47]), Target: 56
Context: tensor([18, 47, 56]), Target: 57
Context: tensor([18, 47, 56, 57]), Target: 58
Context: tensor([18, 47, 56, 57, 58]), Target: 1
Context: tensor([18, 47, 56, 57, 58,  1]), Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15]), Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Target: 58


In [9]:
# Decoded example of context and target
for i in range(block_size):
    context_decoded = decode(x[:i + 1].tolist())
    target_decoded = decode([y[i].item()])
    print(f"Context: '{context_decoded}', Target: '{target_decoded}'")

Context: 'F', Target: 'i'
Context: 'Fi', Target: 'r'
Context: 'Fir', Target: 's'
Context: 'Firs', Target: 't'
Context: 'First', Target: ' '
Context: 'First ', Target: 'C'
Context: 'First C', Target: 'i'
Context: 'First Ci', Target: 't'


## Batching Our Data

In [10]:
batch_size = 4
block_size = 8

def get_batch(split):
    # Get a batch of data for training or validation
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y

# Example usage of get_batch:
# We can see that the matrix x contains sequences of characters
# and y contains the next character for each sequence
x_batch, y_batch = get_batch('train')
print("Batch x:")
print(x_batch)
print("Batch y:")
print(y_batch)

for i in range(2):
    for j in range(block_size):
        context = x_batch[i, :j+1]
        target = y_batch[i, j]
        # Print encoded context and target
        print(f"Batch {i}, Context: {context.tolist()}, Target: {target.item()}")

Batch x:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Batch y:
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
Batch 0, Context: [24], Target: 43
Batch 0, Context: [24, 43], Target: 58
Batch 0, Context: [24, 43, 58], Target: 5
Batch 0, Context: [24, 43, 58, 5], Target: 57
Batch 0, Context: [24, 43, 58, 5, 57], Target: 1
Batch 0, Context: [24, 43, 58, 5, 57, 1], Target: 46
Batch 0, Context: [24, 43, 58, 5, 57, 1, 46], Target: 43
Batch 0, Context: [24, 43, 58, 5, 57, 1, 46, 43], Target: 39
Batch 1, Context: [44], Target: 53
Batch 1, Context: [44, 53], Target: 56
Batch 1, Context: [44, 53, 56], Target: 1
Batch 1, Context: [44, 53, 56, 1], Target: 58
Batch 1, Context: [44, 53, 56, 1, 58], Target: 46
Batch 1, Context: [44, 53, 56, 1, 58, 46], Target:

## Bigram Model

In [11]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and target are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        if targets is None: return logits, None
        B, T, C = logits.shape
        # We reshape to comply to the expected input for cross-entropy loss
        # in PyTorch, which expects logits to be (B*T, C)
        logits = logits.view(B * T, C)  # Reshape to (B*T, C)
        # targets should be reshaped to (B*T,)
        targets = targets.view(B * T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) tensor of integers
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx) # Forward pass

            # focus only on the last time step
            logits = logits[:, -1, :] # (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the input
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [12]:
model = BigramLanguageModel(vocab_size)
logits, loss = model(x_batch, y_batch)
print("Logits shape:", logits.shape)
print("Loss:", loss.item())

print(decode(model.generate(
    idx = torch.zeros(
        (1, 1), dtype=torch.long
    ),
    max_new_tokens = 100,
)[0].tolist()))

Logits shape: torch.Size([32, 65])
Loss: 5.036386013031006

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


### Training

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

batch_size = 32
STEPS = 10000
for step in range(STEPS):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)  # Zero the gradients
    loss.backward()  # Backpropagation
    optimizer.step()  # Update the model parameters
    if step % 100 == 0:
        print(f"Step {step}, Loss: {loss.item()}")

Step 0, Loss: 4.647705078125
Step 100, Loss: 4.563594341278076
Step 200, Loss: 4.487884998321533
Step 300, Loss: 4.341160297393799
Step 400, Loss: 4.198233604431152
Step 500, Loss: 4.104591369628906
Step 600, Loss: 3.951016902923584
Step 700, Loss: 3.9056990146636963
Step 800, Loss: 3.800628185272217
Step 900, Loss: 3.7908599376678467
Step 1000, Loss: 3.664973020553589
Step 1100, Loss: 3.601811647415161
Step 1200, Loss: 3.595593214035034
Step 1300, Loss: 3.5757346153259277
Step 1400, Loss: 3.463500499725342
Step 1500, Loss: 3.342952013015747
Step 1600, Loss: 3.28108811378479
Step 1700, Loss: 3.1663148403167725
Step 1800, Loss: 3.1322338581085205
Step 1900, Loss: 3.186391830444336
Step 2000, Loss: 3.3166022300720215
Step 2100, Loss: 3.0833427906036377
Step 2200, Loss: 3.0410451889038086
Step 2300, Loss: 3.0327343940734863
Step 2400, Loss: 2.9013376235961914
Step 2500, Loss: 2.9623847007751465
Step 2600, Loss: 2.801382303237915
Step 2700, Loss: 2.9442999362945557
Step 2800, Loss: 2.91966

In [14]:
# Visual check of the model's generation
print(decode(model.generate(
    idx = torch.zeros(
        (1, 1), dtype=torch.long
    ),
    max_new_tokens = 500,
)[0].tolist()))


M:
IUSh t,
F th he d ke alved.
Thupld, cipbll t
I: ir w, l me sie hend lor ito'l an e

I:
Gochosen ea ar btamandd halind
Aust, plt t wadyotl
I bel qunganonoth he m he de avellis k'l, tond soran:

WI he toust are bot g e n t s d je hid t his IAces I my ig t
Ril'swoll e pupat inouleacends-athiqu heamer te
Wht s

MI wect!-lltherotheve t fe;
WAnd py;

PO t s ld tathat, ir V
IO thesecin teot tit ado ilorer.
Ply, d'stacoes, ld omat mealellly yererer EMEvesas ie IZEd pave mautoofareanerllleyomerer but?


## Transformer

In [None]:
# Hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
eval_iters = 200

device = None
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(f"Using device: {device}")

Using device: mps


In [None]:
# Mathematical Trick in Self Attention:
# Because we are building a decoder-only model, we can use a causal mask
# which ensures that the model can only attend to previous tokens and not future ones.

