<a href="https://colab.research.google.com/github/emaadqazi/LLM_from_scratch/blob/main/GPT_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2026-02-05 03:12:34--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2026-02-05 03:12:34 (21.4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [50]:
with open ('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [51]:
print('length of dataset in characters: ', len(text))

length of dataset in characters:  1115394


In [52]:
# All the unique characters that are from the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [53]:
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) } # Creating lookup table from char to integer; ch -> i
itos = { i:ch for i,ch in enumerate(chars) } # Vice-versa
encode = lambda s: [stoi[c] for c in s] # encoder -> take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder -> take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [54]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # The 1000 chars from earlier looks like the following output to GPT

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [55]:
# Split data into train and validation sets
n = int(0.9*len(data)) # First 90% will be train; rest val
train_data = data[:n] # Index first 90%
val_data = data[n:] # Index last 10%
# This will help us understand to what extend the model is overfitting

In [56]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [57]:
# In the above example, there are 9 values but there are 8 training examples. GPT will essentially look at that and say that when
# there is [18, 47] -> next value is 56, etc. Look at output to understand that
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"When input is {context} the target: {target}")

When input is tensor([18]) the target: 47
When input is tensor([18, 47]) the target: 56
When input is tensor([18, 47, 56]) the target: 57
When input is tensor([18, 47, 56, 57]) the target: 58
When input is tensor([18, 47, 56, 57, 58]) the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [58]:
torch.manual_seed(1337) # Seed for random number generator
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions

def get_batch(split):
  # Generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size): # Batch dimension
  for t in range(block_size): # Time dimension
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"When input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
When input is [24] the target: 43
When input is [24, 43] the target: 58
When input is [24, 43, 58] the target: 5
When input is [24, 43, 58, 5] the target: 57
When input is [24, 43, 58, 5, 57] the target: 1
When input is [24, 43, 58, 5, 57, 1] the target: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
When input is [44] the target: 53
When input is [44, 53] the target: 56
When input is [44, 53, 56] the target: 1
When input is [44, 53, 56, 1] the target: 58
When input is [44, 53, 56, 1, 58] the target: 46
When input is [44, 5

In [59]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [60]:
'''
Notes:
torch.nn = neural network building blocks (layers, loss functons)
manual_seed => random numbers are reproducible so we get the same random result everytime
nn.Module is a template that automatically tracks all the learnable paramters, enables GPU support, handles saving/loading, and more
Bigram: pair of consecutive items -> a bigram model predicts the next character using only the current character (simplest possible language model as there is no memory of anything before)

nn.Embedding creates lookup table where there are n rows (one for each char) and n cols (one score for each possible next char) where n is a character in the vocabulary

BTC (Batch, Time, Channel/Classes)
B = how many sequences in parallel
T = length of each sequence
C = scores for each possible next character

(4, 8, 65) means: for 4 sequences each with 8 positions, I have 65 scores at each position predicting what comes next

PyTorch's 'cross_entropy' function is picky about input shape. It wants:
- Predictions: (N, C) - N examples, C classes
- Targets: (N) - N correct answers

We have:
logits: (4, 8, 65) - 4 batches, 8 positions, 65 scores
targets: (4, 8) - 4 batches, 8 correct answers
So, we flatten them and get:
logits: (4, 8, 65) -> 32, 65 # 32 predictions each with 65 scores
targets: (4, 8) -> 32 # 32 correct answers

Cross_entropy: measures how bad our predictions are
Loss of ~4.17 means model is getting randomly right now

Generate method:
(1) Get predictions for all positions
(2) Keep only the last position's prediction
(3) Converts logits -> probabilities via softmax: [4.2, 1.1, -2.0] -> [72%, 20%, 8%]
(4) Randomly samples based on probabilities: Pick "e" (72% likely)
(5) Append new characters, repeat: "h" -> "he" -> "hel" -> ...
We sample randomly (via multinomial) as it adds variety and allows model to pick 2nd/3rd most likely character to make output more interesting

Final output:
torch.zeros((1, 1)) - start with a single "0" token (newline char)
max_new_tokens = 100 - generate 100 chars
decode(...) - convert integers back to text

TLDR;
forward() -> 'Here is my prediction + how wrong I was'
generate() -> 'Let me write some text character by character'
Loss -> The signal we will use to improve the model
'''

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__() # Calls parent class; needed for nn.Module to work properly
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    # idx and targets are both (B, T) tensor of integers
    logits = self.token_embedding_table(idx) # (B, T, C): Batch Time Channel tensor

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [61]:
# Create PyTorch optimizer; algorithm used during neural network to adjust parameters (weights & biases) to minimize the loss function
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
'''
AdamW: algorithm that decides how to adjust the model's numbers
m.parameters() -> "Here are all the learnable numbers in my model" (the embedding table)
lr=1e-3 -> learning rate = 0.001; how big each adjustment is. Too big = chaotic. Too small = slow
'''


'\nAdamW: algorithm that decides how to adjust the model\'s numbers\nm.parameters() -> "Here are all the learnable numbers in my model" (the embedding table)\nlr=1e-3 -> learning rate = 0.001; how big each adjustment is. Too big = chaotic. Too small = slow\n'

In [62]:
batch_size = 32 # Increased from 4 to 32
for steps in range(10000):
  # Sample a batch of data
  xb, yb = get_batch('train')

  # Evaluate the loss
  logits, loss = m(xb, yb) # Forward pass; makes predictions and measures how wrong we are
  optimizer.zero_grad(set_to_none=True) # Zero gradients; clear out "adjustment" notes from the previous step (PyTorch accumulates gradients by default so we reset them)
  loss.backward() # PyTorch figures out: "To reduce the loss, which numbers in the embedding table should go up? Which should go down? By how much?" These adjustment instructions are called gradients
  optimizer.step() # Update embedding table based on the gradients

  print(loss.item())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2.515348196029663
2.474428176879883
2.6085002422332764
2.6852357387542725
2.440641403198242
2.5239131450653076
2.583158016204834
2.5586931705474854
2.5336263179779053
2.5082459449768066
2.563934087753296
2.4099056720733643
2.549560070037842
2.627998113632202
2.597146511077881
2.6242775917053223
2.485126256942749
2.5880191326141357
2.552927017211914
2.498333215713501
2.521122455596924
2.5981087684631348
2.56059193611145
2.47790789604187
2.60870361328125
2.561020612716675
2.542356014251709
2.6399805545806885
2.6982827186584473
2.541898250579834
2.5504093170166016
2.561579704284668
2.502772569656372
2.5175554752349854
2.681809663772583
2.603564977645874
2.5262415409088135
2.6207547187805176
2.538642406463623
2.5304956436157227
2.4993271827697754
2.671292304992676
2.6358695030212402
2.540466785430908
2.637118339538574
2.6086719036102295
2.5935816764831543
2.52583646774292
2.579291343688965
2.4875173568725586
2.552349328994751

In [63]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


In [64]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a==')
print(a)
print('--')
print('b==')
print(b)
print('--')
print('c=')
print(c)


a==
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b==
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [65]:
# Consider the following toy example:

torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

# We want to make the tokens talk to one another in a specific way; token in 5th location should not communicate with tokens beyond it. Only previous information should flow to current context
# Easiest way for tokens to communicate is by doing an average of all proceeding elements


torch.Size([4, 8, 2])

In [66]:
# We want x[b, t] = mean{i<=t} x[b, i]
xbow = torch.zeros((B, T, C)) # bow = bag of words; x_bow
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] # (t, C)
    xbow[b, t] = torch.mean(xprev, 0)

In [67]:
# Version 2: using matrix multipy for a weighted aggregation
wei = torch.tril(torch.ones(T, T)) # wei; short for weights
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2)

False

In [68]:
# version3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

False