<a href="https://colab.research.google.com/github/dominiksakic/zero_to_hero/blob/main/basics_06_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- source: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=126s

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
# get data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-07-14 10:42:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-07-14 10:42:32 (17.9 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
# make decoder, encoder
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i  for i, ch in enumerate(chars)}
itos = {i : ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode(text[:50]))
print(decode(encode(text[:50])))
print(f"Vocav size: {vocab_size}")

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56]
First Citizen:
Before we proceed any further, hear
Vocav size: 65


In [None]:
# Tokenize data, and create test/val
data = torch.tensor(encode(text), dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
# excursion into how the model predicts next token from one sentence
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]  #<---
    target = y[t]
    print(f'timestep {t}: when input is {context} the target is {target}')

timestep 0: when input is tensor([18]) the target is 47
timestep 1: when input is tensor([18, 47]) the target is 56
timestep 2: when input is tensor([18, 47, 56]) the target is 57
timestep 3: when input is tensor([18, 47, 56, 57]) the target is 58
timestep 4: when input is tensor([18, 47, 56, 57, 58]) the target is 1
timestep 5: when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
timestep 6: when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
timestep 7: when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


  - Result is that the model learns to complete from various lengths.
  - from one characters up to 8.


In [None]:
# lets make the example more complex by introducing a batch dimension
torch.manual_seed(1337)
batch_size = 4

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+ block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]

    print(f"Batch {b}: when context is {context}, target is {target}")
  print("\n")

Batch 0: when context is tensor([24]), target is 43
Batch 0: when context is tensor([24, 43]), target is 58
Batch 0: when context is tensor([24, 43, 58]), target is 5
Batch 0: when context is tensor([24, 43, 58,  5]), target is 57
Batch 0: when context is tensor([24, 43, 58,  5, 57]), target is 1
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1]), target is 46
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1, 46]), target is 43
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1, 46, 43]), target is 39


Batch 1: when context is tensor([44]), target is 53
Batch 1: when context is tensor([44, 53]), target is 56
Batch 1: when context is tensor([44, 53, 56]), target is 1
Batch 1: when context is tensor([44, 53, 56,  1]), target is 58
Batch 1: when context is tensor([44, 53, 56,  1, 58]), target is 46
Batch 1: when context is tensor([44, 53, 56,  1, 58, 46]), target is 39
Batch 1: when context is tensor([44, 53, 56,  1, 58, 46, 39]), target is 58
Batch 1: when context is 

In [None]:
# create a baseline model/bigram
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx) # (B,T,C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape

      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
      # idx is (B, T) array of indices in the current context
      for _ in range(max_new_tokens):
          # get the predictions
          logits, loss = self(idx)
          # focus only on the last time step
          logits = logits[:, -1, :] # becomes (B, C)
          # apply softmax to get probabilities
          probs = F.softmax(logits, dim=-1) # (B, C)
          # sample from the distribution
          idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
          # append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
      return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.item())

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=50)[0].tolist()))

torch.Size([256, 65])
4.648044586181641

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLER


In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
# train the baseline
batch_size = 32

for s in range(100000):
  # trainig data
  xb, yb = get_batch('train')

  # Forward pass
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)

  #Backward pass
  loss.backward()
  optimizer.step()

print(loss.item())

2.4081521034240723


In [None]:
# sample after training
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



MAUEdingn I
GBENoutl, Thasuriopro sorllll le.

Mat,
FI s ICKI ambe mithevis LLEShe ysste ar s blllyorswon
Clmbe t uruk
CLLarf w p ar lye twemen ulif hercefowive:
YBOUSTIOVO: gh ced s p ay anore iveatothe ierave yanccu wind s; oalllak omad ste?
h;

JUThow h llde iouge thes whe yomeathistlieis moma hit me o ind.

F hik, thite:
TRThe hal at w!
Whase t ma T:

Bareomast yethin athe stt geloupr msh f wh n
Yorinkeshave pan t,
NGAnthe or,
Wh tro joullieallisube:
Fin matthese V:
Aporn geng y ll yr mofor


In [None]:
xv, yv = get_batch("val")
_, loss = m(xv, yv)

print(f"Validation loss: {loss}")

Validation loss: 2.419926404953003


# Transformers


In [29]:
# Gensis of Transformer (weighted aggregation)
import torch

torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
print('Starting Tensor a')
print({a})

a = a / torch.sum(a, 1, keepdim=True)
print(f'\nShape of a: {a.shape}')
print('Divide values by the sum along the 1 axis: ')
print({a})

b = torch.randint(0,10,(3,2)).float()
print(f'\nShape of b: {b.shape}')
print('Starting tensor b is')
print(b)

c = a @ b
print('\nresult of c is')
print(c)


Starting Tensor a
{tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])}

Shape of a: torch.Size([3, 3])
Divide values by the sum along the 1 axis: 
{tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])}

Shape of b: torch.Size([3, 2])
Starting tensor b is
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

result of c is
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [51]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
print(f'Batch 0')
print(x[0])

Batch 0
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


In [49]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    print(f'Batch: {b}')
    for t in range(T):
      xprev = x[b, :t+1]
      xbow[b,t] = torch.mean(xprev, 0)
      print(f'time step {t}, after aggregation: {xbow[b,t]}')

    if b == 0:
      break

Batch: 0
time step 0, after aggregation: tensor([ 0.1808, -0.0700])
time step 1, after aggregation: tensor([-0.0894, -0.4926])
time step 2, after aggregation: tensor([ 0.1490, -0.3199])
time step 3, after aggregation: tensor([ 0.3504, -0.2238])
time step 4, after aggregation: tensor([0.3525, 0.0545])
time step 5, after aggregation: tensor([ 0.0688, -0.0396])
time step 6, after aggregation: tensor([ 0.0927, -0.0682])
time step 7, after aggregation: tensor([-0.0341,  0.1332])


- For each time step we want to have the previous timesteps accumulated(mean)

In [57]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
print(f'shape of wei: {wei.shape}')
print(f'wei is: {wei}')

wei = wei / wei.sum(1, keepdim=True)
print(f'\nwei after normalization: {wei}')

xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
print(f'\nxbow2 batch 0 after multiplication: {xbow2[0]}')


shape of wei: torch.Size([8, 8])
wei is: tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

wei after normalization: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

xb

In [63]:
import torch.nn.functional as F
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
print(f'tril: {tril}')

wei = torch.zeros((T,T))
print(f'\nwei: {wei}')

wei = wei.masked_fill(tril == 0, float('-inf'))
print(f'\nwei after masking: {wei}')

wei = F.softmax(wei, dim=-1)
print(f'\nwei after softmax: {wei}')

xbow3 = wei @ x
print(f'\nxbow3 batch 0: {xbow3[0]}')

tril: tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

wei: tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

wei after masking: tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -