<a href="https://colab.research.google.com/github/dominiksakic/zero_to_hero/blob/main/basics_06_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- source: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=126s

In [54]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [5]:
# get data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-07-14 10:42:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-07-14 10:42:32 (17.9 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [6]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [69]:
# make decoder, encoder
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i  for i, ch in enumerate(chars)}
itos = {i : ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode(text[:50]))
print(decode(encode(text[:50])))
print(f"Vocav size: {vocab_size}")

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56]
First Citizen:
Before we proceed any further, hear
Vocav size: 65


In [57]:
# Tokenize data, and create test/val
data = torch.tensor(encode(text), dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [58]:
# excursion into how the model predicts next token from one sentence
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]  #<---
    target = y[t]
    print(f'timestep {t}: when input is {context} the target is {target}')

timestep 0: when input is tensor([18]) the target is 47
timestep 1: when input is tensor([18, 47]) the target is 56
timestep 2: when input is tensor([18, 47, 56]) the target is 57
timestep 3: when input is tensor([18, 47, 56, 57]) the target is 58
timestep 4: when input is tensor([18, 47, 56, 57, 58]) the target is 1
timestep 5: when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
timestep 6: when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
timestep 7: when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


  - Result is that the model learns to complete from various lengths.
  - from one characters up to 8.


In [59]:
# lets make the example more complex by introducing a batch dimension
torch.manual_seed(1337)
batch_size = 4

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+ block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]

    print(f"Batch {b}: when context is {context}, target is {target}")
  print("\n")

Batch 0: when context is tensor([24]), target is 43
Batch 0: when context is tensor([24, 43]), target is 58
Batch 0: when context is tensor([24, 43, 58]), target is 5
Batch 0: when context is tensor([24, 43, 58,  5]), target is 57
Batch 0: when context is tensor([24, 43, 58,  5, 57]), target is 1
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1]), target is 46
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1, 46]), target is 43
Batch 0: when context is tensor([24, 43, 58,  5, 57,  1, 46, 43]), target is 39


Batch 1: when context is tensor([44]), target is 53
Batch 1: when context is tensor([44, 53]), target is 56
Batch 1: when context is tensor([44, 53, 56]), target is 1
Batch 1: when context is tensor([44, 53, 56,  1]), target is 58
Batch 1: when context is tensor([44, 53, 56,  1, 58]), target is 46
Batch 1: when context is tensor([44, 53, 56,  1, 58, 46]), target is 39
Batch 1: when context is tensor([44, 53, 56,  1, 58, 46, 39]), target is 58
Batch 1: when context is 

In [68]:
# create a baseline model/bigram
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx) # (B,T,C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape

      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
      # idx is (B, T) array of indices in the current context
      for _ in range(max_new_tokens):
          # get the predictions
          logits, loss = self(idx)
          # focus only on the last time step
          logits = logits[:, -1, :] # becomes (B, C)
          # apply softmax to get probabilities
          probs = F.softmax(logits, dim=-1) # (B, C)
          # sample from the distribution
          idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
          # append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
      return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.item())

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
4.878634929656982

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
# train the baseline

# Transformers


In [None]:
# Gensis of Transformer
# Averaging past context with for loops
# Matrix multiplying as weighted aggregation
# Final version