# Assignment 3: Transformer is All You Need

Federico Giorgi (fg2617)

## Basics

In [1]:
# Import all the libraries
import os, math, torch
from dataclasses import dataclass
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [None]:
# ========= 0) Load Tiny Shakespeare =========
# If you already have it locally, set TINY_PATH to that file.
# Otherwise, download once from the classic URL.
TINY_PATH = "tiny_shakespeare.txt"
if not os.path.exists(TINY_PATH):
    import urllib.request
    urllib.request.urlretrieve(
        "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
        TINY_PATH
    )

with open(TINY_PATH, "r", encoding="utf-8") as f:
    corpus_text = f.read()

# ========= 1) Train a subword tokenizer (BPE) with vocab <= 500 =========
special_tokens = ["[PAD]", "[UNK]", "[BOS]", "[EOS]"]
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=500, min_frequency=2, special_tokens=special_tokens)
tokenizer.train_from_iterator([corpus_text], trainer=trainer)

pad_id  = tokenizer.token_to_id("[PAD]")
bos_id  = tokenizer.token_to_id("[BOS]")
eos_id  = tokenizer.token_to_id("[EOS]")
vocab_size = tokenizer.get_vocab_size()

# Encode entire corpus to integer IDs
ids = tokenizer.encode(corpus_text).ids

# ========= 2) Sequence formatting: overlapping fixed-length windows =========
# For next-token prediction: Input = first N tokens, Target = same sequence shifted by 1
SEQ_LEN = 50  # N in your spec
def make_windows(token_ids, seq_len):
    # produce (inp, tgt) pairs with stride=1, overlapping
    # last complete window ends at len-1 to allow shift
    L = len(token_ids)
    # Need inp length = seq_len, tgt length = seq_len, so we need i .. i+seq_len for inp and i+1 .. i+seq_len+1 for tgt
    # That means i must go until L - (seq_len + 1)
    limit = L - (seq_len + 1)
    inputs = []
    targets = []
    for i in range(0, max(0, limit + 1)):
        seq = token_ids[i : i + seq_len + 1]
        inp = seq[:-1]
        tgt = seq[1:]
        inputs.append(inp)
        targets.append(tgt)
    return inputs, targets

inputs, targets = make_windows(ids, SEQ_LEN)

# ========= 3) 80/20 split (on sequence-pairs) =========
dataset_size = len(inputs)
train_size = int(0.8 * dataset_size)
val_size   = dataset_size - train_size

class NextTokenDataset(Dataset):
    def __init__(self, X, Y):
        self.X = [torch.tensor(x, dtype=torch.long) for x in X]
        self.Y = [torch.tensor(y, dtype=torch.long) for y in Y]
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.Y[i]

full_ds = NextTokenDataset(inputs, targets)
train_ds, val_ds = random_split(full_ds, [train_size, val_size], generator=torch.Generator().manual_seed(42))

# ========= 4) DataLoaders =========
BATCH_SIZE = 128
def collate_batch(batch):
    # All sequences are already length SEQ_LEN, so simple stack
    X = torch.stack([b[0] for b in batch], dim=0)  # (B, T)
    Y = torch.stack([b[1] for b in batch], dim=0)  # (B, T)
    return X, Y

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=collate_batch)

# ========= 5) Token embeddings + positional encodings =========
# Option A (learned positions): nn.Embedding for both tokens and positions
class TokenPosEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        self.tok = nn.Embedding(vocab_size, d_model)
        self.pos = nn.Embedding(max_len, d_model)
    def forward(self, x):
        # x: (B, T) token IDs
        B, T = x.size()
        pos = torch.arange(T, device=x.device).unsqueeze(0).expand(B, T)  # (B, T)
        return self.tok(x) + self.pos(pos)  # (B, T, d_model)

# Option B (sinusoidal positions): classic transformer-style fixed encodings
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=10000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)  # (max_len, d_model)
    def forward(self, x):
        # x: (B, T, d_model) token embeddings
        T = x.size(1)
        return x + self.pe[:T].unsqueeze(0)

# Example: build embeddings and run one batch through
device = "cuda" if torch.cuda.is_available() else "cpu"
d_model = 256
max_len = SEQ_LEN  # since our sequences are fixed-length windows

tokpos = TokenPosEmbedding(vocab_size, d_model, max_len).to(device)
sinpos = SinusoidalPositionalEncoding(d_model, max_len).to(device)

xb, yb = next(iter(train_loader))  # (B, T), (B, T)
xb = xb.to(device)
emb_tokpos = tokpos(xb)            # (B, T, d_model) learned positions
emb_sin    = sinpos(tokpos.tok(xb))# (B, T, d_model) sinusoidal positions added to token embeddings

print("Vocab size:", vocab_size)
print("Train batches:", len(train_loader), "Val batches:", len(val_loader))
print("Embedded shapes (learned / sinusoidal):", emb_tokpos.shape, emb_sin.shape)

# Your model can now consume emb_tokpos (or emb_sin). For next-token prediction,
# typical loss is cross-entropy over logits shaped (B, T, vocab_size) vs targets (B, T).
