In [123]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import pandas as pd

In [124]:
cities = pd.read_csv("cities_raw.csv")["city_en"].tolist()

split = int(0.9*len(cities))
train_data = cities[:split]
val_data = cities[split:]
print(train_data[:4])
print('train:', len(train_data))
print('val:', len(val_data))

['sapporo', 'chūō-ku', 'kita-ku', 'higashi-ku']
train: 1723
val: 192


In [125]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(cities))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0 # start and finish char
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

def encode(s: str):
  return [stoi[c] for c in s]

def decode(ints: list[int]):
  return ''.join(itos[i] for i in ints)

decode(encode(cities[3]))

{1: '-', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'w', 23: 'y', 24: 'z', 25: 'ō', 26: 'ū', 0: '.'}
27


'higashi-ku'

In [126]:
# What uniform distribution looks like

for _ in range(5):
    word = []
    while True:
        ix = torch.randint(vocab_size, (1,)).item()
        char = itos[ix]
        if char == '.':
            if len(word) > 0: # only break if we have some chars
                break
            continue
        word.append(char)
    print(''.join(word))

ahebwa
pizbūbōmbucjtlycōsigūdwr-iuhoppojmkjandybōo
klamatbzu-kkykwoūrf
-ūymtpōmūabūdaegarcaybfyū-ujtcycūawwōwowujōūbydichō
gōcwunjuerdtcsiha-kwnsōahcraigrbholprotag-imjwssyhpinsaōfdeūyzswmopt-pbdc-aūbzotjcnfkcetsoznwsūfkcndkzelas


In [127]:
from typing import Literal

class DatasetManager:
    def __init__(self, train_data, val_data, block_size, batch_size):
        self.block_size = block_size
        self.batch_size = batch_size
        self.train_dataset = self._build_dataset(train_data)
        self.val_dataset = self._build_dataset(val_data)

    def _build_dataset(self, data):
        X, Y = [], []
        for w in data:
            encoding = encode(w + '.')
            context = encode('.') * self.block_size
            for idx in encoding:
                X.append(context)
                Y.append(idx)
                context = context[1:] + [idx]
        return torch.tensor(X), torch.tensor(Y)

    def get_batch(self, split: Literal["train", "val"]):
        data = self.train_dataset if split == "train" else self.val_dataset
        ix = torch.randint(len(data[0]), (self.batch_size,))
        return data[0][ix], data[1][ix]

    def estimate_loss(self, model, eval_iters=200):
        out = {}
        model.eval()
        with torch.no_grad():
            for split in ['train', 'val']:
                losses = torch.zeros(eval_iters)
                for k in range(eval_iters):
                    X, Y = self.get_batch(split)
                    logits, loss = model(X, Y)
                    losses[k] = loss.item()
                out[split] = losses.mean()
        model.train()
        return out

In [128]:
# Example usage:
db = DatasetManager(train_data, val_data, block_size=6, batch_size=8)
xbatch, ybatch = db.get_batch("train")

for x, y in list(zip(xbatch, ybatch)):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

.....t --> a
otakad --> a
..unna --> n
..ibar --> a
...wak --> a
.....e --> t
..yama --> g
..yama --> t


In [129]:
from torch import nn
from torch.nn import functional as F

# Some terminology:
# Batch (32): Different training examples processed in parallel
#   - Each batch contains 32 different sequences we're training on

# Time (4): Sequence positions (your block_size)
#   - For city "Tokyo.", with block_size=4:
#   - "...." -> "T"
#   - "...T" -> "o"
#   - "..To" -> "k"
#   - ".Tok" -> "y"

# Classes (20): Your vocabulary size (possible characters)
#   - Each position outputs scores for all possible next characters
#   - If vocab is ['.','a','b','c',...], then each position predicts
#     probabilities for each character being next

# HYPERPARAMETERS
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class SimpleMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

simple_mlp = SimpleMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  6281


In [42]:
# Initial (completely random):
simple_mlp.generate(6)

cydi.
ueig-ōr.
wuruicydgaku.
ōūōwtffdpiksizljddiisa.
ōwkkciōfwa.
nmn.


In [43]:
# TRAIN Simple MLP
from dataclasses import dataclass

@dataclass
class LearningInterval():
    lr: int
    iters: int

def train_model(model, schedules, eval_interval=2000):
    for i, sch in enumerate(schedules):
        print(f"SCHEDULE {i+1}/{len(schedules)}: lr={sch.lr}, iters={sch.iters}")

        # create a PyTorch optimizer
        optimizer = torch.optim.AdamW(model.parameters(), lr=sch.lr)

        for cur_iter in range(sch.iters):

            # every once in a while evaluate the loss on train and val sets
            if cur_iter % eval_interval == 0 or cur_iter == sch.iters - 1:
                losses = db.estimate_loss(model)
                print(f"iter {cur_iter + 1}/{sch.iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            # sample a batch of data
            xb, yb = db.get_batch('train')

            # evaluate the loss
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

# Example schedule
SAME_SCHEDULE = [
    LearningInterval(1e-2, 6_000),
    LearningInterval(1e-3, 10_000), 
    LearningInterval(1e-4, 14_000),
    LearningInterval(1e-5, 6_000),
]

train_model(simple_mlp, SAME_SCHEDULE)


SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2953, val loss 3.2979
iter 2001/6000: train loss 1.8181, val loss 1.9482
iter 4001/6000: train loss 1.7779, val loss 1.9859
iter 6000/6000: train loss 1.7836, val loss 1.9999
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.7885, val loss 1.9992
iter 2001/10000: train loss 1.6962, val loss 1.9438
iter 4001/10000: train loss 1.6829, val loss 1.9293
iter 6001/10000: train loss 1.6901, val loss 1.9114
iter 8001/10000: train loss 1.6906, val loss 1.9296
iter 10000/10000: train loss 1.6902, val loss 1.9211
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.6577, val loss 1.9431
iter 2001/14000: train loss 1.6649, val loss 1.9265
iter 4001/14000: train loss 1.6705, val loss 1.9122
iter 6001/14000: train loss 1.6821, val loss 1.9257
iter 8001/14000: train loss 1.6694, val loss 1.9578
iter 10001/14000: train loss 1.6364, val loss 1.9587
iter 12001/14000: train loss 1.6601, val loss 1.9525
iter 14000/14000: t

In [44]:
simple_mlp.generate(20)

hiradami.
mitsutai.
anooshi.
kininehitashi.
akura.
shi-ku.
toka.
okumakawa.
hinawa.
neimu.
urida.
mimahi-ku.
murutsu.
wakasawa.
shōtai-ku.
sukuika.
onohachō.
totagawa.
minamo.
kawako.


In [45]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 10
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 300  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class BigMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

big_mlp = BigMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  6281


In [46]:
train_model(big_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2970, val loss 3.2992
iter 2001/6000: train loss 1.8338, val loss 2.1074
iter 4001/6000: train loss 1.8030, val loss 1.9803
iter 6000/6000: train loss 1.8246, val loss 2.0572
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.7896, val loss 2.0966
iter 2001/10000: train loss 1.6565, val loss 1.9767
iter 4001/10000: train loss 1.6577, val loss 1.9882
iter 6001/10000: train loss 1.6411, val loss 1.9671
iter 8001/10000: train loss 1.6474, val loss 1.9716
iter 10000/10000: train loss 1.6599, val loss 1.9877
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.6539, val loss 1.9966
iter 2001/14000: train loss 1.6338, val loss 1.9584
iter 4001/14000: train loss 1.6404, val loss 1.9639
iter 6001/14000: train loss 1.6477, val loss 1.9672
iter 8001/14000: train loss 1.6324, val loss 1.9332
iter 10001/14000: train loss 1.6463, val loss 1.9564
iter 12001/14000: train loss 1.6453, val loss 1.9517
iter 14000/14000: t

In [47]:
big_mlp.generate(20)

nomefunza.
takishika.
mōfunue.
tasakagaō.
chiratauno.
abachō.
osankawa.
izumozuwa.
tariya.
kotsuyoi.
nomanoma.
menoka.
echiō.
minami-ku.
fukuto.
itashi-ku.
jimishinōman.
nurahira.
shiwa.
takamanaka.


In [48]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class NonLinearMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

nonlin_mlp = NonLinearMLP()
total_params = sum(p.numel() for p in nonlin_mlp.parameters())
print("Params: ", total_params)

Params:  43613


In [49]:
train_model(nonlin_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.3095, val loss 3.3080
iter 2001/6000: train loss 1.7771, val loss 1.9692
iter 4001/6000: train loss 1.7233, val loss 2.0189
iter 6000/6000: train loss 1.6491, val loss 1.9498
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.6925, val loss 1.9334
iter 2001/10000: train loss 1.4947, val loss 2.0130
iter 4001/10000: train loss 1.4474, val loss 2.1456
iter 6001/10000: train loss 1.3806, val loss 2.1756
iter 8001/10000: train loss 1.3380, val loss 2.3432
iter 10000/10000: train loss 1.2977, val loss 2.3887
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.3269, val loss 2.3864
iter 2001/14000: train loss 1.3001, val loss 2.4162
iter 4001/14000: train loss 1.2642, val loss 2.4679
iter 6001/14000: train loss 1.2730, val loss 2.5201
iter 8001/14000: train loss 1.2793, val loss 2.5331
iter 10001/14000: train loss 1.2660, val loss 2.5259
iter 12001/14000: train loss 1.2466, val loss 2.6467
iter 14000/14000: t

In [50]:
nonlin_mlp.generate(20)

uba.
toyonaka.
kizugawa.
nama-ku.
kita-ku.
igate.
hamagai.
waki.
okaoda.
hakuya.
katana.
yamagata.
yuofuto.
yogata.
wanjō.
date.
yūhonan.
tomato-ku.
kikube.
higashikuno.


In [51]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class BN_MLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

bnmlp = BN_MLP()
total_params = sum(p.numel() for p in bnmlp.parameters())
print("Params: ", total_params)

Params:  20629


In [52]:
train_model(bnmlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2749, val loss 3.2759
iter 2001/6000: train loss 1.6942, val loss 1.9078
iter 4001/6000: train loss 1.6410, val loss 1.9056
iter 6000/6000: train loss 1.5280, val loss 1.8565
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.5207, val loss 1.8551
iter 2001/10000: train loss 1.3921, val loss 1.9235
iter 4001/10000: train loss 1.3512, val loss 1.9315
iter 6001/10000: train loss 1.3413, val loss 1.9574
iter 8001/10000: train loss 1.3116, val loss 2.0164
iter 10000/10000: train loss 1.2807, val loss 2.0214
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.2552, val loss 1.9973
iter 2001/14000: train loss 1.2598, val loss 1.9348
iter 4001/14000: train loss 1.2490, val loss 1.9595
iter 6001/14000: train loss 1.2612, val loss 2.0260
iter 8001/14000: train loss 1.2526, val loss 2.0257
iter 10001/14000: train loss 1.2355, val loss 2.0557
iter 12001/14000: train loss 1.2528, val loss 2.0525
iter 14000/14000: t

In [53]:
bnmlp.eval()
bnmlp.generate(20)

nakaizaku.
hori.
higashikage.
shiroichi.
akaita.
sashinae.
chinakawa.
yunagi.
sakawa.
miyada.
ribu.
tagabu.
kariya.
asakinose.
shiriizu.
matsuka.
ubachi.
makkura.
yotsukawa.
mitaka.


In [95]:
from torch import nn

# HYPERPARAMETERS (same as original)
block_size = 20
batch_size = 40
n_embd = 64    # embedding dim (increased)
n_head = 4

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class TransformerNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        
        # Stack 6 transformer layers
        self.blocks = nn.Sequential(*[
            nn.TransformerDecoderLayer(
                d_model=n_embd,
                nhead=n_head,
                dim_feedforward=4*n_embd,
                dropout=0.1,
                batch_first=True
            ) for _ in range(6)
        ])
        
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.ln_final = nn.LayerNorm(n_embd)

    def forward(self, x, targets=None):
        B, T = x.shape
        
        # Create embeddings (unchanged)
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        x = tok_emb + pos_emb
        
        mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)
        
        # Pass through transformer layers (unchanged)
        memory = torch.zeros_like(x)
        for block in self.blocks:
            x = block(x, memory, tgt_mask=mask)
        
        # Use all positions instead of just last one
        logits = self.lm_head(x)  # Now shape is [batch_size, seq_len, vocab_size]
        
        if targets is None:
            loss = None
        else:
            # Reshape logits and targets for cross_entropy
            B, T, V = logits.shape  # V is vocab_size
            logits = logits.view(-1, V)  # flatten batch and time dims
            targets = targets.view(-1)    # flatten batch and time dims
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, number_of_cities):
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size

            while True:
                x = torch.tensor([context])
                logits, _ = self(x)
                # For generation, we only need last position
                last_logits = logits[:, -1, :]  # Add this line
                probs = F.softmax(last_logits, dim=-1)
                ix = torch.multinomial(probs, num_samples=1).item()
                context = context[1:] + [ix]
                out.append(ix)
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))

transformer = TransformerNet()
total_params = sum(p.numel() for p in transformer.parameters())
print("Params: ", total_params)

Params:  405403


In [96]:
transformer.generate(5) # untrained transformer

fsōbjuōjzlbshaojmtl.
ulczōrzjūōwmekjsrdljhooameōzōegzhōaaaiōyuru.
jmyrkjsūunzhbfpjaōjzogōgbh.
jegeimōwlyhhpjwwyiiiujūei.
ōblhjmewrwzjmypcuoūhhanhfmajūsjbmiwf-ewhhwzcōztucgiazrjmljjwūbezūmōmmgdwtz-wwgsfwnzjmōmsiwbyupdzckōbōnswōmrūoc.


In [97]:
from typing import Literal

class TransformerDatasetManager:
    def __init__(self, train_data, val_data, block_size, batch_size):
        self.block_size = block_size
        self.batch_size = batch_size
        self.train_dataset = self._build_dataset(train_data)
        self.val_dataset = self._build_dataset(val_data)

    def _build_dataset(self, data):
        X, Y = [], []
        for w in data:
            # OLD: Built sequences token by token
            # encoding = encode(w + '.')
            # context = encode('.') * self.block_size
            # for idx in encoding:
            #     X.append(context)
            #     Y.append(idx)
            #     context = context[1:] + [idx]

            # NEW: Build full sequences at once
            encoding = encode('.' + w + '.')  # Add start/end tokens
            if len(encoding) > self.block_size:
                print("Skipped :", w)
                continue  # Skip if too long
                
            # Pad sequence to block_size with start tokens
            padding = self.block_size - len(encoding)
            encoding = encode('.') * padding + encoding
            
            # OLD: X was context, Y was next token
            # NEW: X is all tokens except last, Y is all tokens except first
            # Example for "cat" with block_size=6:
            # encoding = [.,.,.,.,c,a,t,.]
            # X = [.,.,.,.,c,a,t]
            # Y = [.,.,.,c,a,t,.]
            X.append(encoding[:-1])
            Y.append(encoding[1:])
            
        # OLD: Each X,Y was single context->token prediction
        # NEW: Each X,Y is full sequence prediction
        return torch.tensor(X), torch.tensor(Y)

    def get_batch(self, split: Literal["train", "val"]):
        # Same as before, but now each batch contains full sequences
        # instead of single token predictions
        data = self.train_dataset if split == "train" else self.val_dataset
        ix = torch.randint(len(data[0]), (self.batch_size,))
        return data[0][ix], data[1][ix]

    def estimate_loss(self, model, eval_iters=200):
        # Same function, but now evaluating on full sequence predictions
        out = {}
        model.eval()
        with torch.no_grad():
            for split in ['train', 'val']:
                losses = torch.zeros(eval_iters)
                for k in range(eval_iters):
                    X, Y = self.get_batch(split)
                    logits, loss = model(X, Y)
                    losses[k] = loss.item()
                out[split] = losses.mean()
        model.train()
        return out

# Example shapes for each approach:
# Original DatasetManager:
# X shape: [batch_size, block_size]       # Each X is one context
# Y shape: [batch_size]                   # Each Y is one target token

# TransformerDatasetManager:
# X shape: [batch_size, block_size-1]     # Each X is full sequence minus last token
# Y shape: [batch_size, block_size-1]     # Each Y is full sequence minus first token

db = TransformerDatasetManager(train_data, val_data, block_size, batch_size)

def train_transformer(model, schedules, eval_interval=2000):
    for i, sch in enumerate(schedules):
        print(f"SCHEDULE {i+1}/{len(schedules)}: lr={sch.lr}, iters={sch.iters}")

        optimizer = torch.optim.AdamW(model.parameters(), lr=sch.lr)

        for cur_iter in range(sch.iters):
            # Evaluate periodically
            if cur_iter % eval_interval == 0 or cur_iter == sch.iters - 1:
                losses = db.estimate_loss(model)
                print(f"iter {cur_iter + 1}/{sch.iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            # Get batch and train
            xb, yb = db.get_batch('train')
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

Skipped : higashisumiyoshi-ku


In [98]:
# Print first few examples from training data
print("Training data examples:")
for i in range(3):
    x, y = db.get_batch('train')
    print(f"\nExample {i+1}:")
    print(f"X: {''.join(itos[j.item()] for j in x[0])}")
    print(f"Y: {''.join(itos[j.item()] for j in y[0])}")


Training data examples:

Example 1:
X: ...........hamanaka
Y: ..........hamanaka.

Example 2:
X: ...........ichikawa
Y: ..........ichikawa.

Example 3:
X: ............chitose
Y: ...........chitose.


In [86]:
# Example schedule
TRANSFORMER_SCHEDULE = [
    LearningInterval(1e-3, 10),
    LearningInterval(1e-4, 10), 
    LearningInterval(1e-5, 10),
]

train_transformer(transformer, TRANSFORMER_SCHEDULE)

SCHEDULE 1/3: lr=0.001, iters=10
iter 1/10: train loss 0.1653, val loss 0.2545
iter 2/10: train loss 0.1654, val loss 0.2575
iter 3/10: train loss 0.1667, val loss 0.2560
iter 4/10: train loss 0.1672, val loss 0.2572
iter 5/10: train loss 0.1672, val loss 0.2592
iter 6/10: train loss 0.1653, val loss 0.2587
iter 7/10: train loss 0.1671, val loss 0.2574
iter 8/10: train loss 0.1691, val loss 0.2539
iter 9/10: train loss 0.1698, val loss 0.2533
iter 10/10: train loss 0.1679, val loss 0.2536
SCHEDULE 2/3: lr=0.0001, iters=10
iter 1/10: train loss 0.1666, val loss 0.2531
iter 2/10: train loss 0.1666, val loss 0.2488
iter 3/10: train loss 0.1660, val loss 0.2510
iter 4/10: train loss 0.1639, val loss 0.2488
iter 5/10: train loss 0.1650, val loss 0.2505
iter 6/10: train loss 0.1637, val loss 0.2539
iter 7/10: train loss 0.1645, val loss 0.2512
iter 8/10: train loss 0.1633, val loss 0.2545
iter 9/10: train loss 0.1630, val loss 0.2500
iter 10/10: train loss 0.1627, val loss 0.2523
SCHEDULE 3/

In [87]:
transformer.generate(20)

nakakantassa-chisaptasatshisasukachizatamhakakatshakagatshihi---kawashagasutttshishasasuratatsakakashimmatshicsuratshrachinatshatsuchisr--shinshir-ushimizashittttsffushisshitashitshinotō-kackasushi-kchachshakkahimizata-chitshizatashitzashashakakamim-chatsatshim-kuchyatsashchi--kachichishashichchichikuchidachinttsachichinshikata-shitshitatshikakashisashikas--katadanasatasa-a.
yonanathamanwamamachchitshisemirachiyatshitamruchatshitsurus.
.
kakamashakatanarasasashisakamusatsatakasasamatshingag--shisatashitsakuchi-chidatshatshimetshikadamichshishihitttyachiruntashamashatshijim-shichshitshitchōtshich-kaschikashikash.
.
kimisagatsarakashizsazatsakamizatorur--chimarusamihatamshatshichchichitsmizas-yatotshirmidadachichchintshikus.
.
akazasatasamzagammigatshamimayashi-chimma-chikatsatshimisurachichetshikchigakakatshizushsusathiferhimat--kakushasha-shimihigashintshijōtsō-shi-kkusharatamitsachichitshichintatshachiwatashichichetshichashichisj-ōfusakasatmishizatatatatsetsukuchimiūdetshikushatat-kas

In [205]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 8
batch_size = 32
n_embd = 10    # embedding dim
n_hidden = 100  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.LayerNorm(dim),
            nn.Dropout(0.5)
        )
    
    def forward(self, x):
        return x + self.net(x)  # Residual connection
        

class ResidualMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden), nn.ReLU(), nn.LayerNorm(n_hidden),
            nn.Linear(n_hidden, n_hidden * 2), nn.ReLU(),
            ResidualBlock(n_hidden * 2),
            ResidualBlock(n_hidden * 2),
            nn.Linear(n_hidden * 2, n_hidden), nn.ReLU(), nn.LayerNorm(n_hidden),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        self.net.eval()
        with torch.no_grad():
            # idx is (B, T) array of indices in the current context
            for _ in range(number_of_cities):
                out = []
                context = [0] * block_size 

                while True:
                    # forward pass the neural net
                    logits = self.net(torch.tensor([context]))
                    probs = F.softmax(logits, dim=1)
                    # sample from the distribution
                    ix = torch.multinomial(probs, num_samples=1).item()
                    # shift the context window and track the samples
                    context = context[1:] + [ix]
                    out.append(ix)
                    # if we sample the special '.' token, break
                    if ix == 0:
                        break

                print(''.join(itos[i] for i in out))  # decode and print the generated word
        self.net.train()

residual_mlp = ResidualMLP()
total_params = sum(p.numel() for p in residual_mlp.parameters())
print("Params: ", total_params)

Params:  132997


In [206]:
train_model(residual_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.3304, val loss 3.3305
iter 2001/6000: train loss 1.7014, val loss 1.9889
iter 4001/6000: train loss 1.5289, val loss 1.9528
iter 6000/6000: train loss 1.4297, val loss 2.0595
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.4062, val loss 2.0297
iter 2001/10000: train loss 1.1897, val loss 2.1425
iter 4001/10000: train loss 1.1350, val loss 2.2845
iter 6001/10000: train loss 1.0861, val loss 2.4574
iter 8001/10000: train loss 1.0418, val loss 2.5984
iter 10000/10000: train loss 1.0639, val loss 2.6312
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.0320, val loss 2.6588
iter 2001/14000: train loss 1.0219, val loss 2.6895
iter 4001/14000: train loss 1.0256, val loss 2.7816
iter 6001/14000: train loss 1.0163, val loss 2.7211
iter 8001/14000: train loss 0.9715, val loss 2.7710
iter 10001/14000: train loss 0.9627, val loss 2.7767
iter 12001/14000: train loss 0.9814, val loss 2.7740
iter 14000/14000: t

In [207]:
residual_mlp.generate(20)
# damn overfitting

iwade.
shinshinotsu.
mishima.
toyooka.
kamishihoro.
samamue.
kaisei.
ide.
yurihama.
nameya.
sekigahara.
takatsuyama.
okutama.
seki.
aoko.
ichinomiya.
iwada.
nan-ikashasu.
yotsukazu.
tochigi.


In [477]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 10
batch_size = 40
n_embd = 24    # embedding dim
n_hidden = 150  # hidden layer size


cities = pd.read_csv("cities_raw.csv")["city_en"].tolist()

# Randomly split into train/val
indices = torch.randperm(len(cities))
split = int(0.9*len(cities))
train_data = [cities[i] for i in indices[:split]]
val_data = [cities[i] for i in indices[split:]]

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class FinalMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

fmlp = FinalMLP()
total_params = sum(p.numel() for p in fmlp.parameters())
print("Params: ", total_params)

Params:  65307


In [478]:
schedule = [
    LearningInterval(1e-2, 10_000),
    LearningInterval(1e-3, 15_000), 
    LearningInterval(1e-4, 14_000),
    LearningInterval(1e-5, 25_000),
]


train_model(fmlp, schedule)

SCHEDULE 1/4: lr=0.01, iters=10000
iter 1/10000: train loss 3.3042, val loss 3.3050
iter 2001/10000: train loss 1.8143, val loss 1.9363
iter 4001/10000: train loss 1.7970, val loss 1.9617
iter 6001/10000: train loss 1.7888, val loss 1.9405
iter 8001/10000: train loss 1.8206, val loss 1.9744
iter 10000/10000: train loss 1.7505, val loss 1.9217
SCHEDULE 2/4: lr=0.001, iters=15000
iter 1/15000: train loss 1.7590, val loss 1.9129
iter 2001/15000: train loss 1.6140, val loss 1.8209
iter 4001/15000: train loss 1.5898, val loss 1.8312
iter 6001/15000: train loss 1.5866, val loss 1.8411
iter 8001/15000: train loss 1.5846, val loss 1.8374
iter 10001/15000: train loss 1.5910, val loss 1.8438
iter 12001/15000: train loss 1.5767, val loss 1.8437
iter 14001/15000: train loss 1.5802, val loss 1.8495
iter 15000/15000: train loss 1.5713, val loss 1.8663
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.5953, val loss 1.8747
iter 2001/14000: train loss 1.5903, val loss 1.8429
iter 4001/14

In [482]:
fmlp.generate(20)

akadai.
azumi.
ikaonari.
chikise.
ōfu.
kawakami-ku.
rakume.
hichinohiya.
saki.
kuchihodo.
uzawa.
kishi-ku.
inkoppppott.
kusokuri.
manda.
namae.
omasa.
kantō.
tsuka.
nosaki-ku.
