In [271]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import pandas as pd

In [272]:
cities = pd.read_csv("cities_raw.csv")["city_en"].tolist()

split = int(0.9*len(cities))
train_data = cities[:split]
val_data = cities[split:]
print(train_data[:4])
print('train:', len(train_data))
print('val:', len(val_data))

['sapporo', 'chūō-ku', 'kita-ku', 'higashi-ku']
train: 1723
val: 192


In [273]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(cities))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0 # start and finish char
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

def encode(s: str):
  return [stoi[c] for c in s]

def decode(ints: list[int]):
  return ''.join(itos[i] for i in ints)

decode(encode(cities[3]))

{1: '-', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'w', 23: 'y', 24: 'z', 25: 'ō', 26: 'ū', 0: '.'}
27


'higashi-ku'

In [274]:
# What uniform distribution looks like

for _ in range(5):
    word = []
    while True:
        ix = torch.randint(vocab_size, (1,)).item()
        char = itos[ix]
        if char == '.':
            if len(word) > 0: # only break if we have some chars
                break
            continue
        word.append(char)
    print(''.join(word))

dbūizjōpkdoa
tetmōeeōt
n
jzojeczzulhmkitūghidnsljbhaūheuusz
iclb


In [280]:
from typing import Literal

class DatasetManager:
    def __init__(self, train_data, val_data, block_size, batch_size):
        self.block_size = block_size
        self.batch_size = batch_size
        self.train_dataset = self._build_dataset(train_data)
        self.val_dataset = self._build_dataset(val_data)

    def _build_dataset(self, data):
        X, Y = [], []
        for w in data:
            encoding = encode(w + '.')
            context = encode('.') * self.block_size
            for idx in encoding:
                X.append(context)
                Y.append(idx)
                context = context[1:] + [idx]
        return torch.tensor(X), torch.tensor(Y)

    def get_batch(self, split: Literal["train", "val"]):
        data = self.train_dataset if split == "train" else self.val_dataset
        ix = torch.randint(len(data[0]), (self.batch_size,))
        return data[0][ix], data[1][ix]

    def estimate_loss(self, model, eval_iters=200):
        out = {}
        model.eval()
        with torch.no_grad():
            for split in ['train', 'val']:
                losses = torch.zeros(eval_iters)
                for k in range(eval_iters):
                    X, Y = self.get_batch(split)
                    logits, loss = model(X, Y)
                    losses[k] = loss.item()
                out[split] = losses.mean()
        model.train()
        return out

In [281]:
# Example usage:
db = DatasetManager(train_data, val_data, block_size=6, batch_size=8)
xbatch, ybatch = db.get_batch("train")

for x, y in list(zip(xbatch, ybatch)):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

....ta --> k
...yam --> a
...... --> m
higash --> i
ogawa- --> k
hinhid --> a
....na --> k
matsud --> o


In [286]:
from torch import nn
from torch.nn import functional as F

# Some terminology:
# Batch (32): Different training examples processed in parallel
#   - Each batch contains 32 different sequences we're training on

# Time (4): Sequence positions (your block_size)
#   - For city "Tokyo.", with block_size=4:
#   - "...." -> "T"
#   - "...T" -> "o"
#   - "..To" -> "k"
#   - ".Tok" -> "y"

# Classes (20): Your vocabulary size (possible characters)
#   - Each position outputs scores for all possible next characters
#   - If vocab is ['.','a','b','c',...], then each position predicts
#     probabilities for each character being next

# HYPERPARAMETERS
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class SimpleMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

simple_mlp = SimpleMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  6281


In [287]:
# Initial (completely random):
simple_mlp.generate(6)

jkidbhzimcpanlhdiawmhlrūuujyrmūffu-ymwtwmlohrcmaūotigtlhnwruguūdwy.
czōūeunciōlpklōid.
ūlftnbokkhodzwmileō.
smnsotfzūri.
-gdatuweumyo.
ktilcfkn-mkdōtyrdirsōprwiesmcōcbtōjczwmaa.


In [288]:
# TRAIN Simple MLP
from dataclasses import dataclass

@dataclass
class LearningInterval():
    lr: int
    iters: int

def train_model(model, schedules, eval_interval=2000):
    for i, sch in enumerate(schedules):
        print(f"SCHEDULE {i+1}/{len(schedules)}: lr={sch.lr}, iters={sch.iters}")

        # create a PyTorch optimizer
        optimizer = torch.optim.AdamW(model.parameters(), lr=sch.lr)

        for cur_iter in range(sch.iters):

            # every once in a while evaluate the loss on train and val sets
            if cur_iter % eval_interval == 0 or cur_iter == sch.iters - 1:
                losses = db.estimate_loss(model)
                print(f"iter {cur_iter + 1}/{sch.iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            # sample a batch of data
            xb, yb = db.get_batch('train')

            # evaluate the loss
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

# Example schedule
SAME_SCHEDULE = [
    LearningInterval(1e-2, 6_000),
    LearningInterval(1e-3, 10_000), 
    LearningInterval(1e-4, 14_000),
    LearningInterval(1e-5, 6_000),
]

train_model(simple_mlp, SAME_SCHEDULE)


SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2895, val loss 3.2906
iter 2001/6000: train loss 1.7939, val loss 1.9550
iter 4001/6000: train loss 1.7745, val loss 1.9605
iter 6000/6000: train loss 1.7985, val loss 2.0079
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.7900, val loss 2.0125
iter 2001/10000: train loss 1.6680, val loss 1.9265
iter 4001/10000: train loss 1.6917, val loss 1.9260
iter 6001/10000: train loss 1.6959, val loss 1.9228
iter 8001/10000: train loss 1.6653, val loss 1.9371
iter 10000/10000: train loss 1.7097, val loss 1.9396
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.6600, val loss 1.9234
iter 2001/14000: train loss 1.6710, val loss 1.8951
iter 4001/14000: train loss 1.6684, val loss 1.9274
iter 6001/14000: train loss 1.6642, val loss 1.9211
iter 8001/14000: train loss 1.6708, val loss 1.8969
iter 10001/14000: train loss 1.6606, val loss 1.9179
iter 12001/14000: train loss 1.6755, val loss 1.9399
iter 14000/14000: t

In [289]:
simple_mlp.generate(20)

koise.
asai.
haku.
miyara.
shinoryūkawa.
katsu.
hakubatsu.
minamaoya.
ijoma.
chikamayusa.
shimadatokata.
tōmishō.
osuda-ku.
suma.
touratsun.
mifu.
tanaipa-ku.
hagudo.
tokuyema.
ikeizahi.


In [291]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 10
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 300  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class BigMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

big_mlp = BigMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  38697


In [293]:
train_model(big_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.3006, val loss 3.3007
iter 2001/6000: train loss 1.8471, val loss 2.0904
iter 4001/6000: train loss 1.7770, val loss 2.0525
iter 6000/6000: train loss 1.7953, val loss 2.0570
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.8001, val loss 2.0818
iter 2001/10000: train loss 1.6640, val loss 1.9540
iter 4001/10000: train loss 1.6591, val loss 1.9953
iter 6001/10000: train loss 1.6959, val loss 1.9565
iter 8001/10000: train loss 1.6477, val loss 1.9575
iter 10000/10000: train loss 1.6489, val loss 1.9961
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.6373, val loss 1.9943
iter 2001/14000: train loss 1.6407, val loss 1.9952
iter 4001/14000: train loss 1.6523, val loss 1.9733
iter 6001/14000: train loss 1.6534, val loss 1.9675
iter 8001/14000: train loss 1.6273, val loss 1.9503
iter 10001/14000: train loss 1.6472, val loss 1.9709
iter 12001/14000: train loss 1.6223, val loss 1.9794
iter 14000/14000: t

In [297]:
big_mlp.generate(20)

rigeeru.
woji.
hoku.
ugawagatsu.
murashio-ku.
tokawa.
akama.
kiigaka.
miho.
naoroi.
akawasuke.
chibe.
nōha.
jō-ku.
yamagata-ku.
kugoe.
mizumu.
yōbana.
nianai.
shiru.


In [313]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class NonLinearMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

nonlin_mlp = NonLinearMLP()
total_params = sum(p.numel() for p in nonlin_mlp.parameters())
print("Params: ", total_params)

Params:  43613


In [307]:
train_model(nonlin_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2832, val loss 3.2831
iter 2001/6000: train loss 1.8471, val loss 1.9853
iter 4001/6000: train loss 1.7507, val loss 2.0014
iter 6000/6000: train loss 1.6920, val loss 2.0458
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.7166, val loss 2.0087
iter 2001/10000: train loss 1.5798, val loss 2.0170
iter 4001/10000: train loss 1.4859, val loss 2.0922
iter 6001/10000: train loss 1.4258, val loss 2.1627
iter 8001/10000: train loss 1.3776, val loss 2.3169
iter 10000/10000: train loss 1.3479, val loss 2.3819
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.3439, val loss 2.4564
iter 2001/14000: train loss 1.3284, val loss 2.4428
iter 4001/14000: train loss 1.2989, val loss 2.4178
iter 6001/14000: train loss 1.2993, val loss 2.4530
iter 8001/14000: train loss 1.3004, val loss 2.4668
iter 10001/14000: train loss 1.2793, val loss 2.6267
iter 12001/14000: train loss 1.3073, val loss 2.5383
iter 14000/14000: t

In [311]:
nonlin_mlp.generate(20)

shimoramo.
shimonuma-ku.
kinoku.
tohaka.
kibato-ku.
ōtsuki.
yara.
tobiishi-ku.
kamiibora-ku.
ubino.
okuiishi-ku.
kushū.
isamari.
nishichinon.
butsu.
bisuyama.
ugi.
ikeda.
sayotohira.
ebyūō.


In [315]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class BN_MLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

bnmlp = BN_MLP()
total_params = sum(p.numel() for p in nonlin_mlp.parameters())
print("Params: ", total_params)

Params:  43613


In [316]:
train_model(bnmlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2933, val loss 3.2930
iter 2001/6000: train loss 1.6703, val loss 1.8892
iter 4001/6000: train loss 1.5718, val loss 1.8832
iter 6000/6000: train loss 1.5141, val loss 1.9195
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.5283, val loss 1.9349
iter 2001/10000: train loss 1.3854, val loss 1.8822
iter 4001/10000: train loss 1.3589, val loss 1.9303
iter 6001/10000: train loss 1.3252, val loss 1.9409
iter 8001/10000: train loss 1.2784, val loss 2.0175
iter 10000/10000: train loss 1.2534, val loss 2.0485
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.2376, val loss 2.0479
iter 2001/14000: train loss 1.2391, val loss 2.0557
iter 4001/14000: train loss 1.2393, val loss 2.0645
iter 6001/14000: train loss 1.2420, val loss 2.0736
iter 8001/14000: train loss 1.2360, val loss 2.0821
iter 10001/14000: train loss 1.2438, val loss 2.0315
iter 12001/14000: train loss 1.2398, val loss 2.0577
iter 14000/14000: t

In [322]:
bnmlp.eval()
bnmlp.generate(20)

setana.
hirako.
gawama.
higashii.
tōyoda.
kamida.
higashi-ku.
shimizu.
itakyo.
mizumizanaki.
misato.
misato.
minamichi.
fukuasuki.
kami.
kashie.
watariizuminotpu.
fujino.
sumachi.
niijima.
