In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import pandas as pd

In [11]:
torch.manual_seed(1234)
cities = pd.read_csv("../chinese_cities.csv")['name'].tolist()
split = int(0.9*len(cities))
indices = torch.randperm(len(cities)).tolist()
train_data = [cities[i] for i in indices[:split]]
val_data = [cities[i] for i in indices[split:]]
print(train_data[:8])
print('train:', len(train_data))
print('val:', len(val_data))

['chenzhou', 'hailin', "qin'an", 'suibin', 'changning', 'dongshan', 'shizuishan', 'nantou']
train: 2655
val: 296


In [13]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(cities))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0 # start and finish char
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

def encode(s: str):
  return [stoi[c] for c in s]

def decode(ints: list[int]):
  return ''.join(itos[i] for i in ints)

decode(encode(cities[11]))

{1: "'", 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


'fangshan'

In [14]:
# What uniform distribution looks like

for _ in range(5):
    word = []
    while True:
        ix = torch.randint(vocab_size, (1,)).item()
        char = itos[ix]
        if char == '.':
            if len(word) > 0: # only break if we have some chars
                break
            continue
        word.append(char)
    print(''.join(word))

zbwkfjrlolniyyfexeegsgxugmn
lxpezretjsht'sq
taqhpbbqx'hbumgejok'tu
flqbmljzhilscqyandfbu
sxitwxtl


In [17]:
from typing import Literal

class DatasetManager:
    def __init__(self, train_data, val_data, block_size, batch_size):
        self.block_size = block_size
        self.batch_size = batch_size
        self.train_dataset = self._build_dataset(train_data)
        self.val_dataset = self._build_dataset(val_data)

    def _build_dataset(self, data):
        X, Y = [], []
        for w in data:
            encoding = encode(w + '.')
            context = encode('.') * self.block_size
            for idx in encoding:
                X.append(context)
                Y.append(idx)
                context = context[1:] + [idx]
        return torch.tensor(X), torch.tensor(Y)

    def get_batch(self, split: Literal["train", "val"]):
        data = self.train_dataset if split == "train" else self.val_dataset
        ix = torch.randint(len(data[0]), (self.batch_size,))
        return data[0][ix], data[1][ix]

    def estimate_loss(self, model, eval_iters=200):
        out = {}
        model.eval()
        with torch.no_grad():
            for split in ['train', 'val']:
                losses = torch.zeros(eval_iters)
                for k in range(eval_iters):
                    X, Y = self.get_batch(split)
                    logits, loss = model(X, Y)
                    losses[k] = loss.item()
                out[split] = losses.mean()
        model.train()
        return out

In [18]:
# Example usage:
db = DatasetManager(train_data, val_data, block_size=6, batch_size=8)
xbatch, ybatch = db.get_batch("train")

for x, y in list(zip(xbatch, ybatch)):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

shunko --> u
maonan --> .
.....j --> u
...... --> x
.....j --> i
samzhu --> b
anning --> .
...... --> a


In [24]:
from torch import nn
from torch.nn import functional as F

# Some terminology:
# Batch (32): Different training examples processed in parallel
#   - Each batch contains 32 different sequences we're training on

# Time (4): Sequence positions (your block_size)
#   - For city "Tokyo.", with block_size=4:
#   - "...." -> "T"
#   - "...T" -> "o"
#   - "..To" -> "k"
#   - ".Tok" -> "y"

# Classes (20): Your vocabulary size (possible characters)
#   - Each position outputs scores for all possible next characters
#   - If vocab is ['.','a','b','c',...], then each position predicts
#     probabilities for each character being next

# HYPERPARAMETERS
block_size = 12
batch_size = 40
n_embd = 12    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class SimpleMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

simple_mlp = SimpleMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  12047


In [25]:
# Initial (completely random):
simple_mlp.generate(6)

pwrifcfqwkgzcibaahnqfazsdjurok'irunmdelmjwkkpedc'b'cnhsrcjbsllxgdnwjnepucqea'nikdwdjzjrwyciarwhy'ittfowox'hucuaxoezzdjjspzgnyaqlubdqqjsazketkslqlzkie.
clxhbkcbwmayqbcnjldwxdifcqh.
yszkypywzzynnxn'wdahkkrfrmqptqiboeazkdznwuknwhxjmkofrsgjzecumhz.
ocqualrlexs.
swfohjplliucqwfyoabebrjbleihguxfgzjzyf.
nnhlxqxnfaxamyyokolznc.


In [26]:
# TRAIN Simple MLP
from dataclasses import dataclass

@dataclass
class LearningInterval():
    lr: int
    iters: int

def train_model(model, schedules, eval_interval=2000):
    for i, sch in enumerate(schedules):
        print(f"SCHEDULE {i+1}/{len(schedules)}: lr={sch.lr}, iters={sch.iters}")

        # create a PyTorch optimizer
        optimizer = torch.optim.AdamW(model.parameters(), lr=sch.lr)

        for cur_iter in range(sch.iters):

            # every once in a while evaluate the loss on train and val sets
            if cur_iter % eval_interval == 0 or cur_iter == sch.iters - 1:
                losses = db.estimate_loss(model)
                print(f"iter {cur_iter + 1}/{sch.iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            # sample a batch of data
            xb, yb = db.get_batch('train')

            # evaluate the loss
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

# Example schedule
SAME_SCHEDULE = [
    LearningInterval(1e-2, 6_000),
    LearningInterval(1e-3, 10_000), 
    LearningInterval(1e-4, 14_000),
    LearningInterval(1e-5, 6_000),
]


In [27]:
train_model(simple_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.3009, val loss 3.2987
iter 2001/6000: train loss 1.6459, val loss 1.6894
iter 4001/6000: train loss 1.6003, val loss 1.6975
iter 6000/6000: train loss 1.6329, val loss 1.6717
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.6518, val loss 1.7110
iter 2001/10000: train loss 1.5384, val loss 1.6000
iter 4001/10000: train loss 1.5557, val loss 1.6161
iter 6001/10000: train loss 1.5140, val loss 1.6068
iter 8001/10000: train loss 1.5069, val loss 1.6354
iter 10000/10000: train loss 1.5200, val loss 1.6547
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.5439, val loss 1.6566
iter 2001/14000: train loss 1.4881, val loss 1.6003
iter 4001/14000: train loss 1.4844, val loss 1.5982
iter 6001/14000: train loss 1.4802, val loss 1.6255
iter 8001/14000: train loss 1.5000, val loss 1.5954
iter 10001/14000: train loss 1.5045, val loss 1.6017
iter 12001/14000: train loss 1.5004, val loss 1.6142
iter 14000/14000: t

In [28]:
simple_mlp.generate(20)

luidong.
yangzhou.
wuzho.
cuyang.
luchangnai.
gangang.
houyian.
cheng.
tongnan.
runjiang.
chong.
boyunz.
xiangai.
bouaqi.
jiodong.
xiufe.
longluu.
yashun.
xiang.
juangmon.


In [32]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 14
batch_size = 40
n_embd = 20    # embedding dim
n_hidden = 600  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class BigMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), # (B,T,n_embed)
            nn.Flatten(start_dim=1),          # (B, T*E) 
            nn.Linear(n_embd * block_size, n_hidden),  # (T*E, H)
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

big_mlp = BigMLP()
total_params = sum(p.numel() for p in simple_mlp.parameters())
print("Params: ", total_params)

Params:  12047


In [33]:
train_model(big_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000


iter 1/6000: train loss 3.2888, val loss 3.2886
iter 2001/6000: train loss 1.6789, val loss 1.7489
iter 4001/6000: train loss 1.6721, val loss 1.7469
iter 6000/6000: train loss 1.6864, val loss 1.7507
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.6466, val loss 1.7310
iter 2001/10000: train loss 1.5079, val loss 1.6271
iter 4001/10000: train loss 1.5124, val loss 1.6242
iter 6001/10000: train loss 1.5184, val loss 1.6440
iter 8001/10000: train loss 1.4839, val loss 1.6670
iter 10000/10000: train loss 1.4887, val loss 1.6477
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.4844, val loss 1.6695
iter 2001/14000: train loss 1.4693, val loss 1.6103
iter 4001/14000: train loss 1.4865, val loss 1.6293
iter 6001/14000: train loss 1.4590, val loss 1.6109
iter 8001/14000: train loss 1.4629, val loss 1.6341
iter 10001/14000: train loss 1.4808, val loss 1.5971
iter 12001/14000: train loss 1.4957, val loss 1.6322
iter 14000/14000: train loss 1.4722, val loss 1.6269


In [39]:
big_mlp.generate(20)

enghua.
chengdu.
bachuan.
xixian.
shuyuan.
xiangyan.
lingshu.
gon.
yudi.
gyangceng.
jimen.
tongrua.
jinzhongjiang.
uopinglu.
zeibe.
lengyuan.
lindi.
jiaoying.
baoxi.
shankou.


In [48]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 6
batch_size = 40
n_embd = 10    # embedding dim
n_hidden = 68  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class NonLinearMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden * 2), nn.ReLU(),
            nn.Linear(n_hidden * 2, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        # idx is (B, T) array of indices in the current context
        for _ in range(number_of_cities):
            out = []
            context = [0] * block_size 

            while True:
                # forward pass the neural net
                logits = self.net(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                # sample from the distribution
                ix = torch.multinomial(probs, num_samples=1).item()
                # shift the context window and track the samples
                context = context[1:] + [ix]
                out.append(ix)
                # if we sample the special '.' token, break
                if ix == 0:
                    break

            print(''.join(itos[i] for i in out))  # decode and print the generated word

nonlin_mlp = NonLinearMLP()
total_params = sum(p.numel() for p in nonlin_mlp.parameters())
print("Params: ", total_params)

Params:  43613


In [49]:
train_model(nonlin_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.3117, val loss 3.3111
iter 2001/6000: train loss 1.5670, val loss 1.6377
iter 4001/6000: train loss 1.5157, val loss 1.6126
iter 6000/6000: train loss 1.4875, val loss 1.5897
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.4835, val loss 1.5779
iter 2001/10000: train loss 1.3528, val loss 1.5244
iter 4001/10000: train loss 1.3408, val loss 1.5532
iter 6001/10000: train loss 1.3121, val loss 1.5663
iter 8001/10000: train loss 1.2979, val loss 1.6126
iter 10000/10000: train loss 1.2600, val loss 1.6244
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.2711, val loss 1.6162
iter 2001/14000: train loss 1.2599, val loss 1.6170
iter 4001/14000: train loss 1.2710, val loss 1.6578
iter 6001/14000: train loss 1.2637, val loss 1.6247
iter 8001/14000: train loss 1.2357, val loss 1.6348
iter 10001/14000: train loss 1.2578, val loss 1.6822
iter 12001/14000: train loss 1.2705, val loss 1.7072
iter 14000/14000: t

In [64]:
nonlin_mlp.generate(10) # ok this is pretty good, a lot of repeats though

dulufeng.
shuizhou.
donghu.
gurin.
jiancheng.
taohe.
mentian.
xinhui.
guigang.
xinjiang.


In [65]:
from torch import nn
from torch.nn import functional as F

# HYPERPARAMETERS (same as original)
block_size = 8
batch_size = 32
n_embd = 10    # embedding dim
n_hidden = 100  # hidden layer size

db = DatasetManager(train_data, val_data, batch_size=batch_size, block_size=block_size)

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.LayerNorm(dim),
            nn.Dropout(0.5)
        )
    
    def forward(self, x):
        return x + self.net(x)  # Residual connection
        

class ResidualMLP(nn.Module):

    def __init__(self):
        super().__init__()

        # input are (B,T) sequence of xs integers
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, n_embd),
            nn.Flatten(start_dim=1),          
            nn.Linear(n_embd * block_size, n_hidden), nn.ReLU(), nn.LayerNorm(n_hidden),
            nn.Linear(n_hidden, n_hidden * 2), nn.ReLU(),
            ResidualBlock(n_hidden * 2),
            ResidualBlock(n_hidden * 2),
            nn.Linear(n_hidden * 2, n_hidden), nn.ReLU(), nn.LayerNorm(n_hidden),
            nn.Linear(n_hidden, vocab_size)
        )

        with torch.no_grad():
            self.net[-1].weight *= 0.1  # last layer make less confident

    # idx and targets are both (B,T) tensor of integers
    def forward(self, x, targets=None):
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=32)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=20)
        logits = self.net(x)
        if targets is None:
            loss = None
        else:
            # Cross entropy expects shape (N, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, number_of_cities):
        self.net.eval()
        with torch.no_grad():
            # idx is (B, T) array of indices in the current context
            for _ in range(number_of_cities):
                out = []
                context = [0] * block_size 

                while True:
                    # forward pass the neural net
                    logits = self.net(torch.tensor([context]))
                    probs = F.softmax(logits, dim=1)
                    # sample from the distribution
                    ix = torch.multinomial(probs, num_samples=1).item()
                    # shift the context window and track the samples
                    context = context[1:] + [ix]
                    out.append(ix)
                    # if we sample the special '.' token, break
                    if ix == 0:
                        break

                print(''.join(itos[i] for i in out))  # decode and print the generated word
        self.net.train()

residual_mlp = ResidualMLP()
total_params = sum(p.numel() for p in residual_mlp.parameters())
print("Params: ", total_params)

Params:  132997


In [67]:
train_model(residual_mlp, SAME_SCHEDULE)

SCHEDULE 1/4: lr=0.01, iters=6000
iter 1/6000: train loss 3.2956, val loss 3.2961
iter 2001/6000: train loss 1.5220, val loss 1.6201
iter 4001/6000: train loss 1.4253, val loss 1.5916
iter 6000/6000: train loss 1.4017, val loss 1.5674
SCHEDULE 2/4: lr=0.001, iters=10000
iter 1/10000: train loss 1.4222, val loss 1.5957
iter 2001/10000: train loss 1.3123, val loss 1.5594
iter 4001/10000: train loss 1.2877, val loss 1.5536
iter 6001/10000: train loss 1.2426, val loss 1.6033
iter 8001/10000: train loss 1.2344, val loss 1.6612
iter 10000/10000: train loss 1.1676, val loss 1.6616
SCHEDULE 3/4: lr=0.0001, iters=14000
iter 1/14000: train loss 1.1886, val loss 1.6904
iter 2001/14000: train loss 1.1884, val loss 1.6910
iter 4001/14000: train loss 1.1737, val loss 1.7464
iter 6001/14000: train loss 1.1529, val loss 1.7139
iter 8001/14000: train loss 1.1758, val loss 1.7160
iter 10001/14000: train loss 1.1221, val loss 1.7381
iter 12001/14000: train loss 1.1369, val loss 1.7580
iter 14000/14000: t

In [69]:
residual_mlp.generate(20) # too much overfitting

haibei.
ziliuwu.
jiangyou.
weicheng.
lushui.
wugyang.
xuyi.
loulin.
jiangzhou.
yanjin.
beihai.
garkant.
jinglong.
longchang.
xukou.
jincheng.
dangxing.
shengcheng.
yangcheng.
luiji.
