GPTs learn math by predicting the next valid solution step, exactly like predicting the next word.

In [3]:
import pandas as pd

df = pd.read_csv("../dataset/Cities.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,city,country,population,latitude,longitude,city coordinates
0,0,Encamp,Andorra,11224,42.533333,1.583333,"[42.5333333, 1.5833333]"
1,1,Les Escaldes,Andorra,15854,42.5,1.533333,"[42.5000000, 1.5333333]"
2,2,Andorra la Vella,Andorra,20430,42.5,1.516667,"[42.5000000, 1.5166667]"
3,3,La Massana,Andorra,7211,42.55,1.516667,"[42.5500000, 1.5166667]"
4,4,Canillo,Andorra,3292,42.566667,1.6,"[42.5666667, 1.6000000]"


In [2]:
city_counts = df.groupby("country")["city"].count().sort_values(ascending=False).head(10)
print(city_counts)

country
Russia           4324
United States    4171
Philippines      3752
India            2995
Romania          2755
Brazil           2017
Mexico           1687
Germany          1478
Italy            1118
Greece           1084
Name: city, dtype: int64


In [6]:
import string


# vocab should be all characters from df[city] lowercase
all_cities = df["city"].astype(str).str.lower()
vocab = sorted(set("".join(all_cities)))
V = len(vocab)

stoi = {char: idx for idx, char in enumerate(vocab)}
itos = {idx: char for char, idx in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]
def decode(ids):
    return ''.join([itos[i] for i in ids])
    
df['city_with_ending'] = df['city'].astype(str) + '.'
names = df[df['country'] == 'Russia']['city_with_ending']

In [9]:
import torch
import torch.nn as nn


class BigramMLP(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 128)
        self.mlp = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        x = self.embed(x)   # (B, 128)
        return self.mlp(x) # (B, V)


In [10]:
import torch.nn.functional as F

def generate(model, start_char, max_len=20):
    idx = torch.tensor([stoi[start_char]])
    out = start_char

    for _ in range(max_len):
        logits = model(idx)
        probs = F.softmax(logits[-1], dim=-1)
        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]
        out += next_char
        if next_char == '.':
            break
        idx = torch.tensor([next_idx])

    return out


generate(BigramMLP(V), 'a')


'aâä.'

In [11]:

# names = df['city_with_ending']
X, Y = [], []

for name in names:
    name = name.lower()
    for a, b in zip(name[:-1], name[1:]):

        X.append(stoi[a])
        Y.append(stoi[b])

X = torch.tensor(X)
Y = torch.tensor(Y)

model = BigramMLP(V)

In [12]:
model = BigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)


In [13]:
batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]
    yb = Y[idx]

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.1721
step 500 | loss 2.4007
step 1000 | loss 2.3299
step 1500 | loss 2.3393
step 2000 | loss 2.3477
step 2500 | loss 2.4110
step 3000 | loss 2.3692
step 3500 | loss 2.4215


In [34]:
generate(model, 's')

'skintogol.'

# Trigram

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TrigramMLP(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 128)

        self.mlp = nn.Sequential(
            nn.Linear(128 * 2, 256),  # two embeddings concatenated
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        # x: (B, 2)
        emb = self.embed(x)          # (B, 2, 128)
        emb = emb.view(x.size(0), -1)  # (B, 256)
        return self.mlp(emb)         # (B, V)


In [36]:
X, Y = [], []

for name in names:
    name = name.lower()
    for a, b, c in zip(name[:-2], name[1:-1], name[2:]):
        X.append([stoi[a], stoi[b]])
        Y.append(stoi[c])

X = torch.tensor(X)  # (N, 2)
Y = torch.tensor(Y)  # (N,)


In [37]:
model = TrigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]   # (B, 2)
    yb = Y[idx]   # (B,)

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.1820
step 500 | loss 2.0225
step 1000 | loss 1.9467
step 1500 | loss 1.9289
step 2000 | loss 1.9248
step 2500 | loss 1.8935
step 3000 | loss 1.8825
step 3500 | loss 1.8225


In [38]:
def generate(model, start_chars, max_len=20):
    assert len(start_chars) == 2

    idx = [stoi[start_chars[0]], stoi[start_chars[1]]]
    out = start_chars

    for _ in range(max_len):
        x = torch.tensor([idx])  # (1, 2)
        logits = model(x)
        probs = F.softmax(logits[0], dim=-1)

        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]

        out += next_char
        if next_char == '.':
            break

        idx = [idx[1], next_idx]  # slide window

    return out


In [49]:
generate(model, 'ca')

'caltula.'

# 4-gram

In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FourGramMLP(nn.Module):
    def __init__(self, vocab_size, embed_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim * 4, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        # x: (B, 4)
        emb = self.embed(x)            # (B, 4, 128)
        emb = emb.view(x.size(0), -1)  # (B, 512)
        return self.mlp(emb)           # (B, V)


In [51]:
X, Y = [], []

for name in names:
    name = name.lower()
    if len(name) < 5:
        continue

    for a, b, c, d, e in zip(
        name[:-4], name[1:-3], name[2:-2], name[3:-1], name[4:]
    ):
        X.append([stoi[a], stoi[b], stoi[c], stoi[d]])
        Y.append(stoi[e])

X = torch.tensor(X)  # (N, 4)
Y = torch.tensor(Y)  # (N,)


In [52]:
model = FourGramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]   # (B, 4)
    yb = Y[idx]   # (B,)

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.2170
step 500 | loss 1.2249
step 1000 | loss 1.1516
step 1500 | loss 0.7939
step 2000 | loss 0.8417
step 2500 | loss 0.7926
step 3000 | loss 0.8238
step 3500 | loss 0.7599


In [54]:
def generate(model, start_text, max_len=50):
    assert len(start_text) >= 4

    context = [stoi[c] for c in start_text[-4:]]
    out = start_text

    for _ in range(max_len):
        x = torch.tensor([context])  # (1, 4)
        logits = model(x)
        probs = F.softmax(logits[0], dim=-1)

        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]

        out += next_char
        if next_char == '.':
            break

        context = context[1:] + [next_idx]

    return out


In [61]:
generate(model, 'casa')

'casalimbay.'