This notebook aims to teach:

How to build a simple autoregressive generative model that produces names resembling cities from around the world

How increasing context size (bigram → trigram → 4-gram) leads to lower model loss and improved predictions

How a bigram model can be implemented using a traditional, rule-based software approach

---------------------

1. Lets create a generative model which can generate new city names

- Cities dataset consists of existing city names around the world - lets take a peek at the data

In [8]:
import pandas as pd
df = pd.read_csv("../dataset/cities_latin_alphabet.csv")
df.head()

Unnamed: 0,city,country
0,Encamp,Andorra
1,Canillo,Andorra
2,Sharjah,United Arab Emirates
3,Dubai,United Arab Emirates
4,Asadabad,Afghanistan


In [None]:
city_counts_by_country = df.groupby("country")["city"].count().sort_values(ascending=False).head(3)


print(f'We have {len(df)} city names from {len(set(df["country"]))} countries')

print('City Names by Country:')
print(city_counts_by_country)

We have 35320 city names from 185 countries
City Names by Country:
country
Russia           3768
Philippines      3161
United States    2929
Name: city, dtype: int64


We will generate new city names using a character level model
- Lets generate a vocabulary consiting of characters from city names

In [7]:
import string

# vocab should be all characters from df[city] lowercase
all_cities = df["city"].astype(str).str.lower()
vocab = sorted(set("".join(all_cities)))
V = len(vocab)

print(f'we have a vocabulary of {V} characters')

we have a vocabulary of 66 characters


In [8]:
vocab

[' ',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ø',
 'ù',
 'ú',
 'û',
 'ü',
 'ý']

In [None]:
stoi = {char: idx for idx, char in enumerate(vocab)}
itos = {idx: char for char, idx in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]
def decode(ids):
    return ''.join([itos[i] for i in ids])
    
df['city_with_ending'] = df['city'].astype(str) + '.'
names = df[df['country'] == 'Russia']['city_with_ending']

In [9]:
import torch
import torch.nn as nn


class BigramMLP(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 128)
        self.mlp = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        x = self.embed(x)   # (B, 128)
        return self.mlp(x) # (B, V)

In [12]:
model= BigramMLP(V)
print(model)

BigramMLP(
  (embed): Embedding(66, 128)
  (mlp): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=66, bias=True)
  )
)


In [None]:
import torch.nn.functional as F

def generate(model, start_char, max_len=20):
    idx = torch.tensor([stoi[start_char]])
    out = start_char

    for _ in range(max_len):
        logits = model(idx)
        probs = F.softmax(logits[-1], dim=-1)
        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]
        out += next_char
        if next_char == '.':
            break
        idx = torch.tensor([next_idx])

    return out

generate(BigramMLP(V), 'a')

'aâä.'

In [None]:
# names = df['city_with_ending']
X, Y = [], []

for name in names:
    name = name.lower()
    for a, b in zip(name[:-1], name[1:]):

        X.append(stoi[a])
        Y.append(stoi[b])

X = torch.tensor(X)
Y = torch.tensor(Y)

model = BigramMLP(V)

In [None]:
model = BigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)


In [None]:
model = BigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)


In [12]:
model = BigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)


In [13]:
batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]
    yb = Y[idx]

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.1721
step 500 | loss 2.4007
step 1000 | loss 2.3299
step 1500 | loss 2.3393
step 2000 | loss 2.3477
step 2500 | loss 2.4110
step 3000 | loss 2.3692
step 3500 | loss 2.4215


In [34]:
generate(model, 's')

'skintogol.'

# Trigram

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TrigramMLP(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 128)

        self.mlp = nn.Sequential(
            nn.Linear(128 * 2, 256),  # two embeddings concatenated
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        # x: (B, 2)
        emb = self.embed(x)          # (B, 2, 128)
        emb = emb.view(x.size(0), -1)  # (B, 256)
        return self.mlp(emb)         # (B, V)


In [36]:
X, Y = [], []

for name in names:
    name = name.lower()
    for a, b, c in zip(name[:-2], name[1:-1], name[2:]):
        X.append([stoi[a], stoi[b]])
        Y.append(stoi[c])

X = torch.tensor(X)  # (N, 2)
Y = torch.tensor(Y)  # (N,)


In [37]:
model = TrigramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]   # (B, 2)
    yb = Y[idx]   # (B,)

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.1820
step 500 | loss 2.0225
step 1000 | loss 1.9467
step 1500 | loss 1.9289
step 2000 | loss 1.9248
step 2500 | loss 1.8935
step 3000 | loss 1.8825
step 3500 | loss 1.8225


In [38]:
def generate(model, start_chars, max_len=20):
    assert len(start_chars) == 2

    idx = [stoi[start_chars[0]], stoi[start_chars[1]]]
    out = start_chars

    for _ in range(max_len):
        x = torch.tensor([idx])  # (1, 2)
        logits = model(x)
        probs = F.softmax(logits[0], dim=-1)

        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]

        out += next_char
        if next_char == '.':
            break

        idx = [idx[1], next_idx]  # slide window

    return out


In [49]:
generate(model, 'ca')

'caltula.'

# 4-gram

In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FourGramMLP(nn.Module):
    def __init__(self, vocab_size, embed_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim * 4, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, vocab_size)
        )

    def forward(self, x):
        # x: (B, 4)
        emb = self.embed(x)            # (B, 4, 128)
        emb = emb.view(x.size(0), -1)  # (B, 512)
        return self.mlp(emb)           # (B, V)


In [51]:
X, Y = [], []

for name in names:
    name = name.lower()
    if len(name) < 5:
        continue

    for a, b, c, d, e in zip(
        name[:-4], name[1:-3], name[2:-2], name[3:-1], name[4:]
    ):
        X.append([stoi[a], stoi[b], stoi[c], stoi[d]])
        Y.append(stoi[e])

X = torch.tensor(X)  # (N, 4)
Y = torch.tensor(Y)  # (N,)


In [52]:
model = FourGramMLP(V)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

batch_size = 256
steps = 4000

for step in range(steps):
    idx = torch.randint(0, len(X), (batch_size,))
    xb = X[idx]   # (B, 4)
    yb = Y[idx]   # (B,)

    optimizer.zero_grad()
    logits = model(xb)
    loss = loss_fn(logits, yb)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step} | loss {loss.item():.4f}")


step 0 | loss 4.2170
step 500 | loss 1.2249
step 1000 | loss 1.1516
step 1500 | loss 0.7939
step 2000 | loss 0.8417
step 2500 | loss 0.7926
step 3000 | loss 0.8238
step 3500 | loss 0.7599


In [54]:
def generate(model, start_text, max_len=50):
    assert len(start_text) >= 4

    context = [stoi[c] for c in start_text[-4:]]
    out = start_text

    for _ in range(max_len):
        x = torch.tensor([context])  # (1, 4)
        logits = model(x)
        probs = F.softmax(logits[0], dim=-1)

        next_idx = torch.multinomial(probs, 1).item()
        next_char = itos[next_idx]

        out += next_char
        if next_char == '.':
            break

        context = context[1:] + [next_idx]

    return out


In [61]:
generate(model, 'casa')

'casalimbay.'

In [7]:
from collections import Counter

# Count character bigrams from city names
bigram_counts = Counter()

for name in names:
    name_lower = name.lower()
    # Create bigrams by pairing consecutive characters
    for i in range(len(name_lower) - 1):
        bigram = name_lower[i:i+2]
        bigram_counts[bigram] += 1

# Display most common bigrams
print(f"Total unique bigrams: {len(bigram_counts)}")
print(f"\nTop 20 most common character bigrams:")
for bigram, count in bigram_counts.most_common(20):
    print(f"  '{bigram}': {count}")

# Convert to dictionary for easy access
bigram_dict = dict(bigram_counts)

Total unique bigrams: 511

Top 20 most common character bigrams:
  'sk': 1324
  'a.': 1033
  'ov': 1009
  'ka': 800
  'y.': 763
  'ya': 761
  'no': 719
  'vo': 677
  'ko': 638
  'ye': 621
  'ki': 618
  'o.': 599
  'in': 548
  'ay': 515
  'k.': 512
  'iy': 497
  'sh': 471
  'oy': 466
  'ch': 463
  'an': 451


In [None]:
import random

def generate_from_bigram_dict(start_char, max_len=20):
    """
    Generate text using bigram dictionary counts.
    Selects next character based on bigram frequency.
    """
    if start_char not in vocab:
        return f"Error: '{start_char}' not in vocabulary"
    
    output = start_char
    current_char = start_char.lower()
    
    for _ in range(max_len):
        # Find all bigrams that start with current_char
        possible_bigrams = [(bigram, count) for bigram, count in bigram_dict.items() 
                           if bigram[0] == current_char]
        
        if not possible_bigrams:
            # No bigrams found starting with this character, stop
            break
        
        # Extract next characters and their counts
        next_chars = []
        weights = []
        for bigram, count in possible_bigrams:
            next_char = bigram[1]
            next_chars.append(next_char)
            weights.append(count)
        
        # Weighted random selection based on bigram counts
        next_char = random.choices(next_chars, weights=weights, k=1)[0]
        output += next_char
        
        # Stop if we hit the end marker
        if next_char == '.':
            break
        
        current_char = next_char
    
    return output

# Test generation
print("Generating city names using bigram dictionary:")
for start in ['m', 's', 'k', 'n', 'v']:
    generated = generate_from_bigram_dict(start)
    print(f"  '{start}' -> '{generated}'")
