In [None]:
%%capture
!wget https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
names = open('names.txt').read().splitlines()

# tokenizer
vocab = sorted(set(''.join(names) + '.'))
vocab_size = len(vocab)
stoi = {v:k for k, v in enumerate(vocab)}
itos = {v:k for k, v in stoi.items()}

def decode(seq: list[int]) -> str:
    return ''.join([itos[i] for i in seq])

def encode(name: str) -> list[int]:
    return [stoi[s] for s in name]

In [None]:
block_size = 3

X = []
Y = []

for name in names:
    context = [0] * block_size
    for ch in name + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(f"{context} -> {ix}")
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

train_size = int(.8 * X.shape[0])
Xtr, Xts = X[:train_size], X[train_size:]
Ytr, Yts = Y[:train_size], Y[train_size:]

In [None]:
from torch.utils.data import Dataset, TensorDataset, DataLoader

# class NamesDataset(Dataset):
#     def __init__(self, X, Y):
#         self.X = X
#         self.Y = Y

#     def __getitem__(self, idx):
#         return self.X[idx], self.Y[idx]

#     def __len__(self):
#         return self.X.shape[0]

Dtr = TensorDataset(Xtr, Ytr)
Dts = TensorDataset(Xts, Yts)

In [None]:
DLtr = DataLoader(Dtr, batch_size=32, shuffle=True, drop_last=True)
DLts = DataLoader(Dts, batch_size=32, shuffle=False, drop_last=False)

In [None]:
class NameGenerator(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_hidden):
        super().__init__()
        self.E = nn.Embedding(vocab_size, n_embd)
        self.lin1 = nn.Linear(block_size * n_embd, n_hidden)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(n_hidden, vocab_size)

        torch.nn.init.xavier_uniform_(self.lin1.weight)


    def forward(self, x):
        # x: (B, T)
        xemb = self.E(x) # x shape: (B, T, C) or (batch_size, block_size, n_embd)
        # print(f"xemb shape: {xemb.shape}")
        B, T, C = xemb.shape
        xflat = xemb.view(B, T * C) # (B, T * C) or (batch_size, block_size * n_embd)
        # print(f"xflat shape: {xflat.shape}")
        hpreact = self.lin1(xflat)
        h = self.tanh(hpreact)
        logits = self.out(h)
        return logits

n_embd = 2
model = NameGenerator(vocab_size, n_embd, block_size=3, n_hidden=50).to(device)
model2 = NameGenerator(vocab_size, n_embd=32, block_size=3, n_hidden=150).to(device)
optimizer = torch.optim.Adam(model.parameters())
optimizer2 = torch.optim.Adam(model2.parameters())

In [None]:
def plot_embedding():

    w = model.E.weight.cpu().detach().numpy()

    plt.figure(figsize=(8, 8))
    plt.scatter(w[:, 0], w[:, 1], s=200)

    for i in range(27):
        char = itos[i]
        plt.text(w[i][0], w[i][1], char, ha='center', va='center', color='white')
    plt.show()

In [None]:
# model hyperparameters
n_embd = 2
n_hidden = 50

E = torch.randn(vocab_size, n_embd)
W1 = torch.randn(block_size * n_embd, n_hidden) / ((block_size * n_embd) ** 0.5)
b1 = torch.randn(n_hidden) / ((block_size * n_embd) ** 0.5)
W2 = torch.randn(n_hidden, vocab_size) * 0.01
b2 = torch.randn(vocab_size) * 0

params = [E, W1, b1, W2, b2]

for param in params:
    param.requires_grad = True

In [None]:
from IPython.display import clear_output

In [None]:
@torch.no_grad()
def evaluate(model):
    model.eval()

    epoch_loss = 0.0
    for x, y in DLts:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        epoch_loss += loss.item()
    epoch_loss /= len(DLts)

    return epoch_loss

In [None]:
def train(model, optimizer, n_epoch = 10):
    torch.manual_seed(42)
    lossi = []

    for epoch in range(n_epoch):
        model.train()
        epoch_loss = 0.0
        for x, y in DLtr:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = F.cross_entropy(logits, y)

            # for param in model.parameters():
            #     param.grad = None
            optimizer.zero_grad()

            loss.backward()

            # for param in model.parameters():
            #     param.data -= 0.01 * param.grad
            optimizer.step()

            epoch_loss += loss.item()
            lossi.append(loss.item())
        epoch_loss /= len(DLtr)
        eval_loss = evaluate(model)
        print(f"Epoch {epoch+1} | Train Loss: {epoch_loss:.3f} | Eval Loss: {eval_loss:.3f}")

    return lossi
        # plot_embedding()
        # clear_output(wait=True)

In [None]:
lossi = train(model, optimizer)

Epoch 1 | Train Loss: 2.386 | Eval Loss: 2.562
Epoch 2 | Train Loss: 2.348 | Eval Loss: 2.538
Epoch 3 | Train Loss: 2.322 | Eval Loss: 2.525
Epoch 4 | Train Loss: 2.302 | Eval Loss: 2.507
Epoch 5 | Train Loss: 2.287 | Eval Loss: 2.505
Epoch 6 | Train Loss: 2.276 | Eval Loss: 2.511
Epoch 7 | Train Loss: 2.267 | Eval Loss: 2.498
Epoch 8 | Train Loss: 2.260 | Eval Loss: 2.490
Epoch 9 | Train Loss: 2.255 | Eval Loss: 2.497
Epoch 10 | Train Loss: 2.250 | Eval Loss: 2.491


In [None]:
lossi = train(model2, optimizer2)

Epoch 1 | Train Loss: 2.242 | Eval Loss: 2.423
