In [282]:
from __future__ import annotations
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random

%matplotlib inline

In [264]:
words: list[str] = open("names.txt", "r").read().splitlines()

In [265]:
# Gets all the characters, a-z
chars: list[str] = sorted(list(set("".join(words))))

# Maps each character to an integer
stoi: dict[str, int] = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0

# Maps each integer to a character
itos: dict[int, str] = {i: s for s, i in stoi.items()}

In [266]:
# Build dataset
def build_dataset(words: list[str], print_examples: int = 0) -> tuple[torch.Tensor, torch.Tensor]:
    block_size = 3 # Context length: number of characters taken to predict the next one
    X, Y = [], []
    for i, w in enumerate(words):
        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            if i < print_examples:
                print(''.join(itos[i] for i in context), '---->', itos[ix])
            context = context[1:] + [ix]
    
    return torch.tensor(X), torch.tensor(Y)

In [267]:
X, Y = build_dataset(words, 5)

... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
... ----> a
..a ----> v
.av ----> a
ava ----> .
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [268]:
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([228146, 3]), torch.Size([228146]), torch.int64, torch.int64)

In [269]:
# Embedding lookup table
C = torch.randn((27, 2))

In [270]:
# Parameters
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [271]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [272]:
parameters = [C, W1, b1, W2, b2]
sum(p.nelement() for p in parameters) # Number of parameters in total

3481

In [273]:
# Forward pass
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [274]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
print(h)
h.shape

tensor([[ 0.9208,  0.9263,  0.9198,  ..., -0.4087, -0.9899, -0.9319],
        [-0.9982, -0.0732, -0.7086,  ...,  0.9984, -0.8796, -0.9861],
        [-1.0000,  0.9921, -0.1214,  ...,  0.9997, -0.9582, -0.9812],
        ...,
        [ 0.8116, -0.9484,  0.2638,  ..., -0.9804, -0.9935, -0.2724],
        [-0.2628, -0.1613,  0.2926,  ..., -0.9996, -0.9996,  0.9067],
        [ 0.9632, -0.9942,  0.4905,  ..., -0.8996, -0.7454, -0.3228]])


torch.Size([228146, 100])

In [275]:
logits = h @ W2 + b2
logits.shape

torch.Size([228146, 27])

In [276]:
counts = logits.exp()
prob = counts / counts.sum(-1, keepdim=True)
prob.shape

torch.Size([228146, 27])

In [277]:
loss = -prob[torch.arange(len(Y)), Y].log().mean()
loss

tensor(17.4237)

In [278]:
loss = F.cross_entropy(logits, Y)
loss

tensor(17.4237)

In [279]:
for p in parameters:
    p.requires_grad = True

In [280]:
# Learning rate
lre = torch.linspace(-3, 0, 1000)
lrs = 10 ** lre

In [281]:
lri = []
lossi = []

for i in range(10000):
    # Minibatch construct
    ix = torch.randint(0, X.shape[0], (32, ))

    # Forward pass
    emb = C[X[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    # lr = lrs[i]
    lr = 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # Track stats
    # lri.append(lr)
    # lossi.append(loss.item())

print(loss.item())

2.7184276580810547


In [None]:
# Create the data sets
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.8*len(words))

# Training split (80%)
Xtr, Ytr = build_dataset(words[:n1])

# Dev/validation split (10%)
Xdev, Ydev = build_dataset(words[n1:n2])

# Test split (10%)
Xte, Yte = build_dataset(words[n2:])