In [99]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
words = open("names.txt").read().splitlines()

In [101]:
len(words)

32033

In [102]:
chars = sorted(list(set(''.join(words))))
vocab_size = len(chars) + 1
# encoding and decoding chars
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [103]:
# build dataset
X, Y = [], []
block_size = 3 # size of context for predictions

for word in words[:5]:
    context = [0] * block_size
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [104]:
X[0], Y[0]

(tensor([0, 0, 0]), tensor(5))

In [105]:
# hyperparameters
learning_rate = 0.1
max_epochs = 100

In [106]:
# first layer
g = torch.Generator().manual_seed(42)
C = torch.randn(vocab_size, 2, generator=g)
W1 = torch.randn(6, 100, generator=g)
b1 = torch.randn(100, generator=g)

# output layer
W2 = torch.randn(100, vocab_size, generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]

In [107]:
for param in parameters:
    param.requires_grad_()

In [108]:
for i in range(max_epochs):
    # forward pass
    tok_emb = C[X] # T x block_size x C
    h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y) 

    # backward pass
    for p in parameters: # zero out the gradients
        p.grad = None
    loss.backward()

    # update parameters
    for p in parameters:
        p.data += -learning_rate * p.grad

In [109]:
loss.item()

0.28075531125068665