In [73]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [74]:
words = open("names.txt").read().splitlines()

In [75]:
len(words)

32033

In [76]:
chars = sorted(list(set(''.join(words))))
vocab_size = len(chars) + 1
# encoding and decoding chars
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [77]:
# build dataset
X, Y = [], []
block_size = 3 # size of context for predictions

for word in words[:5]:
    context = [0] * block_size
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [78]:
X[0], Y[0]

(tensor([0, 0, 0]), tensor(5))

In [79]:
C = torch.randn(vocab_size, 2)

In [80]:
C.shape

torch.Size([27, 2])

In [81]:
X[13], C[X][13]

(tensor([0, 0, 1]),
 tensor([[-0.3490,  0.3206],
         [-0.3490,  0.3206],
         [-0.0375, -0.5325]]))

In [82]:
C[X][0]

tensor([[-0.3490,  0.3206],
        [-0.3490,  0.3206],
        [-0.3490,  0.3206]])

In [83]:
tok_emb = C[X]
tok_emb.shape

torch.Size([32, 3, 2])

In [84]:
# first layer
W1 = torch.randn(6, 100)
b = torch.randn(100)

In [85]:
# one way to reshape to a 32 x 6 matrix
torch.cat(torch.unbind(tok_emb,1), 1)
# more effeciency way
tok_emb.view(-1, 6) # -1 means infer the size which is 32

tensor([[-0.3490,  0.3206, -0.3490,  0.3206, -0.3490,  0.3206],
        [-0.3490,  0.3206, -0.3490,  0.3206, -0.7880, -0.0998],
        [-0.3490,  0.3206, -0.7880, -0.0998,  0.0224, -0.5022],
        [-0.7880, -0.0998,  0.0224, -0.5022,  0.0224, -0.5022],
        [ 0.0224, -0.5022,  0.0224, -0.5022, -0.0375, -0.5325],
        [-0.3490,  0.3206, -0.3490,  0.3206, -0.3490,  0.3206],
        [-0.3490,  0.3206, -0.3490,  0.3206,  0.4983, -2.7033],
        [-0.3490,  0.3206,  0.4983, -2.7033,  0.1693,  1.3699],
        [ 0.4983, -2.7033,  0.1693,  1.3699, -0.4941, -0.2430],
        [ 0.1693,  1.3699, -0.4941, -0.2430,  0.7316, -0.8155],
        [-0.4941, -0.2430,  0.7316, -0.8155, -0.4941, -0.2430],
        [ 0.7316, -0.8155, -0.4941, -0.2430, -0.0375, -0.5325],
        [-0.3490,  0.3206, -0.3490,  0.3206, -0.3490,  0.3206],
        [-0.3490,  0.3206, -0.3490,  0.3206, -0.0375, -0.5325],
        [-0.3490,  0.3206, -0.0375, -0.5325,  0.7316, -0.8155],
        [-0.0375, -0.5325,  0.7316, -0.8

In [86]:
h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b)

In [87]:
# output layer
W2 = torch.randn(100, vocab_size)
b2 = torch.randn(vocab_size)

In [88]:
# calculate the logits and normalize to probabilities
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(-1, keepdim=True)

In [89]:
# calculate the loss
#loss = -prob[torch.arange(len(Y)), Y].log().mean() # negative log likelihood
loss = F.cross_entropy(logits, Y) # same as above
loss

tensor(15.9860)

In [94]:
# hyperparameters
learning_rate = 0.1
max_epochs = 100

In [95]:
# first layer
g = torch.Generator().manual_seed(42)
C = torch.randn(vocab_size, 2, generator=g)
W1 = torch.randn(6, 100, generator=g)
b1 = torch.randn(100, generator=g)

# output layer
W2 = torch.randn(100, vocab_size, generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]

In [96]:
for param in parameters:
    param.requires_grad_()

In [97]:
for i in range(max_epochs):
    # forward pass
    tok_emb = C[X] # T x block_size x C
    h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y) 

    # backward pass
    for p in parameters: # zero out the gradients
        p.grad = None
    loss.backward()

    # update parameters
    for p in parameters:
        p.data += -learning_rate * p.grad

In [98]:
loss.item()

0.28075531125068665