In [28]:
import torch
import torch.nn.functional as F
import matplotlib as plt
%matplotlib inline

In [29]:
words = open('data/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [30]:
len(words)

32033

In [31]:
# build vocabulary of characters and mappings to/from ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {s:i for i, s in stoi.items()}

In [107]:
# build the dataset
block_size = (
    3  # how many characters to use to predict the next (e.g., given 3, predict the 4th)
)
X, Y = [], []
for w in words:
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]  # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)


In [108]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [55]:
# map characters into two-dimensional space (2 features)
C = torch.randn((27, 2))

In [42]:
C[5]

tensor([ 0.1608, -0.4292])

In [43]:
# we won't use this, but for demo purposes
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([ 0.1608, -0.4292])

In [58]:
# TODO: deep dive on "vectorized" indexing

emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [60]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [69]:
# note this is the same as a view of 32, 6 on emb
# but this is inefficient and creates new memory
# torch.cat(torch.unbind(emb, 1), 1).shape

In [73]:
# note: -1 means pytorch should infer the size
# and we are using broadcasting because the shape is (32, 100) + (1, 100)
h = emb.view(-1, 6) @ W1 + b1
h

tensor([[-0.3756, -1.4700,  0.9593,  ..., -1.0189, -0.1805, -1.7467],
        [ 1.4458, -2.9249, -0.6455,  ...,  0.8396, -0.3092, -3.2581],
        [-2.0311,  3.3760,  4.9581,  ..., -2.4709,  3.5807,  0.5737],
        ...,
        [ 4.7768,  1.3318,  2.7927,  ...,  2.3503,  2.3656, -3.2482],
        [-3.8487, -1.7352,  0.4011,  ..., -5.1642,  2.3147,  0.1899],
        [-2.3546, -2.6379,  1.2796,  ...,  1.2726,  3.4596,  2.4552]])

In [72]:
h.shape

torch.Size([32, 100])

In [80]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [81]:
logits = h @ W2 + b2

In [82]:
logits.shape

torch.Size([32, 27])

In [83]:
counts = logits.exp()

In [88]:
prob = counts / counts.sum(1, keepdims=True)
prob.shape

torch.Size([32, 27])

In [93]:
# --- clean update

In [109]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [110]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [111]:
sum(p.nelement() for p in parameters)

3481

In [117]:
for p in parameters:
    p.requires_grad = True

for _ in range(100):
    # minibatch
    ix = torch.randint(0, X.shape[0], (32,))

    # forward pass
    emb = C[X[ix]]  # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2  # (32, 27)
    # below is example of cross_entropy
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y[ix])

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())


2.812567710876465


In [120]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
loss

tensor(2.7215, grad_fn=<NllLossBackward0>)