In [99]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
words = open("names.txt").read().splitlines()

In [101]:
len(words)

32033

In [102]:
chars = sorted(list(set(''.join(words))))
vocab_size = len(chars) + 1
# encoding and decoding chars
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [110]:
# build dataset
X, Y = [], []
block_size = 3 # size of context for predictions

for word in words:
    context = [0] * block_size
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [104]:
X[0], Y[0]

(tensor([0, 0, 0]), tensor(5))

In [186]:
# hyperparameters
learning_rate = 0.01
max_epochs = 10000
batch_size = 32

In [178]:
# first layer
g = torch.Generator().manual_seed(42)
C = torch.randn(vocab_size, 2, generator=g)
W1 = torch.randn(6, 100, generator=g)
b1 = torch.randn(100, generator=g)

# output layer
W2 = torch.randn(100, vocab_size, generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]

In [179]:
for param in parameters:
    param.requires_grad_()

In [180]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10 ** lre
lrs

tensor([0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012,
        0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0013, 0.0013, 0.0013,
        0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0014,
        0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014,
        0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015,
        0.0015, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016,
        0.0016, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017,
        0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0019,
        0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0020, 0.0020,
        0.0020, 0.0020, 0.0020, 0.0020, 0.0020, 0.0021, 0.0021, 0.0021, 0.0021,
        0.0021, 0.0021, 0.0021, 0.0022, 

In [189]:
for i in range(max_epochs):
    # construct minibatch
    ix = torch.randint(0, X.shape[0], (batch_size,))
    # forward pass
    tok_emb = C[X[ix]] # T x block_size x C
    h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix]) 
    
    if i % 10 == 0:
        print(f'Loss: {loss:.4f}')

    # backward pass
    for p in parameters: # zero out the gradients
        p.grad = None
    loss.backward()

    # update parameters
    lr = learning_rate
    for p in parameters:
        p.data += -lr * p.grad


Loss: 2.2897
Loss: 2.7117
Loss: 2.0168
Loss: 2.4474
Loss: 2.5954
Loss: 2.5244
Loss: 2.3938
Loss: 2.2994
Loss: 2.2121
Loss: 2.6540
Loss: 2.0953
Loss: 2.2042
Loss: 2.2668
Loss: 2.0916
Loss: 2.3668
Loss: 2.3124
Loss: 2.1729
Loss: 2.4946
Loss: 1.8778
Loss: 2.2836
Loss: 2.3575
Loss: 2.3274
Loss: 2.3074
Loss: 2.3237
Loss: 2.7228
Loss: 2.5603
Loss: 2.0593
Loss: 2.3503
Loss: 2.0457
Loss: 2.1061
Loss: 2.0404
Loss: 2.7363
Loss: 1.9836
Loss: 2.2680
Loss: 2.2281
Loss: 2.2785
Loss: 2.6613
Loss: 2.4116
Loss: 2.1132
Loss: 2.4175
Loss: 2.1551
Loss: 2.1684
Loss: 1.9031
Loss: 2.2179
Loss: 2.4538
Loss: 2.4645
Loss: 1.9633
Loss: 2.1315
Loss: 2.1618
Loss: 2.1152
Loss: 2.2388
Loss: 2.0031
Loss: 2.3461
Loss: 2.1538
Loss: 2.3101
Loss: 2.5187
Loss: 2.1354
Loss: 2.2416
Loss: 2.1845
Loss: 2.0395
Loss: 2.6507
Loss: 2.5245
Loss: 2.0707
Loss: 2.0901
Loss: 2.4001
Loss: 2.2808
Loss: 2.3423
Loss: 2.4370
Loss: 2.4455
Loss: 2.3972
Loss: 2.0100
Loss: 2.0788
Loss: 2.1545
Loss: 2.2398
Loss: 2.3711
Loss: 2.5122
Loss: 2.5555

In [190]:
tok_emb = C[X] # T x block_size x C
h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y) 
loss.item()

2.3191893100738525