In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
with open("names.txt") as f:
  words = f.read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
# build vocab and mappings to/from ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(f"{itos=}")

itos={1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [7]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:
  
  # print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append (rolling window)
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [8]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [9]:
C = torch.randn((27, 2))

In [26]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [29]:
W1 = torch.randn((6,100)) # 6 inputs, 100 outputs (hidden layer size or hyperparameter)
b1 = torch.randn(100)

In [39]:
h = emb.view(32, 6) @ W1 + b1 # hidden layer
h.shape
# ^ 100 dimensional activations for each of the training examples

torch.Size([32, 100])

In [42]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # hidden layer, tanh makes it -1 to 1
h

In [44]:
h.shape

torch.Size([32, 100])

In [45]:
b1.shape

torch.Size([100])

In [None]:
# Broadcasting check: This is correct because the same 100D bias vector is added to each of the 32 training examples
# 32, 100
#   , 100

In [46]:
W2 = torch.randn((100, 27)) # input 100 from hidden layer, output 27 (one for each character)
b2 = torch.randn(27)

In [47]:
logits = h @ W2 + b2


In [48]:
logits.shape

torch.Size([32, 27])

In [50]:
counts = logits.exp()

In [53]:
prob = counts / counts.sum(1, keepdim=True)

In [61]:
prob.shape

torch.Size([32, 27])

In [63]:
prob # the probability of each character for each training example

tensor([[1.4813e-05, 4.9507e-12, 3.7101e-04, 4.4213e-05, 1.4031e-06, 4.0345e-01,
         3.6777e-08, 2.8899e-10, 2.3362e-11, 1.4314e-02, 1.3185e-13, 1.1037e-06,
         1.2677e-03, 5.4338e-03, 4.2434e-06, 3.3549e-06, 1.7512e-10, 2.3876e-06,
         5.7134e-01, 5.9527e-09, 2.6583e-07, 5.4498e-07, 7.7641e-07, 3.7423e-04,
         1.7179e-09, 3.3473e-03, 3.4533e-05],
        [9.8116e-11, 1.3091e-09, 9.7746e-12, 8.5157e-04, 2.4377e-12, 1.3418e-04,
         1.1531e-05, 9.2460e-09, 3.3055e-13, 9.9606e-09, 3.9576e-11, 1.7197e-02,
         1.2075e-08, 5.4115e-06, 3.8283e-06, 4.5141e-08, 8.6070e-03, 4.1409e-08,
         1.6570e-09, 1.0854e-09, 9.5931e-01, 2.2277e-12, 1.3808e-02, 6.1580e-07,
         8.9950e-13, 4.5421e-06, 7.1229e-05],
        [1.3723e-03, 1.6301e-10, 2.2812e-05, 3.6982e-06, 3.7219e-02, 4.0616e-04,
         1.1141e-01, 3.9589e-11, 3.8617e-02, 7.7586e-07, 5.0776e-14, 8.2956e-07,
         7.9574e-01, 1.7263e-10, 1.2285e-05, 1.8921e-06, 2.1999e-12, 1.7806e-05,
         9.4828e-

In [64]:
prob[torch.arange(32), Y] # given the label, the probability of the model assigns to the label

tensor([4.0345e-01, 5.4115e-06, 1.7263e-10, 5.0775e-11, 9.1275e-09, 3.3549e-06,
        3.1971e-05, 2.0841e-04, 5.0416e-06, 2.4476e-12, 2.7518e-04, 1.6607e-12,
        4.9507e-12, 4.7417e-04, 3.2499e-15, 3.8346e-08, 1.4314e-02, 1.1149e-08,
        8.4075e-10, 2.1631e-12, 3.7346e-06, 2.0490e-08, 7.7135e-01, 2.4604e-07,
        2.1541e-08, 5.9527e-09, 9.8909e-07, 2.2079e-13, 9.4318e-01, 9.4595e-04,
        5.4572e-06, 4.9512e-13])

In [65]:
loss = -prob[torch.arange(32), Y].log().mean() # negative log likelihood loss
loss

tensor(15.9552)

In [66]:
# CLEANED UP

In [67]:
X.shape, Y.shape # dataset


(torch.Size([32, 3]), torch.Size([32]))

In [82]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [83]:
sum(p.nelement() for p in parameters) # number of parameters in total


3481

In [84]:
for p in parameters:
  p.requires_grad = True


In [90]:
for _ in range(1000):
  # forward pass
  emb = C[X] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Y)
  print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update parameters
  for p in parameters:
    p.data += -0.1 * p.grad

0.5532564520835876
0.5344884991645813
0.5171173214912415
0.5013316869735718
0.4872431755065918
0.47484081983566284
0.463998019695282
0.4545147716999054
0.44617143273353577
0.4387666583061218
0.4321335256099701
0.42613908648490906
0.42068007588386536
0.4156756103038788
0.41106167435646057
0.40678736567497253
0.40281081199645996
0.3990974426269531
0.3956182599067688
0.39234787225723267
0.3892655074596405
0.3863520920276642
0.3835917115211487
0.3809700906276703
0.3784741759300232
0.3760930299758911
0.37381646037101746
0.37163496017456055
0.3695409595966339
0.3675268888473511
0.3655855357646942
0.3637113571166992
0.36189839243888855
0.36014169454574585
0.35843634605407715
0.35677802562713623
0.35516270995140076
0.35358691215515137
0.3520469665527344
0.35053980350494385
0.3490622341632843
0.3476121425628662
0.34618648886680603
0.3447834849357605
0.34340086579322815
0.34203672409057617
0.3406897783279419
0.339358389377594
0.3380417823791504
0.3367387056350708
0.3354485034942627
0.33417096734

In [76]:
F.cross_entropy(logits, Y)

tensor(17.7697)

In [None]:
logits = torch.tensor([-2, -3, 0, 5])