In [1]:
words = open('names.txt', 'r').read().splitlines()
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
letters = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(letters)}
stoi['.'] = 0

stoi.items()

dict_items([('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7), ('h', 8), ('i', 9), ('j', 10), ('k', 11), ('l', 12), ('m', 13), ('n', 14), ('o', 15), ('p', 16), ('q', 17), ('r', 18), ('s', 19), ('t', 20), ('u', 21), ('v', 22), ('w', 23), ('x', 24), ('y', 25), ('z', 26), ('.', 0)])

In [3]:
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

# Build a one-pass forward neural network

In [4]:
## Instantiate X

input = '.emma'
input_vec = []
for ch in input:
    input_vec.append(stoi[ch])
input_vec

[0, 5, 13, 13, 1]

In [None]:
## Initiate W matrix

cl = 27
g = torch.Generator().manual_seed(2147483647)
W = torch.rand([cl, cl], generator = g)

In [19]:
## Compute forward pass 
X = torch.tensor(input_vec)
xenc = F.one_hot(X, num_classes=cl).float() #input to neural net: one-hot encoding
y = xenc @ W    # predict log counts can also use torch.matmul(xenc, W)

#### Calculate probability matrix 
cnt = torch.exp(y)  #counts, equivalent to N
P = cnt / cnt.sum(dim=1, keepdim=True) # probability matrix for next character
print(P.shape)


torch.Size([5, 27])


In [21]:
## Instantiate y

label = 'emma.'
ys = torch.tensor([stoi[chr] for chr in label])

## Calculate loss (negative log-likelihood) for y
loss = torch.tensor(0.0)
cnt = 0
for y in ys:
    p = P[cnt, y]
    logp = torch.log(p)
    nll = -logp
    loss += nll
    print('Probability:', round(p.item(), 2), 'Log probability:', round(logp.item(), 2), 'Negative log likelihood (i.e. loss):', round(nll.item(), 2))
    cnt += 1

mean_loss = loss / cnt
print('Mean loss:', round(mean_loss.item(), 2))

Probability: 0.02 Log probability: -3.78 Negative log likelihood (i.e. loss): 3.78
Probability: 0.03 Log probability: -3.66 Negative log likelihood (i.e. loss): 3.66
Probability: 0.03 Log probability: -3.38 Negative log likelihood (i.e. loss): 3.38
Probability: 0.03 Log probability: -3.38 Negative log likelihood (i.e. loss): 3.38
Probability: 0.04 Log probability: -3.21 Negative log likelihood (i.e. loss): 3.21
Mean loss: 3.48


In [17]:
# Andrej's code
xs = torch.tensor([0, 5, 13, 13, 1])
ys = torch.tensor([5, 13, 13, 1, 0])

nlls = torch.zeros(5)
for i in range(5):
    #i-th bigram:
    x = xs[i].item() #input character index
    y = ys[i].item() #input character index
    print('------------')
    print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
    print('input to the neural net:', x)
    print('output probabilities from the neural net:', P[i])
    print('label (actual next character:)', y)
    p = P[i, y]
    print('probability assigned by the net to the correct character', p.item())
    logp = torch.log(p)
    print('log likelihood:', round(logp.item(), 2))
    nll = -logp
    print('negative log likelihood:', round(nll.item(), 2))
    nlls[i] = nll

print('===========')
print('average neg log likelihood, i.e. loss =', round(nlls.mean().item(), 2))

------------
bigram example 1: .e (indexes 0,5)
input to the neural net: 0
output probabilities from the neural net: tensor([0.0426, 0.0299, 0.0233, 0.0382, 0.0229, 0.0229, 0.0507, 0.0563, 0.0211,
        0.0280, 0.0440, 0.0313, 0.0497, 0.0439, 0.0308, 0.0261, 0.0424, 0.0563,
        0.0547, 0.0325, 0.0425, 0.0222, 0.0472, 0.0285, 0.0566, 0.0295, 0.0258])
label (actual next character:) 5
probability assigned by the net to the correct character 0.022935139015316963
log likelihood: -3.78
negative log likelihood: 3.78
------------
bigram example 2: em (indexes 5,13)
input to the neural net: 5
output probabilities from the neural net: tensor([0.0298, 0.0278, 0.0241, 0.0308, 0.0300, 0.0362, 0.0401, 0.0269, 0.0538,
        0.0445, 0.0560, 0.0239, 0.0358, 0.0258, 0.0370, 0.0542, 0.0275, 0.0513,
        0.0263, 0.0253, 0.0315, 0.0294, 0.0317, 0.0507, 0.0554, 0.0562, 0.0380])
label (actual next character:) 13
probability assigned by the net to the correct character 0.025837397202849388
log like

In [23]:
## Rewrite loss in vector form
logp = P[torch.arange(5), ys].log().mean()
loss = -logp

loss

tensor(3.4815)

## Put it all together 

In [25]:
input = '.emma'
xs = torch.tensor([stoi[chr] for chr in input])

xs

tensor([ 0,  5, 13, 13,  1])

In [26]:
label = 'emma.'
ys = torch.tensor([stoi[chr] for chr in label])

ys

tensor([ 5, 13, 13,  1,  0])

In [32]:
## Initiat the W matrix
cl = 27
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(cl, cl, generator=g, requires_grad=True)

In [36]:
## Compute forward pass 
xenc = F.one_hot(xs, num_classes = 27).float()
y = xenc @ W    #calculate log counts can also use torch.matmul(xenc, W)

## Calculate probability matrix 
cnt = torch.exp(y)  
P = cnt / cnt.sum(dim=1, keepdim=True)  #probability matrix for next character 

## Calculate vectorized loss
logp = P[torch.arange(5), ys].log().mean()
loss = -logp

print('Mean nll (i.e. loss):', round(loss.item(), 2))


Mean nll (i.e. loss): 3.77


In [37]:
## Compute backward pass 
delta = 0.1
W.grad = None # set gradient to zero
loss.backward() # will automatically calculates gradients

W.data += -delta * W.grad

In [38]:
W

tensor([[ 1.5661e+00, -2.3749e-01, -2.7631e-02, -1.1009e+00,  2.8554e-01,
         -9.8891e-03, -1.5471e+00,  6.0443e-01,  7.8862e-02,  9.0400e-01,
         -4.7141e-01,  7.8627e-01, -3.2862e-01, -4.3313e-01,  1.3719e+00,
          2.9286e+00,  1.5606e+00, -1.6261e+00,  6.7666e-01, -8.4050e-01,
          9.8420e-01, -1.4859e-01, -1.4796e+00,  4.4790e-01, -7.0966e-02,
          2.4937e+00,  2.4419e+00],
        [-6.5036e-01, -1.2201e+00,  3.0235e-01, -1.0727e+00,  7.2641e-01,
          5.0498e-02,  1.3073e+00, -8.0246e-01, -8.5067e-01, -1.8069e+00,
          1.2503e+00, -1.2258e+00,  1.2145e+00, -9.6500e-01, -2.3257e-01,
         -3.4803e-01,  3.3163e-01, -1.3264e+00,  1.1206e+00,  5.9535e-01,
          4.5753e-01,  5.3393e-02, -1.7401e+00,  1.1494e-01,  8.0189e-01,
          5.4007e-01, -1.1648e+00],
        [ 1.4756e-01, -1.0006e+00,  3.8012e-01,  4.7328e-01, -9.1027e-01,
         -7.8305e-01,  1.3506e-01, -2.1161e-01, -1.0406e+00, -1.5367e+00,
          9.3743e-01, -8.8303e-01,  1.74

In [None]:
## Sample from the model

g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    ix = 0
    res = []
    while True:
        #============= Before ==============
        p = N[ix].float()
        p /= p.sum()
    

        #============= Now =================
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        res.append(itos[ix])
        if ix == 0:
            break   
    print(''.join(res))