### Version 2, since this is a good checkpoint and it's good to summarize!

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('../names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [3]:
block_size = 3 # context length: adjust how many characters will be used to predict the next one
X, Y = [], []
for w in words[:5]:
    #print(f'word: {w}')
    context = [0] * block_size
    
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]
        
X = torch.tensor(X)
Y = torch.tensor(Y)

In [4]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn(27, 2)
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
sum(p.nelement() for p in parameters)

3481

In [5]:
emb = C[X]
h = torch.tanh(emb.view(-1, emb.shape[1]*emb.shape[2]) @ W1 + b1)
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(dim=1, keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.5610)

Now introducing the cross entorpy as a way to make this process faster, and to save all of this code! And we get the same answer as before

In [56]:
F.cross_entropy(logits, Y)

tensor(15.3348)

Using cross_entropy is great because (1) the "educational way" of calculating loss because there's many intermediate tensors that need to be created in the process of calculating loss, which is inefficient. cross_entropy has dedicated kernals which can very efficiently evaluate these tensor operations without needing intermediate tensors (2) this incredibly clever idea:

In [63]:
# suppose we have the logits
logits = torch.tensor([-10, -5, 4, 8])
counts = logits.exp()
prob = counts / counts.sum()
prob

tensor([1.4956e-08, 2.2197e-06, 1.7986e-02, 9.8201e-01])

This is all well and good, and everything is well behaved when we take the exponential of the logits because large negative numbers go to zero

In [70]:
# suppose we have the logits
logits = torch.tensor([-100, -100, 4, 8])
counts = logits.exp()
prob = counts / counts.sum()
prob

tensor([0.0000, 0.0000, 0.0180, 0.9820])

But we will get very bad answers if our logits are large positive numbers:

In [71]:
# suppose we have the logits
logits = torch.tensor([-100, -100, 4, 100])
counts = logits.exp()
prob = counts / counts.sum()
prob

tensor([0., 0., 0., nan])

and this is because:

In [73]:
counts # we have an infinity here because there is not enough digits of precision to represent that element

tensor([3.7835e-44, 3.7835e-44, 5.4598e+01,        inf])

So what PyTorch does under the hood is get the maximum of the passed tensor, and then subtracts the entire tensor from that, since probabilities will be the same even if the largest is added to it

In [75]:
# suppose we have the logits
logits = torch.tensor([-100, -100, 4, 100]) - 100
counts = logits.exp()
prob = counts / counts.sum()
prob # bam, well behaved!

tensor([0.0000e+00, 0.0000e+00, 2.0305e-42, 1.0000e+00])

### So with that said let's redefine how we can calculate our loss

In [6]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn(27, 2)
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True
    p.grad = None

In [11]:
emb = C[X]
h = torch.tanh(emb.view(-1, emb.shape[1]*emb.shape[2]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(11.3392, grad_fn=<NllLossBackward0>)

### Always remember to zero your gradients! Always remember to requires_grad!

In [12]:
loss.backward()
# update
for p in parameters:
    p.data += -0.1 * p.grad
loss

tensor(11.3392, grad_fn=<NllLossBackward0>)