## Version 2, since now we're at a good point of re-factoring our code so that the BatchNorm layer is pretty clear

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import math
import random
%matplotlib inline

In [2]:
words = open('../names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)
vocab_size

27

In [3]:
block_size = 3

def build_dataset(words, block_size):
    X, Y = [], []
    
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    # split up dataset
    random.seed(42)
    random.shuffle(words)
    n1 = int(0.8*X.shape[0])
    n2 = int(0.9*X.shape[0])
    Xtr, Xdev, Xte = X.tensor_split((n1, n2), dim=0)
    Ytr, Ydev, Yte = Y.tensor_split((n1, n2), dim=0)
    
    return Xtr, Ytr, Xdev, Ydev, Xte, Yte 

Xtr, Ytr, Xdev, Ydev, Xte, Yte = build_dataset(words, block_size)
Xtr.shape, Ytr.shape, Xdev.shape, Ydev.shape, Xte.shape, Yte.shape

(torch.Size([182516, 3]),
 torch.Size([182516]),
 torch.Size([22815, 3]),
 torch.Size([22815]),
 torch.Size([22815, 3]),
 torch.Size([22815]))

In [4]:
n_emb = 10 
n_hidden = 200

In [5]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn(vocab_size, n_emb, generator=g)
W1 = torch.randn((block_size*n_emb, n_hidden), generator=g) * (5/3) / (block_size*n_emb)**0.5
#b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0 # technically looking at this, we don't need it either

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]
print(f'total parameters: {sum(p.nelement() for p in parameters)}')
for p in parameters:
    p.requires_grad = True

total parameters: 12097


In [6]:
max_steps = 200_000
batch_size = 32
lossi = []
epsilon = 1e-5 # additional variable which allows for more well behaved std when variance is 0

for i in range(max_steps):
    
    ### mini-batch construction
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batches for X, Y
    
    ### forward pass
    emb = C[Xb] # embed all characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate all vectors
    
    # linear layer
    hpreact = embcat @ W1

    # BatchNorm layer
    # -------------------------------------------------------------------
    bnmeani = hpreact.mean(dim=0, keepdim=True)
    bnstdi = hpreact.std(dim=0, keepdim=True)    
    hpreact = bngain*(hpreact - bnmeani)/(bnstdi + epsilon) + bnbias
    with torch.no_grad():
        bnmean_running = 0.999*bnmean_running + 0.001*bnmeani
        bnstd_running = 0.999*bnstd_running + 0.001*bnstdi
    # -------------------------------------------------------------------
    # non-linearity to construct outputs
    h = torch.tanh(hpreact) # hidden layer construction
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function
        
    ### backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < int(max_steps/2) else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    if i % 10_000 == 0:
        print(f'loss at step {i} of {max_steps}: {loss:.4f}')
    lossi.append(loss.log10().item())
    
print(f'-----\nfinal loss: {10**(lossi[-1])}')

loss at step 0 of 200000: 3.3127
loss at step 10000 of 200000: 2.4849
loss at step 20000 of 200000: 2.1814
loss at step 30000 of 200000: 1.9586
loss at step 40000 of 200000: 2.3361
loss at step 50000 of 200000: 1.9597
loss at step 60000 of 200000: 2.4565
loss at step 70000 of 200000: 1.8806
loss at step 80000 of 200000: 2.1031
loss at step 90000 of 200000: 2.2282
loss at step 100000 of 200000: 1.8709
loss at step 110000 of 200000: 2.0971
loss at step 120000 of 200000: 1.8410
loss at step 130000 of 200000: 2.0784
loss at step 140000 of 200000: 2.1063
loss at step 150000 of 200000: 1.8091
loss at step 160000 of 200000: 2.2664
loss at step 170000 of 200000: 1.8477
loss at step 180000 of 200000: 2.1799
loss at step 190000 of 200000: 2.1433
-----
final loss: 1.974827186526749


Number of features = number of neurons in our hidden layer

The momentum is the scalar value that we use to update the running quantities by the in-situ calculated value obtained while we're training

https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html

https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

The main point is that throughout our layer, we want Gaussian activations so that the behavior of the network is fairly stable regardless of our inputs. This will become critical when we begin to scale the number of layers in our MLP to do more complicated things

Over time, BatchNorm got phased out because of all of these correlations that are introduced when the examples are coupled to each other when we are normalizing all of the batches during training. It causes errors and bugs and Andrej wants to spare us from this pain. There is stuff like LayerNorm which, I'm guessing just normalizes over an entire layer, but we aren't covering that yet

In [22]:
# we're at an 1:20:00 roughly, and we're just starting to turn everything into modules so that it'll look just 
# like what we'd see when looking at PyTorch source code