In this jupyter notebook with solve the exercises associated to the third lesson in the course.

# E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [7]:
import torch
import torch.nn.functional as F

In [10]:
words = open('names.txt', 'r').read().splitlines()


In [11]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [12]:
# build the dataset
block_size = 4 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


torch.Size([182625, 4]) torch.Size([182625])
torch.Size([22655, 4]) torch.Size([22655])
torch.Size([22866, 4]) torch.Size([22866])


In [13]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g) * 0.01
W1 = torch.randn((40, 200), generator=g) * 0.01
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g) * 0.01
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [14]:
for p in parameters:
    p.requires_grad = True
    
# Regularization strength (lambda)
reg_lambda = 0.0001

# learning loop:
for i in range(200000):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (2048,))
  
  # forward pass
  emb = C[Xtr[ix]] # (32, 4, 4)
  h = torch.tanh(emb.view(-1, 40) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  ce_loss = F.cross_entropy(logits, Ytr[ix])
    
  # Compute L2 regularization loss (sum over all parameters)
  reg_loss = 0
  for p in parameters:
    reg_loss += p.pow(2).sum()
    
  # Total loss = cross entropy loss + regularization term
  loss = ce_loss + reg_lambda * reg_loss
  
  if i % 1000 == 0:
      print(f"{i}: {loss.item()}")
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad



0: 3.7640669345855713
1000: 2.816574811935425
2000: 2.673694133758545
3000: 2.5149476528167725
4000: 2.4476442337036133
5000: 2.433586359024048
6000: 2.4314026832580566
7000: 2.389500141143799
8000: 2.3416943550109863
9000: 2.337069272994995
10000: 2.322997570037842
11000: 2.3018946647644043
12000: 2.305741310119629
13000: 2.3033180236816406
14000: 2.28576922416687
15000: 2.2801334857940674
16000: 2.2362303733825684
17000: 2.256568670272827
18000: 2.21258544921875
19000: 2.251546859741211
20000: 2.2736141681671143
21000: 2.2070837020874023
22000: 2.2259745597839355
23000: 2.228837490081787
24000: 2.251105308532715
25000: 2.152637004852295
26000: 2.1736741065979004
27000: 2.1438486576080322
28000: 2.168102741241455
29000: 2.201573133468628
30000: 2.151139497756958
31000: 2.130990982055664
32000: 2.208843469619751
33000: 2.169929027557373
34000: 2.2003817558288574
35000: 2.1984996795654297
36000: 2.192793130874634
37000: 2.185087203979492
38000: 2.1658644676208496
39000: 2.16941213607788

In [15]:
with torch.no_grad():
    # Forward pass on the entire dev set 
    emb = C[Xdev]                     # (num_dev_examples, block_size, embedding_dim)
    h = torch.tanh(emb.view(Xdev.shape[0], -1) @ W1 + b1)  # (num_dev_examples, hidden_size)
    logits = h @ W2 + b2              # (num_dev_examples, num_classes)
    
    # Compute loss on the dev set
    dev_loss = F.cross_entropy(logits, Ydev)
    print(f"Dev loss: {dev_loss.item()}")

Dev loss: 2.060102701187134


The hyperparameters that we have changed are the context size, which now is 4, and the batch size. We have also included some L2 regularization, and initilaised the matrices with less variance. 