In this jupyter notebook with solve the exercises associated to the third lesson in the course.

# E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [1]:
import torch
import torch.nn.functional as F

In [2]:
words = open('../names.txt', 'r').read().splitlines()


In [3]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [4]:
# build the dataset
block_size = 4 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


torch.Size([182625, 4]) torch.Size([182625])
torch.Size([22655, 4]) torch.Size([22655])
torch.Size([22866, 4]) torch.Size([22866])


In [18]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g) * 0.01
W1 = torch.randn((40, 200), generator=g) * 0.01
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g) * 0.01
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
for p in parameters:
    p.requires_grad = True
    
# Regularization strength (lambda)
reg_lambda = 0.0001

# learning loop:
for i in range(1000000):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (2048,))
  
  # forward pass
  emb = C[Xtr[ix]] # (32, 4, 4)
  h = torch.tanh(emb.view(-1, 40) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  ce_loss = F.cross_entropy(logits, Ytr[ix])
    
  # Compute L2 regularization loss (sum over all parameters)
  reg_loss = 0
  for p in parameters:
    reg_loss += p.pow(2).sum()
    
  # Total loss = cross entropy loss + regularization term
  loss = ce_loss + reg_lambda * reg_loss
  
  if i % 1000 == 0:
      print(f"{i}: {loss.item()}")
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  if i < 5000:
    lr = 0.1
  elif i < 10000:
    lr = 0.01
  elif i < 20000:
    lr = 0.001
  elif i < 40000:
    lr = 0.0001
  else: 
    lr = 0.000001
  for p in parameters:
    p.data += -lr * p.grad



0: 2.383408784866333
1000: 2.390038251876831
2000: 2.346309185028076
3000: 2.361440658569336
4000: 2.3024892807006836
5000: 2.318439483642578
6000: 2.2855751514434814
7000: 2.321019411087036
8000: 2.367689371109009
9000: 2.321833372116089
10000: 2.2576465606689453
11000: 2.300424814224243
12000: 2.2740671634674072
13000: 2.2913851737976074


In [17]:
with torch.no_grad():
    # Forward pass on the entire dev set 
    emb = C[Xdev]                     # (num_dev_examples, block_size, embedding_dim)
    h = torch.tanh(emb.view(Xdev.shape[0], -1) @ W1 + b1)  # (num_dev_examples, hidden_size)
    logits = h @ W2 + b2              # (num_dev_examples, num_classes)
    
    # Compute loss on the dev set
    dev_loss = F.cross_entropy(logits, Ydev)
    print(f"Dev loss: {dev_loss.item()}")

Dev loss: 2.417600393295288
