In [40]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-03-11 11:50:07--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-03-11 11:50:07 (13.1 MB/s) - ‘names.txt’ saved [228145/228145]



# Load data

In [41]:
file_path = 'names.txt'
words = open(file_path, 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
char_to_idx = {char: idx+1 for idx, char in enumerate(chars)}
char_to_idx['.'] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
vocab_size = len(idx_to_char)

print(f'{idx_to_char=}\n{vocab_size=}')

idx_to_char={1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
vocab_size=27


# Create dataloader

In [42]:
def build_dataset(words, char_to_idx, context_size=3):
  X, Y = [], []

  for w in words:
    context = [0] * context_size
    for char in w + '.':
      idx = char_to_idx[char]
      X.append(context)
      Y.append(idx)
      context = context[1:] + [idx]

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(f'{X.shape=}, {Y.shape=}')
  return X, Y

In [116]:
context_size = 4
train_size = 0.85
split = int(train_size * len(words))
X_train, y_train = build_dataset(words[:split], char_to_idx, context_size=context_size)
X_valid, y_valid = build_dataset(words[split:], char_to_idx, context_size=context_size)

X.shape=torch.Size([194078, 4]), Y.shape=torch.Size([194078])
X.shape=torch.Size([34068, 4]), Y.shape=torch.Size([34068])


# Create model

In [121]:
emb_size = 10
hidden_size = 200

generator = torch.Generator().manual_seed(999)
char_emb = torch.randn((vocab_size, emb_size), generator=generator)
# Layer 1
W1 = torch.randn((emb_size * context_size, hidden_size), generator=generator) * (5/3)/((emb_size * context_size)**0.5)
b1 = torch.randn(hidden_size, generator=generator) * 0.1
# Layer 2
W2 = torch.randn((hidden_size, hidden_size), generator=generator) * 0.1
b2 = torch.randn(hidden_size, generator=generator) * 0.1
# Layer 3
W3 = torch.randn((hidden_size, vocab_size), generator=generator) * 0.1
b3 = torch.randn(vocab_size, generator=generator) * 0.1
# Batch Norm params
batch_norm_scale_1 = torch.randn((1, hidden_size)) * 0.1 + 1.0
batch_norm_shift_1 = torch.randn((1, hidden_size)) * 0.1

batch_norm_scale_2 = torch.randn((1, hidden_size)) * 0.1 + 1.0
batch_norm_shift_2 = torch.randn((1, hidden_size)) * 0.1

parameters = [char_emb, W1, b1, W2, b2, W3, b3, batch_norm_scale_1, batch_norm_shift_1, batch_norm_scale_2, batch_norm_shift_2]
print(f'# of params: {sum(param.nelement() for param in parameters):,}')

# of params: 54,897


In [None]:
for param in parameters:
    param.requires_grad = True

max_steps = 200000
batch_size = 32
n = batch_size
lossi = []

with torch.no_grad():
  for step in range(max_steps):

    # mini batch embedding
    mini_batch = torch.randint(0, X_train.shape[0], (batch_size, ), generator=generator)
    X_batch, y_batch = X_train[mini_batch], y_train[mini_batch] # (32, 3) (32)

    # forward pass
    emb = char_emb[X_batch] # emb: (32, 3, 10)  char_emb: (27, 10)
    flatten_emb = emb.view(emb.shape[0], -1) # (32, 30)
    # Linear layer 1
    pre_batch_norm_1 = flatten_emb @ W1 + b1 # (32, 200) = (32, 30) @ (30, 200) + (200)
    # Batch Norm 1
    batch_norm_mean_1 = pre_batch_norm_1.mean(0, keepdim=True) # (1, 200)
    batch_norm_var_1 = pre_batch_norm_1.var(0, keepdim=True, unbiased=True) # (1, 200)
    batch_norm_var_inv_1 = (batch_norm_var_1 + 1e-5)**-0.5 # (1, 200)
    batch_norm_raw_1 = (pre_batch_norm_1 - batch_norm_mean_1) * batch_norm_var_inv_1 # (32, 200) = ((32, 200) - (1, 200)) * (1, 200)
    Z_1 = batch_norm_scale_1 * batch_norm_raw_1 + batch_norm_shift_1 # (32, 200)
    # Activation 1
    alpha = 0.01
    A_1 = np.maximum(alpha * Z_1, Z_1)
    # Linear layer 2
    pre_batch_norm_2 = A_1 @ W2 + b2
    # Batch Norm 2
    batch_norm_mean_2 = pre_batch_norm_2.mean(0, keepdim=True)
    batch_norm_var_2 = pre_batch_norm_2.var(0, keepdim=True, unbiased=True)
    batch_norm_var_inv_2 = (batch_norm_var_2 + 1e-5)**0.5
    batch_norm_raw_2 = (pre_batch_norm_2 - batch_norm_mean_2) * batch_norm_var_inv_2
    Z_2 = batch_norm_scale_2 * batch_norm_raw_2 + batch_norm_shift_2
    # Activation 2
    A_2 = np.maximum(alpha * Z_2, Z_2)
    logits = A_2 @ W3 + b3 # (32, 27) = (32, 200) @ (200, 27) + (27)

    # Loss
    loss = F.cross_entropy(logits, y_batch)

    # backward pass
    for param in parameters:
      param.grad = None
    d_logits = F.softmax(logits, 1)
    d_logits[range(n), y_batch] -= 1
    d_logits /= n # (32, 27)
    # Layer 3
    d_A_2 = d_logits @ W3.T # (32, 200) = (32, 27) @ (27, 200)
    d_W3 = A_2.T @ d_logits # (200, 27) = (200, 32) @ (32, 27)
    d_b3 = d_logits.sum(0) # (27)
    # LeakyRelu
    d_Z_2 = torch.ones_like(Z_2) # (32, 200)
    d_Z_2[Z_2 <= 0] = alpha
    d_Z_2 *= d_A_2
    # Batch norm
    d_batch_norm_scale_2 = (batch_norm_raw_2 * d_Z_2).sum(0, keepdim=True) # (1, 200)
    d_batch_norm_shift_2 = d_Z_2.sum(0, keepdim=True) # (1, 200)
    d_pre_batch_norm_2 = batch_norm_scale_2 * batch_norm_var_inv_2/n * (n*d_Z_2 - d_Z_2.sum(0) - n/(n-1)*batch_norm_raw_2*(d_Z_2*batch_norm_raw_2).sum(0)) # (32, 200)

    # Layer 2
    d_A = d_Z_2 @ W2.T # (32, 200) = (32, 200) @ (200, 200)
    d_W2 = A_1.T @ d_Z_2 # (200, 27) = (200, 32) @ (32, 27)
    d_b2 = d_Z_2.sum(0) # (27)
    # LeakyRelu
    d_Z = torch.ones_like(Z_1) # (32, 200)
    d_Z[Z_1 <= 0] = alpha
    d_Z *= d_A
    # Batch norm
    d_batch_norm_scale = (batch_norm_raw_1 * d_Z).sum(0, keepdim=True) # (1, 200)
    d_batch_norm_shift = d_Z.sum(0, keepdim=True) # (1, 200)
    d_pre_batch_norm = batch_norm_scale_1 * batch_norm_var_inv_1/n * (n*d_Z - d_Z.sum(0) - n/(n-1)*batch_norm_raw_1*(d_Z*batch_norm_raw_1).sum(0)) # (32, 200)
    # Layer 1
    d_flatten_emb = d_pre_batch_norm @ W1.T # (32, 30) = (32, 200) @ (200, 30)
    d_W1 = flatten_emb.T @ d_pre_batch_norm # (30, 200) = (30, 32) @ (32, 200)
    d_b1 = d_pre_batch_norm.sum(0) # (200)
    # mini batch embedding
    d_mini_batch = d_flatten_emb.view(emb.shape) # (32, 3, 10)
    d_char_emb = torch.zeros_like(char_emb) # (27, 10)
    for k in range(X_batch.shape[0]):
      for j in range(X_batch.shape[1]):
        idx = X_batch[k, j]
        d_char_emb[idx] += d_mini_batch[k, j]

    grads = [d_char_emb, d_W1, d_b1, d_W2, d_b2, d_W3, d_b3, d_batch_norm_scale, d_batch_norm_shift, d_batch_norm_scale_2, d_batch_norm_shift_2]

    lr = 0.0005
    for param, grad in zip(parameters, grads):
      param.data -= lr * grad

    if step % 10000 == 0:
      print(f'({step}/{max_steps}) {loss.item():.5f}')

    lossi.append(loss.log10().item())


(0/200000) 3.69194
(10000/200000) 2.60604
(20000/200000) 2.62095


In [112]:
# calibrate the batch norm at the end of training
with torch.no_grad():
  emb = char_emb[X_train]
  flatten_emb = emb.view(emb.shape[0], -1)
  Z_1 = flatten_emb @ W1 + b1
  batch_norm_mean_1 = Z_1.mean(0, keepdim=True)
  batch_norm_var_1 = Z_1.var(0, keepdim=True, unbiased=True)
  A_1 = np.maximum(alpha * Z_1, Z_1)
  Z_2 = A_1 @ W2 + b2
  batch_norm_mean_2 = Z_2.mean(0, keepdim=True)
  batch_norm_var_2 = Z_2.var(0, keepdim=True, unbiased=True)

In [113]:
@torch.no_grad()
def split_loss(split):
  X, y = {
      'train': (X_train, y_train),
      'valid': (X_valid, y_valid)
  }[split]
  emb = char_emb[X]
  flatten_emb = emb.view(emb.shape[0], -1)
  Z_1 = flatten_emb @ W1 + b1
  Z_1 = batch_norm_scale_1 * (Z_1 - batch_norm_mean_1) * (batch_norm_var_1 + 1e-5)**-0.5 + batch_norm_shift_1
  A_1 = np.maximum(alpha*Z_1, Z_1)
  Z_2 = A_1 @ W2 + b2
  Z_2 = batch_norm_scale_2 * (Z_2 - batch_norm_mean_2) * (batch_norm_var_2 + 1e-5)**-0.5 + batch_norm_shift_2
  A_2 = np.maximum(alpha*Z_2, Z_2)
  logits = A_2 @ W3 + b3
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('valid')

train 2.393301010131836
valid 2.5881686210632324


In [115]:
generator = torch.Generator()

with torch.no_grad():
  for _ in range(20):

    out = []
    context = [0] * context_size
    while True:
      # forward pass
      emb = char_emb[torch.tensor([context])]
      flatten_emb = emb.view(emb.shape[0], -1)
      Z_1 = flatten_emb @ W1 + b1
      Z_1 = batch_norm_scale_1 * (Z_1 - batch_norm_mean_1) * (batch_norm_var_1 + 1e-5)**-0.5 + batch_norm_shift_1
      A_1 = np.maximum(alpha*Z_1, Z_1)
      Z_2 = A_1 @ W2 + b2
      Z_2 = batch_norm_scale_2 * (Z_2 - batch_norm_mean_2) * (batch_norm_var_2 + 1e-5)**-0.5 + batch_norm_shift_2
      A_2 = np.maximum(alpha*Z_2, Z_2)
      logits = A_2 @ W3 + b3

      # sample
      probs = F.softmax(logits, dim=1)
      idx = torch.multinomial(probs, num_samples=1, generator=generator).item()
      context = context[1:] + [idx]
      out.append(idx)
      if idx == 0:
        break

    print(''.join(idx_to_char[i] for i in out))

cy.
iaueliig.
hyneyea.
yue.
aaie.
taierayie.
maiey.
lica.
ira.
eloa.
ai.
ber.
lianoterai.
zaianib.
ay.
hivianey.
evyaaiyle.
moa.
kaliis.
aaivia.
