In [1]:
import torch
import torch.nn.functional as F
import importlib
import matplotlib.pyplot as plt
import random
mm = importlib.import_module("makemore-1")

In [2]:
mm.CHARS

'.abcdefghijklmnopqrstuvwxyz'

In [3]:
words = mm.load_words_from_file('names.txt')

# what are we trying to do

add more context in training data. we don't want input-output pairs to just be $(c_k, c_{k+1})$, but rather $((c_{k-B+1}, \ldots, c_k), c_{k+1})$ for context length $B$.

From our set of words, build up a training set of character sequences $(c_{k-B+1}, \ldots, c_{k-1}) \mapsto c_k$.

In [4]:
start_idx = mm.CHAR_INDICES['.']
B = 3

def make_dataset_split(words):
  xs = []
  ys = []
  
  for word in words:
    # initial context_array = [0, 0, 0]
    context_array = [start_idx] * B
    xs.append(context_array.copy())
    word = f'{word}.'

    # loop invariant:
    #   - xs has a sequence of inputs already processed (possibly empty),
    #     followed by the next input to be processed
    #   - xs = [x_1, ..., x_{k-1}, x_k]^T
    #   - ys = [y_1, ..., y_{k-1}]^T has a sequence of outputs, one for each
    #     of the inputs already processed.
    for ch in word:
      ch_idx = mm.CHAR_INDICES[ch]
      ys.append(ch_idx)
      # assuming training data contains no "."'s and has properly been filtered out,
      # then the only way ch_idx == 0 is if we're at the end
      if ch_idx != 0:
        context_array.pop(0)
        context_array.append(ch_idx)
        xs.append(context_array.copy())

  X = torch.tensor(xs)
  Y = torch.tensor(ys)
  print(X.shape, Y.shape)
  return (X, Y)

# 80/10/10 split
M = len(words)
random.seed(12345)
random.shuffle(words)
X_tr, Y_tr = make_dataset_split(words[:int(0.8 * M)])
X_val, Y_val = make_dataset_split(words[int(0.8 * M):int(0.9 * M)])
X_tst, Y_tst = make_dataset_split(words[int(0.9 * M):])
print(X_tr.shape[0] + X_val.shape[0] + X_tst.shape[0])

torch.Size([182512, 3]) torch.Size([182512])
torch.Size([22864, 3]) torch.Size([22864])
torch.Size([22770, 3]) torch.Size([22770])
228146


# Using an MLP w/ character embeddings

We build an n-gram probability model: a model that calculates the conditional probability of the next character $\textbf{c}_t$ given the previously seen characters $(\textbf{c}_{t-d+1}, \ldots, \textbf{c}_{t-1})$:

$$P(\textbf{c}_t | \textbf{c}_{t-1}, \textbf{c}_{t-2}, \ldots, \textbf{c}_{t-d+1})$$

using

1. a character embedding layer
2. a hidden layer with a non-linearity (tanh activation) 
3. a softmax layer

(NOTE: below, using "$e$" to be a variable, $\exp$ will be the exponential function. sorry not sorry.) the layers are defined by:

 - $c =$ the current sequence of characters
 - $e = C(c)$
 - $h = \text{tanh}((W^{(1)})^T e + b^{(1)})$
 - $l = (W^{(2)})^T x + b^{(1)}$
 - $p = \text{softmax}(l)$
 - $\text{loss} = \text{Cross-Entropy}(p)$
 
$$c \mapsto e \mapsto h \mapsto l \mapsto p$$

and we calculate the loss for each input's output probability vector $p$.
 
some notation:

 - block size $B$ ($B = 3$ here)
 - a single, shared character embedding $C: \text{Chars } \to R^d$
    - $R^d$ is the embedding space
    - here, $d = 2$
 - $K$ is the size of the hidden layer output (here $K = 100$)
 - $W^{(1)} \in \mathbb{R}^{(Bd) \times K}$, and $b^{(1)} \in \mathbb{R}^K$
 - $W^{(2)} \in \mathbb{R}^{K \times 27}$, and $b^{(2)} \in \mathbb{R}^{27}$

In [25]:
d = 15
K = 200

C = torch.randn((27,d))

g = torch.Generator().manual_seed(12345)
W1 = torch.randn((B*d, K), generator=g)
b1 = torch.randn((K,), generator=g)
W2 = torch.randn((K, 27), generator=g)
b2 = torch.randn((27,), generator=g)

params = [C, W1, b1, W2, b2]

for prm in params:
  prm.requires_grad = True


print(f"number of parameters = {sum([p.nelement() for p in params])}")

number of parameters = 15032


In [26]:
num_iters = 200000
get_learning_rate = lambda it: 0.1 if it < 100000 else 0.01
batch_size = 128

for it in range(num_iters):
  batch_idxs = torch.randint(0, X_tr.shape[0], (batch_size,))
  X_batch = X_tr[batch_idxs]
  Y_batch = Y_tr[batch_idxs]
  
  # forward propagation, calculate outputs and loss
  embeddings = C[X_batch].view(-1, B*d)
  hiddens = torch.tanh(embeddings @ W1 + b1)
  logits = hiddens @ W2 + b2
  loss = F.cross_entropy(logits, Y_batch)
  
  if it % 2000 == 0:
    print(f"[{it}] loss = {loss.item()}")

  # backward propagate the error gradient
  for prm in params:
    prm.grad = None
  loss.backward()
  
  # update
  for prm in params:
    prm.data += -get_learning_rate(it) * prm.grad

print(f"[{num_iters}] loss = {loss.item()}")

[0] loss = 26.658891677856445
[2000] loss = 3.1221680641174316
[4000] loss = 3.2022035121917725
[6000] loss = 2.4897313117980957
[8000] loss = 2.6937429904937744
[10000] loss = 2.49950909614563
[12000] loss = 2.3878071308135986
[14000] loss = 2.222501039505005
[16000] loss = 2.305237054824829
[18000] loss = 2.1167891025543213
[20000] loss = 2.2838659286499023
[22000] loss = 2.0340373516082764
[24000] loss = 2.178699016571045
[26000] loss = 2.402988910675049
[28000] loss = 1.9995990991592407
[30000] loss = 2.340970754623413
[32000] loss = 2.1475155353546143
[34000] loss = 2.2620625495910645
[36000] loss = 2.079510450363159
[38000] loss = 1.9154407978057861
[40000] loss = 2.277647018432617
[42000] loss = 2.3154256343841553
[44000] loss = 2.051833391189575
[46000] loss = 2.1735739707946777
[48000] loss = 2.0750677585601807
[50000] loss = 2.297797679901123
[52000] loss = 2.0699045658111572
[54000] loss = 2.08777117729187
[56000] loss = 2.177191972732544
[58000] loss = 2.1318113803863525
[6

In [27]:
embeddings = C[X_tr].view(-1, B*d)
hiddens = torch.tanh(embeddings @ W1 + b1)
logits = hiddens @ W2 + b2
loss = F.cross_entropy(logits, Y_tr)
loss.item()

2.063495635986328

In [28]:
embeddings = C[X_val].view(-1, B*d)
hiddens = torch.tanh(embeddings @ W1 + b1)
logits = hiddens @ W2 + b2
loss = F.cross_entropy(logits, Y_val)
loss.item()

2.129589319229126

# tune learning rate

In [None]:
num_iters = 1000
lr_exps = torch.linspace(-3, 0.5, num_iters)
lrs = 10 ** lr_exps
losses = []
batch_size = 32

for it in range(num_iters):
  batch_idxs = torch.randint(0, X_tr.shape[0], (batch_size,))
  X_batch = X_tr[batch_idxs]
  Y_batch = Y_tr[batch_idxs]
  
  # forward propagation, calculate outputs and loss
  embeddings = C[X_batch].view(-1, B*d)
  hiddens = torch.tanh(embeddings @ W1 + b1)
  logits = hiddens @ W2 + b2
  loss = F.cross_entropy(logits, Y_batch)
  
  if it % 200 == 0:
    print(f"[{it}] loss = {loss.item()}")
    
  losses.append(loss.item())

  # backward propagate the error gradient
  for prm in params:
    prm.grad = None
  loss.backward()
  
  # update
  for prm in params:
    prm.data += -lrs[it] * prm.grad
    

print(f"final loss = {loss.item()}")

In [None]:
plt.plot(lr_exps, losses)