In [1]:
import torch
import torch.nn.functional as F
import importlib
mm = importlib.import_module("makemore-1")

In [2]:
mm.CHARS

'.abcdefghijklmnopqrstuvwxyz'

In [3]:
words = mm.load_words_from_file('names.txt')

# what are we trying to do

add more context in training data. we don't want input-output pairs to just be $(c_k, c_{k+1})$, but rather $((c_{k-B+1}, \ldots, c_k), c_{k+1})$ for context length $B$.

From our set of words, build up a training set of character sequences $(c_{k-B+1}, \ldots, c_{k-1}) \mapsto c_k$.

In [5]:
start_idx = mm.CHAR_INDICES['.']
context_length = 3

xs = []
ys = []

for word in words:
  # initial context_array = [0, 0, 0]
  context_array = [start_idx] * context_length
  xs.append(context_array.copy())
  word = f'{word}.'

  # loop invariant:
  #   - xs has a sequence of inputs already processed (possibly empty),
  #     followed by the next input to be processed
  #   - xs = [x_1, ..., x_{k-1}, x_k]^T
  #   - ys = [y_1, ..., y_{k-1}]^T has a sequence of outputs, one for each
  #     of the inputs already processed.
  for ch in word:
    ch_idx = mm.CHAR_INDICES[ch]
    ys.append(ch_idx)
    # assuming training data contains no "."'s and has properly been filtered out,
    # then the only way ch_idx == 0 is if we're at the end
    if ch_idx != 0:
      context_array.pop(0)
      context_array.append(ch_idx)
      xs.append(context_array.copy())

X = torch.tensor(xs)
Y = torch.tensor(ys)
# number of samples
M = X.shape[0]

print(X.shape, X.dtype, Y.shape, Y.dtype)

torch.Size([228146, 3]) torch.int64 torch.Size([228146]) torch.int64


# Using an MLP w/ character embeddings

We build an n-gram probability model: a model that calculates the conditional probability of the next character $\textbf{c}_t$ given the previously seen characters $(\textbf{c}_{t-d+1}, \ldots, \textbf{c}_{t-1})$:

$$P(\textbf{c}_t | \textbf{c}_{t-1}, \textbf{c}_{t-2}, \ldots, \textbf{c}_{t-d+1})$$

using

1. a character embedding layer
2. a hidden layer with a non-linearity (tanh activation) 
3. a softmax layer

(NOTE: below, using "$e$" to be a variable, $\exp$ will be the exponential function. sorry not sorry.) the layers are defined by:

 - $c =$ the current sequence of characters
 - $e = C(c)$
 - $h = \text{tanh}((W^{(1)})^T e + b^{(1)})$
 - $l = (W^{(2)})^T x + b^{(1)}$
 - $p = \text{softmax}(l)$
 - $\text{loss} = \text{Cross-Entropy}(p)$
 
$$c \mapsto e \mapsto h \mapsto l \mapsto p$$

and we calculate the loss for each input's output probability vector $p$.
 
some notation:

 - block size $B$ ($B = 3$ here)
 - a single, shared character embedding $C: \text{Chars } \to R^d$
    - $R^d$ is the embedding space
    - here, $d = 2$
 - $K$ is the size of the hidden layer output (here $K = 100$)
 - $W^{(1)} \in \mathbb{R}^{(Bd) \times K}$, and $b^{(1)} \in \mathbb{R}^K$
 - $W^{(2)} \in \mathbb{R}^{K \times 27}$, and $b^{(2)} \in \mathbb{R}^{27}$

In [74]:
B = 3
d = 2
K = 100

C = torch.randn((27,d))

g = torch.Generator().manual_seed(12345)
W1 = torch.randn((B*d, K), generator=g)
b1 = torch.randn((K,), generator=g)
W2 = torch.randn((K, 27), generator=g)
b2 = torch.randn((27,), generator=g)

params = [W1, b1, W2, b2]

print(f"number of parameters = {sum([p.nelement() for p in params])}")

number of parameters = 3427


In [77]:
for prm in params:
  prm.requires_grad = True
  
num_iters = 6000
learning_rate = 0.1
batch_size = 32

for it in range(num_iters):
  batch_idxs = torch.randint(0, X.shape[0], (batch_size,))
  X_batch = X[batch_idxs]
  Y_batch = Y[batch_idxs]
  
  # forward propagation, calculate outputs and loss
  embeddings = C[X_batch].view(-1, B*d)
  hiddens = torch.tanh(embeddings @ W1 + b1)
  logits = hiddens @ W2 + b2
  loss = F.cross_entropy(logits, Y_batch)
  
  if it % 200 == 0:
    print(f"[{it}] loss = {loss.item()}")

  # backward propagate the error gradient
  for prm in params:
    prm.grad = None
  loss.backward()
  
  # update
  for prm in params:
    prm.data += -learning_rate * prm.grad
    

print(f"final loss = {loss.item()}")

[0] loss = 3.256247043609619
[200] loss = 2.4560561180114746
[400] loss = 2.759335994720459
[600] loss = 2.9781603813171387
[800] loss = 2.8154613971710205
[1000] loss = 2.553621768951416
[1200] loss = 2.6483516693115234
[1400] loss = 2.2547922134399414
[1600] loss = 2.677513599395752
[1800] loss = 2.326481819152832
[2000] loss = 2.6013107299804688
[2200] loss = 2.294363021850586
[2400] loss = 2.4159796237945557
[2600] loss = 2.6304101943969727
[2800] loss = 2.252729892730713
[3000] loss = 2.7548513412475586
[3200] loss = 2.7321689128875732
[3400] loss = 2.5576043128967285
[3600] loss = 2.6842174530029297
[3800] loss = 2.664486885070801
[4000] loss = 2.970008134841919
[4200] loss = 2.811039686203003
[4400] loss = 2.688290596008301
[4600] loss = 2.6198182106018066
[4800] loss = 2.3921103477478027
[5000] loss = 2.581737995147705
[5200] loss = 2.501214027404785
[5400] loss = 2.4199914932250977
[5600] loss = 2.3907744884490967
[5800] loss = 2.2113475799560547
final loss = 2.840384960174560

In [78]:
embeddings = C[X].view(-1, B*d)
hiddens = torch.tanh(embeddings @ W1 + b1)
logits = hiddens @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss.item()

2.4954025745391846