In [1]:
# jupyter notebook server
# http://dev-machinelearning:8888/?token=49f59c67c533c9bf3d23e6c0243fcb3dc79f6480ff106609

import os
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn.functional as F

In [2]:
print("current working directory", os.getcwd())
os.chdir("makemore")
print("current working directory", os.getcwd())
!ls -lh

current working directory /model-store
current working directory /model-store/makemore
total 264K
-rw-r--r-- 1 root root 1.1K Jun  6 12:52 LICENSE
-rw-r--r-- 1 root root 3.0K Jun  6 12:52 README.md
-rw-r--r-- 1 root root  29K Jun  6 12:52 makemore.py
-rw-r--r-- 1 root root 223K Jun  6 12:52 names.txt


In [3]:
words = open("names.txt").read().splitlines()
len(words), min(len(w) for w in words), max(len(w) for w in words)

(32033, 2, 15)

In [4]:
chars = sorted(list(set("".join(words))))
stoi = {c: i+1 for i, c in enumerate(chars)}
stoi["."] = 0
itos = {i:c for c, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(f"{vocab_size=}")
block_size = 3 # context length: how many characters do we take to predict the next one?
print(f"{block_size=}")

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
vocab_size=27
block_size=3


In [5]:
# traning split, dev/validation split, test split
# 80%, 10%, 10%

def build_dataset(words):
  X, Y = [], []
  for w in words:
    context = [0] * block_size
    for c in w+".":
      ix = stoi[c]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(f"{X.shape=}", f"{Y.shape=}")
  return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
print(len(words), n1, n2)

import random
random.seed(42)
random.shuffle(words)
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte =build_dataset(words[n2:])

32033 25626 28829
X.shape=torch.Size([182625, 3]) Y.shape=torch.Size([182625])
X.shape=torch.Size([22655, 3]) Y.shape=torch.Size([22655])
X.shape=torch.Size([22866, 3]) Y.shape=torch.Size([22866])


In [40]:
n_emb = 10
n_hidden = 200

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.rand((vocab_size, n_emb), generator=g)
W1 = torch.rand((block_size * n_emb, n_hidden), generator=g) * ((5/3) / (block_size * n_emb)**0.5) #* 0.2
b1 = torch.rand(n_hidden, generator=g) * 0.01
W2 = torch.rand((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.rand(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

# unit Gaussian distribution
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print("number of paramenters", sum([p.nelement() for p in parameters]))

for p in parameters:
  p.requires_grad = True

number of paramenters 12297


In [41]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
  # construct minibatch
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix]
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vector
  # Linear Layer
  # b1 is useless here since b1.grad is 0 or very close to 0
  hpreact = embcat @ W1 + b1
  bnmeani = hpreact.mean(dim=0, keepdim=True)
  bnstdi = hpreact.std(dim=0, keepdim=True)
  hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias

  # BatchNorm Layer
  with torch.no_grad():
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstdi + 0.001 * bnstdi
  
  # Non-linearity Layer
  h = torch.tanh(hpreact) # hidden layer
  logits = h @ W2 + b2 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function
  #print(loss.item())
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  # update
  lr = 0.1 if i<100000 else 0.001 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad
  # trace loss
  if i%10000 == 0:
    print(f"{i:7d}/{max_steps:7d}: {loss.item():.4f}")
  lossi.append(loss.log10().item())
  #break

print(loss.log10().item())

      0/ 200000: 3.2856
  10000/ 200000: 2.1910
  20000/ 200000: 2.1619
  30000/ 200000: 2.4301
  40000/ 200000: 2.0716
  50000/ 200000: 2.0125
  60000/ 200000: 2.4279
  70000/ 200000: 2.1984
  80000/ 200000: 2.3772
  90000/ 200000: 2.1483
 100000/ 200000: 2.2567
 110000/ 200000: 2.1611
 120000/ 200000: 1.7684
 130000/ 200000: 2.3275
 140000/ 200000: 2.2118
 150000/ 200000: 2.0102
 160000/ 200000: 2.2136
 170000/ 200000: 2.1935
 180000/ 200000: 1.8776
 190000/ 200000: 2.0842
0.3272089660167694


In [None]:
plt.hist(h.view(-1).tolist(), 50);

In [None]:
plt.hist(hpreact.view(-1).tolist(), 50);

In [None]:
plt.clf()
plt.figure(figsize=(20,10))
plt.imshow(h.abs()>0.99, cmap="gray", interpolation="nearest")

In [None]:
plt.plot(lossi)

In [43]:
@torch.no_grad() # disable gradient tracking
def split_loss(split):
  x, y = {
    "train": (Xtr, Ytr),
    "val": (Xdev, Ydev),
    "test": (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_emb)
  embcat = emb.view(emb.shape[0], -1) # (N, block_size*n_emb)
  hpreact = embcat @ W1 + b1
  #hpreact = bngain * (hpreact - hpreact.mean(dim=0, keepdim=True)) / hpreact.std(dim=0, keepdim=True) + bnbias
  # perform batch normalization, refer to *Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift*
  hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
  h = torch.tanh(hpreact) # (N, n_hidden)
  logits = h @ W2 + b2  # (N, vocab_size)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss("val")
split_loss("test")

val 2.1109306812286377
test 2.107935667037964


In [39]:
# sample from the model
g = torch.Generator().manual_seed(2147483674 * 10)
for _ in range(20):
  out = []
  context = [0] * block_size # initialize context with ...
  while True:
    emb = C[torch.tensor([context])] # (1, block_size, d)
    h = torch.tanh(emb.view(1, -1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
  print("".join([itos[o] for o in out]))

gydandinseys.
hmb.
yndyny.
ruunaspe.
nal.
ymyytim.
navysevynlyv.
llri.
lbytusil.
rrian.
llyynn.
vdhny.
ldohn.
lalr.
hnsahnnndynm.
ruddy.
hndohrorycwrlv.
hdok.
hmdhcys.
ssoin.
