In [None]:
import os
import math

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn.functional as F


In [None]:
print("current working directory", os.getcwd())
os.chdir("../makemore")
print("current working directory", os.getcwd())
!ls -lh

In [None]:
words = open("names.txt").read().splitlines()

In [None]:
len(words), min(len(w) for w in words), max(len(w) for w in words)

In [None]:
chars = sorted(list(set("".join(words))))
stoi = {c: i+1 for i, c in enumerate(chars)}
stoi["."] = 0
itos = {i:c for c, i in stoi.items()}

In [None]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
#for w in words[:5]:
  context = [0] * block_size
  #print(w)
  for c in w+".":
    ix = stoi[c]
    #print("".join([itos[i] for i in context]),"-->", c)
    X.append(context)
    Y.append(ix)
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)
print(f"{X.shape=}", f"{Y.shape=}")

In [None]:
g = torch.Generator().manual_seed(2147483674)
C = torch.rand((27, 2), generator=g)
emb = C[X]
emb.shape
# https://pytorch.org/docs/stable/generated/torch.cat.html#torch.cat
#torch.cat((emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]), dim=1).shape
# https://pytorch.org/docs/stable/generated/torch.unbind.html#torch.unbind
#torch.cat(torch.unbind(emb, dim=1), dim=1).shape
# https://pytorch.org/docs/stable/tensor_view.html
#emb.view((32, -1)).shape
#emb.view((32, -1)) == torch.cat(torch.unbind(emb, dim=1), dim=1)
W1 = torch.rand((6, 100))
b1 = torch.rand(100)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
#h.shape
W2 = torch.rand((100, 27))
b2 = torch.rand(27)
logits = h @ W2 + b2
#counts = logits.exp()
#probs = counts / counts.sum(dim=1, keepdim=True)
#loss = -probs[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)

In [None]:
# traning split, dev/validation split, test split
# 80%, 10%, 10%

def build_dataset(words):
  block_size = 3 # context length: how many characters do we take to predict the next one?
  X, Y = [], []
  for w in words:
    context = [0] * block_size
    for c in w+".":
      ix = stoi[c]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(f"{X.shape=}", f"{Y.shape=}")
  return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
print(len(words), n1, n2)

import random
random.seed(42)
random.shuffle(words)
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte =build_dataset(words[n2:])

In [None]:
g = torch.Generator().manual_seed(2147483674)
C = torch.rand((27, 10), generator=g)
W1 = torch.rand((30, 200), generator=g)
b1 = torch.rand(200, generator=g)
W2 = torch.rand((200, 27), generator=g)
b2 = torch.rand(27, generator=g)

parameters = [C, W1, b1, W2, b2]
print("number of paramenters", sum([p.nelement() for p in parameters]))

for p in parameters:
  p.requires_grad = True

In [None]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

In [None]:
lri = []
lossi = []
stepi = []

In [None]:

for i in range(50000):
  # construct minibatch
  ix = torch.randint(0, Xtr.shape[0], (32,))
  # forward pass
  emb = C[Xtr[ix]]
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  # update
  #lr = lrs[i]
  lr = 0.1
  #lr = 0.01
  #lr = 0.005
  for p in parameters:
    p.data += -lr * p.grad
  # trace loss
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.log10().item())

print(loss.log10().item())

In [None]:
plt.plot(stepi, lossi)

In [None]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

In [None]:
plt.clf()
plt.scatter(C[:, 0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
  plt.text(C[i, 0].item(), C[i, 1].item(), itos[i], ha="center", va="center", color="white")
plt.grid("minor")

In [None]:
g = torch.Generator().manual_seed(2147483674)
for _ in range(20):
  out = []
  context = [0] * block_size # initialize context with ...
  while True:
    emb = C[torch.tensor([context])] # (1, block_size, d)
    h = torch.tanh(emb.view(1, -1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
  print("".join([itos[o] for o in out]))
