For how to setup the environment, refer to [setup nanoGPT env](./setup_nanoGPT_env.md)

In [None]:
import os
import math

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn.functional as F

In [None]:
print("current working directory", os.getcwd())
os.chdir("../makemore")
print("current working directory", os.getcwd())
!ls -lh

In [None]:
words = open("names.txt").read().splitlines()

In [None]:
len(words), min(len(w) for w in words), max(len(w) for w in words)

In [None]:
b = {}
for w in words:
  chs = ["<S>"] + list(w) + ["<E>"]
  for c1, c2 in zip(chs, chs[1:]):
    bigram = (c1, c2)
    b[bigram] = b.get(bigram, 0) + 1

In [None]:
sorted(b.items(), key= lambda k: -k[-1]) 

In [None]:
chars = sorted(list(set("".join(words))))
stoi = {c: i+1 for i, c in enumerate(chars)}
stoi["."] = 0
itos = {i:c for c, i in stoi.items()}

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32)
for w in words:
  chs = ["."] + list(w) + ["."]
  for c1, c2 in zip(chs, chs[1:]):
    ix1, ix2 = stoi[c1], stoi[c2]
    N[ix1, ix2] += 1

In [None]:
plt.clf()
plt.figure(figsize=(16,16))
plt.imshow(N, cmap="Blues")
for i in range(27):
  for j in range(27):
    chstr = itos[i] + itos[j]
    plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
    plt.text(j, i, N[j, i].item(), ha="center", va="top", color="gray")
plt.axis("off")

In [None]:
g = torch.Generator().manual_seed(23254312)
p = torch.rand(3, generator=g)
p = p/p.sum()
p

In [None]:
torch.multinomial(p, num_samples=100, replacement=True, generator=g)

In [None]:
P = (N+1).float()
# make sure we know what broadcast do, and how to do it: https://pytorch.org/docs/stable/notes/broadcasting.html
#P = P / P.sum(dim=1, keepdim=True)
P /= P.sum(dim=1, keepdim=True)
# verify each row is normalized
P.sum(dim=1, keepdim=True)

In [None]:
g = torch.Generator().manual_seed(2147483647)
for i in range(50):
  ix = 0
  outs = []
  while True:
    #p = N[ix].float()
    #p = p/p.sum()
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    outs.append(itos[ix])
    if ix == 0:
      break
  print("".join(outs))

In [None]:
n = 0
log_likelihood = 0.0
#for w in words:
for w in ["yongbopx"]:
  chs = ["."] + list(w) + ["."]
  for c1, c2 in zip(chs, chs[1:]):
    ix1, ix2 = stoi[c1], stoi[c2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    print(f"{c1}{c2}: {prob:.4f} {logprob:.4f}")

print(f"{log_likelihood=}")
nll = -log_likelihood/n
print(f"{nll=}")

In [None]:
# create the training set of bigrams: (x, y)
xs, ys = [], []
for w in words[:1]:
  chs = ["."] + list(w) + ["."]
  for c1, c2 in zip(chs, chs[1:]):
    ix1, ix2 = stoi[c1], stoi[c2]
    xs.append(ix1)
    ys.append(ix2)

# https://discuss.pytorch.org/t/difference-between-torch-tensor-and-torch-tensor/30786
#xs = torch.Tensor(xs)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
# log(a*b*c) = log(a) + log(b) + log(c)

# GOAL: maximize the likelihood of training data w.r.t model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g)

# https://pytorch.org/docs/stable/generated/torch.nn.functional.one_hot.html#torch.nn.functional.one_hot
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp()
P = counts / counts.sum(dim=1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are called a "softmax"

In [None]:
# create the training set of bigrams: (x, y)
xs, ys = [], []
#for w in words[:1]:
for w in words:
  chs = ["."] + list(w) + ["."]
  for c1, c2 in zip(chs, chs[1:]):
    ix1, ix2 = stoi[c1], stoi[c2]
    xs.append(ix1)
    ys.append(ix2)

# https://discuss.pytorch.org/t/difference-between-torch-tensor-and-torch-tensor/30786
#xs = torch.Tensor(xs)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

data_num = xs.nelement()
print(f"{data_num=}")

xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding


In [None]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
for k in range(100):
  # forward pass
  logits = xenc @ W # predict log-counts
  counts = logits.exp()
  probs = counts / counts.sum(dim=1, keepdim=True) # probabilities for next character
  loss = -probs[torch.arange(data_num), ys].log().mean() + 0.1 * (W**2).mean() # loss + l2 regulation
  print(f"{loss.item()}")

  # backward pass
  W.grad = None # set gradient to zero
  loss.backward()

  # update pass
  W.data += -50 * W.grad


In [None]:
# finally, sample from the "neural network"
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix = 0
  outs = []
  while True:
    # --------
    # BEFORE:
    #p = P[ix]
    # --------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(dim=1, keepdims=True) # probabilities for next character
    # --------
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    outs.append(itos[ix])
    if ix == 0:
      break
  print("".join(outs))