<a href="https://colab.research.google.com/github/cwscx/makmore/blob/main/build_makemore_yay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
words = open('names.txt', 'r').read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
min(len(w) for w in words)

In [None]:
max(len(w) for w in words)

In [None]:
b = {}
for w in words:
  chs = ['<S>'] + list(w) + ['<W>']
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2)
    b[bigram] = b.get(bigram, 0) + 1

In [None]:
sorted(b.items(), key = lambda kv: kv[1], reverse=True)

In [None]:
import torch

In [None]:
stoi = {s:i for i, s in enumerate(list('.abcdefghijklmnopqrstuvwxyz'))}
itos = {i:s for s,i in stoi.items()}

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32)
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    index1 = stoi.get(ch1, 0)
    index2 = stoi.get(ch2, 0)
    N[index1, index2] += 1


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap="Blues")
for i in range(27):
  for j in range(27):
    chstr = itos[i] + itos[j]
    plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
    plt.text(j, i, N[i,j].item(), ha="center", va="top", color="gray")
plt.axis('off')

In [None]:
N[0, :]

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)
index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
print(index)
print(itos[index])

In [None]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
print(p)
p = p / p.sum()
print(p)

In [None]:
t = torch.multinomial(p,  num_samples=20, replacement=True, generator=g)

In [None]:
P = (N + 1).float() # + 1 is model smoothing. Make model more smooth (less peaked)

# braodcast.
# P [27,27]. P.sum [27,1]. It scales P.sum to [27,27] by copying the value, and
# apply semantic operation 1 by 1.
#
# sum(dim=1) could be understand as P[c][i] where c is constant, sum all i.
P /=P.sum(dim=1, keepdim=True)  # Use /= in place operation to avoid creating new object P


In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  out = []
  index = 0
  while True:
    p = P[index]
    # xenc = F.one_hot(torch.tensor([index]), num_classes=27).float()
    # logits = xenc @ W
    # counts = logits.exp()
    # p = counts / counts.sum(dim=1, keepdim=True)
    index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[index])
    if index == 0:
      break

  print(''.join(out))

In [None]:
a = torch.Tensor([[1,2,3],[4,5,6]])
a.sum(0, keepdim=True)

In [None]:
P[:,0].sum() # sum all first element per row.

In [None]:
# likelihood is the product of possibility.
# but unlike P which was calculated by us from the dataset. The actual P will
# be calculated by neural network.
negative_log_likelihood = 0 # lowest possible value is 0
n = 0
for w in ["andrejq"]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    index1 = stoi.get(ch1, 0)
    index2 = stoi.get(ch2, 0)
    prob=P[index1][index2]
    logprob = torch.log(prob)
    negative_log_likelihood -= logprob
    n += 1
    print(f'{ch1}-{ch2}: {prob:.4f} {logprob:.4f}')

print(f'{negative_log_likelihood=}')
print(f'{negative_log_likelihood / n}')

In [None]:
 # creating the training set of the bigrams

xs = []
ys = []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    index1 = stoi.get(ch1, 0)
    index2 = stoi.get(ch2, 0)
    xs.append(index1)
    ys.append(index2)

# prefer tensor over Tensor. Because tensor could defer dtype automatically.
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
# Each example is just an integer. And an int cannot be plugged in neural network.
# to do so, use encoding to encode the integer to vector.
# one hot encoding.
from torch.nn import functional as F

x_encoded = F.one_hot(xs, num_classes=27).float()
y_encoded = F.one_hot(ys, num_classes=27).float()

In [None]:
plt.imshow(x_encoded)

In [None]:
x_encoded.dtype

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# 1. we use weight to convert data (count) into logits.
logits = x_encoded @ W

# ------softmax start------
# 2. we run exp over logits to convert it into sth like counts (positive)
counts = logits.exp()

# 3. we then normalize it to get probs, ready for backward progpagation.
# because all the operations @, exp, normalize are differentiable.
probs = counts / counts.sum(dim=1, keepdim=True)
# ------softmax end--------

print(probs)

In [None]:
probs.shape

In [None]:
nlls = torch.zeros(5)
for i in range(5):
  x = xs[i].item()
  y = ys[i].item()

  print('-------------------------')
  print(f'bigram example {i + 1}: {itos[x]}{itos[y]} (indexes) {x}, {y}')
  print('input to neural net:', x)
  print('output probabilities from neural net:', probs[i])
  print('label:', y)
  p = probs[i, y]
  print('probabilby assigned by net to the label:', p.item())
  logp = torch.log(p)
  print('log probablity assigned by net to the label:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print("========")
print('average negative log likelihood', nlls.mean().item())

In [None]:
 probs[0, 5], probs[1,13], probs[2, 13], probs[3, 1], probs[4, 0]

In [None]:
for k in range(100):
  # Forward pass
  x_encoded = F.one_hot(xs, num_classes=27).float() # num_of_words x 27
  logits = x_encoded @ W # num_of_words x 27
  count = logits.exp() # num_of_words x 27
  softmax_probs = count / count.sum(dim=1, keepdim=True) # num_of_words x 27
  loss = -softmax_probs[torch.arange(len(xs)), ys].log().mean()
  # loss = -softmax_probs[torch.arange(len(xs)), ys].log().mean() + 0.01 * (W**2).mean() # make the loss score more smooth

  print(loss.item())

  # Backward pass
  W.grad = torch.zeros((27, 27), dtype=torch.float)
  loss.backward()
  W.data += -50 * W.grad