# Makemore Notebook

## 0. Initiation

In [1]:
import torch
import torch.nn.functional as F

In [2]:
# Get the list of words from the file
words = open('names.txt', 'r').read().splitlines()

In [3]:
# Get all lower case letters once and sorted
chars = sorted(list(set(''.join(words))))

# Character to integer match
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

## 1. Testing and investigation

In [None]:
# Count all possible biagrams
b = {}
for w in words:
  # Add start and end symbols
  chs = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2)
    b[bigram] = b.get(bigram, 0) + 1

In [None]:
# Create an empty matrix
N = torch.zeros((27, 27), dtype=torch.int32)

In [None]:
# Fill the matrix: Count all possible biagrams. The first row and column are for the start and end symbols, then the rest are the letters.
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1, ix2] += 1

In [None]:
# Plot the biagrams of the dataset
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
# Broadcasting: Normalise, but be careful, we need to normalise the rows, not the columns
P = (N+1).float()
P /= P.sum(1, keepdims=True)

In [None]:
# Sample randomly the next letters over the distribution from the train set until the end symbol is reached
# Set the seed for reproducibility
g = torch.Generator().manual_seed(2147483647)
# Sample 5 names
for i in range(5):
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    # Until end is reached
    if ix == 0:
      break
  print(''.join(out))

In [None]:
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

In [None]:
# Get probability of all possible biagrams in the dataset
# We want to get the likelihood of the data, i.e. the probability the model assigns to the data
# This should be high if the model is good
# To get one number we need to multiply all the probabilities of all the biagrams in the dataset
# But we can use the log trick to convert the product into a sum
# The log also helps to avoid underflow, meaning to avoid very small numbers
# Add the end, we want to minimize the negative log likelihood, which is equivalent to maximizing the log likelihood

log_likelihood = 0.0
n = 0
for w in words:
#for w in ["andrejq"]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    #print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

## 2. Test NN with first word

In [None]:
# Get all lower case letters once and sorted
chars = sorted(list(set(''.join(words))))

# Character to integer match
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
# Create the training set of bigrams (x,y)
# Here and example for the first word in the dataset on how to create the training set
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
# Encode integers via one-hot encoding
xenc = F.one_hot(xs, num_classes=27).float()
xenc

In [None]:
# Plot the one-hot encoding
import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(xenc)

In [None]:
# Create first random weights
W = torch.randn((27, 1))

# Multiply the one-hot encoded input with the weights
xenc @ W

In [None]:
# Get the probabilities by applying the softmax function
logits = xenc @ W # log-counts
counts = logits.exp() # equivalent N
probs = counts / counts.sum(1, keepdims=True)
probs

In [None]:
# Randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g)

In [None]:
# Compute the probabilities of the next character
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [None]:
# 5 radomly examples from the first word in the dataset
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

In [None]:
# Randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# Forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean()

In [None]:
print(loss.item())

In [None]:
# Backward pass
W.grad = None # set to zero the gradient
loss.backward()

In [None]:
# Update the weights
W.data += -0.1 * W.grad

## 3. --------- !!! OPTIMIZATION !!! yay, but this time actually --------------

In [None]:
# Create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# Gradient descent
for k in range(1):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts. It basically gives the row of the matrix W corresponding to the input character in the one-hot encoding
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # Get loss over the biagrams in the dataset and add a regularization term to avoid weights to grow too much
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

In [None]:
# Finally, sample from the 'neural net' model five names
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  out = []
  ix = 0 # We start with the start symbol
  while True:
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    # Until end symbol is reached
    if ix == 0:
      break
  print(''.join(out))