This is a section where we will learn about how to create the makemore module with two approaches, the counting way and the neural network way, to create a bigram model, which, to my understanding, is a model that predict the next character based on the previous information.

In [None]:
import torch
import torch.nn.functional as F

In [None]:
# read the text dataset
with open('names.txt', 'r') as f:
    words = f.read().splitlines()

In [None]:
# get unique characters from the dataset
chars = sorted(list(set(''.join(words))))
num_chars = len(chars) + 1

In [None]:
# encoder and decoder
stoi = { ch:i+1 for i,ch in enumerate(chars) }
stoi['.'] = 0
itos = { i:ch for ch,i in stoi.items() }

First approach is counting

In [None]:
# count the presence of each combination
N = torch.zeros((num_chars, num_chars))

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
# compute the probabilities of each combination
P = N.float()
P /= P.sum(1, keepdim=True)

In [None]:
# calculate the loss using negative log likelihood
n = 0
log_likelihood = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        
nlls = -log_likelihood
loss = nlls/n

print(loss.item())

Second approach is neural network

In [None]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((num_chars, num_chars), generator=g, requires_grad=True)

In [None]:
# get the training dataset
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

In [None]:
# gradient descent
max_iters = 1000
learning_rate = -50
loss = 0

for k in range(max_iters):
    
    # forward pass
    xenc = F.one_hot(xs, num_classes=num_chars).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(num), ys].log().mean()
    
    # backward pass
    W.grad = None
    loss.backward()
    
    # update
    W.data += learning_rate * W.grad

print(loss.item())

In [None]:
# example
for i in range(5):
    out = []
    ix = 0
    
    while True:
        # first approach
        p = P[ix]
        
        # second approach
        xenc = F.one_hot(torch.tensor([ix]), num_classes=num_chars).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])

        if ix == 0:
            break
    
    print(''.join(out))