In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
with open('names.txt', 'r', encoding='utf-8') as f:
    text = f.read().split('\n')

# Unique characters in the text
chars = ['.'] + sorted(list(set(''.join(text))))
vocab_size = len(chars)

# Mapping from characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

### Bigram Count Approach

In [None]:
# Bigram counts
B = torch.zeros((vocab_size, vocab_size), dtype=torch.int32)
for word in text:
    chars = ['.'] + list(word) + ['.']
    for char1, char2 in zip(chars, chars[1:]):
        B[char_to_int[char1], char_to_int[char2]] += 1

In [None]:
# Visualise the bigram tensor
plt.figure(figsize=(16, 16))
plt.imshow(B, cmap='Blues')
for i in range(B.size(0)):
    for j in range(B.size(1)):
        plt.text(j, i, int_to_char[i] + int_to_char[j], ha='center', va='bottom', color='gray')
        plt.text(j, i, B[i, j].item(), ha='center', va='top', color='gray')
plt.axis('off')

In [None]:
# Calculate the probabilities of the next character in the bigram. Summing and normalising across each row.
# Add model smoothing by adding 1 to each bigram. This removes posibility of an average negative log likelihood of inf.
P = (B + 1) / B.sum(1, keepdim=True)

In [None]:
# Sample from the model
for _ in range(5):
    out = []
    ix = 0
    while True:
        # Sample the next character from the distribution for the current character index
        ix = torch.multinomial(P[ix], num_samples=1, replacement=True).item()
        out.append(int_to_char[ix])
        if ix == 0:
            break # End of word
    print(''.join(out))

In [None]:
# Loss function. Average negative log likelihood of the text
log_likelihood = 0
n = 0
for word in text:
    chars = ['.'] + list(word) + ['.']
    for char1, char2 in zip(chars, chars[1:]):
        prob = P[char_to_int[char1], char_to_int[char2]]
        log_likelihood += torch.log(prob)
        n += 1
        
print(f'Loss: {-log_likelihood / n}')
# Goal is to minimise the loss (average negative log likelihood) w.r.t. the parameters, i.e. bigram counts B

### Gradient Based Apprach

In [None]:
# Create the training set of bigrams (x, y)
xs, ys = [], []

for word in text:
    chars = ['.'] + list(word) + ['.']
    for char1, char2 in zip(chars, chars[1:]):
        xs.append(char_to_int[char1])
        ys.append(char_to_int[char2])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

# Input to network. One-hot encoding
xs_encoded = F.one_hot(xs, num_classes=vocab_size).float()

# Randomly initialise neurons' weights. Network has only one linear layer
W = torch.randn(vocab_size, vocab_size, requires_grad=True)

max_iters = 100
lr = 50 # Learning rate
reg = 1e-8 # Regularisation strength

# Gradient descent
for i in range(max_iters):

    # Forward pass
    logits = xs_encoded @ W
    # Perform softmax
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True) # Probabilities of the next character

    # Cross entropy loss with L2 regularisation on the weights
    loss = -probs[torch.arange(len(ys)), ys].log().mean() + reg * (W**2).sum()

    # Backward pass
    W.grad = None # Set the gradient to zero
    loss.backward()

    # Update the weights
    W.data -= lr * W.grad

    if i % (max_iters // 10) == 0 or i == max_iters - 1:
        print(f'Iteration {i:2d} | Loss: {loss.data:.4f}')

In [None]:
# Sample from the model
for _ in range(5):
    out = []

    while True:
        # Forward pass
        logits = W[ix].view(1, -1)
        # Perform softmax
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True) 
        
        # Sample the next character from the distribution for the current character index
        ix = torch.multinomial(probs, num_samples=1, replacement=True).item()
        out.append(int_to_char[ix])
        if ix == 0:
            break # End of word
    print(''.join(out))