In [1]:
import torch
import random

# The MLP implemented in this notebook is based on the design from
# the paper 'A Neural Probabilistic Language Model' (Bengio et al. 2003)
# https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [2]:
words = open('names.txt', 'r').read().splitlines()

chars = ['.'] + sorted(list(set(''.join(words))))
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}
num_chars = len(chars)

In [3]:
# Hyperparameters
block_size = 3 # Context length (characters).
embedding_dims = 10 # Number of dimensions for the embedding space.
batch_size = 32 # Number of examples to process at a time in training.
hidden_layer_size = 200 # Number of neurons in the hidden layer.
init_lr = 0.1 # Initial learning rate.
final_lr = 0.01 # Final learning rate.
max_steps = 100000

def build_dataset(words: list[str]) -> (torch.Tensor, torch.Tensor):

    X, Y = [], []

    for word in words:
        context = [0] * block_size # Padding the context with initial '.' characters.
        for char in word + '.':
            idx = stoi[char]
            X.append(context)
            Y.append(idx)
            context = context[1:] + [idx] # Update context.

    return torch.tensor(X), torch.tensor(Y)

random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

x_train, y_train = build_dataset(words[:n1])
x_val, y_val = build_dataset(words[n1:n2])
x_test, y_test = build_dataset(words[n2:])

# Embed all characters into a two dimensional space.
C = torch.randn((num_chars, embedding_dims), requires_grad=True)
# Hidden layer.
W1 = torch.randn((block_size * embedding_dims, hidden_layer_size), requires_grad=True)
b1 = torch.randn(hidden_layer_size, requires_grad=True)
# Output layer.
W2 = torch.randn((hidden_layer_size, num_chars), requires_grad=True)
b2 = torch.randn(num_chars, requires_grad=True)
params = [C, W1, b1, W2, b2]

print(f'Number of parameters: {sum(param.nelement() for param in params)}')

Number of parameters: 11897


In [4]:
# Gradient descent.
for i in range(max_steps):

    # Constructing batches.
    idx = torch.randint(0, x_train.shape[0], (batch_size, ))

    # Forward pass.
    embedding = C[x_train[idx]]
    h = torch.tanh(embedding.view(embedding.size(0), block_size * embedding_dims) @ W1 + b1)
    logits = h @ W2 + b2
    # Calculate the cross entropy loss.
    loss = torch.nn.functional.cross_entropy(logits, y_train[idx])

    # Backward pass.
    for param in params:
        param.grad = None # Set the gradient to zero.
    loss.backward()

    # Stochastic gradient descent update.
    lr = init_lr if i < (max_steps / 2) else final_lr
    for param in params:
        param.data -= lr * param.grad

In [5]:
# Evaluation on the training set.
embedding = C[x_train]
h = torch.tanh(embedding.view(embedding.size(0), block_size * embedding_dims) @ W1 + b1)
logits = h @ W2 + b2
loss = torch.nn.functional.cross_entropy(logits, y_train)
print(loss)

tensor(2.1997, grad_fn=<NllLossBackward0>)


In [6]:
# Evaluation of the validation set.
embedding = C[x_val]
h = torch.tanh(embedding.view(embedding.size(0), block_size * embedding_dims) @ W1 + b1)
logits = h @ W2 + b2
loss = torch.nn.functional.cross_entropy(logits, y_val)
print(loss)

tensor(2.2411, grad_fn=<NllLossBackward0>)


In [7]:
# Sample from the model.
for _ in range(5):
    out = []
    context = [0] * block_size

    while True:
        # Forward pass.
        embedding = C[torch.tensor([context])]
        h = torch.tanh(embedding.view(embedding.size(0), block_size * embedding_dims) @ W1 + b1)
        logits = h @ W2 + b2
        probs = torch.nn.functional.softmax(logits, dim=1)

        # Sample from the distribution.
        idx = torch.multinomial(probs, num_samples=1, replacement=True).item()
        context = context[1:] + [idx]
        out.append(itos[idx])
        if idx == 0:
            break
    print(''.join(out))

grariustlyn.
kaylyn.
ilyn.
aryai.
fia.
