# Create A Multi-Layer Perceptron (MLP) For Predicting The Next Character In A Sequence

- [Andrej Karpathy YouTube Tutorial](https://www.youtube.com/watch?v=TCH_1BHY58I&t=541s&ab_channel=AndrejKarpathy)
- [A Neural Probabilistic Language Model (Paper)](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [4]:
def load_data(file_path: str) -> list[str]:
    """Load text data from a file and return as a list of strings."""

    with open(file_path, "r") as f:
        # Read all the lines as a list
        data: list[str] = f.read().splitlines()

    return data

In [5]:
# Load Data
fp: str = "../../../data/names.txt"
names: list[str] = load_data(file_path=fp)

names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

### Build Vocabulary Of Characters And Mappings

In [6]:
set("".join(names))

{'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [7]:
special_token: str = "."
characters: list[str] = sorted(set("".join(names)))
# Add the special token to the beginning of the list.
characters.insert(0, special_token)
n_chars: int = len(characters)

# Convert text to numbers.
text_to_num: dict[str, int] = {text: idx for idx, text in enumerate(characters)}
# Convert numbers to text
num_to_text: dict[int, str] = {idx: text for text, idx in text_to_num.items()}


console.print(text_to_num, num_to_text)

In [8]:
def build_dataset(
    names: list[str],
    special_token: str = ".",
    block_size: int = 3,
    print_info: bool = False,
) -> tuple[Tensor, Tensor]:
    """
    Builds a dataset of name sequences and their corresponding character indices.

    Args:
        names (list[str]): A list of names to build the dataset from.
        special_token (str, optional): A special token to append to the end of each name. Defaults to ".".
        block_size (int, optional): The size of the context window for each input sequence. Defaults to 3.
        print_info (bool, optional): Whether to print information about the dataset generation. Defaults to False.

    Returns:
        tuple[Tensor, Tensor]: A tuple containing the input sequences (X) and their corresponding target indices (Y).
    """
    X, Y = [], []

    for w in names:
        if print_info:
            print(w)
        context: list[str] = [0] * block_size

        for ch in w + special_token:
            ix: int = text_to_num.get(ch)
            X.append(context)
            Y.append(ix)

            if print_info:
                print(
                    f"{''.join([num_to_text.get(i) for i in context])} ---> {num_to_text.get(ix)}"
                )

            # Crop and append, like a rolling window
            context = context[1:] + [ix]

    X: Tensor = torch.tensor(X)
    Y: Tensor = torch.tensor(Y)
    print(f"\n{X.shape=}, {Y.shape=}")
    return (X, Y)

In [9]:
X, y = build_dataset(names=names[:5], block_size=3, print_info=True)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .

X.shape=torch.Size([32, 3]), Y.shape=torch.Size([32])


In [10]:
X.shape, y.shape

(torch.Size([32, 3]), torch.Size([32]))

<img src="../images/mlp 1.png" width="600">

In [11]:
# Step 1: Build an embedding lookup table
emb_dim: int = 2  # embedding dimension
C: Tensor = torch.randn((n_chars, emb_dim))

print(f"{C.shape = }")
C

C.shape = torch.Size([27, 2])


tensor([[ 0.3686, -0.5562],
        [ 1.2637, -0.5048],
        [-1.4296, -0.3214],
        [-0.6920,  0.1593],
        [ 0.1617, -0.0502],
        [ 0.8174,  0.4805],
        [-1.3794,  0.1881],
        [-1.2635, -0.3200],
        [ 1.3065,  0.3617],
        [ 0.6837, -0.6760],
        [-2.2507, -0.0455],
        [ 1.1029, -0.2650],
        [-1.6356,  0.4328],
        [ 1.3866, -1.3432],
        [-0.1160, -1.3923],
        [ 1.7940, -1.0061],
        [-1.0558,  0.4111],
        [-0.3326, -1.3829],
        [ 1.3733, -1.3214],
        [-0.1643,  0.7629],
        [-0.2303, -0.4835],
        [-2.6855, -1.3601],
        [ 1.3371,  0.6430],
        [-0.2928,  2.2157],
        [ 1.7291, -1.1297],
        [-0.7569, -1.0828],
        [-0.7454, -1.3146]])

In [12]:
# Embed the inputs
# Method 1
C[5]

tensor([0.8174, 0.4805])

In [13]:
# Method 2
F.one_hot(torch.tensor(5), num_classes=n_chars).float() @ C

tensor([0.8174, 0.4805])

In [29]:
# Embed the entire input
emb: Tensor = C[X]
print(f"{emb.shape=}")
emb

emb.shape=torch.Size([32, 3, 2])


tensor([[[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.3686, -0.5562]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.8174,  0.4805]],

        [[ 0.3686, -0.5562],
         [ 0.8174,  0.4805],
         [ 1.3866, -1.3432]],

        [[ 0.8174,  0.4805],
         [ 1.3866, -1.3432],
         [ 1.3866, -1.3432]],

        [[ 1.3866, -1.3432],
         [ 1.3866, -1.3432],
         [ 1.2637, -0.5048]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.3686, -0.5562]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 1.7940, -1.0061]],

        [[ 0.3686, -0.5562],
         [ 1.7940, -1.0061],
         [-1.6356,  0.4328]],

        [[ 1.7940, -1.0061],
         [-1.6356,  0.4328],
         [ 0.6837, -0.6760]],

        [[-1.6356,  0.4328],
         [ 0.6837, -0.6760],
         [ 1.3371,  0.6430]],

        [[ 0.6837, -0.6760],
         [ 1.3371,  0.6430],
         [ 0.6837, -0.6760]],

        [[ 1.3371,  0

In [30]:
# OR
# Method 2
emb1: Tensor = F.one_hot(X, num_classes=n_chars).float() @ C
print(f"{emb1.shape=}")
emb1

emb1.shape=torch.Size([32, 3, 2])


tensor([[[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.3686, -0.5562]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.8174,  0.4805]],

        [[ 0.3686, -0.5562],
         [ 0.8174,  0.4805],
         [ 1.3866, -1.3432]],

        [[ 0.8174,  0.4805],
         [ 1.3866, -1.3432],
         [ 1.3866, -1.3432]],

        [[ 1.3866, -1.3432],
         [ 1.3866, -1.3432],
         [ 1.2637, -0.5048]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.3686, -0.5562]],

        [[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 1.7940, -1.0061]],

        [[ 0.3686, -0.5562],
         [ 1.7940, -1.0061],
         [-1.6356,  0.4328]],

        [[ 1.7940, -1.0061],
         [-1.6356,  0.4328],
         [ 0.6837, -0.6760]],

        [[-1.6356,  0.4328],
         [ 0.6837, -0.6760],
         [ 1.3371,  0.6430]],

        [[ 0.6837, -0.6760],
         [ 1.3371,  0.6430],
         [ 0.6837, -0.6760]],

        [[ 1.3371,  0

In [31]:
emb.equal(emb1)

True

In [32]:
emb.shape

torch.Size([32, 3, 2])

In [33]:
emb[1], emb[1].shape

(tensor([[ 0.3686, -0.5562],
         [ 0.3686, -0.5562],
         [ 0.8174,  0.4805]]),
 torch.Size([3, 2]))

In [34]:
emb[13, 2]

tensor([ 1.2637, -0.5048])

In [38]:
# Step 2: Build The Hidden Layer
W1: Tensor = torch.randn((6, 100))
b1: Tensor = torch.randn(100)

# Forward pass
h: Tensor = emb.view(-1, 6) @ W1 + b1
# Apply a non-linearity
h = torch.tanh(h)
h.shape

torch.Size([32, 100])

In [39]:
# Step 3: Output Layer
W2: Tensor = torch.randn(h.shape[-1], n_chars, requires_grad=True)  # (100, 27)
b2: Tensor = torch.zeros(n_chars, requires_grad=True)  # (27,)

logits: Tensor = torch.matmul(h, W2) + b2  # h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [45]:
# Convert logits to `fake` counts
counts: Tensor = logits.exp()
# Normalize: Apply Softmax
probs: Tensor = counts / counts.sum(dim=-1, keepdim=True)

probs[0].sum(), probs[15].sum()

(tensor(1., grad_fn=<SumBackward0>), tensor(1., grad_fn=<SumBackward0>))

In [46]:
# Negative log likelihood loss
loss: Tensor = -1 * probs[torch.arange(logits.shape[0]), y].log().mean()
loss

tensor(14.3851, grad_fn=<MulBackward0>)

#### Combine The Previous Cells

In [73]:
X, y = build_dataset(names=names, block_size=3, print_info=False)


X.shape=torch.Size([228152, 3]), Y.shape=torch.Size([228152])


In [74]:
torch.randint(0, X.shape[0], size=(32,))

tensor([ 63473,  63199,   2996, 138850, 144712, 196523,  68133, 123124, 140752,
         11876,  48565,  55557, 193809, 134738, 142102,  71058,  52869,  56823,
        109774,  43877, 138688, 223131, 156774, 144497, 214777, 147728,   2755,
         43798,  97593,  56453,  69142, 166838])

In [75]:
g = torch.Generator().manual_seed(5)

# Parameters
emb_dim: int = 2  # embedding dimension
n_nodes: int = 100  # number of hidden nodes
learning_rate: float = 0.1  # learning rate
batch_size: int = 32  # batch size
epochs: int = 30  # number of epochs
C: Tensor = torch.randn((n_chars, emb_dim), generator=g)
W1: Tensor = torch.randn((6, n_nodes), generator=g)
b1: Tensor = torch.randn(n_nodes, generator=g)
W2: Tensor = torch.randn(n_nodes, n_chars, generator=g)  # (100, 27)
b2: Tensor = torch.randn(n_chars, generator=g)  # (27,)
print(f"{C.shape = }")
parameters: list[Tensor] = [C, W1, b1, W2, b2]
n_parameters: int = sum([p.nelement() for p in parameters])
print(f"{n_parameters = :,}")

C.shape = torch.Size([27, 2])
n_parameters = 3,481


In [76]:
# Require gradients to be true
for p in parameters:
    p.requires_grad = True

In [78]:
# Calculate the optimal learning rate
lr: Tensor = torch.linspace(-3, 0, 1000)
lr = 10**lr  # 10^-3, 10^-2, ..., 10^0
lr

tensor([0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012,
        0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0013, 0.0013, 0.0013,
        0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0014,
        0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014,
        0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015,
        0.0015, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016,
        0.0016, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017,
        0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0019,
        0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0020, 0.0020,
        0.0020, 0.0020, 0.0020, 0.0020, 0.0020, 0.0021, 0.0021, 0.0021, 0.0021,
        0.0021, 0.0021, 0.0021, 0.0022, 

In [77]:
lr_all: list[float] = []
losses_all: list[float] = []

for epoch in range(epochs):

    # Add mini-batches
    idx: Tensor = torch.randint(0, X.shape[0], size=(batch_size,))

    # Step 1: Build an embedding lookup table
    # Embed the input
    emb: Tensor = F.one_hot(X[idx], num_classes=n_chars).float() @ C

    # Forward pass
    # Step 2: Build The Hidden Layer
    # Reshape the input to match the shape of the weight matrix
    h: Tensor = emb.view(-1, 6) @ W1 + b1
    # Apply a non-linearity
    h = torch.tanh(h)

    # Step 3: Output Layer
    logits: Tensor = torch.matmul(h, W2) + b2  # h @ W2 + b2

    # Calculate the loss: Negative log likelihood loss
    loss: Tensor = F.cross_entropy(logits, y[idx])

    # Backward pass
    # Reset gradients
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update the parameters
    for p in parameters:
        p.data -= learning_rate * p.grad

    print(f"Epoch: {epoch+1}/{epochs} | Loss: {loss.item():.4f}")

Epoch: 1/30 | Loss: 16.4787
Epoch: 2/30 | Loss: 14.6044
Epoch: 3/30 | Loss: 12.8515
Epoch: 4/30 | Loss: 12.8096
Epoch: 5/30 | Loss: 12.0617
Epoch: 6/30 | Loss: 10.9320
Epoch: 7/30 | Loss: 11.5245
Epoch: 8/30 | Loss: 9.9370
Epoch: 9/30 | Loss: 9.8197
Epoch: 10/30 | Loss: 9.3507
Epoch: 11/30 | Loss: 7.6521
Epoch: 12/30 | Loss: 8.3687
Epoch: 13/30 | Loss: 9.6624
Epoch: 14/30 | Loss: 7.0304
Epoch: 15/30 | Loss: 5.6843
Epoch: 16/30 | Loss: 7.0956
Epoch: 17/30 | Loss: 7.3071
Epoch: 18/30 | Loss: 7.2287
Epoch: 19/30 | Loss: 7.8393
Epoch: 20/30 | Loss: 7.2934
Epoch: 21/30 | Loss: 7.7770
Epoch: 22/30 | Loss: 6.1947
Epoch: 23/30 | Loss: 5.7256
Epoch: 24/30 | Loss: 6.0679
Epoch: 25/30 | Loss: 6.6677
Epoch: 26/30 | Loss: 5.1784
Epoch: 27/30 | Loss: 5.1976
Epoch: 28/30 | Loss: 5.7453
Epoch: 29/30 | Loss: 5.4178
Epoch: 30/30 | Loss: 6.5563
