# Create A Multi-Layer Perceptron (MLP) For Predicting The Next Character In A Sequence

- [Andrej Karpathy YouTube Tutorial](https://www.youtube.com/watch?v=TCH_1BHY58I&t=541s&ab_channel=AndrejKarpathy)
- [A Neural Probabilistic Language Model (Paper)](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [4]:
def load_data(file_path: str) -> list[str]:
    """Load text data from a file and return as a list of strings."""

    with open(file_path, "r") as f:
        # Read all the lines as a list
        data: list[str] = f.read().splitlines()

    return data

In [5]:
# Load Data
fp: str = "../../../data/names.txt"
names: list[str] = load_data(file_path=fp)

names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

### Build Vocabulary Of Characters And Mappings

In [6]:
set("".join(names))

{'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [10]:
special_token: str = "."
characters: list[str] = sorted(set("".join(names)))
# Add the special token to the beginning of the list.
characters.insert(0, special_token)
n_chars: int = len(characters)

# Convert text to numbers.
text_to_num: dict[str, int] = {text: idx for idx, text in enumerate(characters)}
# Convert numbers to text
num_to_text: dict[int, str] = {idx: text for text, idx in text_to_num.items()}


console.print(text_to_num, num_to_text)

In [11]:
# Context length: num of chars required to predict the next char
block_size: int = 3
X, y = [], []

# Read N words at a time
for w in names[:5]:
    print(w)
    # Add special character(s) between words
    word: list[str] = [special_token] * (block_size) + list(w) + [special_token]

    for i in range(len(word) - block_size + 1):
        # Get indices of the characters in the word.
        # This will be used to create the n-gram tensor.
        idxs: list[int] = [text_to_num.get(word[j]) for j in range(i, i + block_size)]
        chars: list[str] = [word[j] for j in range(i, i + block_size)]
        print(
            f"{''.join(chars)} ---> {word[i+block_size] if i < len(word)-block_size else ''}"
        )
        X.append(idxs[0])
        y.append(idxs[1])
    print()

X: Tensor = torch.tensor(X)
y: Tensor = torch.tensor(y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
ma. ---> 

olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ia. ---> 

ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
va. ---> 

isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
la. ---> 

sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .
ia. ---> 



In [12]:
X.shape, y.shape

(torch.Size([37]), torch.Size([37]))

In [13]:
# Build an embedding lookup table
emb_dim: int = 2  # embedding dimension
C: Tensor = torch.randn((n_chars, emb_dim))

print(f"{C.shape = }")
C

C.shape = torch.Size([27, 2])


tensor([[ 1.9163,  0.3247],
        [-0.7992,  1.5627],
        [ 0.2732, -0.0048],
        [ 2.3680, -1.7333],
        [-0.0935,  0.0136],
        [-0.0941, -1.5134],
        [ 0.5186,  0.2429],
        [ 1.3886,  1.9865],
        [-0.1567,  0.3870],
        [-0.4536,  0.5081],
        [-1.1571, -0.9593],
        [ 0.6917, -0.4346],
        [ 1.0448, -2.2214],
        [ 1.6801,  0.6266],
        [-1.0635, -0.8294],
        [-0.4315, -0.8258],
        [ 1.4252,  1.0683],
        [ 0.7842, -0.0208],
        [ 0.6248, -0.1475],
        [ 1.4614,  0.3518],
        [ 0.5166, -0.3145],
        [ 1.4578,  0.6152],
        [-0.7944,  0.5030],
        [ 1.5362, -0.5275],
        [ 0.1844, -0.6385],
        [ 0.2334, -0.0110],
        [-0.3823, -0.0169]])

In [14]:
# Embed the inputs
# Method 1
C[5]

tensor([-0.0941, -1.5134])

In [16]:
# Method 2
F.one_hot(torch.tensor([5]), num_classes=n_chars).float() @ C

tensor([[-0.0941, -1.5134]])

In [17]:
# Embed the entire input
C[X]

tensor([[ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.0941, -1.5134],
        [ 1.6801,  0.6266],
        [ 1.6801,  0.6266],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.4315, -0.8258],
        [ 1.0448, -2.2214],
        [-0.4536,  0.5081],
        [-0.7944,  0.5030],
        [-0.4536,  0.5081],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.7992,  1.5627],
        [-0.7944,  0.5030],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.4536,  0.5081],
        [ 1.4614,  0.3518],
        [-0.7992,  1.5627],
        [ 0.2732, -0.0048],
        [-0.0941, -1.5134],
        [ 1.0448, -2.2214],
        [ 1.0448, -2.2214],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.4614,  0.3518],
        [-0.4315, -0.8258],
        [ 1.4252,  1.0683],
        [-0.1567,  0

In [18]:
# OR
# Method 2
F.one_hot(X, num_classes=n_chars).float() @ C

tensor([[ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.0941, -1.5134],
        [ 1.6801,  0.6266],
        [ 1.6801,  0.6266],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.4315, -0.8258],
        [ 1.0448, -2.2214],
        [-0.4536,  0.5081],
        [-0.7944,  0.5030],
        [-0.4536,  0.5081],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.7992,  1.5627],
        [-0.7944,  0.5030],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [-0.4536,  0.5081],
        [ 1.4614,  0.3518],
        [-0.7992,  1.5627],
        [ 0.2732, -0.0048],
        [-0.0941, -1.5134],
        [ 1.0448, -2.2214],
        [ 1.0448, -2.2214],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.9163,  0.3247],
        [ 1.4614,  0.3518],
        [-0.4315, -0.8258],
        [ 1.4252,  1.0683],
        [-0.1567,  0

In [20]:
C[X][1]

tensor([1.9163, 0.3247])

In [None]:
C[X][1]