# Makemore: Wavenet

- [Andrej Karpathy YouTube](https://www.youtube.com/watch?v=t3YJ5hKiMQ0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=6&ab_channel=AndrejKarpathy)

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [4]:
def load_data(file_path: str) -> list[str]:
    """Load text data from a file and return as a list of strings."""

    with open(file_path, "r") as f:
        # Read all the lines as a list
        data: list[str] = f.read().splitlines()

    return data

In [5]:
# Load Data
fp: str = "../../../data/names.txt"
names: list[str] = load_data(file_path=fp)

names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

### Build Vocabulary Of Characters And Mappings

In [6]:
special_token: str = "."
characters: list[str] = sorted(set("".join(names)))
# Add the special token to the beginning of the list.
characters.insert(0, special_token)
n_chars: int = len(characters)

# Convert text to numbers.
text_to_num: dict[str, int] = {text: idx for idx, text in enumerate(characters)}
# Convert numbers to text
num_to_text: dict[int, str] = {idx: text for text, idx in text_to_num.items()}

In [7]:
from torch.utils.data import random_split, TensorDataset, Dataset
from sklearn.model_selection import train_test_split


def build_dataset(
    names: list[str],
    special_token: str = ".",
    block_size: int = 3,
    print_info: bool = False,
) -> tuple[Tensor, Tensor]:
    """
    Builds a dataset of name sequences and their corresponding character indices.

    Args:
        names (list[str]): A list of names to build the dataset from.
        special_token (str, optional): A special token to append to the end of each name. Defaults to ".".
        block_size (int, optional): The size of the context window for each input sequence. Defaults to 3.
        print_info (bool, optional): Whether to print information about the dataset generation. Defaults to False.

    Returns:
        tuple[Tensor, Tensor]: A tuple containing the input sequences (X) and their corresponding target indices (Y).
    """
    X, Y = [], []

    for w in names:
        if print_info:
            print(w)
        context: list[str] = [0] * block_size

        for ch in w + special_token:
            ix: int = text_to_num.get(ch)
            X.append(context)
            Y.append(ix)

            if print_info:
                print(
                    f"{''.join([num_to_text.get(i) for i in context])} ---> {num_to_text.get(ix)}"
                )

            # Crop and append, like a rolling window
            context = context[1:] + [ix]

    X: Tensor = torch.tensor(X)
    Y: Tensor = torch.tensor(Y)
    print(f"\n{X.shape=}, {Y.shape=}")
    return (X, Y)


def split_data_into_train_dev_test(
    data: Tensor | Dataset, test_size: float = 0.05, dev_size: float = 0.1, seed=42
) -> tuple[Tensor, ...]:
    """
    Splits a given PyTorch tensor `data` into training, development, and test sets.

    Params:
    -------
        data (torch.Tensor): The input tensor to be split.
        test_size (float, optional): The fraction of the data to use for the test set. Defaults to 0.2.
        dev_size (float, optional): The fraction of the data to use for the development set. Defaults to 0.1.
        seed (int, optional): The random seed to use for reproducibility. Defaults to 42.

    Returns:
    --------
        tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The training, development, and test sets as PyTorch tensors.
    """
    if isinstance(data, Tensor):
        X_train, X_test = train_test_split(data, test_size=test_size, random_state=seed)
        X_train, X_dev = train_test_split(
            X_train, test_size=dev_size, random_state=seed
        )
        result: tuple[Tensor, ...] = (X_train, X_dev, X_test)
    if isinstance(data, Dataset):
        X_train, X_test, y_train, y_test = train_test_split(
            data.data,
            data.targets,
            test_size=test_size,
            random_state=seed,
            stratify=data.targets,
        )
        X_train, X_dev, y_train, y_dev = train_test_split(
            X_train, y_train, test_size=dev_size, random_state=seed, stratify=y_train
        )
        result: tuple[Tensor, ...] = (X_train, X_dev, X_test, y_train, y_dev, y_test)

    print(f"{X_train.shape=}; {X_dev.shape=}; {X_test.shape=}")

    return result


class MyDataset(Dataset):
    def __init__(self, data: Tensor, targets: Tensor) -> None:
        self.data = data
        self.targets = targets

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(data.shape={self.data.shape}, "
            f"target.shape={self.targets.shape=})"
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.targets[idx]
        return x, y

In [17]:
from abc import ABC, abstractmethod


class CustomModule(ABC):
    @abstractmethod
    def __init__(self) -> None:
        raise NotImplementedError

    @abstractmethod
    def __call__(self, x: Tensor) -> Tensor:
        raise NotImplementedError

    @abstractmethod
    def parameters(self) -> list[Tensor]:
        raise NotImplementedError


class Linear(CustomModule):
    """
    A linear layer implementation.

    This class implements a linear layer, which performs a linear transformation on the input tensor. It takes in the number
    of input features and output features, and optionally a bias term. The weights and biases are initialized randomly.

    The `__call__` method applies the linear transformation to the input tensor and returns the output tensor.

    The `parameters` method returns a list of the learnable parameters (weights and biases) of the layer.
    """

    def __init__(
        self, in_features: int, out_features: int, bias: bool = True, seed: int = 42
    ) -> None:
        self.in_features = in_features
        self.out_features = out_features
        # Kaiming's initialization
        self.weight = (
            torch.randn(in_features, out_features, generator=torch.manual_seed(seed))
            / in_features**0.5
        )
        self.bias = (
            torch.randn(out_features, generator=torch.manual_seed(seed))
            if bias
            else None
        )

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"

    def __call__(self, x: Tensor) -> Tensor:
        self.output: Tensor = torch.matmul(x, self.weight)
        if self.bias is not None:
            self.output += self.bias
        return self.output

    def parameters(self) -> list[Tensor]:
        return [self.weight] + ([self.bias] if self.bias is not None else [])


class BatchNorm1d(CustomModule):
    def __init__(self, dim: int, eps: float = 1e-5, momentum: float = 0.1) -> None:
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # Parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # Buffers (trained with running `momentum update`)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x: Tensor) -> Tensor:
        if self.training:
            # Calculate the batch mean and variance
            x_mean: Tensor = x.mean(dim=0, keepdim=True)
            x_var: Tensor = x.var(dim=0, keepdim=True)
        else:
            x_mean = self.running_mean
            x_var = self.running_var

        # Normalize the input
        x_hat: Tensor = (x - x_mean) / (x_var + self.eps).sqrt()
        self.output: Tensor = (self.gamma * x_hat) + self.beta

        if self.training:
            with torch.no_grad():
                # Update running mean and variance
                self.running_mean = (
                    1 - self.momentum
                ) * self.running_mean + self.momentum * x_mean
                self.running_var = (
                    1 - self.momentum
                ) * self.running_var + self.momentum * x_var

        return self.output

    def parameters(self) -> list[Tensor]:
        return [self.gamma, self.beta]


class Tanh(CustomModule):
    """A custom module that applies the hyperbolic tangent activation function to the input tensor."""

    def __init__(self) -> None:
        pass

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"

    def __call__(self, x: Tensor) -> Tensor:
        self.output = torch.tanh(x)
        return self.output

    def parameters(self) -> list[Tensor]:
        return []


class Embedding(CustomModule):
    """A custom module that creates an embedding lookup table from a given
    vocabulary size and embedding dimension."""

    def __init__(self, vocab_size: int, embedding_dim: int, seed: int = 42) -> None:
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.weight = torch.randn(
            (vocab_size, embedding_dim), generator=torch.manual_seed(seed)
        )

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.vocab_size}, {self.embedding_dim})"

    def __call__(self, idx: int) -> Tensor:
        self.output = self.weight[idx]
        return self.output

    def parameters(self) -> list[Tensor]:
        """Get all the parameters."""
        return [self.weight]


class Flatten(CustomModule):

    def __init__(self) -> None:
        pass

    def __repr__(self) -> str:
        return f"{__class__.__name__}()"

    def __call__(self, x: Tensor) -> Tensor:
        self.output = x.view(x.shape[0], -1)
        return self.output

    def parameters(self):
        """Get all the parameters."""
        return []


class FlattenConsecutive(CustomModule):
    """A custom module that flattens consecutive elements in the input tensor along
    the second dimension."""

    def __init__(self, n_dim: int) -> None:
        self.n_dim = n_dim

    def __repr__(self) -> str:
        return f"{__class__.__name__}({self.n_dim})"

    def __call__(self, x: Tensor) -> Tensor:
        B, T, C = x.shape
        x = x.view(B, T // self.n_dim, C * self.n_dim)
        if x.shape[1] == 1:
            x = x.squeeze(1)

        self.output = x
        return self.output

    def parameters(self):
        """Get all the parameters."""
        return []


class Sequential(CustomModule):
    """A custom module that applies a sequence of other custom modules to the input tensor."""

    def __init__(self, layers: list[CustomModule]) -> None:
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        """Get parameters of all layers and stretch them out into one list."""
        return [p for layer in self.layers for p in layer.parameters()]

In [9]:
X, y = build_dataset(names=names[:5], block_size=3, print_info=True)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .

X.shape=torch.Size([32, 3]), Y.shape=torch.Size([32])


In [10]:
X, y = build_dataset(names=names, block_size=3, print_info=False)
data: Dataset = MyDataset(X, y)

X_train, X_dev, X_test, y_train, y_dev, y_test = split_data_into_train_dev_test(
    data=data
)


X.shape=torch.Size([228152, 3]), Y.shape=torch.Size([228152])
X_train.shape=torch.Size([195069, 3]); X_dev.shape=torch.Size([21675, 3]); X_test.shape=torch.Size([11408, 3])


### Experiment!

- 1

In [46]:
# Parameters
g = torch.Generator().manual_seed(5)

emb_dim: int = 10  # embedding dimension
block_size: int = 3  # size of the context window for each input sequence
M: int = emb_dim * block_size  # number of inputs
n_nodes: int = 300  # number of hidden nodes
learning_rate: float = 0.1  # learning rate
batch_size: int = 32  # batch size
epochs: int = 140_000  # number of epochs

layers: list[Any] = [
    Embedding(vocab_size=n_chars, embedding_dim=emb_dim, seed=42),
    Flatten(),
    Linear(in_features=M, out_features=n_nodes, bias=False),
    BatchNorm1d(dim=n_nodes),
    Tanh(),
    Linear(in_features=n_nodes, out_features=n_chars, bias=False),
    BatchNorm1d(dim=n_chars),
]

with torch.no_grad():
    # Make last layer less confident
    # layers[-1].weight *= 0.1  # Default: w/o BatchNorm
    layers[-1].gamma *= 0.1  # with BatchNorm

    # Apply gain to the other layers
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            # Scale the weights of all other linear layers by a gain factor.
            # layer.weight *= 5 / 3 # Default: w/o BatchNorm
            layer.weight *= 1.0  # with BatchNorm: No scaling

# Parameters Collection
parameters: list[Tensor] = [p for layer in layers for p in layer.parameters()]
print(f"Total params: {sum(p.numel() for p in parameters):,}")

for p in parameters:
    p.requires_grad = True

Total params: 18,024


In [47]:
# Use The optimal learning rate to train the model
losses_all: list[float] = []

for epoch in range(epochs):

    # Add mini-batches
    idx: Tensor = torch.randint(0, X_train.shape[0], size=(batch_size,))
    # X, y batch
    Xb, yb = X_train[idx], y_train[idx]

    # Forward pass
    x: Tensor = Xb
    for layer in layers:
        x: Tensor = layer(x)  # Logits: Apply the linear layer
    loss: Tensor = F.cross_entropy(x, yb)

    # Backward pass
    for p in parameters:
        # Reset gradients
        p.grad = None
    loss.backward()

    # learning rate decay
    learning_rate: float = (
        0.1 if epoch < 70_000 else (0.01 if epoch < 85_000 else 0.001)
    )

    # Update the parameters
    for p in parameters:
        p.data -= learning_rate * p.grad

    # Record the loss
    losses_all.append(loss.item())

    if (epoch) % 10_000 == 0:
        print(f"Epoch: {epoch}/{epochs} | Loss: {loss.item():.4f}")

    if epoch > 100:
        break

Epoch: 0/140000 | Loss: 3.3159


In [48]:
layers[0](2)

tensor([-0.7592,  1.0786,  0.8008,  1.6781,  1.2758,  1.2908,  0.6107,  1.3340,
        -0.2326,  0.0402], grad_fn=<SelectBackward0>)

In [49]:
# for layer in layers:
#     layer.training = False

In [59]:
# sample from the model
g = torch.Generator().manual_seed(5)
n_names: int = 10

for _ in range(n_names):

    out: list[str] = []
    context: list[int] = [0] * block_size  # initialize with all ...
    while True:
        # forward pass the neural net
        x: Tensor = torch.tensor([context])

        for layer in layers:
            x = layer(x)
            # break

        logits: Tensor = x
        probs: Tensor = F.softmax(logits, dim=1)
        # sample from the distribution
        idx: int = torch.multinomial(probs, num_samples=1, generator=g).item()
        # Shift the context window and track the samples
        context = context[1:] + [idx]
        out.append(idx)
        # If we sample the special '.' token, break
        if idx == 0:
            break

    # Decode and print the generated word
    print("".join(num_to_text.get(i) for i in out))

kafyuw.
.
parales.
zfgfixo.
erihrbvblxf.


In [None]:
aa

### Experiment!

- 2

In [None]:
# Parameters
g = torch.Generator().manual_seed(5)

emb_dim: int = 10  # embedding dimension
block_size: int = 3  # size of the context window for each input sequence
M: int = emb_dim * block_size  # number of inputs
n_nodes: int = 300  # number of hidden nodes
learning_rate: float = 0.1  # learning rate
batch_size: int = 32  # batch size
epochs: int = 140_000  # number of epochs

# NEW!
model: Sequential = Sequential(
    layers=[
        Embedding(vocab_size=n_chars, embedding_dim=emb_dim, seed=42),
        FlattenConsecutive(n_dim=2),
        Linear(in_features=M, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_chars, bias=False),
        BatchNorm1d(dim=n_chars),
    ]
)

# with torch.no_grad():
# Make last layer less confident
# model[-1].weight *= 0.1  # Default: w/o BatchNorm

# Parameters Collection
parameters: list[Tensor] = model.parameters()  # NEW!
print(f"Total params: {sum(p.numel() for p in parameters):,}")

for p in parameters:
    p.requires_grad = True

In [None]:
# Use The optimal learning rate to train the model
losses_all: list[float] = []

for epoch in range(epochs):

    # Add mini-batches
    idx: Tensor = torch.randint(0, X_train.shape[0], size=(batch_size,))
    # X, y batch
    Xb, yb = X_train[idx], y_train[idx]

    # Forward pass
    logits: Tensor = model(Xb)  # NEW!
    loss: Tensor = F.cross_entropy(x, yb)

    # Backward pass
    for p in parameters:
        # Reset gradients
        p.grad = None
    loss.backward()

    # learning rate decay
    learning_rate: float = (
        0.1 if epoch < 70_000 else (0.01 if epoch < 85_000 else 0.001)
    )

    # Update the parameters
    for p in parameters:
        p.data -= learning_rate * p.grad

    # Record the loss
    losses_all.append(loss.item())

    if (epoch) % 10_000 == 0:
        print(f"Epoch: {epoch}/{epochs} | Loss: {loss.item():.4f}")

    if epoch > 100:
        break

In [None]:
# Parameters
g = torch.Generator().manual_seed(5)

emb_dim: int = 10  # embedding dimension
block_size: int = 3  # size of the context window for each input sequence
M: int = emb_dim * block_size  # number of inputs
n_nodes: int = 300  # number of hidden nodes
learning_rate: float = 0.1  # learning rate
batch_size: int = 32  # batch size
epochs: int = 140_000  # number of epochs
# C: Tensor = torch.randn((n_chars, emb_dim), generator=g) * 0.01  # Lookup table
W1: Tensor = torch.randn((M, n_nodes), generator=g) * 0.01
b1: Tensor = torch.randn(n_nodes, generator=g) * 0.01
# Initialize the weights and biases with very small random values.
W2: Tensor = torch.randn(n_nodes, n_chars, generator=g) * 0.01  # (100, 27)
b2: Tensor = torch.randn(n_chars, generator=g) * 0  # (27,)

model: Any = Sequential(
    layers=[
        Embedding(vocab_size=n_chars, embedding_dim=emb_dim, seed=42),
        Linear(in_features=M, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_nodes, bias=False),
        BatchNorm1d(dim=n_nodes),
        Tanh(),
        Linear(in_features=n_nodes, out_features=n_chars, bias=False),
        BatchNorm1d(dim=n_chars),
    ]
)

with torch.no_grad():
    # Make last layer less confident
    # layers[-1].weight *= 0.1  # Default: w/o BatchNorm
    layers[-1].gamma *= 0.1  # with BatchNorm

    # Apply gain to the other layers
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            # Scale the weights of all other linear layers by a gain factor.
            # layer.weight *= 5 / 3 # Default: w/o BatchNorm
            layer.weight *= 1.0  # with BatchNorm: No scaling

# Parameters Collection
parameters: list[Tensor] = [C] + [p for layer in layers for p in layer.parameters()]
print(f"Total params: {sum(p.numel() for p in parameters):,}")

for p in parameters:
    p.requires_grad = True