# Shakespeare


```{contents}
:local:
```


In [1]:
from __future__ import annotations

import os
import time
from pathlib import Path
from typing import Any, Dict, List, Literal, Tuple, Union

import numpy as np
from numpy.typing import ArrayLike, NDArray
import requests
import tiktoken
import torch
from rich.pretty import pprint
from torch import nn
from torch.utils.data import DataLoader, Dataset
import os
import random
import warnings

import numpy as np
import torch
import torch.backends.cudnn

from pydantic import BaseModel, Field, model_validator, computed_field


## Composing the Configurations

In [2]:
class Composer(BaseModel):
    seed: int = 2024
    debug: bool = False

    url: str = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    dataset_name: str = "tinyshakespeare"
    data_folder: str = Field(default="./data/tinyshakespeare", description="Path to the data folder")

    train_path: Path = Field(None, description="Path to the train file")
    valid_path: Path = Field(None, description="Path to the valid file")

    encoding_name: Literal['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base'] = "gpt2"

    batch_size: int = Field(default=64, description="Batch size")
    block_size: int = Field(default=128, description="Block size, an alias for max length/context window size.", alias="context_length")
    device_type: Literal["cpu", "cuda"] = "cpu"
    device: torch.device = Field(None, description="Device to use")


    @model_validator(mode="after")
    def set_train_valid_paths(self) -> Composer:
        self.train_path = Path(self.data_folder) / "train.txt"
        self.valid_path = Path(self.data_folder) / "valid.txt"
        return self

    @model_validator(mode="after")
    def set_device(self) -> Composer:
        self.device = torch.device(self.device_type)
        return self

    @model_validator(mode="after")
    def set_debug_fields(self) -> Composer:
        if self.debug:
            self.batch_size = 2
            self.block_size = 8
        return self

    class Config:
        extra = "forbid"
        arbitrary_types_allowed = True

composer = Composer(debug=True)
pprint(composer)


## Reproducibility

In [3]:
def configure_deterministic_mode() -> None:
    """
    See https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
    and https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
    """
    # fmt: off
    torch.use_deterministic_algorithms(True, warn_only=True)
    torch.backends.cudnn.benchmark        = False
    torch.backends.cudnn.deterministic    = True
    torch.backends.cudnn.enabled          = False

    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    # fmt: on
    warnings.warn(
        "Deterministic mode is activated. This will negatively impact performance and may cause increase in CUDA memory footprint.",
        category=UserWarning,
        stacklevel=2,
    )


def seed_all(
    seed: int = 1992,
    seed_torch: bool = True,
    set_torch_deterministic: bool = True,
) -> int:
    """
    Seed all random number generators.

    Parameters
    ----------
    seed : int
        Seed number to be used, by default 1992.
    seed_torch : bool
        Whether to seed PyTorch or not, by default True.

    Returns
    -------
    seed: int
        The seed number.
    """
    # fmt: off
    os.environ["PYTHONHASHSEED"] = str(seed)       # set PYTHONHASHSEED env var at fixed value
    np.random.default_rng(seed)                    # numpy pseudo-random generator
    random.seed(seed)                              # python's built-in pseudo-random generator

    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)           # pytorch (both CPU and CUDA)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.enabled = False

        if set_torch_deterministic:
            configure_deterministic_mode()
    # fmt: on
    return seed

In [4]:
seed_all(composer.seed, seed_torch=True, set_torch_deterministic=False)

2024

## Tokenization and Vocabulary

- https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
- https://github.com/openai/tiktoken
- https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens). Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable properties[^1]:

- It's reversible and lossless, so you can convert tokens back into the original text
- It works on arbitrary text, even text that is not in the tokeniser's training data
- It compresses the text: the token sequence is shorter than the bytes corresponding to the original text. On average, in practice, each token corresponds to about 4 bytes.
- It attempts to let the model see common subwords. For instance, "ing" is a common subword in English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing" (instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and again in different contexts, it helps models generalise and better understand grammar.

In [5]:
def am_i_in_jupyter() -> bool:
    try:
        from IPython import get_ipython
        if "IPKernelApp" not in get_ipython().config:
            return False
    except ImportError:
        return False
    return True

IN_JUPYTER = am_i_in_jupyter()

In [6]:
def download(url: str, dataset_name: str, dest_folder: Path | str) -> Path:
    dest_folder_path = Path(dest_folder)

    dest_folder_path.mkdir(parents=True, exist_ok=True)

    filepath = dest_folder_path / f"{dataset_name}.txt"

    response = requests.get(url, stream=True)
    corpus = response.text
    response.raise_for_status()

    with open(filepath, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    return filepath, corpus

In [7]:
filepath, corpus = download(composer.url, composer.dataset_name, composer.data_folder)
pprint(filepath)
print(corpus[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [8]:
N = len(corpus)
train_data = corpus[: int(N * 0.9)]
valid_data = corpus[int(N * 0.9) :]

# encode with tiktoken gpt2 bpe
tokenizer = tiktoken.get_encoding(composer.encoding_name)
tiktoken.list_encoding_names()

['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base']

In [9]:
train_ids = tokenizer.encode_ordinary(train_data)
valid_ids = tokenizer.encode_ordinary(valid_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(valid_ids):,} tokens")

train has 301,966 tokens
val has 36,059 tokens


In [10]:
print(tokenizer.decode(train_ids[:2]))
print("-" * 80)
print(tokenizer.decode(train_ids[:100]))


First Citizen
--------------------------------------------------------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we


In [11]:
# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
valid_ids = np.array(valid_ids, dtype=np.uint16)

train_ids.tofile(composer.train_path)
valid_ids.tofile(composer.valid_path)

# train.bin has 301,966 tokens
# val.bin has 36,059 tokens

## Dataset and Dataloading (Poor Man's Dataloader)

As Karpathy puts it, he implemented a poor man's
[dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html).
We will start by dissecting the code and understanding how it works and finally,
show that everything can be done with PyTorch's `Dataset` and `Dataloader`.

### Memory Mapping

Firstly, Kaparthy uses `numpy`'s
[memory mapping](https://numpy.org/doc/stable/reference/generated/numpy.memmap.html)
(`numpy.memmap`) to load the data. Memory mapping is used to create a
memory-mapped array from a binary file. This involves mapping the contents of a
file directly into the virtual memory space of the calling process. This allows
applications to access the file data as if it were loaded in memory, using
pointer operations or array indexing, without the need for explicit read or
write operations.

This essentially means that you can access small segments of a large file
without having to load the entire file into memory. The concept draws
similarities to the use of [generators](https://wiki.python.org/moin/Generators)
in Python, where you can iterate over a large dataset without having to load the
entire dataset into memory.

In [12]:
train_data = np.memmap(composer.train_path, dtype=np.uint16, mode="r")
train_data_dtype = train_data.dtype
train_data_shape = train_data.shape

print(f"data_dtype: {train_data_dtype}, data_shape: {train_data_shape}")

data_dtype: uint16, data_shape: (301966,)


We see that the shape of train data is `(301966,)`, which means that it is a 1D (flattened) array 
with $301966$ elements - this is basically the length of the entire train corpus, in terms of
tokens.

### Notation, Context Length, Shuffling and Batching

However, we are not going to pass the entire training corpus as is to the model.
Instead, we are going to pass a **batch** of sequences (each sequence of length
`context_length`) to the model at a time.

#### Notation

More formally, let's consider a sequence $\mathcal{S}$ of tokens
$(t_1, t_2, \ldots, t_L)$, where $L$ is the length of the sequence. We are going
to pass a batch of sequences $\mathcal{B}$ to the model at a time. Consequently,
the input sequence to the model can be more concisely defined as a matrix $\mathbf{X}$
residing in the space $\mathbb{R}^{\mathcal{B} \times L}$, where $\mathcal{B}$ is the batch size and
$L$ is the length of the sequence.

$$
\mathbf{X} = \begin{bmatrix}
t_{1,1} & t_{1,2} & \ldots & t_{1,L} \\
t_{2,1} & t_{2,2} & \ldots & t_{2,L} \\
\vdots & \vdots & \ddots & \vdots \\
t_{B,1} & t_{B,2} & \ldots & t_{B,L} \\
\end{bmatrix} \in \mathbb{R}^{B \times L}
$$

where $t_{i,j}$ is the $j$-th token in the $i$-th sequence.

#### Context Length / Block Size

$L$ is often referred to as the sequence length, or in the context of GPT, it is
the `block_size` or `context_length` or `max_seq_len`.

It is the length of the sequence that the model will be trained on and is also
the context length/context window that we often hear about.

For example,
[Gemini 1.5](https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024)
was announced to have a standard $128,000$ token context window, up to a maximum
of $1$ million max length.

Typically, I think that if your model is trained on a certain context length, it
is not trivial to change it. For example, if you train a model on a context
length of $128$, you cannot simply change it to $256$ without retraining the
model. But it seems that it is increasingly possible to do so.

Let's look at an example, if we define our $L$ to be $32$, then we would expect each
sequence to be of length $32$.

In [13]:
first_sequence = train_data[0:0+32]
pprint(first_sequence)

first_sequence_decoded = tokenizer.decode(first_sequence)
print(first_sequence_decoded)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are


The example is just extracting $1$ such sequence $\mathcal{S}$ from the train
corpus. To leverage the prowess of linear algebra operations in CUDA, we would
typically pass a batch of sequences $\mathcal{B}$ to the model at a time.

Furthermore, we would require some level of randomness in the sequences that we
pass to the model to enable generalisation. You really do not want the model to
overfit to an ordered sequence of tokens in the training corpus.

To this end, let's see how Karpathy implements batching and shuffling of the
sequences.

#### Shuffling and Discrete Uniform Sampling

To enable shuffling, Karpathy generates a tensor of random integers (essentially a list of
random integers), which serve as indices. These indices are used to select
random sequences from the training (and validation) data.

For simplicity, let's look at the case where batch size is reduced to $\mathcal{B} = 1$.
This means we only need to sample $1$ sequence from the training data - and consequently
we only need $1$ random index.

We can easily achieve this via `torch.randint` which generates random integers
from a discrete uniform distribution over the half-open interval $[l, h)$,
and since we only want to sample $1$ sequence, we set `size=(1,)`.

In [24]:
generator = torch.Generator(device=composer.device)
low, high = 0, len(train_data) - composer.block_size
size = (1,)
indices: torch.Tensor = torch.randint(low=low, high=high, size=size, generator=generator)
pprint(indices)
pprint(indices.shape)

The mathematical operation performed by `torch.randint(low, high, size, generator)` can be described as drawing samples from a uniform discrete distribution. Each element of the resulting tensor is an independent and identically distributed {cite}`radford2019language` (i.i.d.) random variable $X_i$ with the following probability mass function (PMF):

$$
\mathbb{P}(X_i = k) = \frac{1}{h - l} \quad \text{for} \, k = l, \ldots, h-1 
$$

This PMF implies that each integer in the range $[l, h-1]$ has an equal probability of being selected.

In our example, we randomly sampled an index $136,016$ from the training data. We can then
construct a sequence $\mathcal{S}$ by selecting the tokens at the index $136,016$ and the
next $L$ tokens defined by the block size. For simplcity, we will set $L = 8$, which 
we have conveniently defined it in our `composer` configuration when `debug` is set to `True`.

In [26]:
random_sequence = train_data[indices:indices+composer.block_size]
pprint(random_sequence)
pprint(random_sequence.shape)

random_sequence_decoded = tokenizer.decode(random_sequence)
tokenizer.decode(random_sequence)

' written there, and to them say,'

One might wonder why the highest value of the random integers is
`len(self.train_data) - self.block_size`. This is mostly to prevent index out of
range errors. As we shall soon see, we are using these `indices` to slice a
sequence of length `block_size` from the data where you start slicing from the
index `index` and end at `index + block_size`.

#### Batching

Now that we understand how to sample a single sequence from the training data,
let's look at how we can sample a batch of sequences.
PyTorch made it easy for you, as we can just simply change the `size` parameter
to `(batch_size,)`.

In our case, if we set $\mathcal{B} = 2$, we would expect to sample $2$ sequences
from the training data - and consequently we would need $2$ random indices.

In [27]:
generator = torch.Generator(device=composer.device)
low, high = 0, len(train_data) - composer.block_size
size = (composer.batch_size,)
indices: torch.Tensor = torch.randint(low=low, high=high, size=size, generator=generator)
pprint(indices)
pprint(indices.shape)

We then construct a batch of sequences $\mathcal{B}$ by selecting the tokens at the
indices $136,016$ and $197,976$ and the next $L$ tokens via a for loop - and using `torch.stack`
to stack the sequences into a tensor of shape $\mathbb{R}^{\mathcal{B} \times L}$.

In [29]:
x = torch.stack([torch.from_numpy((train_data[index : index + composer.block_size]).astype(np.int64)) for index in indices])
pprint(x)
pprint(x.shape)

It is worth reconciling the fact that the slicing uses `[index:index + block_size]` and
therefore completes the reasoning behind the `len(self.train_data) - self.block_size` in
the `torch.randint` function call - to prevent index out of range errors. Consider
that if we do not subtract `block_size` from the length of the training data, we might
end up with an index that is the last index of the training data, and when we add
`block_size` to it, we would end up with an index that is out of range.

SyntaxError: invalid syntax (3936093827.py, line 1)

**check makemore notes**

to fill in!

In [19]:
def get_batch(
    composer: Composer,
    *,
    split: Literal["train", "valid"],
    batch_size: int,
    block_size: int,
    device: torch.device,
    device_type: Literal["cpu", "cuda"] = "cpu",
) -> Tuple[torch.Tensor, torch.Tensor]:
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == "train":
        data = np.memmap(composer.train_path, dtype=np.uint16, mode="r")
    else:
        data = np.memmap(composer.valid_path, dtype=np.uint16, mode="r")


    ix = torch.randint(len(data) - block_size, (batch_size,))
    pprint(ix)
    x_list_of_numpy: List[NDArray[np.int64]] = [data[i : i + block_size] for i in ix]
    y_list_of_numpy: List[NDArray[np.int64]] = [data[i + 1 : i + 1 + block_size] for i in ix]

    x = torch.stack([torch.from_numpy((data[i : i + block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i + 1 : i + 1 + block_size]).astype(np.int64)) for i in ix])
    if device_type == "cuda":
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

seed_all(composer.seed, seed_torch=True, set_torch_deterministic=False)
train_batch = get_batch(composer, split="train", batch_size=1, block_size=8, device=composer.device)
x, y = train_batch
pprint(x)
pprint(y)

It is relatively simple to understand - and since there is not a need to
[collate](https://pytorch.org/docs/stable/data.html#dataloader-collate-fn) the
data, which makes things a bit easier.

In [20]:
pprint(x.shape)
pprint(y.shape)

[^1]: [OpenAI tiktoken](https://github.com/openai/tiktoken)