# Data Preparation For LLMs

- Preparing the input text for an LLM involves:
  - tokenizing the text
  - converting the tokens into integers (IDs)
  - converting the integers into vector embeddings.

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [4]:
## Load the data
fp: str = "../../data/the-verdict.txt"

with open(fp, "r", encoding="utf-8") as f:
    data = f.read()

print(f"Total number of characers: {len(data):,}\n\n")
print(f"The first 100 characters: {'====' * 10}\n{data[:100]}")

Total number of characers: 20,479


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [5]:
# Split the text on white spaces and punctuation. The words are intentionally NOT normalized.
# This is because it enables the LLM to differentiate between proper and regular nouns, etc.
text: str = data[:100]
pattern: str = r'([,.?_!"()\']|--|\s)'
re.split(pattern=pattern, string=text)

['I',
 ' ',
 'HAD',
 ' ',
 'always',
 ' ',
 'thought',
 ' ',
 'Jack',
 ' ',
 'Gisburn',
 ' ',
 'rather',
 ' ',
 'a',
 ' ',
 'cheap',
 ' ',
 'genius',
 '--',
 'though',
 ' ',
 'a',
 ' ',
 'good',
 ' ',
 'fellow',
 ' ',
 'enough',
 '--',
 'so',
 ' ',
 'it',
 ' ',
 'was',
 ' ',
 'no',
 ' ',
 'g']

In [6]:
# Remove whitespaces
preprocessed: list[str] = re.split(pattern=pattern, string=text)
preprocessed = [ch for ch in preprocessed if ch.strip()]
preprocessed

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'g']

In [7]:
# The entire data
# Remove whitespaces
preprocessed: list[str] = re.split(pattern=pattern, string=data)
preprocessed = [ch for ch in preprocessed if ch.strip()]
len(preprocessed), len(data)

(4649, 20479)

In [8]:
# Create vocabulary. i.e. a dict containing all the distinct words mapped to unique ineger values. (bag of words)
unk_token: str = "<|unk|>"
end_of_text: str = "<|endoftext|>"
vocab: dict[str, any] = {
    ch: idx for idx, ch in enumerate(sorted(set(preprocessed)), start=0)
}
vocab[unk_token] = len(vocab) + 1
vocab[end_of_text] = len(vocab) + 1

In [9]:
# Convert tokens to IDs (encode)
text: str = (
    "Because of the scale of many ML systems, they consume a massive amount of data - ('Neidu, 2024)"
)
tok_text: list[str] = re.split(pattern=pattern, string=text)
tok_text = [ch for ch in tok_text if ch.strip()]
tok_IDs: list[int] = [
    vocab.get(ch) if ch in vocab else vocab.get(unk_token) for ch in tok_text
]

", ".join([str(ch) for ch in tok_IDs])

'1160, 738, 1013, 1160, 738, 1160, 1160, 1160, 5, 1019, 1160, 119, 1160, 1160, 738, 1160, 1160, 3, 2, 1160, 5, 1160, 4'

In [10]:
# Convert token IDs back to tokens
idx_to_text: dict[int, str] = {idx: ch for ch, idx in vocab.items()}

res: list[str] = [idx_to_text.get(idx) for idx in tok_IDs]

# Remove the whitespaces after punctuation
pattern_1: str = r'\s+([,.?!"()\'])'
res: str = " ".join(res)
res = re.sub(pattern=pattern_1, repl=r"\1", string=res)
res

"<|unk|> of the <|unk|> of <|unk|> <|unk|> <|unk|>, they <|unk|> a <|unk|> <|unk|> of <|unk|> <|unk|>(' <|unk|>, <|unk|>)"

In [11]:
class SimpleTokenizerV1:
    """
    A simple tokenizer that splits text into tokens based on a predefined vocabulary.

    The `SimpleTokenizerV1` class provides methods to encode text into a list of token IDs and decode a list
    of token IDs back into text. It uses a predefined vocabulary to map between tokens and their corresponding IDs.

    Args:
        vocab (dict[str, int]): A dictionary mapping tokens to their corresponding IDs.

    Methods:
        encode(text: str) -> list[int]:
            Tokenize a string into a list of token IDs.
        decode(tok_IDs: list[int]) -> str:
            Convert a list of token IDs back into a string.
    """

    def __init__(self, vocab: dict[str, int]):
        self.vocab = vocab
        self.pattern_1: str = r'([,.?_!"()\']|--|\s)'
        self.pattern_2: str = r'\s+([,.?!"()\'])'
        self.idx_to_text: dict[int, str] = {idx: ch for ch, idx in self.vocab.items()}

    def encode(self, text: str) -> list[int]:
        """Tokenize a string into a list of tokens."""
        unk_token: str = "<|unk|>"
        tok_text: list[str] = re.split(pattern=self.pattern_1, string=text)
        tok_text = [ch for ch in tok_text if ch.strip()]
        tok_IDs: list[int] = [
            vocab.get(ch) if ch in vocab else vocab.get(unk_token) for ch in tok_text
        ]
        return tok_IDs

    def decode(self, tok_IDs: list[int]) -> str:
        """Convert a list of tokens into a string."""
        text: str = " ".join([self.idx_to_text.get(idx) for idx in tok_IDs])
        # Clean up the spaces around punctuation
        text = re.sub(pattern=self.pattern_2, repl=r"\1", string=text)
        return text

In [12]:
text: str = "Who is the greatest striker in the world?"
tokenizer: SimpleTokenizerV1 = SimpleTokenizerV1(vocab=vocab)
tok_IDs: list[int] = tokenizer.encode(text)
tok_IDs

[1160, 595, 1013, 517, 1160, 579, 1013, 1160, 10]

In [13]:
tokenizer.decode(tok_IDs=tok_IDs)

'<|unk|> is the greatest <|unk|> in the <|unk|>?'

In [14]:
text1: str = "Hello, do you like tea?"
text2: str = "In the sunlit terraces of the palace."
text: str = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [15]:
tokenizer = SimpleTokenizerV1(vocab)
print(tokenizer.encode(text))

[1160, 5, 362, 1155, 642, 1000, 10, 1161, 57, 1013, 981, 1009, 738, 1013, 1160, 7]


In [16]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair Encoding

```sh
pip install tiktoken
```

- It encodes unknown words properly.

In [17]:
import tiktoken


text: str = "Who is the greatest striker in the world?"
tokenizer = tiktoken.get_encoding("gpt2")
tok_IDs: list[int] = tokenizer.encode(text)
tok_IDs

[8241, 318, 262, 6000, 19099, 287, 262, 995, 30]

In [18]:
tokenizer.decode(tok_IDs)

'Who is the greatest striker in the world?'

In [19]:
end_of_text: str = "<|endoftext|>"

text: str = f"Who is the greatest striker in the world? {end_of_text} AI is booming!"
tok_IDs: list[int] = tokenizer.encode(text, allowed_special={end_of_text})
tok_IDs

[8241,
 318,
 262,
 6000,
 19099,
 287,
 262,
 995,
 30,
 220,
 50256,
 9552,
 318,
 32017,
 0]

In [20]:
tokenizer.decode(tok_IDs)

'Who is the greatest striker in the world? <|endoftext|> AI is booming!'

In [21]:
# How does BPE handle unknown workds/tokens??
text: str = "ChineiduTheGreat"
tokenizer = tiktoken.get_encoding("gpt2")
tok_IDs: list[int] = tokenizer.encode(text, allowed_special={end_of_text})
tok_IDs

[1925, 500, 312, 84, 464, 13681]

In [22]:
# BPE breaks down unkowned tokens into subwords and individual characters. This prevents BPE from replacing
# unknown tokens with a special token sunch as <|unk|>
(
    tokenizer.decode([1925]),  # Ch
    tokenizer.decode([500]),  # ine
    tokenizer.decode([312]),  # id
    tokenizer.decode([84]),  # u
    tokenizer.decode([464]),  # The
    tokenizer.decode([13681]),  # Great
)

('Ch', 'ine', 'id', 'u', 'The', 'Great')

### Data Sampling With A Sliding Window

In [23]:
# Tokenize the entire data using BPE

tokenizer = tiktoken.get_encoding("gpt2")
tok_data: list[int] = tokenizer.encode(data)

tok_data[:5], len(tok_data)

([40, 367, 2885, 1464, 1807], 5145)

In [24]:
# Create input-target pairs for the next-word prediction
enc_sample: list[int] = tok_data[:50]
context_size: int = 4
x: list[int] = enc_sample[:context_size]
y: list[int] = enc_sample[1 : context_size + 1]

print(f"{x = }")
print(f"{y = }")

x = [40, 367, 2885, 1464]
y = [367, 2885, 1464, 1807]


In [25]:
for idx in range(1, context_size + 1):
    print(f"{enc_sample[:idx]} ---> {enc_sample[idx]}")

[40] ---> 367
[40, 367] ---> 2885
[40, 367, 2885] ---> 1464
[40, 367, 2885, 1464] ---> 1807


In [26]:
for idx in range(1, context_size + 1):
    print(
        f"{tokenizer.decode(enc_sample[:idx])} ---> {tokenizer.decode([enc_sample[idx]])}"
    )

I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought


#### Data Loader Implementation

In [27]:
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, text: str, tokenizer: Any, max_length: int, stride: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        print(f"{len(token_ids) = :,}")

        for idx in range(0, len(token_ids) - max_length, stride):
            input_chunk: list[int] = token_ids[idx : (idx + max_length)]
            target_chunk: list[int] = token_ids[idx + 1 : (idx + max_length + 1)]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self) -> int:
        return len(self.input_ids)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, ...]:
        x = self.input_ids[idx]
        y = self.target_ids[idx]
        return (x, y)


def create_dataloader(
    text: str,
    batch_size: int = 4,
    max_length: int = 256,
    stride: int = 128,
    shuffle: bool = True,
    drop_last: bool = True,
) -> DataLoader:
    """Create a dataloader for the given text data."""
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset: Dataset = GPTDataset(
        text=text, tokenizer=tokenizer, max_length=max_length, stride=stride
    )
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last
    )
    return dataloader

In [28]:
tokenizer = tiktoken.get_encoding("gpt2")
dataset_ = GPTDataset(text=data[:30], tokenizer=tokenizer, max_length=4, stride=1)
print(f"{dataset_.input_ids=}, \n{dataset_.target_ids=}")

len(token_ids)=9
dataset_.input_ids=[tensor([  40,  367, 2885, 1464]), tensor([ 367, 2885, 1464, 1807]), tensor([2885, 1464, 1807, 3619]), tensor([1464, 1807, 3619,  402]), tensor([1807, 3619,  402,  271])], 
dataset_.target_ids=[tensor([ 367, 2885, 1464, 1807]), tensor([2885, 1464, 1807, 3619]), tensor([1464, 1807, 3619,  402]), tensor([1807, 3619,  402,  271]), tensor([3619,  402,  271,   65])]


In [29]:
dataset_[0]

(tensor([  40,  367, 2885, 1464]), tensor([ 367, 2885, 1464, 1807]))

In [30]:
dataset_ = GPTDataset(text=data[:30], tokenizer=tokenizer, max_length=4, stride=2)
print(f"{dataset_.input_ids=}, \n{dataset_.target_ids=}")

len(token_ids)=9
dataset_.input_ids=[tensor([  40,  367, 2885, 1464]), tensor([2885, 1464, 1807, 3619]), tensor([1807, 3619,  402,  271])], 
dataset_.target_ids=[tensor([ 367, 2885, 1464, 1807]), tensor([1464, 1807, 3619,  402]), tensor([3619,  402,  271,   65])]


In [31]:
dataloader = create_dataloader(
    text=data, batch_size=4, max_length=4, stride=1, shuffle=False, drop_last=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)

first_batch

len(token_ids)=5,145


[tensor([[  40,  367, 2885, 1464],
         [ 367, 2885, 1464, 1807],
         [2885, 1464, 1807, 3619],
         [1464, 1807, 3619,  402]]),
 tensor([[ 367, 2885, 1464, 1807],
         [2885, 1464, 1807, 3619],
         [1464, 1807, 3619,  402],
         [1807, 3619,  402,  271]])]

In [32]:
second_batch = next(data_iter)
second_batch

[tensor([[ 1807,  3619,   402,   271],
         [ 3619,   402,   271, 10899],
         [  402,   271, 10899,  2138],
         [  271, 10899,  2138,   257]]),
 tensor([[ 3619,   402,   271, 10899],
         [  402,   271, 10899,  2138],
         [  271, 10899,  2138,   257],
         [10899,  2138,   257,  7026]])]

In [33]:
# Using stride=2
stride: int = 2

dataloader = create_dataloader(
    text=data, batch_size=8, max_length=4, stride=stride, shuffle=False, drop_last=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

console.print(f"{inputs=}, \n{targets=}")

# e.g. slide by 2 index positions
# [   40,   367,  2885,  1464],
# [ 2885,  1464,  1807,  3619]

len(token_ids)=5,145


### Token Embeddings

- Preparing the input text for an LLM involves:
  - tokenizing the text
  - converting the tokens into integers (IDs)
  - converting the integers into vector embeddings.

In [34]:
# Initialize the embedding weights with random values which will be optimized during training.
input_ids: Tensor = torch.tensor([2, 3, 5, 1])

torch.manual_seed(42)
vocab_size: int = 6
output_dim: int = 3
embedding_layer = nn.Embedding(vocab_size, output_dim)  # lookup table
print(f"{embedding_layer.weight.shape=}\n")
print(embedding_layer.weight)

embedding_layer.weight.shape=torch.Size([6, 3])

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)


In [35]:
# Create it manually!
torch.manual_seed(42)
torch.randn(vocab_size, output_dim, requires_grad=True)

tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)

In [36]:
# Embed the input
print(f"{embedding_layer(torch.tensor([3]))}")

# OR (using matrix multiplication)
res: Tensor = (
    F.one_hot(torch.tensor([3]), num_classes=6).float() @ embedding_layer.weight
)
print(f"OR\n{res}")

tensor([[-0.6866,  0.6105,  1.3347]], grad_fn=<EmbeddingBackward0>)
OR
tensor([[-0.6866,  0.6105,  1.3347]], grad_fn=<MmBackward0>)


In [37]:
# Embed the entire input
embedding_layer(input_ids)

tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

<br>

### Encoding Word Positions

- The transformer architecture, unlike recurrent neural networks (RNNs), processes all words in a sentence simultaneously.
- This parallel processing is efficient but lacks a built-in mechanism to understand the order of words, which is crucial for language understanding.
- In summary, `positional encodings` are essential in the transformer architecture to provide information about the `order of words` in a sequence, enabling the model to `understand` and process natural language effectively.

#### Absolute Positional Encoding

- Absolute positional encoding is a technique used in transformer architectures to `encode the positions of tokens in a sequence`.
- It provides each position in the sequence with a unique representation, which is added to the corresponding word embeddings to inform the model about the position of each word.
- This is necessary because transformers process the entire input sequence simultaneously and, without positional encoding, would lack the ability to understand the order of words.

#### Relative Positional Encoding

- Relative positional encodings are an alternative to absolute positional encodings used in transformer models to `incorporate information about the relative positions of tokens in a sequence`, rather than their absolute positions.
- This approach can be more flexible and efficient, particularly for tasks where the relationships between tokens are more important than their fixed positions.

In [38]:
tokenizer = tiktoken.get_encoding("gpt2")
dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_core_bpe',
 '_encode_bytes',
 '_encode_only_native_bpe',
 '_encode_single_piece',
 '_mergeable_ranks',
 '_pat_str',
 '_special_tokens',
 'decode',
 'decode_batch',
 'decode_bytes',
 'decode_bytes_batch',
 'decode_single_token_bytes',
 'decode_tokens_bytes',
 'decode_with_offsets',
 'encode',
 'encode_batch',
 'encode_ordinary',
 'encode_ordinary_batch',
 'encode_single_token',
 'encode_with_unstable',
 'eot_token',
 'max_token_value',
 'n_vocab',
 'name',
 'special_tokens_set',
 'token_byte_values']

In [39]:
torch.manual_seed(42)

tokenizer = tiktoken.get_encoding("gpt2")
vocab_size: int = tokenizer.n_vocab
output_dim: int = 256
token_embedding_layer = nn.Embedding(vocab_size, output_dim)  # lookup table
print(f"{token_embedding_layer.weight.shape=}\n")  # (vocab_size, output_dim)

token_embedding_layer.weight.shape=torch.Size([50257, 256])



In [40]:
batch_size: int = 8
max_length: int = 4
stride: int = 4  # max_length and stride are equal to prevent overlapping.

dataloader = create_dataloader(
    text=data,
    batch_size=batch_size,
    max_length=max_length,
    stride=stride,
    shuffle=False,
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"{inputs.shape = }")  # (batch_size, max_length)
print(f"{targets.shape = }")  # (batch_size, max_length)

len(token_ids)=5,145
inputs.shape = torch.Size([8, 4])
targets.shape = torch.Size([8, 4])


In [41]:
# Each token ID is embedded as an `output_dim` dimensional tensor output
token_embeddings = token_embedding_layer(inputs)  # (batch_size, max_length, output_dim)
print(f"{token_embeddings.shape = }")

token_embeddings.shape = torch.Size([8, 4, 256])


In [42]:
# Create the positional embedding
context_length: int = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length)) 
print(f"{pos_embeddings.shape = }")  # (context_length, output_dim)

pos_embeddings.shape = torch.Size([4, 256])


In [43]:
token_embeddings[0].shape

torch.Size([4, 256])

In [44]:
token_embeddings.shape, pos_embeddings.shape

(torch.Size([8, 4, 256]), torch.Size([4, 256]))

In [45]:
# Compute the input embeddings. i.e. add positional embeddings to the token embeddings to enable the LLM to learn word positions.
input_embeddings = token_embeddings + pos_embeddings
print(f"{input_embeddings.shape = }")

input_embeddings.shape = torch.Size([8, 4, 256])
