# Data Preparation For LLMs

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [4]:
## Load the data
fp: str = "../../data/the-verdict.txt"

with open(fp, "r", encoding="utf-8") as f:
    data = f.read()

print(f"Total number of characers: {len(data):,}\n\n")
print(f"The first 100 characters: {'====' * 10}\n{data[:100]}")

Total number of characers: 20,479


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [5]:
# Split the text on white spaces and punctuation. The words are intentionally NOT normalized.
# This is because it enables the LLM to differentiate between proper and regular nouns, etc.
text: str = data[:100]
pattern: str = r'([,.?_!"()\']|--|\s)'
re.split(pattern=pattern, string=text)

['I',
 ' ',
 'HAD',
 ' ',
 'always',
 ' ',
 'thought',
 ' ',
 'Jack',
 ' ',
 'Gisburn',
 ' ',
 'rather',
 ' ',
 'a',
 ' ',
 'cheap',
 ' ',
 'genius',
 '--',
 'though',
 ' ',
 'a',
 ' ',
 'good',
 ' ',
 'fellow',
 ' ',
 'enough',
 '--',
 'so',
 ' ',
 'it',
 ' ',
 'was',
 ' ',
 'no',
 ' ',
 'g']

In [6]:
# Remove whitespaces
preprocessed: list[str] = re.split(pattern=pattern, string=text)
preprocessed = [ch for ch in preprocessed if ch.strip()]
preprocessed

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'g']

In [7]:
# The entire data
# Remove whitespaces
preprocessed: list[str] = re.split(pattern=pattern, string=data)
preprocessed = [ch for ch in preprocessed if ch.strip()]
len(preprocessed), len(data)

(4649, 20479)

In [8]:
# Create vocabulary. i.e. a dict containing all the distinct words mapped to unique ineger values.
unk_token: str = "<|unk|>"
vocab: dict[str, any] = {
    ch: idx for idx, ch in enumerate(sorted(set(preprocessed)), start=1)
}
vocab[unk_token] = 0

In [9]:
# Convert tokens to IDs (encode)
text: str = (
    "Because of the scale of many ML systems, they consume a massive amount of data - ('Neidu, 2024)"
)
tok_text: list[str] = re.split(pattern=pattern, string=text)
tok_text = [ch for ch in tok_text if ch.strip()]
tok_IDs: list[int] = [vocab.get(ch, 0) for ch in tok_text]

", ".join([str(ch) for ch in tok_IDs])

'0, 739, 1014, 0, 739, 0, 0, 0, 6, 1020, 0, 120, 0, 0, 739, 0, 0, 4, 3, 0, 6, 0, 5'

In [10]:
# Convert token IDs back to tokens
idx_to_text: dict[int, str] = {idx: ch for ch, idx in vocab.items()}

res: list[str] = [idx_to_text.get(idx) for idx in tok_IDs]

# Remove the whitespaces after punctuation
pattern_1: str = r'\s+([,.?!"()\'])'
res: str = " ".join(res)
res = re.sub(pattern=pattern_1, repl=r"\1", string=res)
res

"<|unk|> of the <|unk|> of <|unk|> <|unk|> <|unk|>, they <|unk|> a <|unk|> <|unk|> of <|unk|> <|unk|>(' <|unk|>, <|unk|>)"

In [17]:
class SimpleTokenizerV1:
    """
    A simple tokenizer that splits text into tokens based on a predefined vocabulary.

    The `SimpleTokenizerV1` class provides methods to encode text into a list of token IDs and decode a list
    of token IDs back into text. It uses a predefined vocabulary to map between tokens and their corresponding IDs.

    Args:
        vocab (dict[str, int]): A dictionary mapping tokens to their corresponding IDs.

    Methods:
        encode(text: str) -> list[int]:
            Tokenize a string into a list of token IDs.
        decode(tok_IDs: list[int]) -> str:
            Convert a list of token IDs back into a string.
    """

    def __init__(self, vocab: dict[str, int]):
        self.vocab = vocab
        self.pattern_1: str = r'([,.?_!"()\']|--|\s)'
        self.pattern_2: str = r'\s+([,.?!"()\'])'
        self.idx_to_text: dict[int, str] = {idx: ch for ch, idx in self.vocab.items()}

    def encode(self, text: str) -> list[int]:
        """Tokenize a string into a list of tokens."""
        tok_text: list[str] = re.split(pattern=self.pattern_1, string=text)
        tok_text = [ch for ch in tok_text if ch.strip()]
        tok_IDs: list[int] = [vocab.get(ch, 0) for ch in tok_text]
        return tok_IDs

    def decode(self, tok_IDs: list[int]) -> str:
        """Convert a list of tokens into a string."""
        text: str = " ".join([self.idx_to_text.get(idx) for idx in tok_IDs])
        text = re.sub(pattern=self.pattern_2, repl=r"\1", string=text)
        return text

In [18]:
text: str = "Who is the greatest striker in the world?"
tokenizer: SimpleTokenizerV1 = SimpleTokenizerV1(vocab=vocab)
tok_IDs: list[int] = tokenizer.encode(text)
tok_IDs

[0, 596, 1014, 518, 0, 580, 1014, 0, 11]

In [19]:
tokenizer.decode(tok_IDs=tok_text)

'<|unk|> is the greatest <|unk|> in the <|unk|>?'

## Byte Pair Encoding

```sh
pip install tiktoken
```

In [23]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tok_IDs: list[int] = tokenizer.encode(text)
tok_IDs

[8241, 318, 262, 6000, 19099, 287, 262, 995, 30]

In [24]:
tokenizer.decode(tok_IDs)

'Who is the greatest striker in the world?'