In [2]:
import pickle
from tqdm import tqdm

In [None]:
# loading a sample from our TinyStories dataset
from tools import get_data_loader
data_loader = get_data_loader(batch_size=5_000, split='train')

batch = next(iter(data_loader))
print(len(batch), batch[0])

In [None]:
# turn it into one string instead of a list of strings
combined_string = '\n\n'.join(batch)

# find the unique characters
chars = sorted(list(set(combined_string)))
# this is the largest set of characters i found earlier from a batch size of 1_000_000, total 95 characters
if len(chars) < 95:
    chars = ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
v = len(chars)
print('\n', chars, v)

In [5]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
char_encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers

# Regex
this is a pre-processing stage where we set the rules for what types of characters are allowed to be merged together

In [6]:
import regex as re

In [7]:
# don't ask me the specifics of how this plays out, i just know it's what they used for GPT4
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
# if you want to mess around with building your own tokenizer, then ^this string is one of the things to mess around with

In [None]:
compiled_pattern = re.compile(GPT4_SPLIT_PATTERN)
print(compiled_pattern)

In [None]:
# split the text up into text chunks
text_chunks = re.findall(compiled_pattern, combined_string)
print(len(combined_string), len(text_chunks))

In [None]:
print(combined_string[:100])
print(text_chunks[:100])

In [None]:
# input text preprocessing
ids = [char_encode(ch) for ch in text_chunks] # list(ch.encode("utf-8")) # <- use that instead to do actual bytez instead of characters
ids_backup = ids # saving this for later just to see how much compression we get
print(len(ids), ids[:100])

so this regex just splits the text up into all the token ids that are allowed to be merged, meaning that the regex output we saw above is an upper limit on the tokens that we could end up with if we get a large enough vocabulary, rather than a starting point. ngl the reason I didn't use regex on bpe_v1 is because I thought it was a starting point and didn't understand how subwords were supposed to work -_-

# CPE tokenization
CPE = character-pair encoding instead of byte-pair encoding. Honestly we could've done BPE and the only difference would've been a negligibly larger embedding tensor in our model (specifically 256-95=161 extra rows) but might as well since we know we're only gonne be using this one same dataset

In [12]:
vocab_size = 16384 # the desired final vocabulary size
num_merges = vocab_size - v

In [None]:
# most models work off bytes, but we'll be simplifying to just the index of each unique character
base_indices = char_encode(chars)
print(base_indices)

In [14]:
def get_stats(ids, counts=None):
    """
    Given a list of integers, return a dictionary of counts of consecutive pairs
    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
    Optionally allows to update an existing dictionary of counts
    """
    counts = {} if counts is None else counts
    for pair in zip(ids, ids[1:]): # iterate consecutive elements
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
    """
    In the list of integers (ids), replace all consecutive occurrences
    of pair with the new integer token idx
    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
    """
    newids = []
    i = 0
    while i < len(ids):
        # if not at the very last position AND the pair matches, replace it
        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

In [None]:
# now let's actually do it
merges = {} # (int, int) -> int
for i in tqdm(range(num_merges)):
    # count the number of times every consecutive pair appears
    stats = {}
    for chunk_ids in ids:
        # passing in stats will update it in place, adding up counts
        get_stats(chunk_ids, stats)
    # find the pair with the highest count
    pair = max(stats, key=stats.get)
    # mint a new token: assign it the next available id
    idx = v + i
    # replace all occurrences of pair in ids with idx
    ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
    # save the merge
    merges[pair] = idx
    #print(f"merge {i+1}/{num_merges}: {pair} -> {idx} had {stats[pair]} occurrences")

In [None]:
og = sum([len(t) for t in ids_backup])
new = sum([len(t) for t in (ids)])
print("original length:", og) # remember tokens are our original tokens
print("ids length:", new) # and ids are new tokens we've made
print(f"compression ratio: {og / new:.2f}X")

In [31]:
# Ensure the tokenizers directory exists
if not os.path.exists('./models'):
    os.makedirs('./models')

# Prepare the tokenizer data to be saved
tokenizer_data = {
    'stoi': stoi,  # Character to integer mapping
    'merges': merges  # Merges dictionary
}

# Save the tokenizer data using pickle
with open(f'./models/{vocab_size}.model', 'wb') as f:
    pickle.dump(tokenizer_data, f)

In [30]:
# taking a pre-existing tokenizer and trimming it down to a smaller size
# i basically ran this cell and then one above it multiple times until i got to the smallest possible size (128)
vocab_size = vocab_size // 2 # len(chars) for character-wise tokenization
merges = {k: v for k, v in merges.items() if v < vocab_size}

In [32]:
from tokenizer import load_tokenizer_data, BPE_Tokenizer
vocab_size = 16384
tokenizer_data = load_tokenizer_data(f'models/{vocab_size}.model')
tokenizer = BPE_Tokenizer(tokenizer_data['stoi'], tokenizer_data['merges'])

In [None]:
prompt = 'Once upon a time there was a boy named Tim.'
print(tokenizer.display(prompt))

In [None]:
print(tokenizer.display(batch[0]))

In [None]:
for i in range(vocab_size):
    print(f"{i}: '{tokenizer.decode([i])}'")