In [None]:
# uv sync --extra tiktoken
import tiktoken

from atomiclm.tokenizer import BasicTokenizer

In [None]:
# --- Config ---
DATA_PATH = '../data/the-verdict.txt'   # UTF-8 text file to train on (not in repo)
OUTPUT_PATH = '../out/vocab_tiktoken'   # save() appends .json automatically
VOCAB_SIZE = 4096                       # target vocabulary size (base 256 + merges)
# 100257 matches tiktoken's GPT-4 ID for <|endoftext|> — required for a compatible export.
SPECIAL_TOKENS = {'<|endoftext|>': 100257}

## Load
Read the raw text corpus used to learn merge rules.

In [None]:
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    text = f.read()

print(f'Corpus size: {len(text):,} chars')

## Train
Run BPE to learn merge rules. Special tokens are registered after training — they are never split by the BPE algorithm.

In [None]:
tokenizer = BasicTokenizer()
tokenizer.train(text, VOCAB_SIZE)
tokenizer.register_special_tokens(SPECIAL_TOKENS)

print(f'Learned {len(tokenizer.merges)} merge rules, vocab size: {len(tokenizer.vocab)}')

## Save
Persist the trained tokenizer so it can be reloaded without retraining.

In [None]:
tokenizer.save(OUTPUT_PATH)
print(f'Saved to {OUTPUT_PATH}.json')

## Reload
Load the tokenizer from disk and confirm the vocabulary size.

In [None]:
tokenizer = BasicTokenizer()
tokenizer.load(f'{OUTPUT_PATH}.json')

print(f'Vocab size: {len(tokenizer.vocab)}')

## Inspect
`export_mergeable_ranks()` yields `(bytes, rank)` pairs in merge-priority order — the format tiktoken expects.
The regex `pattern` and `special_tokens` are passed through unchanged.

In [None]:
mergeable_ranks = dict(tokenizer.export_mergeable_ranks())

print(f'pattern        : {tokenizer.pattern}')
print(f'mergeable_ranks: {list(mergeable_ranks.items())[-3:]}')  # last 3 as a sample
print(f'special_tokens : {tokenizer.special_tokens}')

## Export to tiktoken
Wrap the tokenizer's merge rules inside a `tiktoken.Encoding` object.
After this, `enc` is a drop-in replacement for any tiktoken encoding.

In [None]:
enc = tiktoken.Encoding(
    name='my_tokenizer',
    pat_str=tokenizer.pattern,
    mergeable_ranks=mergeable_ranks,
    special_tokens=tokenizer.special_tokens,
)

## Compare
Encode the same string with both implementations. The merge tables are identical and both
algorithms apply merges in rank order (lowest rank first), so the output should match.
tiktoken uses a compiled Rust backend, so it is significantly faster at inference.

In [None]:
test_text = 'hello world picture'

ids_py  = tokenizer.encode(test_text)  # Python BPE implementation
ids_tkt = enc.encode(test_text)        # tiktoken Rust backend

print('BasicTokenizer :', ids_py)
print('tiktoken       :', ids_tkt)
assert ids_py == ids_tkt, 'Encoding mismatch'