In [None]:
from atomiclm.tokenizer import BasicTokenizer

In [None]:
# --- Config ---
# All paths and hyperparameters live here. Change these before running.
DATA_PATH = '../data/the-verdict.txt'  # UTF-8 text file to train on (not in repo)
OUTPUT_PATH = '../out/vocab'           # save() appends .json automatically
VOCAB_SIZE = 256 + 50                  # target vocabulary size (base 256 + merges)
SPECIAL_TOKENS = {'<|endoftext|>': VOCAB_SIZE}  # special tokens mapped to fixed IDs

## Load
Read the raw text corpus. BPE operates on the full string — larger corpora produce more representative merge rules.

In [None]:
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    text = f.read()

print(f'Corpus size: {len(text):,} chars')

## Train
BPE starts with 256 byte tokens and greedily merges the most frequent adjacent pair until `VOCAB_SIZE` is reached.
Each merge is stored in `tokenizer.merges` and used during encoding.

In [None]:
tokenizer = BasicTokenizer()

# Special tokens bypass BPE — they are matched literally before any merge is applied.
tokenizer.register_special_tokens(SPECIAL_TOKENS)

# Runs the heap-based O(n log n) BPE training algorithm.
tokenizer.train(text, VOCAB_SIZE, verbose=True)

print(f'Learned {len(tokenizer.merges)} merge rules')

## Encode / Decode
Encoding applies merges in rank order (highest-priority first) to produce token IDs.
Decoding maps each ID back to its byte sequence and joins them.

In [None]:
test_text = 'hello world'

token_ids = tokenizer.encode(test_text)
roundtrip = tokenizer.decode(token_ids)

print('token IDs :', token_ids)
print('decoded   :', roundtrip)
assert roundtrip == test_text, 'Roundtrip failed'

## Save
Persists merges and special tokens to a JSON file. The file stores each merge as a `[left, right, id]` triple.

In [None]:
tokenizer.save(OUTPUT_PATH)
print(f'Saved to {OUTPUT_PATH}.json')

## Verify
Load the saved file into a fresh tokenizer and confirm the merge rules are identical.

In [None]:
reloaded = BasicTokenizer()
reloaded.load(f'{OUTPUT_PATH}.json')

assert reloaded.merges == tokenizer.merges, 'Merge rules do not match after reload'
print('OK — merge rules match')