In [None]:
import regex as re

from atomiclm.tokenizer import BasicTokenizer

In [None]:
# --- Config ---
DATA_PATH = '../data/the-verdict.txt'  # UTF-8 text file to train on (not in repo)
VOCAB_SIZE = 276                       # small vocab for readable output
SPECIAL_TOKENS = {
    '<|endoftext|>': VOCAB_SIZE,       # registered after training, ID = next after vocab
}

## Train
Train a small BPE tokenizer so the merge steps are easy to inspect.

In [None]:
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    text = f.read()

tokenizer = BasicTokenizer()
tokenizer.train(text, VOCAB_SIZE, verbose=True)

## Register special tokens
Special tokens are literal strings that bypass BPE entirely.
The encoder splits the input on these strings first, encodes the surrounding text
with BPE, and emits the fixed ID for each special token match.

In [None]:
tokenizer.register_special_tokens(SPECIAL_TOKENS)

## How the split works
Before encoding, the text is split using a regex built from the special token strings.
Each special token becomes its own chunk; everything else is encoded with BPE.

In [None]:
sample = 'hello world<|endoftext|>second document'

# This is what the encoder does internally before BPE.
pattern = '(' + '|'.join(re.escape(k) for k in SPECIAL_TOKENS) + ')'
chunks = re.split(pattern, sample)

print('split pattern:', pattern)
print('chunks       :', chunks)

## Encode
Pass `allowed_special="all"` to permit special tokens in the input.
By default they are rejected to prevent accidental injection.

In [None]:
ids = tokenizer.encode(sample, allowed_special='all')
decoded = tokenizer.decode(ids)

print('token IDs:', ids)
print('decoded  :', decoded)
assert decoded == sample, 'Roundtrip failed'