In [1]:
import re
import os
import json

import torch

from modules.training import train_streamed_lm
from modules.transformer import TransformerConfig, TransformerLanguageModel
from modules.eval import (
    iter_hf_texts, evaluate_token_nll,
    word_and_char_perplexity, whitespace_oov_stats
 )
from modules.benchmark import (
    measure_throughput, measure_generation_latency, 
    measure_training_step_time
)
from modules.tokenizers import (
    build_pretrained_tokenizer, WhitespaceTokenizer, 
    build_ws_vocab, SentencePieceTokenizer, 
    train_sentencepiece, tokenizer_throughput, 
    avg_tokens_per_word, percent_words_encoded_directly
)
from modules.device import best_device

1. Build tokenizers

In [2]:
# Pretrained tokenizer (GPT-2)
pretrained_name = 'gpt2'
tok_pre = build_pretrained_tokenizer(pretrained_name)
vocab_N = tok_pre.vocab_size
vocab_N

50257

In [3]:
# Whitespace tokenizer: build vocab from HF StackOverflow train stream (small sample)
def head_texts(n=2000):
    for ex in iter_hf_texts(split='train'):
        yield ex
        n -= 1
        if n<=0: 
            break

ws_vocab = build_ws_vocab(head_texts(5000), vocab_size=vocab_N) # type: ignore
tok_ws = WhitespaceTokenizer(ws_vocab, name_or_path=f'whitespace_{vocab_N}')
tok_ws.vocab_size

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

26428

In [4]:
# SentencePiece tokenizer: train on sampled corpus and load
sp_dir = os.path.join('outputs','sentencepiece')
os.makedirs(sp_dir, exist_ok=True)
sp_input = os.path.join(sp_dir, 'sp_corpus.txt')

# Build a ~1MB training corpus different from evaluation sample
with open(sp_input, 'w', encoding='utf-8') as f:
    bytes_written = 0
    for t in iter_hf_texts(split='train'):
        f.write(t.replace('\n',' ')+'\n')
        bytes_written += len(t)
        if bytes_written > 1_200_000:
            break

sp_model = train_sentencepiece(sp_input, model_prefix=os.path.join(
    sp_dir,'spm'), vocab_size=vocab_N, model_type='bpe') # type: ignore
tok_sp = SentencePieceTokenizer(sp_model, name_or_path=f'spm_{vocab_N}')
tok_sp.vocab_size

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

50257

2. Train three identical models

In [5]:
def build_model(vocab_size: int, pad_id: int|None):
    cfg = TransformerConfig(vocab_size=vocab_size, 
                            pad_token_id=pad_id or 0, 
                            max_seq_len=256, 
                            emb_dim=384, 
                            n_layers=4, 
                            n_heads=6, 
                            ff_dim=1536)
    return TransformerLanguageModel(cfg), cfg.__dict__

tokenizers = {
    'pretrained': tok_pre,
    'whitespace': tok_ws,
    'sentencepiece': tok_sp,
}

tokenizer_pad = {
    k: (getattr(t, 'pad_token_id', None) if getattr(
        t, 'pad_token_id', None) is not None else 0)
    for k,t in tokenizers.items()
}

tokenizer_vocab = { k: (t.vocab_size if hasattr(
    t,'vocab_size') else t.get_vocab_size()) for k,t in tokenizers.items() }
tokenizer_vocab

{'pretrained': 50257, 'whitespace': 26428, 'sentencepiece': 50257}

In [6]:
# Train loop per tokenizer
save_root = os.path.join('outputs')
os.makedirs(save_root, exist_ok=True)

train_configs = {}
final_weights = {}

for name, tok in tokenizers.items():
    print(f'Training model for tokenizer: {name}')

    model, cfg = build_model(tokenizer_vocab[name], tokenizer_pad[name])
    ckpt_dir = os.path.join(save_root, name, 'checkpoints')
    final_dir = os.path.join(save_root, name)

    out = train_streamed_lm(
        model, tok, cfg,
        ckpt_dir=ckpt_dir, final_dir=final_dir,
        batch_size=16, max_length=256, steps_per_epoch=1000, num_epochs=5, save_every=1,
        lr=3e-4, warmup_steps=200, grad_clip=1.0,
    )

    train_configs[name] = cfg
    final_weights[name] = out

final_weights

Training model for tokenizer: pretrained


  self.setter(val)


Training on device: cuda
Resumed from outputs\pretrained\checkpoints\last.pt at epoch 5, step 5000
Training complete. Saved to: outputs\pretrained\final.pt
Training model for tokenizer: whitespace


  scaler = torch.cuda.amp.GradScaler(enabled=use_cuda_amp)


Training on device: cuda
Resumed from outputs\whitespace\checkpoints\last.pt at epoch 5, step 5000
Training complete. Saved to: outputs\whitespace\final.pt
Training model for tokenizer: sentencepiece
Training on device: cuda
Resumed from outputs\sentencepiece\checkpoints\last.pt at epoch 5, step 5000
Training complete. Saved to: outputs\sentencepiece\final.pt


{'pretrained': 'outputs\\pretrained\\final.pt',
 'whitespace': 'outputs\\whitespace\\final.pt',
 'sentencepiece': 'outputs\\sentencepiece\\final.pt'}

3. Evaluation: word- and character-level perplexity

In [7]:
# Evaluate models
results = {}
eval_texts = iter_hf_texts(split='test')

# for tokenization stats later, we may need the same texts twice; 
# so convert to list of limited size
eval_texts_list = [t for _, t in zip(range(2000), eval_texts)]

dev = best_device()
print('Using device for evaluation:', dev)

for name, tok in tokenizers.items():
    print(f'Evaluating tokenizer: {name}')
    model, cfg = build_model(tokenizer_vocab[name], tokenizer_pad[name])

    # load final weights
    state = torch.load(final_weights[name], map_location='cpu')
    model.load_state_dict(state)
    model.to(dev)
    model.eval()

    avg_nll, n_tokens = evaluate_token_nll(
        model, tok, eval_texts_list, max_examples=len(eval_texts_list))
    word_ppl, char_ppl = word_and_char_perplexity(
        avg_nll, tok, eval_texts_list, max_examples=len(eval_texts_list))

    results[name] = {
        'avg_token_nll': float(avg_nll),
        'word_ppl': float(word_ppl),
        'char_ppl': float(char_ppl),
        'eval_tokens': int(n_tokens),
    }

results

Using device for evaluation: cuda
Evaluating tokenizer: pretrained
Evaluating tokenizer: whitespace
Evaluating tokenizer: sentencepiece


{'pretrained': {'avg_token_nll': 7.091267203816032,
  'word_ppl': 1861.6299426343217,
  'char_ppl': 4.025724676578652,
  'eval_tokens': 407},
 'whitespace': {'avg_token_nll': 6.418873063535515,
  'word_ppl': 613.3115613681346,
  'char_ppl': 3.2782911709636395,
  'eval_tokens': 381},
 'sentencepiece': {'avg_token_nll': 6.786788562736889,
  'word_ppl': 1283.7788507666364,
  'char_ppl': 3.758276634644433,
  'eval_tokens': 404}}

4. OOV and efficiency metrics

In [8]:
# OOV for whitespace
ws_vocab_set = set(tok_ws.vocab.itos)
oov_count, total_words, pct_oov = whitespace_oov_stats(
    ws_vocab_set, eval_texts_list, max_examples=len(eval_texts_list))
oov_stats = {'oov': oov_count, 'total': total_words, 'percent': pct_oov}
oov_stats

{'oov': 14, 'total': 421, 'percent': 3.32541567695962}

In [9]:
# Efficiency: tokenizer throughput and avg tokens per word
efficiency = {}

for name, tok in tokenizers.items():
    tps, total = tokenizer_throughput(tok, eval_texts_list, iters=2)
    atpw = avg_tokens_per_word(tok, eval_texts_list, max_examples=len(eval_texts_list))
    pct_direct = percent_words_encoded_directly(tok, eval_texts_list, max_examples=200)
    efficiency[name] = {'tok_per_sec': tps, 
                        'tokens_total': total, 
                        'avg_tokens_per_word': atpw, 
                        'pct_words_direct': pct_direct, 
                        'vocab_size': tokenizer_vocab[name]}

efficiency

{'pretrained': {'tok_per_sec': 214723.0015461261,
  'tokens_total': 894,
  'avg_tokens_per_word': 1.0617577197149644,
  'pct_words_direct': 79.09738717339667,
  'vocab_size': 50257},
 'whitespace': {'tok_per_sec': 251771.86625793113,
  'tokens_total': 842,
  'avg_tokens_per_word': 1.0,
  'pct_words_direct': 96.67458432304038,
  'vocab_size': 26428},
 'sentencepiece': {'tok_per_sec': 208646.123578511,
  'tokens_total': 888,
  'avg_tokens_per_word': 1.0546318289786223,
  'pct_words_direct': 95.01187648456056,
  'vocab_size': 50257}}

6. Save results

In [10]:
save_json = {
    'ppl': results,
    'oov_ws': oov_stats,
    'efficiency': efficiency,
}

res_path = os.path.join('outputs','summary.json')

with open(res_path, 'w', encoding='utf-8') as f:
    json.dump(save_json, f, indent=2)

In [11]:
# Build ~1MB evaluation buffer (distinct from SP training corpus)
def collect_texts_approx_bytes(split='test', target_bytes=1_048_576):
    buf = []
    total = 0

    for t in iter_hf_texts(split=split):
        buf.append(t)
        total += len(t)
        if total >= target_bytes:
            break

    return buf

stats_texts_1mb = collect_texts_approx_bytes('test', 1_100_000)
sum(len(x) for x in stats_texts_1mb)

2276

In [12]:
# Override efficiency metrics using ~1MB test buffer
efficiency = {}

def direct_word_counts(tokenizer, texts):
    # count words encoded as single token and not UNK
    word_re = re.compile(r"\w+|[^\w\s]", re.UNICODE)
    total = 0
    direct = 0

    for t in texts:
        for w in word_re.findall(t):
            if not w.strip():
                continue

            # request tensors so we can safely use .sum() and .tolist()
            enc = tokenizer(w, return_tensors='pt', padding=True, truncation=True)
            ids = enc['input_ids'][0].tolist()
            length = int((enc['attention_mask'][0]).sum().item())
            is_unk = hasattr(tokenizer, 'unk_token_id') and (
                length==1 and ids[0] == getattr(tokenizer,'unk_token_id'))
            if length == 1 and not is_unk:
                direct += 1
            total += 1

    return direct, total, (direct/total*100.0 if total else 0.0)

for name, tok in tokenizers.items():
    tps, total = tokenizer_throughput(tok, stats_texts_1mb, iters=2)
    atpw = avg_tokens_per_word(tok, stats_texts_1mb, max_examples=None)
    direct_cnt, direct_total, direct_pct = direct_word_counts(tok, stats_texts_1mb)

    efficiency[name] = {'tok_per_sec': tps, 
                        'tokens_total': total, 
                        'avg_tokens_per_word': atpw, 
                        'pct_words_direct': direct_pct, 
                        'direct_words': direct_cnt, 
                        'total_words': direct_total, 
                        'vocab_size': tokenizer_vocab[name]}

efficiency

{'pretrained': {'tok_per_sec': 272725.8546803404,
  'tokens_total': 894,
  'avg_tokens_per_word': 1.0617577197149644,
  'pct_words_direct': 79.09738717339667,
  'direct_words': 333,
  'total_words': 421,
  'vocab_size': 50257},
 'whitespace': {'tok_per_sec': 304133.9965552876,
  'tokens_total': 842,
  'avg_tokens_per_word': 1.0,
  'pct_words_direct': 96.67458432304038,
  'direct_words': 407,
  'total_words': 421,
  'vocab_size': 26428},
 'sentencepiece': {'tok_per_sec': 183447.86248337684,
  'tokens_total': 888,
  'avg_tokens_per_word': 1.0546318289786223,
  'pct_words_direct': 95.01187648456056,
  'direct_words': 400,
  'total_words': 421,
  'vocab_size': 50257}}

In [13]:
# Qualitative: use longer >=30-word samples and show tokens (not just ids)
long_samples = [
    " ".join(["In", "Python,", "list", "comprehensions", "provide", "a", "concise", "way", "to", "create", "lists.", "They", "are", "often", "faster", "than", "using", "loops,", "and", "they", "express", "intent", "clearly", "when", "mapping", "and", "filtering", "collections", "in", "everyday", "data", "processing", "tasks."]),
    " ".join(["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog,", "while", "the", "curious", "cat", "watches", "from", "the", "windowsill,", "pondering", "why", "humans", "keep", "typing", "this", "sentence", "to", "test", "keyboards", "and", "fonts", "across", "different", "systems."]),
    " ".join(["When", "training", "language", "models,", "tokenization", "choices", "can", "greatly", "affect", "performance,", "memory", "usage,", "and", "generalization;", "understanding", "subword", "algorithms", "and", "OOV", "behavior", "helps", "practitioners", "make", "informed", "trade-offs", "for", "their", "applications."]),
]

def show_token_lists(text):
    enc = tokenizers['pretrained'](text)
    ids = enc.get('input_ids')
    gpt_toks = tokenizers['pretrained'].convert_ids_to_tokens(ids)
    return {
        'text': text,
        'pretrained_tokens': gpt_toks,
        'whitespace_tokens': tok_ws.tokens(text),
        'sentencepiece_tokens': tok_sp.tokens(text),
    }

In [14]:
qual_tokens = [show_token_lists(s) for s in long_samples]

# Print the formatted tokens
for item in qual_tokens:
    print(f"Text: {item['text']}\n")
    print(f"Pretrained Tokens: {item['pretrained_tokens']}\n")
    print(f"Whitespace Tokens: {item['whitespace_tokens']}\n")
    print(f"SentencePiece Tokens: {item['sentencepiece_tokens']}\n")
    print("-" * 80 + "\n")

Text: In Python, list comprehensions provide a concise way to create lists. They are often faster than using loops, and they express intent clearly when mapping and filtering collections in everyday data processing tasks.

Pretrained Tokens: ['In', 'ĠPython', ',', 'Ġlist', 'Ġcomprehens', 'ions', 'Ġprovide', 'Ġa', 'Ġconcise', 'Ġway', 'Ġto', 'Ġcreate', 'Ġlists', '.', 'ĠThey', 'Ġare', 'Ġoften', 'Ġfaster', 'Ġthan', 'Ġusing', 'Ġloops', ',', 'Ġand', 'Ġthey', 'Ġexpress', 'Ġintent', 'Ġclearly', 'Ġwhen', 'Ġmapping', 'Ġand', 'Ġfiltering', 'Ġcollections', 'Ġin', 'Ġeveryday', 'Ġdata', 'Ġprocessing', 'Ġtasks', '.']

Whitespace Tokens: ['In', 'Python', ',', 'list', 'comprehensions', 'provide', 'a', 'concise', 'way', 'to', 'create', 'lists', '.', 'They', 'are', 'often', 'faster', 'than', 'using', 'loops', ',', 'and', 'they', 'express', 'intent', 'clearly', 'when', 'mapping', 'and', 'filtering', 'collections', 'in', 'everyday', 'data', 'processing', 'tasks', '.']

SentencePiece Tokens: ['▁In', '▁Pytho

7. Model efficiency metrics (speed)

In [15]:
speed = {}

dev = best_device()
print('Using device for speed measurements:', dev)

for name, tok in tokenizers.items():
    model, cfg = build_model(tokenizer_vocab[name], tokenizer_pad[name])
    state = torch.load(final_weights[name], map_location='cpu')

    model.load_state_dict(state)
    model.to(dev)
    model.eval()

    thr = measure_throughput(model, cfg['vocab_size'], 
                             seq_len=256, batch_size=8, iters=20)
    ms_gen = measure_generation_latency(model, cfg['vocab_size'], 
                                        prompt_len=16, new_tokens=64, iters=5)
    ms_step = measure_training_step_time(model, cfg['vocab_size'], 
                                         seq_len=256, batch_size=8, iters=10)

    speed[name] = {'forward_tokens_per_s': thr, 
                   'gen_ms_per_token': ms_gen, 
                   'train_ms_per_step': ms_step}

speed

Using device for speed measurements: cuda


{'pretrained': {'forward_tokens_per_s': 28137.579700956503,
  'gen_ms_per_token': 7.418825477361679,
  'train_ms_per_step': 215.43309688568115},
 'whitespace': {'forward_tokens_per_s': 50395.25724804013,
  'gen_ms_per_token': 6.905669718980789,
  'train_ms_per_step': 142.32418537139893},
 'sentencepiece': {'forward_tokens_per_s': 27669.564236477687,
  'gen_ms_per_token': 7.586944103240967,
  'train_ms_per_step': 243.5544729232788}}