# Week 3 Exercise: Tokenizer + Prompt Budget Analyzer

This notebook compares token counts across Hugging Face tokenizers and shows how much of a model's context window a prompt consumes.
It also includes a simple prompt-trimming helper that fits a prompt to a token budget.


In [None]:
# If needed, install dependencies
# !pip -q install transformers sentencepiece


In [None]:
# Imports
import re
from transformers import AutoTokenizer


In [None]:
# Models to compare (all public)
# Context windows are approximate common defaults
MODELS = [
    {'name': 'gpt2', 'context': 1024},
    {'name': 'distilbert-base-uncased', 'context': 512},
    {'name': 'bert-base-uncased', 'context': 512},
    {'name': 'google/flan-t5-small', 'context': 512},
]


In [None]:
# Sample prompt (replace with your own)
PROMPT = '''
You are a helpful assistant.
Summarize the following text and list 3 action items.

Meeting transcript:
We discussed the Q2 launch plan, timelines, and dependencies.
Engineering will finalize the API integration by next Friday.
Marketing will prepare the announcement draft by Monday.
Support needs a short FAQ for common issues and escalation steps.
Risks include vendor delays and limited QA bandwidth.

Please write a concise summary and three action items.
'''


In [None]:
# Tokenizer cache
_TOKENIZERS = {}

def get_tokenizer(model_name: str):
    if model_name not in _TOKENIZERS:
        _TOKENIZERS[model_name] = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    return _TOKENIZERS[model_name]

def count_tokens(model_name: str, text: str) -> int:
    tok = get_tokenizer(model_name)
    return len(tok.encode(text, add_special_tokens=False))

def budget_report(text: str):
    rows = []
    for m in MODELS:
        n = count_tokens(m['name'], text)
        ctx = m['context']
        pct = round((n / ctx) * 100, 2)
        rows.append({
            'model': m['name'],
            'tokens': n,
            'context': ctx,
            'pct_of_context': pct
        })
    return rows


In [None]:
# Show token budget report
report = budget_report(PROMPT)
for row in report:
    print(row)


In [None]:
# Prompt trimming: fit text into a token budget
def trim_to_budget(model_name: str, text: str, max_tokens: int) -> str:
    tok = get_tokenizer(model_name)
    tokens = tok.encode(text, add_special_tokens=False)
    if len(tokens) <= max_tokens:
        return text
    trimmed_tokens = tokens[:max_tokens]
    trimmed_text = tok.decode(trimmed_tokens, skip_special_tokens=True)
    return trimmed_text.rstrip() + '...
'

# Example: trim to 80 tokens for each model
for m in MODELS:
    trimmed = trim_to_budget(m['name'], PROMPT, max_tokens=80)
    print('
---', m['name'], '---')
    print(trimmed)
