# Week 1 Day 4 - Tokenization, Cost Estimation, and Chunking (Community Contribution)

This notebook demonstrates:
- Tokenization using `tiktoken`
- Token counting per model
- Simple cost estimation
- Chunking long text by tokens and by sentences


In [1]:
# Imports and setup
import tiktoken
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv(override=True)
openai = OpenAI()

print("Setup complete")


Setup complete


In [4]:
# Tokenization per model
models = ["gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo"]

encodings = {}
for m in models:
    try:
        encodings[m] = tiktoken.encoding_for_model(m)
        print(f"✅ {m}: {encodings[m].name}")
    except Exception as e:
        print(f"❌ {m}: {e}")

text = "Hi my name is Ed and I like banoffee pie. This is a test of tokenization!"
print(f"\nText length: {len(text)} chars")

for m, enc in encodings.items():
    toks = enc.encode(text)
    print(f"\n{m}: {len(toks)} tokens -> {toks}")


✅ gpt-4o-mini: o200k_base
✅ gpt-4o: o200k_base
✅ gpt-3.5-turbo: cl100k_base

Text length: 73 chars

gpt-4o-mini: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]

gpt-4o: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]

gpt-3.5-turbo: 20 tokens -> [13347, 856, 836, 374, 3279, 323, 358, 1093, 9120, 21869, 4447, 13, 1115, 374, 264, 1296, 315, 4037, 2065, 0]


In [5]:
# Token counting and simple cost estimation
PRICING = {
    "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
    "gpt-4o": {"input": 0.005, "output": 0.015},
    "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
}

def count_tokens(text, model="gpt-4o-mini"):
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

def estimate_cost(tokens, model="gpt-4o-mini", kind="input"):
    if model not in PRICING:
        return 0.0
    return (tokens / 1000) * PRICING[model][kind]

samples = [
    "Hello world!",
    "This is a longer text that will have more tokens and cost more money to process.",
]

for s in samples:
    print(f"\nText: {s}")
    for m in PRICING.keys():
        t = count_tokens(s, m)
        c = estimate_cost(t, m, "input")
        print(f"  {m}: {t} tokens, est input cost ${c:.6f}")



Text: Hello world!
  gpt-4o-mini: 3 tokens, est input cost $0.000000
  gpt-4o: 3 tokens, est input cost $0.000015
  gpt-3.5-turbo: 3 tokens, est input cost $0.000002

Text: This is a longer text that will have more tokens and cost more money to process.
  gpt-4o-mini: 17 tokens, est input cost $0.000003
  gpt-4o: 17 tokens, est input cost $0.000085
  gpt-3.5-turbo: 17 tokens, est input cost $0.000009


In [6]:
# Chunking helpers
import re

def chunk_by_tokens(text, model="gpt-4o-mini", max_tokens=300, overlap=30):
    enc = tiktoken.encoding_for_model(model)
    toks = enc.encode(text)
    chunks = []
    start = 0
    while start < len(toks):
        end = min(start + max_tokens, len(toks))
        chunk_text = enc.decode(toks[start:end])
        chunks.append(chunk_text)
        if end == len(toks):
            break
        start = max(0, end - overlap)
    return chunks

def chunk_by_sentences(text, model="gpt-4o-mini", max_tokens=300):
    enc = tiktoken.encoding_for_model(model)
    sentences = re.split(r"(?<=[.!?])\s+", text)
    chunks, current = [], ""
    for s in sentences:
        candidate = (current + " " + s).strip() if current else s
        if len(enc.encode(candidate)) <= max_tokens:
            current = candidate
        else:
            if current:
                chunks.append(current)
            current = s
    if current:
        chunks.append(current)
    return chunks

# Try with a long text
long_text = (
    "Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. "
    "It enables machines to perform tasks that typically require human intelligence. "
    "Machine learning, a subset of AI, allows systems to learn from data. "
    "Deep learning uses neural networks with multiple layers. "
    "AI powers recommendations, autonomous vehicles, and medical diagnostics. "
) * 10

print("Token-based chunks:")
for i, ch in enumerate(chunk_by_tokens(long_text, max_tokens=120)):
    print(f"  Chunk {i+1}: {len(ch)} chars")

print("\nSentence-based chunks:")
for i, ch in enumerate(chunk_by_sentences(long_text, max_tokens=120)):
    print(f"  Chunk {i+1}: {len(ch)} chars")


Token-based chunks:
  Chunk 1: 677 chars
  Chunk 2: 690 chars
  Chunk 3: 700 chars
  Chunk 4: 670 chars
  Chunk 5: 688 chars
  Chunk 6: 711 chars
  Chunk 7: 670 chars
  Chunk 8: 238 chars

Sentence-based chunks:
  Chunk 1: 637 chars
  Chunk 2: 698 chars
  Chunk 3: 582 chars
  Chunk 4: 637 chars
  Chunk 5: 698 chars
  Chunk 6: 582 chars
