In [2]:
s="To be or not to be that is the question"

In [3]:
import nltk


In [8]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4o")
tokens = enc.encode(s)
decoded = enc.decode(tokens)

print(tokens)
print(decoded)

[1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]
To be or not to be that is the question


In [12]:
# tiktoken already implements BPE - here are the key functions

print("=== tiktoken BPE Functions ===")
print(f"String: '{s}'")
print()

# 1. Basic BPE encoding/decoding (what you already have)
tokens = enc.encode(s)
print(f"encode(): {tokens}")
print(f"decode(): '{enc.decode(tokens)}'")
print()

# 2. Get individual token strings without decoding
print("=== Individual tokens ===")
for i, token_id in enumerate(tokens):
    # This is BPE in action - each token represents a subword/byte sequence
    token_text = enc.decode([token_id])
    print(f"Token {i+1}: {token_id} → '{token_text}'")
print()

# 3. Encoding with allowed/disallowed tokens (BPE control)
# You can control which tokens are allowed
print("=== BPE with token control ===")
# Example: encode with specific token restrictions
try:
    # This shows tiktoken's BPE vocabulary in action
    print(f"Vocabulary size: {enc.n_vocab}")
    print("Some example tokens from the BPE vocabulary:")
    for token_id in [100, 200, 300, 400, 500]:
        try:
            token_text = enc.decode([token_id])
            print(f"  Token {token_id}: '{token_text}'")
        except:
            print(f"  Token {token_id}: <invalid>")
except:
    print("Could not access vocabulary size")
print()

# 4. tiktoken automatically handles BPE merging
print("=== BPE Subword Splitting Examples ===")
examples = ["unhappiness", "preprocessing", "tokenization"]
for word in examples:
    word_tokens = enc.encode(word)
    print(f"'{word}' → BPE splits into: {[enc.decode([t]) for t in word_tokens]}")

=== tiktoken BPE Functions ===
String: 'To be or not to be that is the question'

encode(): [1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]
decode(): 'To be or not to be that is the question'

=== Individual tokens ===
Token 1: 1385 → 'To'
Token 2: 413 → ' be'
Token 3: 503 → ' or'
Token 4: 625 → ' not'
Token 5: 316 → ' to'
Token 6: 413 → ' be'
Token 7: 484 → ' that'
Token 8: 382 → ' is'
Token 9: 290 → ' the'
Token 10: 4928 → ' question'

=== BPE with token control ===
Vocabulary size: 200019
Some example tokens from the BPE vocabulary:
  Token 100: '�'
  Token 200: ''
  Token 300: ' �'
  Token 400: 'ce'
  Token 500: 'м'

=== BPE Subword Splitting Examples ===
'unhappiness' → BPE splits into: ['un', 'h', 'appiness']
'preprocessing' → BPE splits into: ['pre', 'processing']
'tokenization' → BPE splits into: ['token', 'ization']


In [11]:
# More tiktoken BPE features

print("=== Additional tiktoken BPE features ===")

# 1. Different encoding models use different BPE vocabularies
print("Comparing different BPE models:")
models = ["gpt-4o", "gpt-3.5-turbo", "text-davinci-003"]
test_text = "artificial intelligence"

for model in models:
    try:
        model_enc = tiktoken.encoding_for_model(model)
        model_tokens = model_enc.encode(test_text)
        print(f"{model:15}: {len(model_tokens)} tokens → {model_tokens}")
    except:
        print(f"{model:15}: Not available")
print()

# 2. tiktoken special tokens (part of BPE vocabulary)
print("=== Special tokens in BPE ===")
try:
    # Some models have special tokens
    special_tokens = enc.special_tokens_set
    print(f"Number of special tokens: {len(special_tokens)}")
    if special_tokens:
        print("Some special tokens:", list(special_tokens)[:5])
except:
    print("No special tokens or not accessible")
print()

# 3. Encode with different options
print("=== BPE encoding options ===")
# Basic encoding
basic_tokens = enc.encode(s)
print(f"Basic encoding: {basic_tokens}")

# Encoding with disallowed special tokens (safer for user input)
try:
    safe_tokens = enc.encode(s, disallowed_special=())
    print(f"Safe encoding:  {safe_tokens}")
except:
    print("Safe encoding not available for this model")
    
print()
print("✅ tiktoken handles all BPE complexity internally!")
print("✅ You just need: encode() and decode() functions")
print("✅ The tokenization you see IS BPE working!")

=== Additional tiktoken BPE features ===
Comparing different BPE models:
gpt-4o         : 3 tokens → [497, 20454, 22990]
gpt-3.5-turbo  : 3 tokens → [472, 16895, 11478]
text-davinci-003: 3 tokens → [433, 9542, 4430]

=== Special tokens in BPE ===
Number of special tokens: 2
Some special tokens: ['<|endofprompt|>', '<|endoftext|>']

=== BPE encoding options ===
Basic encoding: [1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]
Safe encoding:  [1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]

✅ tiktoken handles all BPE complexity internally!
✅ You just need: encode() and decode() functions
✅ The tokenization you see IS BPE working!
text-davinci-003: 3 tokens → [433, 9542, 4430]

=== Special tokens in BPE ===
Number of special tokens: 2
Some special tokens: ['<|endofprompt|>', '<|endoftext|>']

=== BPE encoding options ===
Basic encoding: [1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]
Safe encoding:  [1385, 413, 503, 625, 316, 413, 484, 382, 290, 4928]

✅ tiktoken handles all 