In [1]:
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tiktoken

In [3]:
# Model to Prefix Encoding from tiktoken github
# https://github.com/openai/tiktoken/blob/main/tiktoken/model.py

MODEL_PREFIX_TO_ENCODING = {
    "o1-": "o200k_base",
    # chat
    "chatgpt-4o-": "o200k_base",
    "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
    "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
    # fine-tuned
    "ft:gpt-4": "cl100k_base",
    "ft:gpt-3.5-turbo": "cl100k_base",
    "ft:davinci-002": "cl100k_base",
    "ft:babbage-002": "cl100k_base",
}

In [4]:
# Load the tokenizer for a specific model (e.g., 'gpt-3.5-turbo')
tokenizer = tiktoken.encoding_for_model("gpt-4-")

In [5]:
tokenizer._special_tokens

{'<|endoftext|>': 100257,
 '<|fim_prefix|>': 100258,
 '<|fim_middle|>': 100259,
 '<|fim_suffix|>': 100260,
 '<|endofprompt|>': 100276}

In [6]:
tokenizer.special_tokens_set

{'<|endofprompt|>',
 '<|endoftext|>',
 '<|fim_middle|>',
 '<|fim_prefix|>',
 '<|fim_suffix|>'}

In [7]:
# Load the tokenizer for a specific model (e.g., 'gpt-3.5-turbo')
for k in MODEL_PREFIX_TO_ENCODING.keys():
    try:        
        tokenizer = tiktoken.encoding_for_model(k)
        print(f'------------{k}------------')
        print(tokenizer._special_tokens)
    except Exception as e:
        print(f"Failure for {k} with error: {e}")

Failure for o1- with error: 'Could not automatically map o1- to a tokeniser. Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect.'
Failure for chatgpt-4o- with error: 'Could not automatically map chatgpt-4o- to a tokeniser. Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect.'
Failure for gpt-4o- with error: 'Could not automatically map gpt-4o- to a tokeniser. Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect.'
------------gpt-4-------------
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276}
------------gpt-3.5-turbo-------------
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276}
------------gpt-35-turbo-------------
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276}
----