refactor: move len_tokens and related code into gptme.util.tokens (#809)

ErikBjare · web-flow · commit 62ba557b3418 · 2025-10-31T12:26:46.000+01:00
* refactor: move len_tokens and related code into gptme.util.tokens

* fix: fixes to review comments
diff --git a/gptme/logmanager.py b/gptme/logmanager.py
@@ -44,6 +44,9 @@ def __getitem__(self, key):
     def __len__(self) -> int:
         return len(self.messages)
 
+    def len_tokens(self, model: str) -> int:
+        return len_tokens(self.messages, model)
+
     def __iter__(self) -> Generator[Message, None, None]:
         yield from self.messages
 
diff --git a/gptme/message.py b/gptme/message.py
@@ -1,5 +1,4 @@
 import dataclasses
-import hashlib
 import logging
 import shutil
 import sys
@@ -19,8 +18,9 @@
 
 from .codeblock import Codeblock
 from .constants import ROLE_COLOR
-from .util import console, get_tokenizer
+from .util import console
 from .util.prompt import rich_to_str
+from .util.tokens import len_tokens
 
 logger = logging.getLogger(__name__)
 
@@ -68,6 +68,9 @@ def __eq__(self, other):
             and self.timestamp == other.timestamp
         )
 
+    def len_tokens(self, model: str) -> int:
+        return len_tokens(self, model=model)
+
     def replace(self, **kwargs) -> Self:
         """Replace attributes of the message."""
         return dataclasses.replace(self, **kwargs)
@@ -326,43 +329,3 @@ def toml_to_msgs(toml: str) -> list[Message]:
 def msgs2dicts(msgs: list[Message]) -> list[dict]:
     """Convert a list of Message objects to a list of dicts ready to pass to an LLM."""
     return [msg.to_dict(keys=["role", "content", "files", "call_id"]) for msg in msgs]
-
-
-# Global cache mapping hashes to token counts
-_token_cache: dict[tuple[str, str], int] = {}
-
-
-def _hash_content(content: str) -> str:
-    """Create a hash of the content"""
-    return hashlib.sha256(content.encode()).hexdigest()
-
-
-def len_tokens(content: str | Message | list[Message], model: str) -> int:
-    """Get the number of tokens in a string, message, or list of messages.
-
-    Uses efficient caching with content hashing to minimize memory usage while
-    maintaining fast repeated calculations, which is especially important for
-    conversations with many messages.
-    """
-    if isinstance(content, list):
-        return sum(len_tokens(msg, model) for msg in content)
-    if isinstance(content, Message):
-        content = content.content
-
-    assert isinstance(content, str), content
-    # Check cache using hash
-    content_hash = _hash_content(content)
-    cache_key = (content_hash, model)
-    if cache_key in _token_cache:
-        return _token_cache[cache_key]
-
-    # Calculate and cache
-    count = len(get_tokenizer(model).encode(content, disallowed_special=[]))
-    _token_cache[cache_key] = count
-
-    # Limit cache size by removing oldest entries if needed
-    if len(_token_cache) > 1000:
-        # Remove first item (oldest in insertion order)
-        _token_cache.pop(next(iter(_token_cache)))
-
-    return count
diff --git a/gptme/tools/shell.py b/gptme/tools/shell.py
@@ -27,9 +27,10 @@
 import bashlex
 
 from ..message import Message
-from ..util import get_installed_programs, get_tokenizer
+from ..util import get_installed_programs
 from ..util.ask_execute import execute_with_confirmation
 from ..util.output_storage import save_large_output
+from ..util.tokens import get_tokenizer
 from .base import (
     ConfirmFunc,
     Parameter,
diff --git a/gptme/util/__init__.py b/gptme/util/__init__.py
@@ -19,27 +19,6 @@
 logger = logging.getLogger(__name__)
 console = Console(log_path=False)
 
-_warned_models = set()
-
-
-@lru_cache
-def get_tokenizer(model: str):
-    import tiktoken  # fmt: skip
-
-    if "gpt-4o" in model:
-        return tiktoken.get_encoding("o200k_base")
-
-    try:
-        return tiktoken.encoding_for_model(model)
-    except KeyError:
-        global _warned_models
-        if model not in _warned_models:
-            logger.debug(
-                f"No tokenizer for '{model}'. Using tiktoken cl100k_base. Use results only as estimates."
-            )
-            _warned_models |= {model}
-        return tiktoken.get_encoding("cl100k_base")
-
 
 def epoch_to_age(epoch, incl_date=False):
     # takes epoch and returns "x minutes ago", "3 hours ago", "yesterday", etc.
diff --git a/gptme/util/tokens.py b/gptme/util/tokens.py
@@ -0,0 +1,82 @@
+import hashlib
+import logging
+import typing
+from functools import lru_cache
+
+if typing.TYPE_CHECKING:
+    import tiktoken  # fmt: skip
+
+    from ..message import Message  # fmt: skip
+
+
+# Global cache mapping hashes to token counts
+_token_cache: dict[tuple[str, str], int] = {}
+
+_warned_models = set()
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache
+def get_tokenizer(model: str) -> "tiktoken.Encoding":
+    """Get the tokenizer for a given model, with caching and fallbacks."""
+    import tiktoken  # fmt: skip
+
+    if "gpt-4o" in model:
+        return tiktoken.get_encoding("o200k_base")
+
+    try:
+        return tiktoken.encoding_for_model(model)
+    except KeyError:
+        global _warned_models
+        if model not in _warned_models:
+            logger.debug(
+                f"No tokenizer for '{model}'. Using tiktoken cl100k_base. Use results only as estimates."
+            )
+            _warned_models |= {model}
+        return tiktoken.get_encoding("cl100k_base")
+
+
+# perf trick: start background thread that pre-loads the gpt-4 and gpt-5 tokenizers
+#             needs logic to wait for the tokenizer to be ready if requested before loaded
+# threading.Thread(target=get_tokenizer, args=("gpt-4",), daemon=True).start()
+# threading.Thread(target=get_tokenizer, args=("gpt-5",), daemon=True).start()
+
+
+def _hash_content(content: str) -> str:
+    """Create a hash of the content"""
+    return hashlib.sha256(content.encode()).hexdigest()
+
+
+def len_tokens(content: "str | Message | list[Message]", model: str) -> int:
+    """Get the number of tokens in a string, message, or list of messages.
+
+    Uses efficient caching with content hashing to minimize memory usage while
+    maintaining fast repeated calculations, which is especially important for
+    conversations with many messages.
+    """
+    from ..message import Message  # fmt: skip
+
+    if isinstance(content, list):
+        return sum(len_tokens(msg, model) for msg in content)
+    if isinstance(content, Message):
+        content = content.content
+
+    assert isinstance(content, str), content
+    # Check cache using hash
+    content_hash = _hash_content(content)
+    cache_key = (content_hash, model)
+    if cache_key in _token_cache:
+        return _token_cache[cache_key]
+
+    # Calculate and cache
+    tokenizer = get_tokenizer(model)
+    count = len(tokenizer.encode(content, disallowed_special=[]))
+    _token_cache[cache_key] = count
+
+    # Limit cache size by removing oldest entries if needed
+    if len(_token_cache) > 1000:
+        # Remove first item (oldest in insertion order)
+        _token_cache.pop(next(iter(_token_cache)))
+
+    return count