fix: better prompt caching & less debug logging (#323)

ErikBjare · web-flow · commit fae65ca47ef6 · 2024-12-11T02:43:59.000+01:00
diff --git a/gptme/cli.py b/gptme/cli.py
@@ -188,11 +188,10 @@ def main(
     config = get_config()
 
     tool_format = tool_format or config.get_env("TOOL_FORMAT") or "markdown"
-
     set_tool_format(tool_format)
 
     # early init tools to generate system prompt
-    init_tools(tool_allowlist)
+    init_tools(frozenset(tool_allowlist) if tool_allowlist else None)
 
     # get initial system prompt
     initial_msgs = [
diff --git a/gptme/init.py b/gptme/init.py
@@ -30,7 +30,6 @@ def init(model: str | None, interactive: bool, tool_allowlist: list[str] | None)
     _init_done = True
 
     # init
-    logger.debug("Started")
     load_dotenv()
 
     # fixes issues with transformers parallelism
@@ -72,20 +71,21 @@ def init(model: str | None, interactive: bool, tool_allowlist: list[str] | None)
         # for some reason it bugs out shell tests in CI
         register_tabcomplete()
 
-    init_tools(tool_allowlist)
+    init_tools(frozenset(tool_allowlist) if tool_allowlist else None)
 
 
 def init_logging(verbose):
-    # log init
-    handler = RichHandler()
+    handler = RichHandler()  # show_time=False
     logging.basicConfig(
         level=logging.DEBUG if verbose else logging.INFO,
         format="%(message)s",
         datefmt="[%X]",
         handlers=[handler],
     )
+
     # anthropic spams debug logs for every request
     logging.getLogger("anthropic").setLevel(logging.INFO)
+    logging.getLogger("openai").setLevel(logging.INFO)
     # set httpx logging to WARNING
     logging.getLogger("httpx").setLevel(logging.WARNING)
     logging.getLogger("httpcore").setLevel(logging.WARNING)
diff --git a/gptme/llm/__init__.py b/gptme/llm/__init__.py
@@ -1,6 +1,7 @@
 import logging
 import shutil
 import sys
+import time
 from collections.abc import Iterator
 from functools import lru_cache
 from typing import cast
@@ -95,11 +96,14 @@ def print_clear():
         print(" " * shutil.get_terminal_size().columns, end="\r")
 
     output = ""
+    start_time = time.time()
+    first_token_time = None
     try:
         for char in (
             char for chunk in _stream(messages, model, tools) for char in chunk
         ):
             if not output:  # first character
+                first_token_time = time.time()
                 print_clear()
                 print(f"{PROMPT_ASSISTANT}: ", end="")
             print(char, end="")
@@ -126,6 +130,15 @@ def print_clear():
         return Message("assistant", output + "... ^C Interrupted")
     finally:
         print_clear()
+        if first_token_time:
+            end_time = time.time()
+            logger.debug(
+                f"Generation interrupted after {end_time - start_time:.1f}s "
+                f"(ttft: {first_token_time - start_time:.2f}s, "
+                f"gen: {end_time - first_token_time:.2f}s, "
+                f"tok/s: {len_tokens(output)/(end_time - first_token_time):.1f})"
+            )
+
     return Message("assistant", output)
 
 
diff --git a/gptme/llm/llm_anthropic.py b/gptme/llm/llm_anthropic.py
@@ -349,6 +349,9 @@ def _prepare_messages_for_api(
 
         messages_dicts_new.append({"role": msg["role"], "content": content_parts})
 
+    # set for the first system message (static between sessions)
+    system_messages[0]["cache_control"] = {"type": "ephemeral"}
+
     # set cache points at the two last user messages, as suggested in Anthropic docs:
     # > The conversation history (previous messages) is included in the messages array.
     # > The final turn is marked with cache-control, for continuing in followups.
diff --git a/gptme/logmanager.py b/gptme/logmanager.py
@@ -103,7 +103,7 @@ def __init__(
             # Try to acquire an exclusive lock
             try:
                 fcntl.flock(self._lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
-                logger.debug(f"Acquired lock on {self.logdir}")
+                # logger.debug(f"Acquired lock on {self.logdir}")
             except BlockingIOError:
                 self._lock_fd.close()
                 self._lock_fd = None
@@ -132,7 +132,7 @@ def __del__(self):
             try:
                 fcntl.flock(self._lock_fd, fcntl.LOCK_UN)
                 self._lock_fd.close()
-                logger.debug(f"Released lock on {self.logdir}")
+                # logger.debug(f"Released lock on {self.logdir}")
             except Exception as e:
                 logger.warning(f"Error releasing lock: {e}")
 
@@ -256,7 +256,7 @@ def load(
 
         if not Path(logfile).exists():
             if create:
-                logger.debug(f"Creating new logfile {logfile}")
+                # logger.debug(f"Creating new logfile {logfile}")
                 Path(logfile).parent.mkdir(parents=True, exist_ok=True)
                 Log([]).write_jsonl(logfile)
             else:
diff --git a/gptme/prompts.py b/gptme/prompts.py
@@ -8,8 +8,8 @@
 import glob
 import logging
 import platform
-import subprocess
 from collections.abc import Generator, Iterable
+from datetime import datetime
 from pathlib import Path
 from typing import Literal
 
@@ -242,8 +242,8 @@ def prompt_systeminfo() -> Generator[Message, None, None]:
 
 def prompt_timeinfo() -> Generator[Message, None, None]:
     """Generate the current time prompt."""
-    # TODO: this should be updated when time changes significantly (such as when resuming a session)
-    prompt = f"## Current Time\n\n**UTC:** {subprocess.run(['date', '-u'], capture_output=True, text=True).stdout.strip()}"
+    # we only set the date in order for prompt caching and such to work
+    prompt = f"## Current Date\n\n**UTC:** {datetime.utcnow().strftime('%Y-%m-%d')}"
     yield Message("system", prompt)
 
 
diff --git a/gptme/tools/__init__.py b/gptme/tools/__init__.py
@@ -72,7 +72,8 @@
 ]
 
 
-def init_tools(allowlist=None) -> None:
+@lru_cache
+def init_tools(allowlist: frozenset[str] | None = None) -> None:
     """Runs initialization logic for tools."""
     # init python tool last
     tools = list(
@@ -93,22 +94,20 @@ def init_tools(allowlist=None) -> None:
         if tool.name in tools_default_disabled:
             if not allowlist or tool.name not in allowlist:
                 continue
-        load_tool(tool)
+        _load_tool(tool)
 
     for tool_name in allowlist or []:
         if not has_tool(tool_name):
             raise ValueError(f"Tool '{tool_name}' not found")
 
 
-def load_tool(tool: ToolSpec) -> None:
+def _load_tool(tool: ToolSpec) -> None:
     """Loads a tool."""
-    # FIXME: when are tools first initialized?
     if tool in loaded_tools:
         logger.warning(f"Tool '{tool.name}' already loaded")
         return
 
-    if tool.init:
-        tool.init()
+    # tool init happens in init_tools to check that spec is available
     if tool.functions:
         for func in tool.functions:
             register_function(func)
diff --git a/gptme/tools/python.py b/gptme/tools/python.py
@@ -120,7 +120,7 @@ def execute_python(
 
 
 @functools.lru_cache
-def get_installed_python_libraries() -> set[str]:
+def get_installed_python_libraries() -> list[str]:
     """Check if a select list of Python libraries are installed."""
     candidates = [
         "numpy",
@@ -137,7 +137,7 @@ def get_installed_python_libraries() -> set[str]:
         if importlib.util.find_spec(candidate):
             installed.add(candidate)
 
-    return installed
+    return list(sorted(installed))
 
 
 def get_functions():
diff --git a/gptme/tools/shell.py b/gptme/tools/shell.py
@@ -44,7 +44,7 @@
 
 
 shell_programs_str = "\n".join(
-    f"- {prog}" for prog in get_installed_programs(candidates)
+    f"- {prog}" for prog in sorted(get_installed_programs(candidates))
 )
 is_macos = sys.platform == "darwin"
 
diff --git a/gptme/util/__init__.py b/gptme/util/__init__.py
@@ -37,7 +37,7 @@ def get_tokenizer(model: str):
     except KeyError:
         global _warned_models
         if model not in _warned_models:
-            logger.warning(
+            logger.info(
                 f"No tokenizer for '{model}'. Using tiktoken cl100k_base. Use results only as estimates."
             )
             _warned_models |= {model}
diff --git a/gptme/util/readline.py b/gptme/util/readline.py
@@ -48,7 +48,7 @@ def load_readline_history() -> None:  # pragma: no cover
     if init:
         return
 
-    logger.debug("Loading history")
+    # logger.debug("Loading history")
     # enabled by default in CPython, make it explicit
     readline.set_auto_history(True)
     # had some bugs where it grew to gigs, which should be fixed, but still good precaution
@@ -69,18 +69,17 @@ def load_readline_history() -> None:  # pragma: no cover
 def register_tabcomplete() -> None:  # pragma: no cover
     """Register tab completion for readline."""
 
-    # set up tab completion
-    logger.debug("Setting up tab completion")
+    # logger.debug("Setting up tab completion")
     readline.set_completer(_completer)
     readline.set_completer_delims(" ")
     readline.parse_and_bind("tab: complete")
 
     # https://github.com/python/cpython/issues/102130#issuecomment-1439242363
     if "libedit" in readline.__doc__:  # type: ignore
-        logger.debug("Found libedit readline")
+        # logger.debug("Found libedit readline")
         readline.parse_and_bind("bind ^I rl_complete")
     else:
-        logger.debug("Found gnu readline")
+        # logger.debug("Found gnu readline")
         readline.parse_and_bind("tab: complete")
 
 
diff --git a/tests/test_prompt_tools.py b/tests/test_prompt_tools.py
@@ -76,7 +76,7 @@
     ],
 )
 def test_prompt_tools(tool_format, example, expected, not_expected):
-    init_tools(allowlist=["shell", "read"])
+    init_tools(allowlist=frozenset(("shell", "read")))
 
     prompt = next(prompt_tools(example, tool_format)).content
 
diff --git a/tests/test_tool_use.py b/tests/test_tool_use.py
@@ -59,7 +59,7 @@
     ],
 )
 def test_tool_use_output_patch(tool_format, args, content, kwargs, expected):
-    init_tools(allowlist=["patch"])
+    init_tools(allowlist=frozenset(("patch",)))
 
     result = ToolUse("patch", args, content, kwargs).to_output(tool_format)
 

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`
`45`	`45`
`46`	`46`	`shell_programs_str = "\n".join(`
`47`		`- f"- {prog}" for prog in get_installed_programs(candidates)`
	`47`	`+ f"- {prog}" for prog in sorted(get_installed_programs(candidates))`
`48`	`48`	`)`
`49`	`49`	`is_macos = sys.platform == "darwin"`
`50`	`50`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def get_tokenizer(model: str):`
`37`	`37`	`except KeyError:`
`38`	`38`	`global _warned_models`
`39`	`39`	`if model not in _warned_models:`
`40`		`- logger.warning(`
	`40`	`+ logger.info(`
`41`	`41`	`f"No tokenizer for '{model}'. Using tiktoken cl100k_base. Use results only as estimates."`
`42`	`42`	`)`
`43`	`43`	`_warned_models \|= {model}`