feat(context): implement hooks-based context compression architecture (#844)

TimeToBuildBob · ErikBjare · web-flow · commit 20435f324024 · 2025-11-20T12:45:53.000+01:00
* feat(autocompact): add strategic removal with reasoning stripping Implements Option B from PR discussion - extends autocompact with: 1. Reasoning stripping for older messages: - Strips <think> and <thinking> tags from messages beyond threshold - Configurable age threshold (default: 5 messages from end) - Reduces token usage while preserving recent reasoning 2. Strategic removal enhancements: - Age-based prioritization (older messages processed first) - Phase-aware (keeps recent context intact) - Preserves existing massive tool result removal 3. Comprehensive test coverage: - Tests for strip_reasoning() helper function - Tests for age-based reasoning stripping - Tests for threshold variations (0, 5) - All existing tests continue passing Token savings tracked separately for tool results vs reasoning, providing visibility into both compaction strategies. Addresses Erik's feedback on PR #844. * refactor(context): create compress module for reusable compression utilities - Create gptme/context/compress.py with strip_reasoning() function - Expose function via gptme/context/__init__.py - Update autocompact.py to import from new module - Enables reuse via hooks, shell tool integration, etc. Addresses feedback from @ErikBjare in PR #844 * fix(tests): correct import path for strip_reasoning Changed imports from 'gptme.tools.autocompact' to 'gptme.context' as the function was moved to the new compress module. Fixes test failures identified by Greptile review. * fix: fixed gptme.context.__init__ --------- Co-authored-by: Erik Bjäreholt <erik@bjareho.lt>
diff --git a/gptme/context/__init__.py b/gptme/context/__init__.py
@@ -1,14 +1,13 @@
-"""Context management for gptme.
+"""Context management utilities.
 
 This module provides:
 - Unified context configuration (context.config)
 - Context selection strategies (context.selector)
+- Context compression utilities (context.compress)
 """
 
+from .compress import strip_reasoning
 from .config import ContextConfig
 from .selector import ContextSelectorConfig
 
-__all__ = [
-    "ContextConfig",
-    "ContextSelectorConfig",
-]
+__all__ = ["ContextConfig", "ContextSelectorConfig", "strip_reasoning"]
diff --git a/gptme/context/compress.py b/gptme/context/compress.py
@@ -0,0 +1,39 @@
+"""Context compression utilities.
+
+Provides core compression utilities that can be used via hooks,
+shell tool integration, or direct invocation.
+"""
+
+import re
+
+from ..util.tokens import len_tokens
+
+
+def strip_reasoning(content: str, model: str = "gpt-4") -> tuple[str, int]:
+    """
+    Strip reasoning tags from message content.
+
+    Removes <think>...</think> and <thinking>...</thinking> blocks
+    while preserving the rest of the content.
+
+    Args:
+        content: Message content potentially containing reasoning tags
+        model: Model name for token counting
+
+    Returns:
+        Tuple of (stripped_content, tokens_saved)
+    """
+    original_tokens = len_tokens(content, model)
+
+    # Remove <think>...</think> blocks (including newlines inside)
+    stripped = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
+
+    # Remove <thinking>...</thinking> blocks (including newlines inside)
+    stripped = re.sub(r"<thinking>.*?</thinking>", "", stripped, flags=re.DOTALL)
+
+    # Clean up extra whitespace left by removals
+    stripped = re.sub(r"\n\n\n+", "\n\n", stripped)  # Multiple blank lines -> two
+    stripped = stripped.strip()
+
+    tokens_saved = original_tokens - len_tokens(stripped, model)
+    return stripped, tokens_saved
diff --git a/gptme/tools/autocompact.py b/gptme/tools/autocompact.py
@@ -6,10 +6,12 @@
 """
 
 import logging
+import re
 from collections.abc import Generator
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from ..context import strip_reasoning
 from ..hooks import StopPropagation
 from ..message import Message, len_tokens
 from ..util.output_storage import create_tool_result_summary
@@ -28,18 +30,22 @@ def auto_compact_log(
     log: list[Message],
     limit: int | None = None,
     max_tool_result_tokens: int = 2000,
+    reasoning_strip_age_threshold: int = 5,
     logdir: Path | None = None,
 ) -> Generator[Message, None, None]:
     """
     Auto-compact log for conversations with massive tool results.
 
-    More aggressive than reduce_log - completely removes massive tool results
-    instead of just truncating them, to allow conversation resumption.
+    More aggressive than reduce_log - implements strategic removal:
+    1. Strips reasoning tags from older messages (age-based)
+    2. Removes massive tool results (existing behavior)
+    3. Prioritizes keeping recent context (phase-aware)
 
     Args:
         log: List of messages to compact
         limit: Token limit (defaults to 80% of model context)
         max_tool_result_tokens: Maximum tokens allowed in a tool result before removal
+        reasoning_strip_age_threshold: Strip reasoning from messages >N positions back
         logdir: Path to conversation directory for saving removed outputs
     """
     from ..llm.models import get_default_model, get_model
@@ -53,26 +59,60 @@ def auto_compact_log(
     tokens = len_tokens(log, model=model.model)
     close_to_limit = tokens >= int(0.7 * model.context)
 
-    # Only return early if we're not close to limit and don't have massive tool results
-    if tokens <= limit and not close_to_limit:
+    # Calculate message positions from end (for age-based reasoning stripping)
+    log_length = len(log)
+
+    # Check if any reasoning stripping is needed
+    needs_reasoning_strip = any(
+        (log_length - idx - 1) >= reasoning_strip_age_threshold
+        and ("<think>" in msg.content or "<thinking>" in msg.content)
+        for idx, msg in enumerate(log)
+    )
+    needs_compacting = tokens > limit or close_to_limit
+
+    # Only return early if nothing needs processing
+    if not needs_reasoning_strip and not needs_compacting:
         yield from log
         return
 
-    logger.info(f"Auto-compacting log: {tokens} tokens exceeds limit of {limit}")
+    if needs_compacting:
+        logger.info(f"Auto-compacting log: {tokens} tokens exceeds limit of {limit}")
+    if needs_reasoning_strip:
+        logger.info(
+            f"Stripping reasoning from messages beyond threshold {reasoning_strip_age_threshold}"
+        )
 
     # Process messages and remove massive tool results
     compacted_log = []
     tokens_saved = 0
+    reasoning_tokens_saved = 0
 
-    for msg in log:
+    for idx, msg in enumerate(log):
         # Skip processing pinned messages
         if msg.pinned:
             compacted_log.append(msg)
             continue
 
+        # Calculate distance from end (for age-based processing)
+        distance_from_end = log_length - idx - 1
+
+        # Phase 1: Strategic reasoning stripping for older messages
+        # Strip reasoning from messages beyond the threshold
+        if distance_from_end >= reasoning_strip_age_threshold:
+            stripped_content, reasoning_saved = strip_reasoning(
+                msg.content, model.model
+            )
+            if reasoning_saved > 0:
+                msg = msg.replace(content=stripped_content)
+                reasoning_tokens_saved += reasoning_saved
+                logger.info(
+                    f"Stripped reasoning from message {idx}: "
+                    f"saved {reasoning_saved} tokens (distance from end: {distance_from_end})"
+                )
+
         msg_tokens = len_tokens(msg.content, model.model)
 
-        # Check if this is a massive tool result (system message with huge content)
+        # Phase 2: Check if this is a massive tool result (system message with huge content)
         # Use same logic as should_auto_compact: over limit OR close to limit with massive tool result
         close_to_limit = tokens >= int(0.8 * model.context)
         if (
@@ -99,9 +139,12 @@ def auto_compact_log(
 
     # Check if we're now within limits
     final_tokens = len_tokens(compacted_log, model.model)
+    total_saved = tokens_saved + reasoning_tokens_saved
     if final_tokens <= limit:
         logger.info(
-            f"Auto-compacting successful: {tokens} -> {final_tokens} tokens (saved {tokens_saved})"
+            f"Auto-compacting successful: {tokens} -> {final_tokens} tokens "
+            f"(saved {total_saved}: {tokens_saved} from tool results, "
+            f"{reasoning_tokens_saved} from reasoning)"
         )
         yield from compacted_log
         return
@@ -299,7 +342,6 @@ def _get_compacted_name(conversation_name: str) -> str:
     if not conversation_name:
         raise ValueError("conversation name cannot be empty")
 
-    import re
     from datetime import datetime
 
     # Strip any existing compacted suffixes: -compacted-YYYYMMDDHHMM
diff --git a/tests/test_auto_compact.py b/tests/test_auto_compact.py
@@ -254,6 +254,101 @@ def test_get_compacted_name_empty_string():
         _get_compacted_name("")
 
 
+def test_strip_reasoning_removes_think_tags():
+    """Test that strip_reasoning removes <think> tags."""
+    from gptme.context import strip_reasoning
+
+    content = "Before <think>This is reasoning</think> After"
+    stripped, tokens_saved = strip_reasoning(content, "gpt-4")
+
+    assert "<think>" not in stripped
+    assert "</think>" not in stripped
+    assert "Before" in stripped
+    assert "After" in stripped
+    assert tokens_saved > 0
+
+
+def test_strip_reasoning_removes_thinking_tags():
+    """Test that strip_reasoning removes <thinking> tags."""
+    from gptme.context import strip_reasoning
+
+    content = "Before <thinking>This is reasoning</thinking> After"
+    stripped, tokens_saved = strip_reasoning(content, "gpt-4")
+
+    assert "<thinking>" not in stripped
+    assert "</thinking>" not in stripped
+    assert "Before" in stripped
+    assert "After" in stripped
+    assert tokens_saved > 0
+
+
+def test_strip_reasoning_handles_multiple_blocks():
+    """Test that strip_reasoning removes multiple reasoning blocks."""
+    from gptme.context import strip_reasoning
+
+    content = "<think>First</think> Middle <thinking>Second</thinking> End"
+    stripped, tokens_saved = strip_reasoning(content, "gpt-4")
+
+    assert "<think>" not in stripped
+    assert "<thinking>" not in stripped
+    assert "Middle" in stripped
+    assert "End" in stripped
+    assert tokens_saved > 0
+
+
+def test_strip_reasoning_preserves_content_without_tags():
+    """Test that strip_reasoning preserves content without reasoning tags."""
+    from gptme.context import strip_reasoning
+
+    content = "This is normal content without reasoning"
+    stripped, tokens_saved = strip_reasoning(content, "gpt-4")
+
+    assert stripped == content
+    assert tokens_saved == 0
+
+
+def test_auto_compact_strips_reasoning_from_older_messages():
+    """Test that auto_compact_log strips reasoning from older messages."""
+    messages = [
+        Message("user", "First <think>old reasoning</think>", datetime.now()),
+        Message("assistant", "Second <think>old reasoning</think>", datetime.now()),
+        Message("user", "Third <think>old reasoning</think>", datetime.now()),
+        Message("assistant", "Fourth <think>old reasoning</think>", datetime.now()),
+        Message("user", "Fifth <think>old reasoning</think>", datetime.now()),
+        Message("assistant", "Recent <think>recent reasoning</think>", datetime.now()),
+        Message("user", "Most recent <think>recent reasoning</think>", datetime.now()),
+    ]
+
+    # Apply auto-compacting with reasoning_strip_age_threshold=5
+    compacted = list(auto_compact_log(messages, reasoning_strip_age_threshold=5))
+
+    # First two messages (distance from end >= 5) should have reasoning stripped
+    assert "<think>" not in compacted[0].content
+    assert "<think>" not in compacted[1].content
+
+    # Last 5 messages (distance from end < 5) should keep reasoning
+    for i in range(-5, 0):
+        # Check if original had <think>, if so, compacted should too
+        if "<think>" in messages[i].content:
+            assert "<think>" in compacted[i].content
+
+
+def test_auto_compact_reasoning_strip_threshold_zero():
+    """Test that threshold=0 strips reasoning from all messages."""
+    messages = [
+        Message("user", "Message 1 <think>reasoning 1</think>", datetime.now()),
+        Message("assistant", "Message 2 <think>reasoning 2</think>", datetime.now()),
+        Message("user", "Message 3 <think>reasoning 3</think>", datetime.now()),
+    ]
+
+    # Apply with threshold=0 (strip all)
+    compacted = list(auto_compact_log(messages, reasoning_strip_age_threshold=0))
+
+    # All messages should have reasoning stripped
+    for msg in compacted:
+        assert "<think>" not in msg.content
+
+
 if __name__ == "__main__":
     # Allow running the test directly
     pytest.main([__file__])