From 114be2465789aa29f3d6ee0e56da471a1fffeaa4 Mon Sep 17 00:00:00 2001 From: zhongyua Date: Mon, 27 Apr 2026 13:06:56 +0800 Subject: [PATCH 1/3] fix(history): atomic pair-aware truncation for tool_call blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace naive `conversation[-ctx_size:]` slicing with a walker that treats assistant.tool_calls and its matching role="tool" messages as one atomic block. Naive slicing can leave an orphan role="tool" at the head when the cut lands mid-pair — OpenAI rejects this with "No tool call found for function call output" (issue #446). New helper `services/history_window.truncate_by_message_count` walks tail backward, identifies blocks, and includes blocks whole or not at all. Orphan tools are silently dropped regardless of budget. Replaces head-only pop guard in: - app/api/websocket.py (web chat) - app/api/feishu.py (feishu channel) Leaves app/services/llm/caller.py:626 untouched (call_agent_llm short-reply path's hardcoded [-10:] is intentional). Tests: 15 covering empty input, budget edges, parallel tool_calls, multi-pair, head/mid orphans, realistic 60-message invariant. Addresses #446 failure mode. Co-Authored-By: Claude Opus 4.7 --- backend/app/api/feishu.py | 9 +- backend/app/api/websocket.py | 11 +- backend/app/services/history_window.py | 166 ++++++++++++++ backend/tests/test_history_window.py | 288 +++++++++++++++++++++++++ 4 files changed, 469 insertions(+), 5 deletions(-) create mode 100644 backend/app/services/history_window.py create mode 100644 backend/tests/test_history_window.py diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index 00fe32737..ca4cef7c7 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,6 +18,7 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service +from app.services.history_window import truncate_by_message_count router = APIRouter(tags=["feishu"]) @@ -1634,7 +1635,13 @@ async def _call_agent_llm( from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE if history: - messages.extend(_normalize_history_messages(history)[-ctx_size:]) + # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool + # pairs intact. Today _normalize_history_messages drops DB role="tool_call" + # rows, so this path has no tool messages and the helper acts as plain count + # truncation; the safety kicks in once a feishu reorganization helper exists. + messages.extend( + truncate_by_message_count(_normalize_history_messages(history), ctx_size) + ) messages.append({"role": "user", "content": user_text}) # Use actual user_id so the system prompt knows who it's chatting with diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 82b297916..6f17ad1e5 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,6 +19,7 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session +from app.services.history_window import truncate_by_message_count from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -662,10 +663,12 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # To prevent tool call message pairs(assistant + tool) from being broken down. - _truncated = conversation[-ctx_size:] - while _truncated and _truncated[0].get("role") == "tool": - _truncated.pop(0) + # Pair-aware truncation: keep the last `ctx_size` messages while + # preserving assistant.tool_calls ↔ role=tool blocks atomically. + # Naive [-ctx_size:] slicing can leave orphan tool messages at the + # head when the cut lands mid-pair, which OpenAI rejects with + # "No tool call found for function call output" (issue #446). + _truncated = truncate_by_message_count(conversation, ctx_size) return await call_llm_with_failover( primary_model=llm_model, diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py new file mode 100644 index 000000000..54c0ce8a6 --- /dev/null +++ b/backend/app/services/history_window.py @@ -0,0 +1,166 @@ +"""Pair-aware conversation history truncation. + +Replaces naive ``conversation[-N:]`` slicing with a walker that keeps +``assistant.tool_calls`` and their matching ``role="tool"`` messages as an +atomic block — never half a pair, never orphan tool messages. + +Why: OpenAI Responses API and Chat Completions both reject input where a +``function_call_output`` / ``role="tool"`` message has no matching +``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive +``[-N:]`` slicing can leave such orphans at the head when the cut lands +between an assistant message and its tool results. This is the failure mode +reported in issue #446. + +Orphan detection is by ``tool_call_id`` matching, not by adjacency — a +tool message inserted between a valid pair and other messages (from +malformed persistence or upstream truncation) is dropped, not folded +into an adjacent block. This makes the helper robust against orphans +at any position, not just at the slice head. + +Input is expected to be in OpenAI chat-completion format (post-reorganization +from DB ``role="tool_call"`` rows). +""" + +from __future__ import annotations + +from typing import Any + + +def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: + """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has + no matching ``assistant.tool_calls`` earlier in the conversation. + + OpenAI rejects the request the moment a ``function_call_output`` is + sent without its matching ``function_call``, regardless of whether + that tool message is at the head, middle, or end. So orphan detection + is by ID matching, not by position. + """ + orphans: set[int] = set() + for i, msg in enumerate(messages): + if msg.get("role") != "tool": + continue + tcid = msg.get("tool_call_id") + if not tcid: + orphans.add(i) + continue + # Search backward for an assistant whose tool_calls contains this id. + # Walks past intervening user / system / other-assistant messages. + found = False + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + found = True + break + j -= 1 + if not found: + orphans.add(i) + return orphans + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool, non-tool-calling message (user / system / assistant text), or + - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"`` + message (identified by ``tool_call_id``, not adjacency). + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages — those whose ``tool_call_id`` has no matching assistant — are + silently dropped regardless of budget. Sending them to OpenAI causes the + #446 error. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + + orphans = _identify_orphans(messages) + n = len(messages) + consumed: set[int] = set(orphans) # orphans drop unconditionally + blocks: list[set[int]] = [] # tail-to-head order + + for i in range(n - 1, -1, -1): + if i in consumed: + continue + msg = messages[i] + role = msg.get("role") + + if role == "tool": + # Find this tool's owning assistant by matching tool_call_id + tcid = msg.get("tool_call_id") + asst_idx = -1 + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + asst_idx = j + break + j -= 1 + if asst_idx < 0: + # Defensive — orphan detection should have caught this + consumed.add(i) + continue + # Block = assistant + ALL of its matching tool messages (siblings) + asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]} + block = {asst_idx} + for k in range(asst_idx + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + elif role == "assistant" and msg.get("tool_calls"): + # Encountered the assistant before any of its tools (e.g. tools + # were truncated upstream or are still in flight). Group with + # whatever matching tools follow it. + asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]} + block = {i} + for k in range(i + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + else: + consumed.add(i) + blocks.append({i}) + + # Walk blocks tail-to-head, taking until budget exhausted. + keep: set[int] = set() + budget = max_messages + for block in blocks: + size = len(block) + if size <= budget: + keep |= block + budget -= size + else: + # Block doesn't fit — stop. Do NOT partial-include (would split pair). + break + + return [messages[k] for k in sorted(keep)] diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py new file mode 100644 index 000000000..934216758 --- /dev/null +++ b/backend/tests/test_history_window.py @@ -0,0 +1,288 @@ +"""Unit tests for pair-aware conversation history truncation. + +Validates that ``truncate_by_message_count`` preserves +``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces +orphan tool messages that would trigger the OpenAI #446 failure mode. +""" + +from app.services.history_window import truncate_by_message_count + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _u(text: str) -> dict: + return {"role": "user", "content": text} + + +def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict: + msg: dict = {"role": "assistant", "content": text} + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + +def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict: + return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}} + + +def _t(call_id: str, content: str = "ok") -> dict: + return {"role": "tool", "tool_call_id": call_id, "content": content} + + +def _roles(msgs: list[dict]) -> list[str]: + return [m.get("role", "?") for m in msgs] + + +# ── Edge cases ────────────────────────────────────────────────────────── + + +def test_empty_input_returns_empty(): + assert truncate_by_message_count([], 10) == [] + + +def test_zero_or_negative_budget_returns_empty(): + msgs = [_u("hi"), _u("there")] + assert truncate_by_message_count(msgs, 0) == [] + assert truncate_by_message_count(msgs, -5) == [] + + +def test_within_budget_returns_all(): + msgs = [_u("a"), _a("b"), _u("c")] + out = truncate_by_message_count(msgs, 10) + assert out == msgs + assert out is not msgs # new list + + +def test_input_not_mutated(): + msgs = [_u("a"), _a("b"), _u("c"), _u("d")] + snapshot = list(msgs) + truncate_by_message_count(msgs, 2) + assert msgs == snapshot + + +# ── Core pair-preservation behavior ───────────────────────────────────── + + +def test_keeps_assistant_tool_pair_intact(): + """Slicing must not split assistant.tool_calls from its tool result.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["assistant", "tool", "user"] + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[1]["tool_call_id"] == "X" + + +def test_drops_pair_entirely_when_budget_too_small(): + """If budget can't fit the whole pair, drop it — never half.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 2 — can't fit pair (needs 2) + final user, must drop pair + out = truncate_by_message_count(msgs, 2) + # Only the trailing user fits as a single block; pair (size 2) doesn't fit + # in remaining budget=1 after taking user. + assert _roles(out) == ["user"] + assert out[0]["content"] == "done?" + + +def test_drops_orphan_tool_at_head(): + """A role=tool with no preceding assistant.tool_calls is dropped.""" + msgs = [ + _t("X"), # orphan — no assistant before + _u("hi"), + _a("ok"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "assistant"] + + +def test_drops_orphan_tool_at_head_after_slicing(): + """Slicing produces an orphan tool at head — must be dropped (the + classic #446 failure mode).""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), # ← naive slice [-3:] would start here as orphan + _u("u2"), + _a("final"), + ] + # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X) + # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because + # we'd need budget 5 to include from start. Result: [u("u2"), a("final")]. + out = truncate_by_message_count(msgs, 3) + assert "tool" not in _roles(out) + # No orphan tool_call_id reaches output + for m in out: + if m.get("role") == "tool": + raise AssertionError(f"Orphan tool leaked: {m}") + + +def test_multiple_parallel_tool_calls_in_one_assistant(): + """Assistant with N tool_calls followed by N tools is one atomic block.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0 + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"] + # Verify the pair came through whole + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[3]["tool_call_id"] == "Z" + + +def test_parallel_tool_pair_dropped_if_too_big(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u]. + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["user"] + + +def test_multiple_pairs_some_drop(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("A")]), + _t("A"), + _u("u2"), + _a(None, tool_calls=[_tc("B")]), + _t("B"), + _u("u3"), + ] + # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3. + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["user", "assistant", "tool", "user"] + assert out[1]["tool_calls"][0]["id"] == "B" + assert out[2]["tool_call_id"] == "B" + + +def test_no_partial_pair_when_budget_exactly_one_short(): + """Exactly one short of fitting a pair → drop the pair, don't include + just the assistant.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + ] + # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk + # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0) + # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t] + out = truncate_by_message_count(msgs, 2) + assert _roles(out) == ["assistant", "tool"] + # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits). + # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't + # fit, BREAK. Output: []. + out2 = truncate_by_message_count(msgs, 1) + assert out2 == [] + + +def test_mid_orphan_tool_dropped(): + """A tool whose tool_call_id has no matching assistant nearby — defensive + drop. (Shouldn't happen with current persistence, but be robust.)""" + msgs = [ + _u("u1"), + _t("ORPHAN_X"), # malformed — no preceding assistant.tool_calls + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + # Orphan dropped + assert "tool" not in _roles(out) + assert _roles(out) == ["user", "user"] + + +def test_orphan_adjacent_to_valid_pair_still_dropped(): + """Orphan tool message inserted right after a legitimate tool-call pair + must be dropped — adjacency to a valid pair does not legitimize it. + + This is the bug class that triggers OpenAI #446 even when slice cut + boundaries would otherwise be safe: any orphan reaching the wire, + regardless of position, makes the request invalid.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("VALID")]), + _t("VALID", "real result"), + _t("ORPHAN_id", "ghost result"), # no assistant emits ORPHAN_id + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + + # The orphan must NOT survive — even though it's adjacent to a valid pair + orphan_present = any( + m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id" + for m in out + ) + assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped" + + # The valid pair survives intact + valid_assistant = any( + m.get("role") == "assistant" + and m.get("tool_calls") + and any(tc["id"] == "VALID" for tc in m["tool_calls"]) + for m in out + ) + valid_tool = any( + m.get("role") == "tool" and m.get("tool_call_id") == "VALID" + for m in out + ) + assert valid_assistant and valid_tool + + +def test_system_message_treated_as_normal_block(): + msgs = [ + {"role": "system", "content": "you are an agent"}, + _u("hi"), + _a("hello"), + ] + out = truncate_by_message_count(msgs, 2) + # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped. + assert _roles(out) == ["user", "assistant"] + + +def test_realistic_long_conversation_truncation(): + """End-to-end: simulate a long chat with many tool-call turns and ensure + the output never has orphan tools.""" + msgs: list[dict] = [_u("start")] + for k in range(20): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result {k}")) + msgs.append(_u(f"next {k}")) + msgs.append(_a("final answer")) + + # Truncate to 30 messages + out = truncate_by_message_count(msgs, 30) + + # Sanity: budget respected + assert len(out) <= 30 + + # Critical invariant: no orphan tool messages anywhere + seen_tool_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + seen_tool_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + tcid = m.get("tool_call_id") + assert tcid in seen_tool_call_ids, ( + f"Orphan tool {tcid!r} in output without matching assistant.tool_calls" + ) From 2a0040fef35e69706030afde273f7f43f80f5adc Mon Sep 17 00:00:00 2001 From: zhongyua Date: Mon, 27 Apr 2026 13:32:48 +0800 Subject: [PATCH 2/3] feat(agent): token-aware history truncation alongside message cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `agents.context_window_tokens` field (default 50000) and a new `truncate_by_token_budget` helper that bounds in-context history by both estimated token cost (primary) and message count (safety cap), preserving assistant.tool_calls ↔ role=tool pairs intact via the same walker as truncate_by_message_count. Why: message-count alone is a wildly variable proxy for token cost — one 50KB tool result eats more context than 100 short user messages. Token budget gives predictable behavior across heterogeneous traffic; message cap remains as a safety net against pathological tiny-message floods. Changes: - models/agent.py: + context_window_tokens (Integer, default=50000) + DEFAULT_CONTEXT_WINDOW_TOKENS constant - schemas/schemas.py: AgentOut, AgentUpdate (1000 <= tokens <= 500000) - alembic: add_context_window_tokens.py (idempotent IF NOT EXISTS) - services/history_window.py: + truncate_by_token_budget, refactored common walker, JSON-serialized char->token estimate via existing estimate_tokens_from_chars (chars/3 — overestimates safely) - api/websocket.py: pass tok_budget to helper, raise DB load to max(ctx_size, 500) so helper has room to choose - api/feishu.py: same pattern at 2 sites (web chat + IM channel paths) - frontend: AgentDetail Settings slider + i18n + types 10 new tests covering token-budget mode (huge-message dropped, both-bounds interaction, atomic pair preservation, orphan defense). 25/25 pass. Other channels (dingtalk/discord/slack/teams/wecom/whatsapp) still use DB-level message-count limit only — they don't get token awareness in this PR but won't crash. Migrating them is follow-up scope. Co-Authored-By: Claude Opus 4.7 --- .../versions/add_context_window_tokens.py | 65 ++++++ backend/app/api/feishu.py | 23 ++- backend/app/api/websocket.py | 41 +++- backend/app/models/agent.py | 23 +++ backend/app/schemas/schemas.py | 2 + backend/app/services/history_window.py | 186 ++++++++++++++---- backend/tests/test_history_window.py | 140 ++++++++++++- frontend/src/i18n/en.json | 4 +- frontend/src/i18n/zh.json | 4 +- frontend/src/pages/AgentDetail.tsx | 18 ++ frontend/src/types/index.ts | 1 + 11 files changed, 442 insertions(+), 65 deletions(-) create mode 100644 backend/alembic/versions/add_context_window_tokens.py diff --git a/backend/alembic/versions/add_context_window_tokens.py b/backend/alembic/versions/add_context_window_tokens.py new file mode 100644 index 000000000..bf385417e --- /dev/null +++ b/backend/alembic/versions/add_context_window_tokens.py @@ -0,0 +1,65 @@ +"""add agents.context_window_tokens for token-aware history truncation + +Revision ID: add_context_window_tokens +Revises: rm_agent_credential_secrets +Create Date: 2026-04-27 +""" + +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = "add_context_window_tokens" +down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Add context_window_tokens with a DDL default of 50000. + + The four-step pattern is required because earlier in the migration chain, + ``alembic/versions/0000_initial_schema.py`` calls + ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents`` + from the *current* model state — including any new columns. SQLAlchemy's + Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause, + so the column ends up ``NOT NULL`` with no default, and a naive + ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never + sets the default. + + This four-step approach is idempotent regardless of pre-existing state: + - column missing → created (nullable, no default initially) + - column present without default → default set + - any rows with NULL → backfilled to 50000 + - column made NOT NULL + + Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE + affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL + column is a no-op. + """ + # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here, + # so existing rows (if any from create_all) aren't blocked. + op.execute( + "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER" + ) + # 2. Ensure the DDL default is set so future inserts that omit this + # column (raw SQL, restored backups, manual migrations) get 50000. + op.execute( + "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000" + ) + # 3. Backfill any rows that were created before the default landed. + op.execute( + "UPDATE agents SET context_window_tokens = 50000 " + "WHERE context_window_tokens IS NULL" + ) + # 4. Now safe to enforce NOT NULL. + op.execute( + "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL" + ) + + +def downgrade() -> None: + # Downgrade omitted — dropping the column would lose per-tenant tuning. + pass diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index ca4cef7c7..02e21f5bf 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,7 +18,7 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service -from app.services.history_window import truncate_by_message_count +from app.services.history_window import truncate_by_token_budget router = APIRouter(tags=["feishu"]) @@ -657,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession ) _pre_sess = _pre_sess_r.scalar_one_or_none() _history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id + # Load extra raw material so app-level token-aware helper has room to choose history_result = await db.execute( select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(max(ctx_size, 500)) ) history_msgs = history_result.scalars().all() history = _build_llm_history_from_chat_messages(list(reversed(history_msgs))) @@ -1374,11 +1375,12 @@ async def _handle_feishu_file( # Load conversation history for LLM context from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE + # Load extra raw material so app-level token-aware helper has room to choose _hist_r = await db.execute( _select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(max(ctx_size, 500)) ) _history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all()))) @@ -1632,15 +1634,18 @@ async def _call_agent_llm( # Build conversation messages (without system prompt — call_llm adds it) messages: list[dict] = [] - from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE + from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE + tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS if history: - # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool - # pairs intact. Today _normalize_history_messages drops DB role="tool_call" - # rows, so this path has no tool messages and the helper acts as plain count - # truncation; the safety kicks in once a feishu reorganization helper exists. + # Pair-aware truncation: token budget primary, message count as safety cap. + # Today _normalize_history_messages drops DB role="tool_call" rows, so this + # path has no tool messages and the pair guard is a no-op; the safety kicks + # in once a feishu reorganization helper exists. messages.extend( - truncate_by_message_count(_normalize_history_messages(history), ctx_size) + truncate_by_token_budget( + _normalize_history_messages(history), tok_budget, message_cap=ctx_size, + ) ) messages.append({"role": "user", "content": user_text}) diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 6f17ad1e5..8d79deeff 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,7 +19,7 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session -from app.services.history_window import truncate_by_message_count +from app.services.history_window import truncate_by_token_budget from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -214,7 +214,9 @@ async def websocket_chat( role_description = agent.role_description or "" welcome_message = agent.welcome_message or "" ctx_size = agent.context_window_size or 100 - logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}") + from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS + tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS + logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok") # Load the agent's primary model if agent.primary_model_id: @@ -300,11 +302,14 @@ async def websocket_chat( logger.info(f"[WS] Selected primary session {conv_id}") try: + # Load extra raw material so the app-level token-aware helper + # (truncate_by_token_budget below) has room to choose from. + _db_load_cap = max(ctx_size, 500) history_result = await db.execute( select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(_db_load_cap) ) history_messages = list(reversed(history_result.scalars().all())) logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}") @@ -663,12 +668,30 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # Pair-aware truncation: keep the last `ctx_size` messages while - # preserving assistant.tool_calls ↔ role=tool blocks atomically. - # Naive [-ctx_size:] slicing can leave orphan tool messages at the - # head when the cut lands mid-pair, which OpenAI rejects with - # "No tool call found for function call output" (issue #446). - _truncated = truncate_by_message_count(conversation, ctx_size) + # Pair-aware truncation with a token budget plus a message-count + # safety cap. Either bound stops the walk; pairs (assistant.tool_calls + # ↔ role=tool) are kept atomic. Token budget protects against + # one-tool-result-eats-the-window scenarios; message cap protects + # against pathological tiny-message floods. The pair guard fixes + # the orphan-tool failure mode reported in #446. + # + # The current user message (just appended at line ~416) is excluded + # from truncation and re-appended after — otherwise a single huge + # input (large paste, base64 image_data) could push past the budget + # and cause the helper to drop the very message we're answering. + # If the input itself exceeds the model's context, the provider will + # surface a clear error rather than silently dropping it here. + _current = ( + conversation[-1] + if conversation and conversation[-1].get("role") == "user" + else None + ) + _history = conversation[:-1] if _current is not None else conversation + _truncated = truncate_by_token_budget( + _history, tok_budget, message_cap=ctx_size, + ) + if _current is not None: + _truncated.append(_current) return await call_llm_with_failover( primary_model=llm_model, diff --git a/backend/app/models/agent.py b/backend/app/models/agent.py index 5c48d7bf9..2493aaedd 100644 --- a/backend/app/models/agent.py +++ b/backend/app/models/agent.py @@ -15,6 +15,11 @@ # (see: https://github.com/dataelement/Clawith/issues/238). DEFAULT_CONTEXT_WINDOW_SIZE = 100 +# Default token budget for in-context history. Conservative for 128K-context +# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent +# override via Agent.context_window_tokens. +DEFAULT_CONTEXT_WINDOW_TOKENS = 50000 + class Agent(Base): """Digital employee (Agent) instance. @@ -81,6 +86,24 @@ class Agent(Base): last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) tokens_used_total: Mapped[int] = mapped_column(Integer, default=0) context_window_size: Mapped[int] = mapped_column(Integer, default=100) + # Token-aware secondary bound on history sent to the LLM. Truncation uses + # the smaller of context_window_size (message count) and this token budget, + # preserving assistant.tool_calls ↔ role=tool pairs intact. + # + # ``server_default`` matters: alembic/versions/0000_initial_schema.py uses + # ``Base.metadata.create_all`` which reads model state at runtime. Without + # a server_default, fresh-DB bootstrap would create this column NOT NULL + # without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration + # later in the chain would short-circuit, leaving direct-SQL inserts + # broken. ``server_default="50000"`` ensures the DDL has the default + # whether the column was created by create_all or by the explicit + # migration. + context_window_tokens: Mapped[int] = mapped_column( + Integer, + default=DEFAULT_CONTEXT_WINDOW_TOKENS, + server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS), + nullable=False, + ) max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50) # Trigger limits (per-agent, configurable from Settings UI) diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py index 0ae5d8e34..eb475d9c2 100644 --- a/backend/app/schemas/schemas.py +++ b/backend/app/schemas/schemas.py @@ -251,6 +251,7 @@ class AgentOut(BaseModel): max_tokens_per_day: int | None = None max_tokens_per_month: int | None = None context_window_size: int = 100 + context_window_tokens: int = 50000 max_tool_rounds: int = 50 max_triggers: int = 20 min_poll_interval_min: int = 5 @@ -286,6 +287,7 @@ class AgentUpdate(BaseModel): primary_model_id: uuid.UUID | None = None fallback_model_id: uuid.UUID | None = None context_window_size: int | None = Field(default=None, ge=1, le=500) + context_window_tokens: int | None = Field(default=None, ge=1000, le=500000) max_tokens_per_day: int | None = None max_tokens_per_month: int | None = None max_tool_rounds: int | None = None diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py index 54c0ce8a6..a3d63817c 100644 --- a/backend/app/services/history_window.py +++ b/backend/app/services/history_window.py @@ -11,19 +11,26 @@ between an assistant message and its tool results. This is the failure mode reported in issue #446. -Orphan detection is by ``tool_call_id`` matching, not by adjacency — a -tool message inserted between a valid pair and other messages (from -malformed persistence or upstream truncation) is dropped, not folded -into an adjacent block. This makes the helper robust against orphans -at any position, not just at the slice head. +Two public entry points: + - ``truncate_by_message_count`` — bound by message count + - ``truncate_by_token_budget`` — bound by estimated token cost (and an + optional message-count safety cap); preferred for production paths + where one tool result can dwarf 50 short messages. Input is expected to be in OpenAI chat-completion format (post-reorganization -from DB ``role="tool_call"`` rows). +from DB ``role="tool_call"`` rows). Helper is tolerant of malformed input — +unmatched tool messages at the head are silently dropped. """ from __future__ import annotations -from typing import Any +import json +from typing import Any, Callable + +from app.services.token_tracker import estimate_tokens_from_chars + + +# ── Block detection (shared between truncators) ───────────────────────── def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: @@ -60,38 +67,24 @@ def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: return orphans -def truncate_by_message_count( - messages: list[dict[str, Any]], - max_messages: int, -) -> list[dict[str, Any]]: - """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. - - A "block" is either: - - a single non-tool, non-tool-calling message (user / system / assistant text), or - - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"`` - message (identified by ``tool_call_id``, not adjacency). - - Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` - messages — those whose ``tool_call_id`` has no matching assistant — are - silently dropped regardless of budget. Sending them to OpenAI causes the - #446 error. +def _identify_blocks(messages: list[dict[str, Any]]) -> list[set[int]]: + """Group conversation entries into atomic blocks. - Args: - messages: Conversation list in OpenAI format. Empty list is fine. - max_messages: Soft upper bound on the number of returned entries. - Values ``<= 0`` return ``[]``. + A block is a set of indices that must be kept (or dropped) together: + - ``{i}`` for a single non-tool, non-tool-calling message + - ``{asst_idx, tool_idx_1, tool_idx_2, ...}`` for an assistant that + emitted N tool_calls plus its matching tool result messages, + identified by ``tool_call_id`` (not by adjacency — orphan tools + inserted between are dropped, not folded into the block). - Returns: - A new list (input is never mutated) of at most ``max_messages`` entries - from the tail of ``messages``, with all tool-call pairs intact. + Returned tail-to-head: most recent block first. Orphan tool messages + (those whose tool_call_id has no matching assistant.tool_calls) are + silently dropped — never appear in any block. """ - if max_messages <= 0 or not messages: - return [] - orphans = _identify_orphans(messages) n = len(messages) + blocks: list[set[int]] = [] consumed: set[int] = set(orphans) # orphans drop unconditionally - blocks: list[set[int]] = [] # tail-to-head order for i in range(n - 1, -1, -1): if i in consumed: @@ -151,16 +144,127 @@ def truncate_by_message_count( consumed.add(i) blocks.append({i}) - # Walk blocks tail-to-head, taking until budget exhausted. + return blocks + + +def _walk_blocks( + messages: list[dict[str, Any]], + budgets_ok: Callable[[int, int], bool], + consume: Callable[[int, int], None], +) -> list[dict[str, Any]]: + """Common walker used by both truncators. + + ``budgets_ok(block_msg_count, block_token_cost)`` returns True if the + block fits. ``consume`` updates remaining budget when a block is taken. + Stops on first non-fitting block (atomic — never partial-include). + """ + blocks = _identify_blocks(messages) keep: set[int] = set() - budget = max_messages for block in blocks: size = len(block) - if size <= budget: - keep |= block - budget -= size - else: - # Block doesn't fit — stop. Do NOT partial-include (would split pair). + token_cost = sum(_estimate_msg_tokens(messages[k]) for k in block) + if not budgets_ok(size, token_cost): break - + keep |= block + consume(size, token_cost) return [messages[k] for k in sorted(keep)] + + +def _estimate_msg_tokens(msg: dict[str, Any]) -> int: + """Estimate token cost for one message via JSON-serialized char count. + + Slight overestimate (JSON keys/quotes inflate vs the tokenizer's view of + the structured payload), which is the safe direction — better to truncate + a bit early than send too much and OOM the model. + """ + try: + serialized = json.dumps(msg, ensure_ascii=False, default=str) + except (TypeError, ValueError): + # Fallback for unserializable payloads (shouldn't happen in practice) + serialized = str(msg) + return estimate_tokens_from_chars(len(serialized)) + + +# ── Public API ────────────────────────────────────────────────────────── + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool message (``user``/``system``/``assistant`` text), or + - an ``assistant`` with ``tool_calls`` plus every immediately-following + ``role="tool"`` message (the assistant's tool results). + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages with no matching assistant are always dropped, regardless of + budget — sending them to OpenAI causes the #446 error. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + remaining = [max_messages] + + def budgets_ok(size: int, _tok: int) -> bool: + return size <= remaining[0] + + def consume(size: int, _tok: int) -> None: + remaining[0] -= size + + return _walk_blocks(messages, budgets_ok, consume) + + +def truncate_by_token_budget( + messages: list[dict[str, Any]], + token_budget: int, + *, + message_cap: int | None = None, +) -> list[dict[str, Any]]: + """Keep tail messages within both bounds, preserving tool-call pairs. + + The two bounds work together: a block is included only if both the + remaining token budget and (when set) remaining message cap can absorb + its full size. The first bound to be exhausted stops the walk. + + Token cost per message is an overestimate based on JSON-serialized char + count divided by ~3 (see ``_estimate_msg_tokens``). This is intentional: + for budget enforcement, overestimating is safe. + + Args: + messages: Conversation list in OpenAI format. + token_budget: Soft upper bound on cumulative estimated tokens. + Values ``<= 0`` return ``[]``. + message_cap: Optional secondary bound on entry count. When set, the + walk stops as soon as either bound is exhausted. + + Returns: + A new list of recent messages within the budget(s), with all + tool-call pairs intact. + """ + if token_budget <= 0 or not messages: + return [] + if message_cap is not None and message_cap <= 0: + return [] + + tok_remaining = [token_budget] + msg_remaining = [message_cap if message_cap is not None else len(messages) + 1] + + def budgets_ok(size: int, tok_cost: int) -> bool: + return size <= msg_remaining[0] and tok_cost <= tok_remaining[0] + + def consume(size: int, tok_cost: int) -> None: + msg_remaining[0] -= size + tok_remaining[0] -= tok_cost + + return _walk_blocks(messages, budgets_ok, consume) diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py index 934216758..c437b5b7d 100644 --- a/backend/tests/test_history_window.py +++ b/backend/tests/test_history_window.py @@ -1,11 +1,14 @@ """Unit tests for pair-aware conversation history truncation. -Validates that ``truncate_by_message_count`` preserves -``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces -orphan tool messages that would trigger the OpenAI #446 failure mode. +Validates that ``truncate_by_message_count`` and ``truncate_by_token_budget`` +preserve ``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never +produces orphan tool messages that would trigger the OpenAI #446 failure mode. """ -from app.services.history_window import truncate_by_message_count +from app.services.history_window import ( + truncate_by_message_count, + truncate_by_token_budget, +) # ── Helpers ───────────────────────────────────────────────────────────── @@ -258,6 +261,135 @@ def test_system_message_treated_as_normal_block(): assert _roles(out) == ["user", "assistant"] +# ── Token-budget mode ─────────────────────────────────────────────────── + + +def test_token_budget_empty_or_zero(): + assert truncate_by_token_budget([], 1000) == [] + assert truncate_by_token_budget([_u("hi")], 0) == [] + assert truncate_by_token_budget([_u("hi")], -10) == [] + + +def test_token_budget_short_messages_within_budget(): + msgs = [_u("hi"), _a("hello"), _u("ok")] + out = truncate_by_token_budget(msgs, 10000) + assert out == msgs + + +def test_token_budget_huge_message_dropped(): + """One enormous tool result should not push other messages out of context; + the huge block just doesn't fit and is dropped.""" + huge_payload = "x" * 60000 # ~20K tokens via chars/3 + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content=huge_payload), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 5000) + # u2 fits (small). Pair is huge → doesn't fit → break. u1 not visited (after pair). + assert _roles(out) == ["user"] + assert out[0]["content"] == "u2" + + +def test_token_budget_preserves_pair_when_both_fit(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content="small result"), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 5000) + assert _roles(out) == ["user", "assistant", "tool", "user"] + + +def test_token_budget_drops_huge_tail_message_caller_must_protect(): + """The helper walks tail-to-head. If the LAST message (often the current + user input that just arrived) alone exceeds the budget, the walker can't + fit it and breaks — leaving NOTHING. Callers that pass the current user + message INTO truncation must hold it OUT and re-append afterward (see + api/websocket.py for the pattern). This test pins the contract so call + sites don't silently regress. + """ + msgs = [ + _u("history msg 1"), + _a("history msg 2"), + _u("x" * 60000), # current input — alone >> 5000 token budget + ] + out = truncate_by_token_budget(msgs, token_budget=5000) + # Walker visits the huge user msg first (tail), can't fit → BREAK. + # Nothing else gets a chance because the loop short-circuits on first miss. + assert out == [] + + +def test_token_budget_with_message_cap_message_wins(): + """100 small messages fit token budget but message_cap=10 binds first.""" + msgs = [_u(f"m{k}") for k in range(100)] + out = truncate_by_token_budget(msgs, 100000, message_cap=10) + assert len(out) == 10 + # Last 10 messages, in order + assert [m["content"] for m in out] == [f"m{k}" for k in range(90, 100)] + + +def test_token_budget_with_message_cap_token_wins(): + """20 fat messages, message_cap loose, token budget binds first.""" + msgs = [_u("x" * 1500) for _ in range(20)] # JSON-serialized ~1527 chars → ~509 tokens + out = truncate_by_token_budget(msgs, 2000, message_cap=100) + # Token budget binds first (well below message_cap=100). Exact count depends + # on JSON overhead but is small enough that several messages don't fit. + assert 0 < len(out) < 10 + assert len(out) < len(msgs) + + +def test_token_budget_orphan_tool_dropped(): + msgs = [ + _t("ORPHAN_X", content="ghost"), + _u("u1"), + _a("ok"), + ] + out = truncate_by_token_budget(msgs, 100000) + # Orphan dropped regardless of budget + assert "tool" not in _roles(out) + + +def test_token_budget_drops_large_pair_atomically(): + """Pair size 2 with first half tiny but second half huge — drop both.""" + huge = "x" * 9000 # ~3000 tokens + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content=huge), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 1000) # fits u2 (~5 tokens), not pair + assert _roles(out) == ["user"] + + +def test_token_budget_zero_message_cap(): + msgs = [_u("hi")] + assert truncate_by_token_budget(msgs, 1000, message_cap=0) == [] + + +def test_token_budget_invariant_no_orphan_in_realistic_long_chat(): + """End-to-end with mixed message sizes — verify no orphan tool leaks.""" + msgs: list[dict] = [_u("start")] + for k in range(15): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result of call {k}")) + msgs.append(_u(f"followup {k}")) + out = truncate_by_token_budget(msgs, 1500, message_cap=20) + + # No orphan tool + assistant_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + assistant_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + assert m["tool_call_id"] in assistant_call_ids + + def test_realistic_long_conversation_truncation(): """End-to-end: simulate a long chat with many tool-call turns and ensure the output never has orphan tools.""" diff --git a/frontend/src/i18n/en.json b/frontend/src/i18n/en.json index f47375aa6..44972143a 100644 --- a/frontend/src/i18n/en.json +++ b/frontend/src/i18n/en.json @@ -421,7 +421,9 @@ "noFallback": "No fallback", "conversationContext": "Context Window", "maxRounds": "Context Window Size", - "roundsDesc": "Number of recent messages included as context for each LLM request", + "roundsDesc": "Maximum number of recent messages included as context for each LLM request", + "maxTokens": "Context Token Budget", + "tokensDesc": "Maximum tokens of history per LLM request. Used together with the message-count cap; whichever bound hits first wins. Protects against a single large tool result blowing out context.", "tokenLimits": "Token Limits", "dailyLimit": "Daily Limit", "monthlyLimit": "Monthly Limit", diff --git a/frontend/src/i18n/zh.json b/frontend/src/i18n/zh.json index 77446d99d..cc861e462 100644 --- a/frontend/src/i18n/zh.json +++ b/frontend/src/i18n/zh.json @@ -428,7 +428,9 @@ "noFallback": "无备选", "conversationContext": "上下文窗口", "maxRounds": "上下文窗口大小", - "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量", + "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量(条数上限)", + "maxTokens": "上下文 token 预算", + "tokensDesc": "每次 LLM 请求时携带的历史 token 上限。与上下文窗口大小(条数)取较小值,防止单次大工具结果撑爆 context", "tokenLimits": "Token 用量限制", "dailyLimit": "每日上限", "monthlyLimit": "每月上限", diff --git a/frontend/src/pages/AgentDetail.tsx b/frontend/src/pages/AgentDetail.tsx index 7e8dd59c0..2d16b403f 100644 --- a/frontend/src/pages/AgentDetail.tsx +++ b/frontend/src/pages/AgentDetail.tsx @@ -2214,6 +2214,7 @@ function AgentDetailInner() { primary_model_id: '', fallback_model_id: '', context_window_size: 100, + context_window_tokens: 50000, max_tool_rounds: 50, max_tokens_per_day: '' as string | number, max_tokens_per_month: '' as string | number, @@ -2233,6 +2234,7 @@ function AgentDetailInner() { primary_model_id: agent.primary_model_id || '', fallback_model_id: agent.fallback_model_id || '', context_window_size: agent.context_window_size ?? 100, + context_window_tokens: (agent as any).context_window_tokens ?? 50000, max_tool_rounds: (agent as any).max_tool_rounds ?? 50, max_tokens_per_day: agent.max_tokens_per_day || '', max_tokens_per_month: agent.max_tokens_per_month || '', @@ -5741,6 +5743,7 @@ function AgentDetailInner() { settingsForm.primary_model_id !== (agent?.primary_model_id || '') || settingsForm.fallback_model_id !== (agent?.fallback_model_id || '') || settingsForm.context_window_size !== (agent?.context_window_size ?? 100) || + settingsForm.context_window_tokens !== ((agent as any)?.context_window_tokens ?? 50000) || settingsForm.max_tool_rounds !== ((agent as any)?.max_tool_rounds ?? 50) || String(settingsForm.max_tokens_per_day) !== String(agent?.max_tokens_per_day || '') || String(settingsForm.max_tokens_per_month) !== String(agent?.max_tokens_per_month || '') || @@ -5757,6 +5760,7 @@ function AgentDetailInner() { primary_model_id: settingsForm.primary_model_id || null, fallback_model_id: settingsForm.fallback_model_id || null, context_window_size: settingsForm.context_window_size, + context_window_tokens: settingsForm.context_window_tokens, max_tool_rounds: settingsForm.max_tool_rounds, max_tokens_per_day: settingsForm.max_tokens_per_day ? Number(settingsForm.max_tokens_per_day) : null, max_tokens_per_month: settingsForm.max_tokens_per_month ? Number(settingsForm.max_tokens_per_month) : null, @@ -5885,6 +5889,20 @@ function AgentDetailInner() { />
{t('agent.settings.roundsDesc')}
+
+ + setSettingsForm(f => ({ ...f, context_window_tokens: Math.max(1000, Math.min(500000, parseInt(e.target.value) || 50000)) }))} + style={{ width: '120px' }} + /> +
{t('agent.settings.tokensDesc')}
+
{/* Max Tool Call Rounds */} diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index e7138f051..e29f740c8 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -36,6 +36,7 @@ export interface Agent { last_heartbeat_at?: string; timezone?: string; context_window_size?: number; + context_window_tokens?: number; agent_type?: 'native' | 'openclaw'; openclaw_last_seen?: string; unread_count?: number; From b01c0f123b3ef44125119c7c968a4ebe2fef34d1 Mon Sep 17 00:00:00 2001 From: zhongyua Date: Mon, 27 Apr 2026 13:55:37 +0800 Subject: [PATCH 3/3] feat(tools): truncate large tool results with workspace spill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tool calls returning ~50 KB of search hits or PDF extracts used to enter the in-context history verbatim, burning tokens for content the model samples one slice of. New `services/tool_result_truncation.py` spills oversized payloads to `agent_data//_tool_results/.txt` and replaces the in-context body with a head excerpt + a marker pointing the model at `read_file` for retrieval. Threshold: 4000 estimated tokens (~12 KB at chars/3) — soft-start to give the model time to learn the read_file follow-up; can tighten to ~2000 once telemetry confirms reliable use of the marker. Smart head: - JSON-shape responses with a known array key (results/items/data/ entries/hits/documents) keep metadata + first 5 items + a synthetic _truncated__count field, so search/list responses stay useful. - Otherwise plain head-cut (preferring a line boundary when one is near the cut point). - Vision-injected list payloads (image_data markers) pass through. Sandbox: spill files live under each agent's existing AGENT_DATA_DIR sandbox; cross-agent leak is prevented by the existing read_file boundary. Applied at three sites to prevent drift: - llm/caller.py _process_tool_call (call_llm path) - llm/caller.py call_agent_llm_with_tools inner loop - services/heartbeat.py tool loop agent_context.py rule 3 augmented with a paragraph teaching the model to recognize the marker and use read_file for full content rather than fabricating from prior knowledge. 16 new unit tests covering pass-through, spill, utf-8 round-trip, threshold boundary, JSON-shape preservation, write-failure fallback, and a realistic 50-result jina_search end-to-end. 41/41 total pass. Note: 3 tool loops still exist (caller.py:432, caller.py:763, heartbeat.py:335). Drift prevention is partial here — PR 3 consolidates them into a single execute_tool_calls helper. Co-Authored-By: Claude Opus 4.7 --- backend/app/services/agent_context.py | 12 + backend/app/services/heartbeat.py | 12 +- backend/app/services/llm/caller.py | 36 +- .../app/services/tool_result_truncation.py | 216 ++++++++++++ backend/tests/test_tool_result_truncation.py | 311 ++++++++++++++++++ 5 files changed, 580 insertions(+), 7 deletions(-) create mode 100644 backend/app/services/tool_result_truncation.py create mode 100644 backend/tests/test_tool_result_truncation.py diff --git a/backend/app/services/agent_context.py b/backend/app/services/agent_context.py index 74edb28df..1da003f59 100644 --- a/backend/app/services/agent_context.py +++ b/backend/app/services/agent_context.py @@ -438,6 +438,18 @@ async def build_agent_context(agent_id: uuid.UUID, agent_name: str, role_descrip 3. **NEVER fabricate file contents or tool results from memory.** Even if you saw a file before, you MUST call the tool again to get current data. + **Handling truncated tool results:** When a tool returns a large payload, the system + may truncate the in-context view and save the full output to your workspace. You will + see a marker like: + + `[truncated. Full output (12453 tokens) saved to _tool_results/.txt under your + workspace — use the read_file tool to retrieve specific sections]` + + When you see this marker, the head excerpt above it is enough for an overview. If you + need the full content (specific search hit, page from a PDF, item N of a list), call + `read_file` with the path shown in the marker. Do NOT fabricate the missing content + from your prior knowledge — the saved file is the ground truth. + 4. **Use `write_file` to update memory/memory.md with important information.** 5. **Use `write_file` to update focus.md with your current focus items.** diff --git a/backend/app/services/heartbeat.py b/backend/app/services/heartbeat.py index f3331ab4f..b7784e84f 100644 --- a/backend/app/services/heartbeat.py +++ b/backend/app/services/heartbeat.py @@ -370,10 +370,20 @@ async def _execute_heartbeat(agent_id: uuid.UUID): else: tool_result = await execute_tool(tool_name, args, agent_id, agent_creator_id) + # Spill oversized tool results to disk; keep in-context bounded + _hb_content: str | list = str(tool_result) + if agent_id: + from pathlib import Path as _HbPath + from app.config import get_settings as _hb_get_settings + from app.services.tool_result_truncation import maybe_truncate_tool_result as _hb_trunc + _hb_ws = _HbPath(_hb_get_settings().AGENT_DATA_DIR) / str(agent_id) + _hb_content = _hb_trunc( + _hb_content, call_id=tc["id"], agent_workspace=_hb_ws, + ) llm_messages.append(LLMMessage( role="tool", tool_call_id=tc["id"], - content=str(tool_result), + content=_hb_content, )) else: reply = response.content or "" diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py index f9cfbc93c..9edddfadc 100644 --- a/backend/app/services/llm/caller.py +++ b/backend/app/services/llm/caller.py @@ -241,14 +241,20 @@ async def _process_tool_call( ) logger.debug(f"[LLM] Tool result: {result[:100]}") + # Resolve agent workspace path once — needed by vision injection and + # tool-result truncation below. Path resolution is sandbox-relevant, so + # keep it tied to AGENT_DATA_DIR / str(agent_id). + ws_path: Path | None = None + if agent_id: + from app.config import get_settings + settings = get_settings() + ws_path = Path(settings.AGENT_DATA_DIR) / str(agent_id) + # ── Vision injection for screenshot tools ── tool_content: str | list = str(result) - if supports_vision and agent_id: + if supports_vision and ws_path is not None: try: from app.services.vision_inject import try_inject_screenshot_vision - from app.config import get_settings - settings = get_settings() - ws_path = Path(settings.AGENT_DATA_DIR) / str(agent_id) vision_content = try_inject_screenshot_vision(tool_name, str(result), ws_path) if vision_content: tool_content = vision_content @@ -256,6 +262,15 @@ async def _process_tool_call( except Exception as e: logger.warning(f"[LLM] Vision injection failed for {tool_name}: {e}") + # ── Tool-result truncation: spill oversized payloads to disk ── + # Vision-injected list payloads pass through unchanged; large strings get + # head-excerpted with a marker pointing the model at _tool_results/.txt. + if ws_path is not None: + from app.services.tool_result_truncation import maybe_truncate_tool_result + tool_content = maybe_truncate_tool_result( + tool_content, call_id=tc["id"], agent_workspace=ws_path, + ) + # Notify client about tool call result if on_tool_call: try: @@ -269,7 +284,7 @@ async def _process_tool_call( }) except Exception: pass - + api_messages.append(LLMMessage( role="tool", tool_call_id=tc["id"], @@ -771,10 +786,19 @@ async def _try_model(model: LLMModel) -> tuple[str, bool, bool]: user_id=agent.creator_id, session_id=session_id, ) + # Spill oversized tool results to disk; keep in-context bounded + _tool_content: str | list = str(result) + if agent_id: + from app.config import get_settings + from app.services.tool_result_truncation import maybe_truncate_tool_result + _ws = Path(get_settings().AGENT_DATA_DIR) / str(agent_id) + _tool_content = maybe_truncate_tool_result( + _tool_content, call_id=tc["id"], agent_workspace=_ws, + ) api_messages.append(LLMMessage( role="tool", tool_call_id=tc["id"], - content=str(result), + content=_tool_content, )) if agent_id and _accumulated_tokens > 0: diff --git a/backend/app/services/tool_result_truncation.py b/backend/app/services/tool_result_truncation.py new file mode 100644 index 000000000..52b679237 --- /dev/null +++ b/backend/app/services/tool_result_truncation.py @@ -0,0 +1,216 @@ +"""Truncate large tool results to disk with an in-context pointer. + +When a tool returns ~50 KB of jina_search hits or a 30-page PDF extract, +sending it verbatim into the LLM history burns tokens for content the model +will only sample one slice of. This module spills oversized payloads to the +agent's workspace and replaces the in-context body with a head excerpt plus +a ``[truncated. Full output saved to ...]`` marker. The model is taught (in +the system prompt) to follow up with ``read_file`` for specific sections. + +Why a "smart" head: + - Naive ``content[:N]`` works for prose but ruins JSON-shape responses + where the head is metadata (``query``, ``total_results``) and the + payload is in a ``results``/``items``/``data`` array. + - We try ``json.loads`` first; if the content is a dict with one of the + well-known array keys, we keep the metadata and the first few items. + Otherwise fall back to head-cut. + +What's intentionally not handled: + - Vision injection payloads (``list`` content) pass through unchanged. + Truncating image_data markers would corrupt the multimodal block. + - DB persistence already truncates to 500 chars (websocket.py + + agent_tools.py + trigger_daemon.py). This module operates on the + in-flight ``api_messages`` list, not the DB form. + - Sandbox boundary: ``_tool_results/.txt`` lives under the + agent's ``AGENT_DATA_DIR / str(agent_id) /`` workspace — read_file + already enforces this boundary, so cross-agent leak is prevented by + the existing tool-level sandbox. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Any + +from loguru import logger + +from app.services.token_tracker import estimate_tokens_from_chars + +# Conservative initial threshold: 4000 estimated tokens (~12 KB at chars/3). +# Soft-start value to give the model time to learn the read_file follow-up +# pattern; can be tightened to ~2000 once telemetry confirms the model uses +# the marker reliably. +TOOL_RESULT_TOKEN_THRESHOLD = 4000 + +# Allowed characters for a call_id used as a filename. Anthropic returns +# ``toolu_``; OpenAI returns ``call_``; clawith synthesizes +# ``call_``. All three fit this set. Anything else (e.g. a +# prompt-injected ``../../etc/passwd``) gets slugged before reaching the +# filesystem. +_SAFE_CALL_ID_RE = re.compile(r"^[A-Za-z0-9_-]{1,128}$") + + +def _safe_call_id(call_id: str) -> str: + """Return a filesystem-safe version of ``call_id``. + + Provider-issued call_ids already match ``_SAFE_CALL_ID_RE`` and pass + through unchanged. Hostile or malformed inputs (path separators, unicode, + arbitrarily long) are replaced char-by-char and length-capped — defense + in depth against a prompt-injection scenario where the model is coaxed + into emitting a path-traversing ``tool_call_id``. + """ + if not call_id: + return "unknown" + if _SAFE_CALL_ID_RE.match(call_id): + return call_id + slugged = re.sub(r"[^A-Za-z0-9_-]", "_", str(call_id))[:128] + return slugged or "unknown" + +# Hard char limit for the in-context head excerpt. Keeps the in-context +# payload small even for unusual content (e.g. one giant JSON string). +_HEAD_MAX_CHARS = TOOL_RESULT_TOKEN_THRESHOLD * 3 # ~12 KB + +# JSON keys that conventionally hold the array payload (the "results" of a +# search, the "items" of a list endpoint, etc). Order matters — first hit wins. +_KNOWN_ARRAY_KEYS = ("results", "items", "data", "entries", "hits", "documents") + +# How many array elements to keep when smart-heading a JSON-shape response. +_KEEP_ARRAY_ITEMS = 5 + + +def maybe_truncate_tool_result( + tool_content: str | list, + *, + call_id: str, + agent_workspace: Path, +) -> str | list: + """Return ``tool_content`` shortened with a marker, or unchanged. + + Args: + tool_content: The raw tool result. ``str`` is the common case; + ``list`` is multimodal vision-injected content (passed through). + call_id: Tool call ID — used as the spill filename. + agent_workspace: Path to the agent's workspace dir + (``AGENT_DATA_DIR / str(agent_id)``). Spill goes under + ``/_tool_results/.txt``. + + Returns: + The original content if under threshold, or a head excerpt + marker + string. List payloads always pass through. + """ + # Multimodal content: image_data markers must stay structurally intact. + if isinstance(tool_content, list): + return tool_content + + if not isinstance(tool_content, str): + # Defensive: unknown payload type — coerce and continue. + tool_content = str(tool_content) + + est_tokens = estimate_tokens_from_chars(len(tool_content)) + if est_tokens <= TOOL_RESULT_TOKEN_THRESHOLD: + return tool_content + + # Spill full content to disk. Sanitize the call_id and assert containment + # so a hostile/malformed call_id can never write outside _tool_results/. + safe_id = _safe_call_id(call_id) + spill_root = (agent_workspace / "_tool_results").resolve() + full_path = (spill_root / f"{safe_id}.txt").resolve() + if not _is_within(full_path, spill_root): + # Should be unreachable after slugging — defense in depth. + logger.error( + f"[tool-truncation] Refusing to spill {call_id!r} → {full_path} " + f"(outside {spill_root}); falling back to inline head-cut." + ) + head = _smart_head(tool_content, _HEAD_MAX_CHARS) + return ( + head + + f"\n\n[truncated. Full output ({est_tokens} tokens) " + f"could not be spilled to disk — only this excerpt is available]" + ) + + try: + spill_root.mkdir(parents=True, exist_ok=True) + full_path.write_text(tool_content, encoding="utf-8") + except OSError as e: + logger.warning( + f"[tool-truncation] Failed to spill {safe_id} to {full_path}: {e}; " + "falling back to inline head-cut without spill." + ) + head = _smart_head(tool_content, _HEAD_MAX_CHARS) + return ( + head + + f"\n\n[truncated. Full output ({est_tokens} tokens) " + f"could not be spilled to disk — only this excerpt is available]" + ) + + head = _smart_head(tool_content, _HEAD_MAX_CHARS) + return ( + head + + f"\n\n[truncated. Full output ({est_tokens} tokens) saved to " + f"_tool_results/{safe_id}.txt under your workspace — use the read_file " + f"tool to retrieve specific sections]" + ) + + +def _is_within(path: Path, root: Path) -> bool: + """True iff ``path`` is the same as ``root`` or nested below it. + + Both paths are expected to already be ``.resolve()``-d by the caller. + """ + try: + path.relative_to(root) + return True + except ValueError: + return False + + +def _smart_head(content: str, max_chars: int) -> str: + """Best-effort excerpt that preserves structure when possible. + + JSON-shape: if the content parses as a dict with a known array key + (``results``/``items``/``data``/...), keep the metadata + first 5 items. + Otherwise fall back to a plain head cut. + """ + if len(content) <= max_chars: + return content + + # Try JSON-shape preservation + stripped = content.lstrip() + if stripped.startswith("{"): + try: + data: dict[str, Any] = json.loads(content) + except (json.JSONDecodeError, ValueError): + pass + else: + if isinstance(data, dict): + truncated = _truncate_json_dict(data) + if truncated is not None: + rendered = json.dumps(truncated, ensure_ascii=False, indent=2) + if len(rendered) <= max_chars: + return rendered + # Even truncated JSON exceeds head budget → fall through. + + # Plain head cut at a word/line boundary if possible + cut = content[:max_chars] + last_newline = cut.rfind("\n") + if last_newline > max_chars * 0.8: + cut = cut[:last_newline] + return cut + + +def _truncate_json_dict(data: dict[str, Any]) -> dict[str, Any] | None: + """If ``data`` has a known array key, return a dict with metadata + preserved and that array trimmed to ``_KEEP_ARRAY_ITEMS`` entries. + Returns None if no known array shape is detected. + """ + for key in _KNOWN_ARRAY_KEYS: + value = data.get(key) + if isinstance(value, list) and len(value) > _KEEP_ARRAY_ITEMS: + trimmed = dict(data) + kept = value[:_KEEP_ARRAY_ITEMS] + trimmed[key] = kept + trimmed[f"_truncated_{key}_count"] = len(value) - _KEEP_ARRAY_ITEMS + return trimmed + return None diff --git a/backend/tests/test_tool_result_truncation.py b/backend/tests/test_tool_result_truncation.py new file mode 100644 index 000000000..e60e08180 --- /dev/null +++ b/backend/tests/test_tool_result_truncation.py @@ -0,0 +1,311 @@ +"""Unit tests for tool result truncation + workspace spill.""" + +import json + +from app.services.tool_result_truncation import ( + TOOL_RESULT_TOKEN_THRESHOLD, + _safe_call_id, + _smart_head, + _truncate_json_dict, + maybe_truncate_tool_result, +) + + +# ── Pass-through cases ────────────────────────────────────────────────── + + +def test_short_string_unchanged(tmp_path): + content = "tiny result" + out = maybe_truncate_tool_result(content, call_id="c1", agent_workspace=tmp_path) + assert out == content + # No spill file created + assert not (tmp_path / "_tool_results").exists() + + +def test_list_payload_passes_through(tmp_path): + """Vision-injected multimodal content must not be touched.""" + payload = [ + {"type": "text", "text": "see image:"}, + {"type": "image", "source": {"type": "base64", "data": "..."}}, + ] + out = maybe_truncate_tool_result(payload, call_id="c1", agent_workspace=tmp_path) + assert out is payload # same object + + +def test_non_string_non_list_coerced(tmp_path): + """Defensive: dict payload (shouldn't happen) is coerced to str.""" + out = maybe_truncate_tool_result( + {"key": "value"}, # type: ignore[arg-type] + call_id="c1", + agent_workspace=tmp_path, + ) + # Coerced to str then passed through (small enough) + assert isinstance(out, str) + + +# ── Spill behavior ────────────────────────────────────────────────────── + + +def _huge_payload(chars: int = 60000) -> str: + return "x" * chars + + +def test_huge_payload_spilled_and_truncated(tmp_path): + payload = _huge_payload(60000) + out = maybe_truncate_tool_result( + payload, call_id="abc-123", agent_workspace=tmp_path + ) + # In-context is now much shorter than original + assert isinstance(out, str) + assert len(out) < len(payload) + # Marker present + assert "[truncated. Full output" in out + assert "_tool_results/abc-123.txt" in out + assert "use the read_file tool" in out + # Spill file written with full payload + spill = tmp_path / "_tool_results" / "abc-123.txt" + assert spill.exists() + assert spill.read_text(encoding="utf-8") == payload + + +def test_spill_path_uses_utf8(tmp_path): + """Mixed CJK + emoji payload should round-trip through utf-8.""" + payload = ("中文测试 🎉 " + "x" * 30000) + out = maybe_truncate_tool_result( + payload, call_id="cjk", agent_workspace=tmp_path + ) + spill = tmp_path / "_tool_results" / "cjk.txt" + assert spill.exists() + assert spill.read_text(encoding="utf-8") == payload + # Marker still present + assert "[truncated" in out + + +def test_spill_failure_returns_inline_marker(tmp_path): + """If we can't write to disk, still truncate but mark as unrecoverable.""" + # Use a workspace path that can't be written (a file, not a dir) + blocker = tmp_path / "blocker" + blocker.write_text("this is a file, not a dir") + payload = _huge_payload(40000) + out = maybe_truncate_tool_result( + payload, call_id="fail", agent_workspace=blocker + ) + assert isinstance(out, str) + assert "could not be spilled to disk" in out + + +def test_at_threshold_boundary(tmp_path): + """Just below threshold passes through; just above triggers spill.""" + chars_at_threshold = TOOL_RESULT_TOKEN_THRESHOLD * 3 # chars/3 ratio + # Below threshold (slightly) + below = "y" * (chars_at_threshold - 100) + out = maybe_truncate_tool_result(below, call_id="b", agent_workspace=tmp_path) + assert out == below + + # Above threshold + above = "y" * (chars_at_threshold + 1000) + out2 = maybe_truncate_tool_result(above, call_id="a", agent_workspace=tmp_path) + assert "[truncated" in out2 + + +# ── Smart head: JSON-shape preservation ───────────────────────────────── + + +def test_smart_head_preserves_results_array(): + """jina_search-style response: keep metadata + first 5 items.""" + # Each result is bulky enough that full payload exceeds max_chars. + bulky_snippet = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 5 + payload = json.dumps({ + "query": "what is rust", + "total_results": 50, + "results": [ + {"url": f"https://ex.com/{i}", "title": f"Result {i}", "snippet": bulky_snippet} + for i in range(50) + ], + }) + assert len(payload) > 5000 # sanity: max_chars below forces truncation + out = _smart_head(payload, max_chars=5000) + parsed = json.loads(out) + assert parsed["query"] == "what is rust" + assert parsed["total_results"] == 50 + assert len(parsed["results"]) == 5 + assert parsed["_truncated_results_count"] == 45 + + +def test_smart_head_recognizes_items_key(): + bulky_item = "x" * 200 + payload = json.dumps({ + "items": [f"{bulky_item}-{i}" for i in range(30)], + "next_cursor": "abc", + }) + assert len(payload) > 2000 # sanity + out = _smart_head(payload, max_chars=2000) + parsed = json.loads(out) + assert len(parsed["items"]) == 5 + assert parsed["next_cursor"] == "abc" + + +def test_smart_head_falls_back_for_non_array_json(): + """JSON without a known array key falls back to head-cut.""" + payload = json.dumps({"big_field": "x" * 10000, "other": "y"}) + out = _smart_head(payload, max_chars=2000) + # Either trimmed JSON dict or plain head-cut, but length ≤ 2000 + assert len(out) <= 2000 + + +def test_smart_head_plain_text_head_cut(): + payload = "line1\n" + ("plain text content " * 1000) + out = _smart_head(payload, max_chars=500) + assert len(out) <= 500 + # Head-cut should prefer line boundary if available + assert out.startswith("line1") + + +def test_truncate_json_dict_no_array_key_returns_none(): + """Helper returns None when no recognizable array shape.""" + assert _truncate_json_dict({"foo": "bar", "baz": 123}) is None + + +def test_truncate_json_dict_short_array_returns_none(): + """Don't bother truncating a 3-item array.""" + assert _truncate_json_dict({"results": [1, 2, 3]}) is None + + +def test_truncate_json_dict_first_known_key_wins(): + """If multiple known keys present, first match wins (results > items).""" + data = {"results": list(range(20)), "items": list(range(30))} + out = _truncate_json_dict(data) + assert out is not None + assert "_truncated_results_count" in out + assert "_truncated_items_count" not in out # items left untouched + + +# ── End-to-end integration ────────────────────────────────────────────── + + +def test_realistic_jina_search_response_truncated_to_useful_excerpt(tmp_path): + """A 50-result jina_search dump fits into the in-context head with + enough metadata + samples for the model to decide whether to read_file.""" + full_response = json.dumps({ + "query": "best practices for python async", + "total_results": 50, + "search_time_ms": 142, + "results": [ + { + "url": f"https://example.com/article-{i}", + "title": f"Article {i} about async", + "snippet": "Lorem ipsum dolor sit amet " * 30, + } + for i in range(50) + ], + }) + + out = maybe_truncate_tool_result( + full_response, call_id="search-1", agent_workspace=tmp_path + ) + assert isinstance(out, str) + + # Full content preserved on disk + spill = tmp_path / "_tool_results" / "search-1.txt" + assert spill.read_text(encoding="utf-8") == full_response + + # Model can see metadata + sample + assert "best practices for python async" in out + assert "total_results" in out + assert "search-1.txt" in out # marker tells model where the rest is + + +def test_marker_includes_token_count(tmp_path): + payload = "z" * 30000 # ~10000 tokens + out = maybe_truncate_tool_result( + payload, call_id="big", agent_workspace=tmp_path + ) + # Marker should mention the original token count + assert "tokens" in out + # The number itself appears (10000) + import re + m = re.search(r"\((\d+)\s+tokens\)", out) + assert m is not None + assert int(m.group(1)) > 5000 + + +# ── Sandbox safety: call_id sanitization ──────────────────────────────── + + +def test_safe_call_id_passes_provider_format(): + """Real Anthropic / OpenAI / synthetic IDs all pass through unchanged.""" + for cid in ( + "toolu_01ABC123def456GHI789jkl", + "call_f6db199d-c470-4bb3-8188-1c9eeb43cc60", + "call_msg_uuid_v4", + "abc-123_XYZ", + ): + assert _safe_call_id(cid) == cid, f"Should pass through: {cid!r}" + + +def test_safe_call_id_slugs_path_traversal(): + """A prompt-injected ../-style call_id gets neutralized.""" + assert _safe_call_id("../../../etc/passwd") != "../../../etc/passwd" + # No path separator survives + out = _safe_call_id("../../../etc/passwd") + assert "/" not in out + assert "\\" not in out + assert ".." not in out or out.replace(".", "_") == out + + +def test_safe_call_id_handles_empty_and_unicode(): + assert _safe_call_id("") == "unknown" + assert _safe_call_id(None) == "unknown" # type: ignore[arg-type] + out = _safe_call_id("中文 with spaces!") + # Unicode and special chars get replaced with underscore + assert all(c.isalnum() or c in "_-" for c in out) + + +def test_safe_call_id_caps_length(): + """Pathological 10K-char call_id is capped at 128.""" + huge = "a" * 10_000 + out = _safe_call_id(huge) + assert len(out) <= 128 + + +def test_path_traversal_call_id_writes_inside_sandbox(tmp_path): + """Hostile call_id like '../../foo' must NOT escape _tool_results/.""" + payload = "x" * 60000 # forces spill + hostile_id = "../../../../../etc/passwd" + + out = maybe_truncate_tool_result( + payload, call_id=hostile_id, agent_workspace=tmp_path + ) + assert isinstance(out, str) + + # Truncation marker still appears with the SLUGGED name (not the hostile one) + assert "[truncated" in out + assert "etc/passwd" not in out # original hostile path doesn't surface + + # Walk the workspace and verify NO file got written outside _tool_results/ + spill_root = tmp_path / "_tool_results" + for path in tmp_path.rglob("*"): + if path.is_file(): + assert spill_root in path.resolve().parents or path.resolve() == spill_root, ( + f"File leaked outside _tool_results/: {path}" + ) + + +def test_call_id_with_separator_in_filename(tmp_path): + """Path separator in call_id should not create subdirectories under + _tool_results/.""" + payload = "y" * 60000 + out = maybe_truncate_tool_result( + payload, call_id="evil/sub/path", agent_workspace=tmp_path + ) + assert isinstance(out, str) + + # No nested directory should appear under _tool_results/ + spill_root = tmp_path / "_tool_results" + nested_dirs = [p for p in spill_root.rglob("*") if p.is_dir()] + assert nested_dirs == [], f"Unexpected subdirectories: {nested_dirs}" + + # Exactly one .txt file at the top level of _tool_results/ + txt_files = list(spill_root.glob("*.txt")) + assert len(txt_files) == 1 + assert txt_files[0].read_text(encoding="utf-8") == payload