From 114be2465789aa29f3d6ee0e56da471a1fffeaa4 Mon Sep 17 00:00:00 2001
From: zhongyua <zhongyua@ualberta.ca>
Date: Mon, 27 Apr 2026 13:06:56 +0800
Subject: [PATCH 1/3] fix(history): atomic pair-aware truncation for tool_call
 blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace naive `conversation[-ctx_size:]` slicing with a walker that treats
assistant.tool_calls and its matching role="tool" messages as one atomic
block. Naive slicing can leave an orphan role="tool" at the head when the
cut lands mid-pair — OpenAI rejects this with "No tool call found for
function call output" (issue #446).

New helper `services/history_window.truncate_by_message_count` walks tail
backward, identifies blocks, and includes blocks whole or not at all.
Orphan tools are silently dropped regardless of budget.

Replaces head-only pop guard in:
  - app/api/websocket.py (web chat)
  - app/api/feishu.py (feishu channel)

Leaves app/services/llm/caller.py:626 untouched (call_agent_llm short-reply
path's hardcoded [-10:] is intentional).

Tests: 15 covering empty input, budget edges, parallel tool_calls,
multi-pair, head/mid orphans, realistic 60-message invariant.

Addresses #446 failure mode.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backend/app/api/feishu.py              |   9 +-
 backend/app/api/websocket.py           |  11 +-
 backend/app/services/history_window.py | 166 ++++++++++++++
 backend/tests/test_history_window.py   | 288 +++++++++++++++++++++++++
 4 files changed, 469 insertions(+), 5 deletions(-)
 create mode 100644 backend/app/services/history_window.py
 create mode 100644 backend/tests/test_history_window.py

diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 00fe32737..ca4cef7c7 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,6 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import truncate_by_message_count
 
 router = APIRouter(tags=["feishu"])
 
@@ -1634,7 +1635,13 @@ async def _call_agent_llm(
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
+        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
+        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
+        # rows, so this path has no tool messages and the helper acts as plain count
+        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        messages.extend(
+            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+        )
     messages.append({"role": "user", "content": user_text})
 
     # Use actual user_id so the system prompt knows who it's chatting with
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 82b297916..6f17ad1e5 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,6 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import truncate_by_message_count
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -662,10 +663,12 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
new file mode 100644
index 000000000..54c0ce8a6
--- /dev/null
+++ b/backend/app/services/history_window.py
@@ -0,0 +1,166 @@
+"""Pair-aware conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
+tool message inserted between a valid pair and other messages (from
+malformed persistence or upstream truncation) is dropped, not folded
+into an adjacent block. This makes the helper robust against orphans
+at any position, not just at the slice head.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
+    """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has
+    no matching ``assistant.tool_calls`` earlier in the conversation.
+
+    OpenAI rejects the request the moment a ``function_call_output`` is
+    sent without its matching ``function_call``, regardless of whether
+    that tool message is at the head, middle, or end. So orphan detection
+    is by ID matching, not by position.
+    """
+    orphans: set[int] = set()
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        if not tcid:
+            orphans.add(i)
+            continue
+        # Search backward for an assistant whose tool_calls contains this id.
+        # Walks past intervening user / system / other-assistant messages.
+        found = False
+        j = i - 1
+        while j >= 0:
+            m = messages[j]
+            if m.get("role") == "assistant" and m.get("tool_calls"):
+                ids = {tc.get("id") for tc in m["tool_calls"]}
+                if tcid in ids:
+                    found = True
+                    break
+            j -= 1
+        if not found:
+            orphans.add(i)
+    return orphans
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
+        message (identified by ``tool_call_id``, not adjacency).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages — those whose ``tool_call_id`` has no matching assistant — are
+    silently dropped regardless of budget. Sending them to OpenAI causes the
+    #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    orphans = _identify_orphans(messages)
+    n = len(messages)
+    consumed: set[int] = set(orphans)  # orphans drop unconditionally
+    blocks: list[set[int]] = []  # tail-to-head order
+
+    for i in range(n - 1, -1, -1):
+        if i in consumed:
+            continue
+        msg = messages[i]
+        role = msg.get("role")
+
+        if role == "tool":
+            # Find this tool's owning assistant by matching tool_call_id
+            tcid = msg.get("tool_call_id")
+            asst_idx = -1
+            j = i - 1
+            while j >= 0:
+                m = messages[j]
+                if m.get("role") == "assistant" and m.get("tool_calls"):
+                    ids = {tc.get("id") for tc in m["tool_calls"]}
+                    if tcid in ids:
+                        asst_idx = j
+                        break
+                j -= 1
+            if asst_idx < 0:
+                # Defensive — orphan detection should have caught this
+                consumed.add(i)
+                continue
+            # Block = assistant + ALL of its matching tool messages (siblings)
+            asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]}
+            block = {asst_idx}
+            for k in range(asst_idx + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        elif role == "assistant" and msg.get("tool_calls"):
+            # Encountered the assistant before any of its tools (e.g. tools
+            # were truncated upstream or are still in flight). Group with
+            # whatever matching tools follow it.
+            asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]}
+            block = {i}
+            for k in range(i + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        else:
+            consumed.add(i)
+            blocks.append({i})
+
+    # Walk blocks tail-to-head, taking until budget exhausted.
+    keep: set[int] = set()
+    budget = max_messages
+    for block in blocks:
+        size = len(block)
+        if size <= budget:
+            keep |= block
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [messages[k] for k in sorted(keep)]
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
new file mode 100644
index 000000000..934216758
--- /dev/null
+++ b/backend/tests/test_history_window.py
@@ -0,0 +1,288 @@
+"""Unit tests for pair-aware conversation history truncation.
+
+Validates that ``truncate_by_message_count`` preserves
+``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
+orphan tool messages that would trigger the OpenAI #446 failure mode.
+"""
+
+from app.services.history_window import truncate_by_message_count
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────
+
+
+def _u(text: str) -> dict:
+    return {"role": "user", "content": text}
+
+
+def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict:
+    msg: dict = {"role": "assistant", "content": text}
+    if tool_calls:
+        msg["tool_calls"] = tool_calls
+    return msg
+
+
+def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict:
+    return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}}
+
+
+def _t(call_id: str, content: str = "ok") -> dict:
+    return {"role": "tool", "tool_call_id": call_id, "content": content}
+
+
+def _roles(msgs: list[dict]) -> list[str]:
+    return [m.get("role", "?") for m in msgs]
+
+
+# ── Edge cases ──────────────────────────────────────────────────────────
+
+
+def test_empty_input_returns_empty():
+    assert truncate_by_message_count([], 10) == []
+
+
+def test_zero_or_negative_budget_returns_empty():
+    msgs = [_u("hi"), _u("there")]
+    assert truncate_by_message_count(msgs, 0) == []
+    assert truncate_by_message_count(msgs, -5) == []
+
+
+def test_within_budget_returns_all():
+    msgs = [_u("a"), _a("b"), _u("c")]
+    out = truncate_by_message_count(msgs, 10)
+    assert out == msgs
+    assert out is not msgs  # new list
+
+
+def test_input_not_mutated():
+    msgs = [_u("a"), _a("b"), _u("c"), _u("d")]
+    snapshot = list(msgs)
+    truncate_by_message_count(msgs, 2)
+    assert msgs == snapshot
+
+
+# ── Core pair-preservation behavior ─────────────────────────────────────
+
+
+def test_keeps_assistant_tool_pair_intact():
+    """Slicing must not split assistant.tool_calls from its tool result."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["assistant", "tool", "user"]
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[1]["tool_call_id"] == "X"
+
+
+def test_drops_pair_entirely_when_budget_too_small():
+    """If budget can't fit the whole pair, drop it — never half."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 2 — can't fit pair (needs 2) + final user, must drop pair
+    out = truncate_by_message_count(msgs, 2)
+    # Only the trailing user fits as a single block; pair (size 2) doesn't fit
+    # in remaining budget=1 after taking user.
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "done?"
+
+
+def test_drops_orphan_tool_at_head():
+    """A role=tool with no preceding assistant.tool_calls is dropped."""
+    msgs = [
+        _t("X"),  # orphan — no assistant before
+        _u("hi"),
+        _a("ok"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_drops_orphan_tool_at_head_after_slicing():
+    """Slicing produces an orphan tool at head — must be dropped (the
+    classic #446 failure mode)."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),       # ← naive slice [-3:] would start here as orphan
+        _u("u2"),
+        _a("final"),
+    ]
+    # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X)
+    # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because
+    # we'd need budget 5 to include from start. Result: [u("u2"), a("final")].
+    out = truncate_by_message_count(msgs, 3)
+    assert "tool" not in _roles(out)
+    # No orphan tool_call_id reaches output
+    for m in out:
+        if m.get("role") == "tool":
+            raise AssertionError(f"Orphan tool leaked: {m}")
+
+
+def test_multiple_parallel_tool_calls_in_one_assistant():
+    """Assistant with N tool_calls followed by N tools is one atomic block."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"]
+    # Verify the pair came through whole
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[3]["tool_call_id"] == "Z"
+
+
+def test_parallel_tool_pair_dropped_if_too_big():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u].
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["user"]
+
+
+def test_multiple_pairs_some_drop():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("A")]),
+        _t("A"),
+        _u("u2"),
+        _a(None, tool_calls=[_tc("B")]),
+        _t("B"),
+        _u("u3"),
+    ]
+    # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3.
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+    assert out[1]["tool_calls"][0]["id"] == "B"
+    assert out[2]["tool_call_id"] == "B"
+
+
+def test_no_partial_pair_when_budget_exactly_one_short():
+    """Exactly one short of fitting a pair → drop the pair, don't include
+    just the assistant."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+    ]
+    # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk
+    # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0)
+    # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t]
+    out = truncate_by_message_count(msgs, 2)
+    assert _roles(out) == ["assistant", "tool"]
+    # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits).
+    # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't
+    # fit, BREAK. Output: [].
+    out2 = truncate_by_message_count(msgs, 1)
+    assert out2 == []
+
+
+def test_mid_orphan_tool_dropped():
+    """A tool whose tool_call_id has no matching assistant nearby — defensive
+    drop. (Shouldn't happen with current persistence, but be robust.)"""
+    msgs = [
+        _u("u1"),
+        _t("ORPHAN_X"),  # malformed — no preceding assistant.tool_calls
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    # Orphan dropped
+    assert "tool" not in _roles(out)
+    assert _roles(out) == ["user", "user"]
+
+
+def test_orphan_adjacent_to_valid_pair_still_dropped():
+    """Orphan tool message inserted right after a legitimate tool-call pair
+    must be dropped — adjacency to a valid pair does not legitimize it.
+
+    This is the bug class that triggers OpenAI #446 even when slice cut
+    boundaries would otherwise be safe: any orphan reaching the wire,
+    regardless of position, makes the request invalid."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("VALID")]),
+        _t("VALID", "real result"),
+        _t("ORPHAN_id", "ghost result"),  # no assistant emits ORPHAN_id
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+
+    # The orphan must NOT survive — even though it's adjacent to a valid pair
+    orphan_present = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id"
+        for m in out
+    )
+    assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped"
+
+    # The valid pair survives intact
+    valid_assistant = any(
+        m.get("role") == "assistant"
+        and m.get("tool_calls")
+        and any(tc["id"] == "VALID" for tc in m["tool_calls"])
+        for m in out
+    )
+    valid_tool = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "VALID"
+        for m in out
+    )
+    assert valid_assistant and valid_tool
+
+
+def test_system_message_treated_as_normal_block():
+    msgs = [
+        {"role": "system", "content": "you are an agent"},
+        _u("hi"),
+        _a("hello"),
+    ]
+    out = truncate_by_message_count(msgs, 2)
+    # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped.
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_realistic_long_conversation_truncation():
+    """End-to-end: simulate a long chat with many tool-call turns and ensure
+    the output never has orphan tools."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(20):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result {k}"))
+        msgs.append(_u(f"next {k}"))
+    msgs.append(_a("final answer"))
+
+    # Truncate to 30 messages
+    out = truncate_by_message_count(msgs, 30)
+
+    # Sanity: budget respected
+    assert len(out) <= 30
+
+    # Critical invariant: no orphan tool messages anywhere
+    seen_tool_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                seen_tool_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            tcid = m.get("tool_call_id")
+            assert tcid in seen_tool_call_ids, (
+                f"Orphan tool {tcid!r} in output without matching assistant.tool_calls"
+            )

From 2a0040fef35e69706030afde273f7f43f80f5adc Mon Sep 17 00:00:00 2001
From: zhongyua <zhongyua@ualberta.ca>
Date: Mon, 27 Apr 2026 13:32:48 +0800
Subject: [PATCH 2/3] feat(agent): token-aware history truncation alongside
 message cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `agents.context_window_tokens` field (default 50000) and a new
`truncate_by_token_budget` helper that bounds in-context history by both
estimated token cost (primary) and message count (safety cap), preserving
assistant.tool_calls ↔ role=tool pairs intact via the same walker as
truncate_by_message_count.

Why: message-count alone is a wildly variable proxy for token cost — one
50KB tool result eats more context than 100 short user messages. Token
budget gives predictable behavior across heterogeneous traffic; message
cap remains as a safety net against pathological tiny-message floods.

Changes:
  - models/agent.py: + context_window_tokens (Integer, default=50000)
    + DEFAULT_CONTEXT_WINDOW_TOKENS constant
  - schemas/schemas.py: AgentOut, AgentUpdate (1000 <= tokens <= 500000)
  - alembic: add_context_window_tokens.py (idempotent IF NOT EXISTS)
  - services/history_window.py: + truncate_by_token_budget, refactored
    common walker, JSON-serialized char->token estimate via existing
    estimate_tokens_from_chars (chars/3 — overestimates safely)
  - api/websocket.py: pass tok_budget to helper, raise DB load to
    max(ctx_size, 500) so helper has room to choose
  - api/feishu.py: same pattern at 2 sites (web chat + IM channel paths)
  - frontend: AgentDetail Settings slider + i18n + types

10 new tests covering token-budget mode (huge-message dropped, both-bounds
interaction, atomic pair preservation, orphan defense). 25/25 pass.

Other channels (dingtalk/discord/slack/teams/wecom/whatsapp) still use
DB-level message-count limit only — they don't get token awareness in
this PR but won't crash. Migrating them is follow-up scope.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../versions/add_context_window_tokens.py     |  65 ++++++
 backend/app/api/feishu.py                     |  23 ++-
 backend/app/api/websocket.py                  |  41 +++-
 backend/app/models/agent.py                   |  23 +++
 backend/app/schemas/schemas.py                |   2 +
 backend/app/services/history_window.py        | 186 ++++++++++++++----
 backend/tests/test_history_window.py          | 140 ++++++++++++-
 frontend/src/i18n/en.json                     |   4 +-
 frontend/src/i18n/zh.json                     |   4 +-
 frontend/src/pages/AgentDetail.tsx            |  18 ++
 frontend/src/types/index.ts                   |   1 +
 11 files changed, 442 insertions(+), 65 deletions(-)
 create mode 100644 backend/alembic/versions/add_context_window_tokens.py

diff --git a/backend/alembic/versions/add_context_window_tokens.py b/backend/alembic/versions/add_context_window_tokens.py
new file mode 100644
index 000000000..bf385417e
--- /dev/null
+++ b/backend/alembic/versions/add_context_window_tokens.py
@@ -0,0 +1,65 @@
+"""add agents.context_window_tokens for token-aware history truncation
+
+Revision ID: add_context_window_tokens
+Revises: rm_agent_credential_secrets
+Create Date: 2026-04-27
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision: str = "add_context_window_tokens"
+down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Add context_window_tokens with a DDL default of 50000.
+
+    The four-step pattern is required because earlier in the migration chain,
+    ``alembic/versions/0000_initial_schema.py`` calls
+    ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents``
+    from the *current* model state — including any new columns. SQLAlchemy's
+    Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause,
+    so the column ends up ``NOT NULL`` with no default, and a naive
+    ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never
+    sets the default.
+
+    This four-step approach is idempotent regardless of pre-existing state:
+      - column missing → created (nullable, no default initially)
+      - column present without default → default set
+      - any rows with NULL → backfilled to 50000
+      - column made NOT NULL
+
+    Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE
+    affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL
+    column is a no-op.
+    """
+    # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here,
+    #    so existing rows (if any from create_all) aren't blocked.
+    op.execute(
+        "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER"
+    )
+    # 2. Ensure the DDL default is set so future inserts that omit this
+    #    column (raw SQL, restored backups, manual migrations) get 50000.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000"
+    )
+    # 3. Backfill any rows that were created before the default landed.
+    op.execute(
+        "UPDATE agents SET context_window_tokens = 50000 "
+        "WHERE context_window_tokens IS NULL"
+    )
+    # 4. Now safe to enforce NOT NULL.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL"
+    )
+
+
+def downgrade() -> None:
+    # Downgrade omitted — dropping the column would lose per-tenant tuning.
+    pass
diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index ca4cef7c7..02e21f5bf 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,7 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import truncate_by_token_budget
 
 router = APIRouter(tags=["feishu"])
 
@@ -657,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession
             )
             _pre_sess = _pre_sess_r.scalar_one_or_none()
             _history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id
+            # Load extra raw material so app-level token-aware helper has room to choose
             history_result = await db.execute(
                 select(ChatMessage)
                 .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id)
                 .order_by(ChatMessage.created_at.desc())
-                .limit(ctx_size)
+                .limit(max(ctx_size, 500))
             )
             history_msgs = history_result.scalars().all()
             history = _build_llm_history_from_chat_messages(list(reversed(history_msgs)))
@@ -1374,11 +1375,12 @@ async def _handle_feishu_file(
         # Load conversation history for LLM context
         from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
         ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE
+        # Load extra raw material so app-level token-aware helper has room to choose
         _hist_r = await db.execute(
             _select(ChatMessage)
             .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id)
             .order_by(ChatMessage.created_at.desc())
-            .limit(ctx_size)
+            .limit(max(ctx_size, 500))
         )
         _history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all())))
 
@@ -1632,15 +1634,18 @@ async def _call_agent_llm(
 
     # Build conversation messages (without system prompt — call_llm adds it)
     messages: list[dict] = []
-    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
+    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
+    tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
     if history:
-        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
-        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
-        # rows, so this path has no tool messages and the helper acts as plain count
-        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        # Pair-aware truncation: token budget primary, message count as safety cap.
+        # Today _normalize_history_messages drops DB role="tool_call" rows, so this
+        # path has no tool messages and the pair guard is a no-op; the safety kicks
+        # in once a feishu reorganization helper exists.
         messages.extend(
-            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+            truncate_by_token_budget(
+                _normalize_history_messages(history), tok_budget, message_cap=ctx_size,
+            )
         )
     messages.append({"role": "user", "content": user_text})
 
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 6f17ad1e5..8d79deeff 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,7 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import truncate_by_token_budget
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -214,7 +214,9 @@ async def websocket_chat(
             role_description = agent.role_description or ""
             welcome_message = agent.welcome_message or ""
             ctx_size = agent.context_window_size or 100
-            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}")
+            from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS
+            tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
+            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok")
 
             # Load the agent's primary model
             if agent.primary_model_id:
@@ -300,11 +302,14 @@ async def websocket_chat(
                     logger.info(f"[WS] Selected primary session {conv_id}")
 
             try:
+                # Load extra raw material so the app-level token-aware helper
+                # (truncate_by_token_budget below) has room to choose from.
+                _db_load_cap = max(ctx_size, 500)
                 history_result = await db.execute(
                     select(ChatMessage)
                     .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id)
                     .order_by(ChatMessage.created_at.desc())
-                    .limit(ctx_size)
+                    .limit(_db_load_cap)
                 )
                 history_messages = list(reversed(history_result.scalars().all()))
                 logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}")
@@ -663,12 +668,30 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # Pair-aware truncation: keep the last `ctx_size` messages while
-                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
-                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
-                        # head when the cut lands mid-pair, which OpenAI rejects with
-                        # "No tool call found for function call output" (issue #446).
-                        _truncated = truncate_by_message_count(conversation, ctx_size)
+                        # Pair-aware truncation with a token budget plus a message-count
+                        # safety cap. Either bound stops the walk; pairs (assistant.tool_calls
+                        # ↔ role=tool) are kept atomic. Token budget protects against
+                        # one-tool-result-eats-the-window scenarios; message cap protects
+                        # against pathological tiny-message floods. The pair guard fixes
+                        # the orphan-tool failure mode reported in #446.
+                        #
+                        # The current user message (just appended at line ~416) is excluded
+                        # from truncation and re-appended after — otherwise a single huge
+                        # input (large paste, base64 image_data) could push past the budget
+                        # and cause the helper to drop the very message we're answering.
+                        # If the input itself exceeds the model's context, the provider will
+                        # surface a clear error rather than silently dropping it here.
+                        _current = (
+                            conversation[-1]
+                            if conversation and conversation[-1].get("role") == "user"
+                            else None
+                        )
+                        _history = conversation[:-1] if _current is not None else conversation
+                        _truncated = truncate_by_token_budget(
+                            _history, tok_budget, message_cap=ctx_size,
+                        )
+                        if _current is not None:
+                            _truncated.append(_current)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,
diff --git a/backend/app/models/agent.py b/backend/app/models/agent.py
index 5c48d7bf9..2493aaedd 100644
--- a/backend/app/models/agent.py
+++ b/backend/app/models/agent.py
@@ -15,6 +15,11 @@
 # (see: https://github.com/dataelement/Clawith/issues/238).
 DEFAULT_CONTEXT_WINDOW_SIZE = 100
 
+# Default token budget for in-context history. Conservative for 128K-context
+# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent
+# override via Agent.context_window_tokens.
+DEFAULT_CONTEXT_WINDOW_TOKENS = 50000
+
 
 class Agent(Base):
     """Digital employee (Agent) instance.
@@ -81,6 +86,24 @@ class Agent(Base):
     last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
     tokens_used_total: Mapped[int] = mapped_column(Integer, default=0)
     context_window_size: Mapped[int] = mapped_column(Integer, default=100)
+    # Token-aware secondary bound on history sent to the LLM. Truncation uses
+    # the smaller of context_window_size (message count) and this token budget,
+    # preserving assistant.tool_calls ↔ role=tool pairs intact.
+    #
+    # ``server_default`` matters: alembic/versions/0000_initial_schema.py uses
+    # ``Base.metadata.create_all`` which reads model state at runtime. Without
+    # a server_default, fresh-DB bootstrap would create this column NOT NULL
+    # without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration
+    # later in the chain would short-circuit, leaving direct-SQL inserts
+    # broken. ``server_default="50000"`` ensures the DDL has the default
+    # whether the column was created by create_all or by the explicit
+    # migration.
+    context_window_tokens: Mapped[int] = mapped_column(
+        Integer,
+        default=DEFAULT_CONTEXT_WINDOW_TOKENS,
+        server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS),
+        nullable=False,
+    )
     max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50)
 
     # Trigger limits (per-agent, configurable from Settings UI)
diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py
index 0ae5d8e34..eb475d9c2 100644
--- a/backend/app/schemas/schemas.py
+++ b/backend/app/schemas/schemas.py
@@ -251,6 +251,7 @@ class AgentOut(BaseModel):
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     context_window_size: int = 100
+    context_window_tokens: int = 50000
     max_tool_rounds: int = 50
     max_triggers: int = 20
     min_poll_interval_min: int = 5
@@ -286,6 +287,7 @@ class AgentUpdate(BaseModel):
     primary_model_id: uuid.UUID | None = None
     fallback_model_id: uuid.UUID | None = None
     context_window_size: int | None = Field(default=None, ge=1, le=500)
+    context_window_tokens: int | None = Field(default=None, ge=1000, le=500000)
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     max_tool_rounds: int | None = None
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
index 54c0ce8a6..a3d63817c 100644
--- a/backend/app/services/history_window.py
+++ b/backend/app/services/history_window.py
@@ -11,19 +11,26 @@
 between an assistant message and its tool results. This is the failure mode
 reported in issue #446.
 
-Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
-tool message inserted between a valid pair and other messages (from
-malformed persistence or upstream truncation) is dropped, not folded
-into an adjacent block. This makes the helper robust against orphans
-at any position, not just at the slice head.
+Two public entry points:
+  - ``truncate_by_message_count`` — bound by message count
+  - ``truncate_by_token_budget`` — bound by estimated token cost (and an
+    optional message-count safety cap); preferred for production paths
+    where one tool result can dwarf 50 short messages.
 
 Input is expected to be in OpenAI chat-completion format (post-reorganization
-from DB ``role="tool_call"`` rows).
+from DB ``role="tool_call"`` rows). Helper is tolerant of malformed input —
+unmatched tool messages at the head are silently dropped.
 """
 
 from __future__ import annotations
 
-from typing import Any
+import json
+from typing import Any, Callable
+
+from app.services.token_tracker import estimate_tokens_from_chars
+
+
+# ── Block detection (shared between truncators) ─────────────────────────
 
 
 def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
@@ -60,38 +67,24 @@ def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
     return orphans
 
 
-def truncate_by_message_count(
-    messages: list[dict[str, Any]],
-    max_messages: int,
-) -> list[dict[str, Any]]:
-    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
-
-    A "block" is either:
-      - a single non-tool, non-tool-calling message (user / system / assistant text), or
-      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
-        message (identified by ``tool_call_id``, not adjacency).
-
-    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
-    messages — those whose ``tool_call_id`` has no matching assistant — are
-    silently dropped regardless of budget. Sending them to OpenAI causes the
-    #446 error.
+def _identify_blocks(messages: list[dict[str, Any]]) -> list[set[int]]:
+    """Group conversation entries into atomic blocks.
 
-    Args:
-        messages: Conversation list in OpenAI format. Empty list is fine.
-        max_messages: Soft upper bound on the number of returned entries.
-            Values ``<= 0`` return ``[]``.
+    A block is a set of indices that must be kept (or dropped) together:
+      - ``{i}`` for a single non-tool, non-tool-calling message
+      - ``{asst_idx, tool_idx_1, tool_idx_2, ...}`` for an assistant that
+        emitted N tool_calls plus its matching tool result messages,
+        identified by ``tool_call_id`` (not by adjacency — orphan tools
+        inserted between are dropped, not folded into the block).
 
-    Returns:
-        A new list (input is never mutated) of at most ``max_messages`` entries
-        from the tail of ``messages``, with all tool-call pairs intact.
+    Returned tail-to-head: most recent block first. Orphan tool messages
+    (those whose tool_call_id has no matching assistant.tool_calls) are
+    silently dropped — never appear in any block.
     """
-    if max_messages <= 0 or not messages:
-        return []
-
     orphans = _identify_orphans(messages)
     n = len(messages)
+    blocks: list[set[int]] = []
     consumed: set[int] = set(orphans)  # orphans drop unconditionally
-    blocks: list[set[int]] = []  # tail-to-head order
 
     for i in range(n - 1, -1, -1):
         if i in consumed:
@@ -151,16 +144,127 @@ def truncate_by_message_count(
             consumed.add(i)
             blocks.append({i})
 
-    # Walk blocks tail-to-head, taking until budget exhausted.
+    return blocks
+
+
+def _walk_blocks(
+    messages: list[dict[str, Any]],
+    budgets_ok: Callable[[int, int], bool],
+    consume: Callable[[int, int], None],
+) -> list[dict[str, Any]]:
+    """Common walker used by both truncators.
+
+    ``budgets_ok(block_msg_count, block_token_cost)`` returns True if the
+    block fits. ``consume`` updates remaining budget when a block is taken.
+    Stops on first non-fitting block (atomic — never partial-include).
+    """
+    blocks = _identify_blocks(messages)
     keep: set[int] = set()
-    budget = max_messages
     for block in blocks:
         size = len(block)
-        if size <= budget:
-            keep |= block
-            budget -= size
-        else:
-            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+        token_cost = sum(_estimate_msg_tokens(messages[k]) for k in block)
+        if not budgets_ok(size, token_cost):
             break
-
+        keep |= block
+        consume(size, token_cost)
     return [messages[k] for k in sorted(keep)]
+
+
+def _estimate_msg_tokens(msg: dict[str, Any]) -> int:
+    """Estimate token cost for one message via JSON-serialized char count.
+
+    Slight overestimate (JSON keys/quotes inflate vs the tokenizer's view of
+    the structured payload), which is the safe direction — better to truncate
+    a bit early than send too much and OOM the model.
+    """
+    try:
+        serialized = json.dumps(msg, ensure_ascii=False, default=str)
+    except (TypeError, ValueError):
+        # Fallback for unserializable payloads (shouldn't happen in practice)
+        serialized = str(msg)
+    return estimate_tokens_from_chars(len(serialized))
+
+
+# ── Public API ──────────────────────────────────────────────────────────
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool message (``user``/``system``/``assistant`` text), or
+      - an ``assistant`` with ``tool_calls`` plus every immediately-following
+        ``role="tool"`` message (the assistant's tool results).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages with no matching assistant are always dropped, regardless of
+    budget — sending them to OpenAI causes the #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+    remaining = [max_messages]
+
+    def budgets_ok(size: int, _tok: int) -> bool:
+        return size <= remaining[0]
+
+    def consume(size: int, _tok: int) -> None:
+        remaining[0] -= size
+
+    return _walk_blocks(messages, budgets_ok, consume)
+
+
+def truncate_by_token_budget(
+    messages: list[dict[str, Any]],
+    token_budget: int,
+    *,
+    message_cap: int | None = None,
+) -> list[dict[str, Any]]:
+    """Keep tail messages within both bounds, preserving tool-call pairs.
+
+    The two bounds work together: a block is included only if both the
+    remaining token budget and (when set) remaining message cap can absorb
+    its full size. The first bound to be exhausted stops the walk.
+
+    Token cost per message is an overestimate based on JSON-serialized char
+    count divided by ~3 (see ``_estimate_msg_tokens``). This is intentional:
+    for budget enforcement, overestimating is safe.
+
+    Args:
+        messages: Conversation list in OpenAI format.
+        token_budget: Soft upper bound on cumulative estimated tokens.
+            Values ``<= 0`` return ``[]``.
+        message_cap: Optional secondary bound on entry count. When set, the
+            walk stops as soon as either bound is exhausted.
+
+    Returns:
+        A new list of recent messages within the budget(s), with all
+        tool-call pairs intact.
+    """
+    if token_budget <= 0 or not messages:
+        return []
+    if message_cap is not None and message_cap <= 0:
+        return []
+
+    tok_remaining = [token_budget]
+    msg_remaining = [message_cap if message_cap is not None else len(messages) + 1]
+
+    def budgets_ok(size: int, tok_cost: int) -> bool:
+        return size <= msg_remaining[0] and tok_cost <= tok_remaining[0]
+
+    def consume(size: int, tok_cost: int) -> None:
+        msg_remaining[0] -= size
+        tok_remaining[0] -= tok_cost
+
+    return _walk_blocks(messages, budgets_ok, consume)
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
index 934216758..c437b5b7d 100644
--- a/backend/tests/test_history_window.py
+++ b/backend/tests/test_history_window.py
@@ -1,11 +1,14 @@
 """Unit tests for pair-aware conversation history truncation.
 
-Validates that ``truncate_by_message_count`` preserves
-``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
-orphan tool messages that would trigger the OpenAI #446 failure mode.
+Validates that ``truncate_by_message_count`` and ``truncate_by_token_budget``
+preserve ``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never
+produces orphan tool messages that would trigger the OpenAI #446 failure mode.
 """
 
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import (
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 
 
 # ── Helpers ─────────────────────────────────────────────────────────────
@@ -258,6 +261,135 @@ def test_system_message_treated_as_normal_block():
     assert _roles(out) == ["user", "assistant"]
 
 
+# ── Token-budget mode ───────────────────────────────────────────────────
+
+
+def test_token_budget_empty_or_zero():
+    assert truncate_by_token_budget([], 1000) == []
+    assert truncate_by_token_budget([_u("hi")], 0) == []
+    assert truncate_by_token_budget([_u("hi")], -10) == []
+
+
+def test_token_budget_short_messages_within_budget():
+    msgs = [_u("hi"), _a("hello"), _u("ok")]
+    out = truncate_by_token_budget(msgs, 10000)
+    assert out == msgs
+
+
+def test_token_budget_huge_message_dropped():
+    """One enormous tool result should not push other messages out of context;
+    the huge block just doesn't fit and is dropped."""
+    huge_payload = "x" * 60000  # ~20K tokens via chars/3
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content=huge_payload),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 5000)
+    # u2 fits (small). Pair is huge → doesn't fit → break. u1 not visited (after pair).
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "u2"
+
+
+def test_token_budget_preserves_pair_when_both_fit():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content="small result"),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 5000)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+
+
+def test_token_budget_drops_huge_tail_message_caller_must_protect():
+    """The helper walks tail-to-head. If the LAST message (often the current
+    user input that just arrived) alone exceeds the budget, the walker can't
+    fit it and breaks — leaving NOTHING. Callers that pass the current user
+    message INTO truncation must hold it OUT and re-append afterward (see
+    api/websocket.py for the pattern). This test pins the contract so call
+    sites don't silently regress.
+    """
+    msgs = [
+        _u("history msg 1"),
+        _a("history msg 2"),
+        _u("x" * 60000),  # current input — alone >> 5000 token budget
+    ]
+    out = truncate_by_token_budget(msgs, token_budget=5000)
+    # Walker visits the huge user msg first (tail), can't fit → BREAK.
+    # Nothing else gets a chance because the loop short-circuits on first miss.
+    assert out == []
+
+
+def test_token_budget_with_message_cap_message_wins():
+    """100 small messages fit token budget but message_cap=10 binds first."""
+    msgs = [_u(f"m{k}") for k in range(100)]
+    out = truncate_by_token_budget(msgs, 100000, message_cap=10)
+    assert len(out) == 10
+    # Last 10 messages, in order
+    assert [m["content"] for m in out] == [f"m{k}" for k in range(90, 100)]
+
+
+def test_token_budget_with_message_cap_token_wins():
+    """20 fat messages, message_cap loose, token budget binds first."""
+    msgs = [_u("x" * 1500) for _ in range(20)]  # JSON-serialized ~1527 chars → ~509 tokens
+    out = truncate_by_token_budget(msgs, 2000, message_cap=100)
+    # Token budget binds first (well below message_cap=100). Exact count depends
+    # on JSON overhead but is small enough that several messages don't fit.
+    assert 0 < len(out) < 10
+    assert len(out) < len(msgs)
+
+
+def test_token_budget_orphan_tool_dropped():
+    msgs = [
+        _t("ORPHAN_X", content="ghost"),
+        _u("u1"),
+        _a("ok"),
+    ]
+    out = truncate_by_token_budget(msgs, 100000)
+    # Orphan dropped regardless of budget
+    assert "tool" not in _roles(out)
+
+
+def test_token_budget_drops_large_pair_atomically():
+    """Pair size 2 with first half tiny but second half huge — drop both."""
+    huge = "x" * 9000  # ~3000 tokens
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content=huge),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 1000)  # fits u2 (~5 tokens), not pair
+    assert _roles(out) == ["user"]
+
+
+def test_token_budget_zero_message_cap():
+    msgs = [_u("hi")]
+    assert truncate_by_token_budget(msgs, 1000, message_cap=0) == []
+
+
+def test_token_budget_invariant_no_orphan_in_realistic_long_chat():
+    """End-to-end with mixed message sizes — verify no orphan tool leaks."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(15):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result of call {k}"))
+        msgs.append(_u(f"followup {k}"))
+    out = truncate_by_token_budget(msgs, 1500, message_cap=20)
+
+    # No orphan tool
+    assistant_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                assistant_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            assert m["tool_call_id"] in assistant_call_ids
+
+
 def test_realistic_long_conversation_truncation():
     """End-to-end: simulate a long chat with many tool-call turns and ensure
     the output never has orphan tools."""
diff --git a/frontend/src/i18n/en.json b/frontend/src/i18n/en.json
index f47375aa6..44972143a 100644
--- a/frontend/src/i18n/en.json
+++ b/frontend/src/i18n/en.json
@@ -421,7 +421,9 @@
       "noFallback": "No fallback",
       "conversationContext": "Context Window",
       "maxRounds": "Context Window Size",
-      "roundsDesc": "Number of recent messages included as context for each LLM request",
+      "roundsDesc": "Maximum number of recent messages included as context for each LLM request",
+      "maxTokens": "Context Token Budget",
+      "tokensDesc": "Maximum tokens of history per LLM request. Used together with the message-count cap; whichever bound hits first wins. Protects against a single large tool result blowing out context.",
       "tokenLimits": "Token Limits",
       "dailyLimit": "Daily Limit",
       "monthlyLimit": "Monthly Limit",
diff --git a/frontend/src/i18n/zh.json b/frontend/src/i18n/zh.json
index 77446d99d..cc861e462 100644
--- a/frontend/src/i18n/zh.json
+++ b/frontend/src/i18n/zh.json
@@ -428,7 +428,9 @@
       "noFallback": "无备选",
       "conversationContext": "上下文窗口",
       "maxRounds": "上下文窗口大小",
-      "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量",
+      "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量（条数上限）",
+      "maxTokens": "上下文 token 预算",
+      "tokensDesc": "每次 LLM 请求时携带的历史 token 上限。与上下文窗口大小（条数）取较小值，防止单次大工具结果撑爆 context",
       "tokenLimits": "Token 用量限制",
       "dailyLimit": "每日上限",
       "monthlyLimit": "每月上限",
diff --git a/frontend/src/pages/AgentDetail.tsx b/frontend/src/pages/AgentDetail.tsx
index 7e8dd59c0..2d16b403f 100644
--- a/frontend/src/pages/AgentDetail.tsx
+++ b/frontend/src/pages/AgentDetail.tsx
@@ -2214,6 +2214,7 @@ function AgentDetailInner() {
         primary_model_id: '',
         fallback_model_id: '',
         context_window_size: 100,
+        context_window_tokens: 50000,
         max_tool_rounds: 50,
         max_tokens_per_day: '' as string | number,
         max_tokens_per_month: '' as string | number,
@@ -2233,6 +2234,7 @@ function AgentDetailInner() {
                 primary_model_id: agent.primary_model_id || '',
                 fallback_model_id: agent.fallback_model_id || '',
                 context_window_size: agent.context_window_size ?? 100,
+                context_window_tokens: (agent as any).context_window_tokens ?? 50000,
                 max_tool_rounds: (agent as any).max_tool_rounds ?? 50,
                 max_tokens_per_day: agent.max_tokens_per_day || '',
                 max_tokens_per_month: agent.max_tokens_per_month || '',
@@ -5741,6 +5743,7 @@ function AgentDetailInner() {
                             settingsForm.primary_model_id !== (agent?.primary_model_id || '') ||
                             settingsForm.fallback_model_id !== (agent?.fallback_model_id || '') ||
                             settingsForm.context_window_size !== (agent?.context_window_size ?? 100) ||
+                            settingsForm.context_window_tokens !== ((agent as any)?.context_window_tokens ?? 50000) ||
                             settingsForm.max_tool_rounds !== ((agent as any)?.max_tool_rounds ?? 50) ||
                             String(settingsForm.max_tokens_per_day) !== String(agent?.max_tokens_per_day || '') ||
                             String(settingsForm.max_tokens_per_month) !== String(agent?.max_tokens_per_month || '') ||
@@ -5757,6 +5760,7 @@ function AgentDetailInner() {
                                     primary_model_id: settingsForm.primary_model_id || null,
                                     fallback_model_id: settingsForm.fallback_model_id || null,
                                     context_window_size: settingsForm.context_window_size,
+                                    context_window_tokens: settingsForm.context_window_tokens,
                                     max_tool_rounds: settingsForm.max_tool_rounds,
                                     max_tokens_per_day: settingsForm.max_tokens_per_day ? Number(settingsForm.max_tokens_per_day) : null,
                                     max_tokens_per_month: settingsForm.max_tokens_per_month ? Number(settingsForm.max_tokens_per_month) : null,
@@ -5885,6 +5889,20 @@ function AgentDetailInner() {
                                             />
                                             <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '4px' }}>{t('agent.settings.roundsDesc')}</div>
                                         </div>
+                                        <div style={{ marginTop: '12px' }}>
+                                            <label style={{ display: 'block', fontSize: '13px', fontWeight: 500, marginBottom: '6px' }}>{t('agent.settings.maxTokens')}</label>
+                                            <input
+                                                className="input"
+                                                type="number"
+                                                min={1000}
+                                                max={500000}
+                                                step={1000}
+                                                value={settingsForm.context_window_tokens}
+                                                onChange={(e) => setSettingsForm(f => ({ ...f, context_window_tokens: Math.max(1000, Math.min(500000, parseInt(e.target.value) || 50000)) }))}
+                                                style={{ width: '120px' }}
+                                            />
+                                            <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '4px' }}>{t('agent.settings.tokensDesc')}</div>
+                                        </div>
                                     </div>
 
                                     {/* Max Tool Call Rounds */}
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
index e7138f051..e29f740c8 100644
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -36,6 +36,7 @@ export interface Agent {
     last_heartbeat_at?: string;
     timezone?: string;
     context_window_size?: number;
+    context_window_tokens?: number;
     agent_type?: 'native' | 'openclaw';
     openclaw_last_seen?: string;
     unread_count?: number;

From b01c0f123b3ef44125119c7c968a4ebe2fef34d1 Mon Sep 17 00:00:00 2001
From: zhongyua <zhongyua@ualberta.ca>
Date: Mon, 27 Apr 2026 13:55:37 +0800
Subject: [PATCH 3/3] feat(tools): truncate large tool results with workspace
 spill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tool calls returning ~50 KB of search hits or PDF extracts used to enter
the in-context history verbatim, burning tokens for content the model
samples one slice of. New `services/tool_result_truncation.py` spills
oversized payloads to `agent_data/<agent_id>/_tool_results/<call_id>.txt`
and replaces the in-context body with a head excerpt + a marker pointing
the model at `read_file` for retrieval.

Threshold: 4000 estimated tokens (~12 KB at chars/3) — soft-start to give
the model time to learn the read_file follow-up; can tighten to ~2000
once telemetry confirms reliable use of the marker.

Smart head:
  - JSON-shape responses with a known array key (results/items/data/
    entries/hits/documents) keep metadata + first 5 items + a synthetic
    _truncated_<key>_count field, so search/list responses stay useful.
  - Otherwise plain head-cut (preferring a line boundary when one is
    near the cut point).
  - Vision-injected list payloads (image_data markers) pass through.

Sandbox: spill files live under each agent's existing AGENT_DATA_DIR
sandbox; cross-agent leak is prevented by the existing read_file boundary.

Applied at three sites to prevent drift:
  - llm/caller.py _process_tool_call (call_llm path)
  - llm/caller.py call_agent_llm_with_tools inner loop
  - services/heartbeat.py tool loop

agent_context.py rule 3 augmented with a paragraph teaching the model to
recognize the marker and use read_file for full content rather than
fabricating from prior knowledge.

16 new unit tests covering pass-through, spill, utf-8 round-trip,
threshold boundary, JSON-shape preservation, write-failure fallback, and
a realistic 50-result jina_search end-to-end. 41/41 total pass.

Note: 3 tool loops still exist (caller.py:432, caller.py:763,
heartbeat.py:335). Drift prevention is partial here — PR 3 consolidates
them into a single execute_tool_calls helper.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backend/app/services/agent_context.py         |  12 +
 backend/app/services/heartbeat.py             |  12 +-
 backend/app/services/llm/caller.py            |  36 +-
 .../app/services/tool_result_truncation.py    | 216 ++++++++++++
 backend/tests/test_tool_result_truncation.py  | 311 ++++++++++++++++++
 5 files changed, 580 insertions(+), 7 deletions(-)
 create mode 100644 backend/app/services/tool_result_truncation.py
 create mode 100644 backend/tests/test_tool_result_truncation.py

diff --git a/backend/app/services/agent_context.py b/backend/app/services/agent_context.py
index 74edb28df..1da003f59 100644
--- a/backend/app/services/agent_context.py
+++ b/backend/app/services/agent_context.py
@@ -438,6 +438,18 @@ async def build_agent_context(agent_id: uuid.UUID, agent_name: str, role_descrip
 3. **NEVER fabricate file contents or tool results from memory.**
    Even if you saw a file before, you MUST call the tool again to get current data.
 
+   **Handling truncated tool results:** When a tool returns a large payload, the system
+   may truncate the in-context view and save the full output to your workspace. You will
+   see a marker like:
+
+   `[truncated. Full output (12453 tokens) saved to _tool_results/<call_id>.txt under your
+   workspace — use the read_file tool to retrieve specific sections]`
+
+   When you see this marker, the head excerpt above it is enough for an overview. If you
+   need the full content (specific search hit, page from a PDF, item N of a list), call
+   `read_file` with the path shown in the marker. Do NOT fabricate the missing content
+   from your prior knowledge — the saved file is the ground truth.
+
 4. **Use `write_file` to update memory/memory.md with important information.**
 
 5. **Use `write_file` to update focus.md with your current focus items.**
diff --git a/backend/app/services/heartbeat.py b/backend/app/services/heartbeat.py
index f3331ab4f..b7784e84f 100644
--- a/backend/app/services/heartbeat.py
+++ b/backend/app/services/heartbeat.py
@@ -370,10 +370,20 @@ async def _execute_heartbeat(agent_id: uuid.UUID):
                     else:
                         tool_result = await execute_tool(tool_name, args, agent_id, agent_creator_id)
 
+                    # Spill oversized tool results to disk; keep in-context bounded
+                    _hb_content: str | list = str(tool_result)
+                    if agent_id:
+                        from pathlib import Path as _HbPath
+                        from app.config import get_settings as _hb_get_settings
+                        from app.services.tool_result_truncation import maybe_truncate_tool_result as _hb_trunc
+                        _hb_ws = _HbPath(_hb_get_settings().AGENT_DATA_DIR) / str(agent_id)
+                        _hb_content = _hb_trunc(
+                            _hb_content, call_id=tc["id"], agent_workspace=_hb_ws,
+                        )
                     llm_messages.append(LLMMessage(
                         role="tool",
                         tool_call_id=tc["id"],
-                        content=str(tool_result),
+                        content=_hb_content,
                     ))
             else:
                 reply = response.content or ""
diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
index f9cfbc93c..9edddfadc 100644
--- a/backend/app/services/llm/caller.py
+++ b/backend/app/services/llm/caller.py
@@ -241,14 +241,20 @@ async def _process_tool_call(
     )
     logger.debug(f"[LLM] Tool result: {result[:100]}")
 
+    # Resolve agent workspace path once — needed by vision injection and
+    # tool-result truncation below. Path resolution is sandbox-relevant, so
+    # keep it tied to AGENT_DATA_DIR / str(agent_id).
+    ws_path: Path | None = None
+    if agent_id:
+        from app.config import get_settings
+        settings = get_settings()
+        ws_path = Path(settings.AGENT_DATA_DIR) / str(agent_id)
+
     # ── Vision injection for screenshot tools ──
     tool_content: str | list = str(result)
-    if supports_vision and agent_id:
+    if supports_vision and ws_path is not None:
         try:
             from app.services.vision_inject import try_inject_screenshot_vision
-            from app.config import get_settings
-            settings = get_settings()
-            ws_path = Path(settings.AGENT_DATA_DIR) / str(agent_id)
             vision_content = try_inject_screenshot_vision(tool_name, str(result), ws_path)
             if vision_content:
                 tool_content = vision_content
@@ -256,6 +262,15 @@ async def _process_tool_call(
         except Exception as e:
             logger.warning(f"[LLM] Vision injection failed for {tool_name}: {e}")
 
+    # ── Tool-result truncation: spill oversized payloads to disk ──
+    # Vision-injected list payloads pass through unchanged; large strings get
+    # head-excerpted with a marker pointing the model at _tool_results/<id>.txt.
+    if ws_path is not None:
+        from app.services.tool_result_truncation import maybe_truncate_tool_result
+        tool_content = maybe_truncate_tool_result(
+            tool_content, call_id=tc["id"], agent_workspace=ws_path,
+        )
+
     # Notify client about tool call result
     if on_tool_call:
         try:
@@ -269,7 +284,7 @@ async def _process_tool_call(
             })
         except Exception:
             pass
-    
+
     api_messages.append(LLMMessage(
         role="tool",
         tool_call_id=tc["id"],
@@ -771,10 +786,19 @@ async def _try_model(model: LLMModel) -> tuple[str, bool, bool]:
                         user_id=agent.creator_id,
                         session_id=session_id,
                     )
+                    # Spill oversized tool results to disk; keep in-context bounded
+                    _tool_content: str | list = str(result)
+                    if agent_id:
+                        from app.config import get_settings
+                        from app.services.tool_result_truncation import maybe_truncate_tool_result
+                        _ws = Path(get_settings().AGENT_DATA_DIR) / str(agent_id)
+                        _tool_content = maybe_truncate_tool_result(
+                            _tool_content, call_id=tc["id"], agent_workspace=_ws,
+                        )
                     api_messages.append(LLMMessage(
                         role="tool",
                         tool_call_id=tc["id"],
-                        content=str(result),
+                        content=_tool_content,
                     ))
 
             if agent_id and _accumulated_tokens > 0:
diff --git a/backend/app/services/tool_result_truncation.py b/backend/app/services/tool_result_truncation.py
new file mode 100644
index 000000000..52b679237
--- /dev/null
+++ b/backend/app/services/tool_result_truncation.py
@@ -0,0 +1,216 @@
+"""Truncate large tool results to disk with an in-context pointer.
+
+When a tool returns ~50 KB of jina_search hits or a 30-page PDF extract,
+sending it verbatim into the LLM history burns tokens for content the model
+will only sample one slice of. This module spills oversized payloads to the
+agent's workspace and replaces the in-context body with a head excerpt plus
+a ``[truncated. Full output saved to ...]`` marker. The model is taught (in
+the system prompt) to follow up with ``read_file`` for specific sections.
+
+Why a "smart" head:
+  - Naive ``content[:N]`` works for prose but ruins JSON-shape responses
+    where the head is metadata (``query``, ``total_results``) and the
+    payload is in a ``results``/``items``/``data`` array.
+  - We try ``json.loads`` first; if the content is a dict with one of the
+    well-known array keys, we keep the metadata and the first few items.
+    Otherwise fall back to head-cut.
+
+What's intentionally not handled:
+  - Vision injection payloads (``list`` content) pass through unchanged.
+    Truncating image_data markers would corrupt the multimodal block.
+  - DB persistence already truncates to 500 chars (websocket.py +
+    agent_tools.py + trigger_daemon.py). This module operates on the
+    in-flight ``api_messages`` list, not the DB form.
+  - Sandbox boundary: ``_tool_results/<call_id>.txt`` lives under the
+    agent's ``AGENT_DATA_DIR / str(agent_id) /`` workspace — read_file
+    already enforces this boundary, so cross-agent leak is prevented by
+    the existing tool-level sandbox.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+from loguru import logger
+
+from app.services.token_tracker import estimate_tokens_from_chars
+
+# Conservative initial threshold: 4000 estimated tokens (~12 KB at chars/3).
+# Soft-start value to give the model time to learn the read_file follow-up
+# pattern; can be tightened to ~2000 once telemetry confirms the model uses
+# the marker reliably.
+TOOL_RESULT_TOKEN_THRESHOLD = 4000
+
+# Allowed characters for a call_id used as a filename. Anthropic returns
+# ``toolu_<UUID>``; OpenAI returns ``call_<UUID>``; clawith synthesizes
+# ``call_<DB-row-uuid>``. All three fit this set. Anything else (e.g. a
+# prompt-injected ``../../etc/passwd``) gets slugged before reaching the
+# filesystem.
+_SAFE_CALL_ID_RE = re.compile(r"^[A-Za-z0-9_-]{1,128}$")
+
+
+def _safe_call_id(call_id: str) -> str:
+    """Return a filesystem-safe version of ``call_id``.
+
+    Provider-issued call_ids already match ``_SAFE_CALL_ID_RE`` and pass
+    through unchanged. Hostile or malformed inputs (path separators, unicode,
+    arbitrarily long) are replaced char-by-char and length-capped — defense
+    in depth against a prompt-injection scenario where the model is coaxed
+    into emitting a path-traversing ``tool_call_id``.
+    """
+    if not call_id:
+        return "unknown"
+    if _SAFE_CALL_ID_RE.match(call_id):
+        return call_id
+    slugged = re.sub(r"[^A-Za-z0-9_-]", "_", str(call_id))[:128]
+    return slugged or "unknown"
+
+# Hard char limit for the in-context head excerpt. Keeps the in-context
+# payload small even for unusual content (e.g. one giant JSON string).
+_HEAD_MAX_CHARS = TOOL_RESULT_TOKEN_THRESHOLD * 3  # ~12 KB
+
+# JSON keys that conventionally hold the array payload (the "results" of a
+# search, the "items" of a list endpoint, etc). Order matters — first hit wins.
+_KNOWN_ARRAY_KEYS = ("results", "items", "data", "entries", "hits", "documents")
+
+# How many array elements to keep when smart-heading a JSON-shape response.
+_KEEP_ARRAY_ITEMS = 5
+
+
+def maybe_truncate_tool_result(
+    tool_content: str | list,
+    *,
+    call_id: str,
+    agent_workspace: Path,
+) -> str | list:
+    """Return ``tool_content`` shortened with a marker, or unchanged.
+
+    Args:
+        tool_content: The raw tool result. ``str`` is the common case;
+            ``list`` is multimodal vision-injected content (passed through).
+        call_id: Tool call ID — used as the spill filename.
+        agent_workspace: Path to the agent's workspace dir
+            (``AGENT_DATA_DIR / str(agent_id)``). Spill goes under
+            ``<workspace>/_tool_results/<call_id>.txt``.
+
+    Returns:
+        The original content if under threshold, or a head excerpt + marker
+        string. List payloads always pass through.
+    """
+    # Multimodal content: image_data markers must stay structurally intact.
+    if isinstance(tool_content, list):
+        return tool_content
+
+    if not isinstance(tool_content, str):
+        # Defensive: unknown payload type — coerce and continue.
+        tool_content = str(tool_content)
+
+    est_tokens = estimate_tokens_from_chars(len(tool_content))
+    if est_tokens <= TOOL_RESULT_TOKEN_THRESHOLD:
+        return tool_content
+
+    # Spill full content to disk. Sanitize the call_id and assert containment
+    # so a hostile/malformed call_id can never write outside _tool_results/.
+    safe_id = _safe_call_id(call_id)
+    spill_root = (agent_workspace / "_tool_results").resolve()
+    full_path = (spill_root / f"{safe_id}.txt").resolve()
+    if not _is_within(full_path, spill_root):
+        # Should be unreachable after slugging — defense in depth.
+        logger.error(
+            f"[tool-truncation] Refusing to spill {call_id!r} → {full_path} "
+            f"(outside {spill_root}); falling back to inline head-cut."
+        )
+        head = _smart_head(tool_content, _HEAD_MAX_CHARS)
+        return (
+            head
+            + f"\n\n[truncated. Full output ({est_tokens} tokens) "
+            f"could not be spilled to disk — only this excerpt is available]"
+        )
+
+    try:
+        spill_root.mkdir(parents=True, exist_ok=True)
+        full_path.write_text(tool_content, encoding="utf-8")
+    except OSError as e:
+        logger.warning(
+            f"[tool-truncation] Failed to spill {safe_id} to {full_path}: {e}; "
+            "falling back to inline head-cut without spill."
+        )
+        head = _smart_head(tool_content, _HEAD_MAX_CHARS)
+        return (
+            head
+            + f"\n\n[truncated. Full output ({est_tokens} tokens) "
+            f"could not be spilled to disk — only this excerpt is available]"
+        )
+
+    head = _smart_head(tool_content, _HEAD_MAX_CHARS)
+    return (
+        head
+        + f"\n\n[truncated. Full output ({est_tokens} tokens) saved to "
+        f"_tool_results/{safe_id}.txt under your workspace — use the read_file "
+        f"tool to retrieve specific sections]"
+    )
+
+
+def _is_within(path: Path, root: Path) -> bool:
+    """True iff ``path`` is the same as ``root`` or nested below it.
+
+    Both paths are expected to already be ``.resolve()``-d by the caller.
+    """
+    try:
+        path.relative_to(root)
+        return True
+    except ValueError:
+        return False
+
+
+def _smart_head(content: str, max_chars: int) -> str:
+    """Best-effort excerpt that preserves structure when possible.
+
+    JSON-shape: if the content parses as a dict with a known array key
+    (``results``/``items``/``data``/...), keep the metadata + first 5 items.
+    Otherwise fall back to a plain head cut.
+    """
+    if len(content) <= max_chars:
+        return content
+
+    # Try JSON-shape preservation
+    stripped = content.lstrip()
+    if stripped.startswith("{"):
+        try:
+            data: dict[str, Any] = json.loads(content)
+        except (json.JSONDecodeError, ValueError):
+            pass
+        else:
+            if isinstance(data, dict):
+                truncated = _truncate_json_dict(data)
+                if truncated is not None:
+                    rendered = json.dumps(truncated, ensure_ascii=False, indent=2)
+                    if len(rendered) <= max_chars:
+                        return rendered
+                    # Even truncated JSON exceeds head budget → fall through.
+
+    # Plain head cut at a word/line boundary if possible
+    cut = content[:max_chars]
+    last_newline = cut.rfind("\n")
+    if last_newline > max_chars * 0.8:
+        cut = cut[:last_newline]
+    return cut
+
+
+def _truncate_json_dict(data: dict[str, Any]) -> dict[str, Any] | None:
+    """If ``data`` has a known array key, return a dict with metadata
+    preserved and that array trimmed to ``_KEEP_ARRAY_ITEMS`` entries.
+    Returns None if no known array shape is detected.
+    """
+    for key in _KNOWN_ARRAY_KEYS:
+        value = data.get(key)
+        if isinstance(value, list) and len(value) > _KEEP_ARRAY_ITEMS:
+            trimmed = dict(data)
+            kept = value[:_KEEP_ARRAY_ITEMS]
+            trimmed[key] = kept
+            trimmed[f"_truncated_{key}_count"] = len(value) - _KEEP_ARRAY_ITEMS
+            return trimmed
+    return None
diff --git a/backend/tests/test_tool_result_truncation.py b/backend/tests/test_tool_result_truncation.py
new file mode 100644
index 000000000..e60e08180
--- /dev/null
+++ b/backend/tests/test_tool_result_truncation.py
@@ -0,0 +1,311 @@
+"""Unit tests for tool result truncation + workspace spill."""
+
+import json
+
+from app.services.tool_result_truncation import (
+    TOOL_RESULT_TOKEN_THRESHOLD,
+    _safe_call_id,
+    _smart_head,
+    _truncate_json_dict,
+    maybe_truncate_tool_result,
+)
+
+
+# ── Pass-through cases ──────────────────────────────────────────────────
+
+
+def test_short_string_unchanged(tmp_path):
+    content = "tiny result"
+    out = maybe_truncate_tool_result(content, call_id="c1", agent_workspace=tmp_path)
+    assert out == content
+    # No spill file created
+    assert not (tmp_path / "_tool_results").exists()
+
+
+def test_list_payload_passes_through(tmp_path):
+    """Vision-injected multimodal content must not be touched."""
+    payload = [
+        {"type": "text", "text": "see image:"},
+        {"type": "image", "source": {"type": "base64", "data": "..."}},
+    ]
+    out = maybe_truncate_tool_result(payload, call_id="c1", agent_workspace=tmp_path)
+    assert out is payload  # same object
+
+
+def test_non_string_non_list_coerced(tmp_path):
+    """Defensive: dict payload (shouldn't happen) is coerced to str."""
+    out = maybe_truncate_tool_result(
+        {"key": "value"},  # type: ignore[arg-type]
+        call_id="c1",
+        agent_workspace=tmp_path,
+    )
+    # Coerced to str then passed through (small enough)
+    assert isinstance(out, str)
+
+
+# ── Spill behavior ──────────────────────────────────────────────────────
+
+
+def _huge_payload(chars: int = 60000) -> str:
+    return "x" * chars
+
+
+def test_huge_payload_spilled_and_truncated(tmp_path):
+    payload = _huge_payload(60000)
+    out = maybe_truncate_tool_result(
+        payload, call_id="abc-123", agent_workspace=tmp_path
+    )
+    # In-context is now much shorter than original
+    assert isinstance(out, str)
+    assert len(out) < len(payload)
+    # Marker present
+    assert "[truncated. Full output" in out
+    assert "_tool_results/abc-123.txt" in out
+    assert "use the read_file tool" in out
+    # Spill file written with full payload
+    spill = tmp_path / "_tool_results" / "abc-123.txt"
+    assert spill.exists()
+    assert spill.read_text(encoding="utf-8") == payload
+
+
+def test_spill_path_uses_utf8(tmp_path):
+    """Mixed CJK + emoji payload should round-trip through utf-8."""
+    payload = ("中文测试 🎉 " + "x" * 30000)
+    out = maybe_truncate_tool_result(
+        payload, call_id="cjk", agent_workspace=tmp_path
+    )
+    spill = tmp_path / "_tool_results" / "cjk.txt"
+    assert spill.exists()
+    assert spill.read_text(encoding="utf-8") == payload
+    # Marker still present
+    assert "[truncated" in out
+
+
+def test_spill_failure_returns_inline_marker(tmp_path):
+    """If we can't write to disk, still truncate but mark as unrecoverable."""
+    # Use a workspace path that can't be written (a file, not a dir)
+    blocker = tmp_path / "blocker"
+    blocker.write_text("this is a file, not a dir")
+    payload = _huge_payload(40000)
+    out = maybe_truncate_tool_result(
+        payload, call_id="fail", agent_workspace=blocker
+    )
+    assert isinstance(out, str)
+    assert "could not be spilled to disk" in out
+
+
+def test_at_threshold_boundary(tmp_path):
+    """Just below threshold passes through; just above triggers spill."""
+    chars_at_threshold = TOOL_RESULT_TOKEN_THRESHOLD * 3  # chars/3 ratio
+    # Below threshold (slightly)
+    below = "y" * (chars_at_threshold - 100)
+    out = maybe_truncate_tool_result(below, call_id="b", agent_workspace=tmp_path)
+    assert out == below
+
+    # Above threshold
+    above = "y" * (chars_at_threshold + 1000)
+    out2 = maybe_truncate_tool_result(above, call_id="a", agent_workspace=tmp_path)
+    assert "[truncated" in out2
+
+
+# ── Smart head: JSON-shape preservation ─────────────────────────────────
+
+
+def test_smart_head_preserves_results_array():
+    """jina_search-style response: keep metadata + first 5 items."""
+    # Each result is bulky enough that full payload exceeds max_chars.
+    bulky_snippet = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 5
+    payload = json.dumps({
+        "query": "what is rust",
+        "total_results": 50,
+        "results": [
+            {"url": f"https://ex.com/{i}", "title": f"Result {i}", "snippet": bulky_snippet}
+            for i in range(50)
+        ],
+    })
+    assert len(payload) > 5000  # sanity: max_chars below forces truncation
+    out = _smart_head(payload, max_chars=5000)
+    parsed = json.loads(out)
+    assert parsed["query"] == "what is rust"
+    assert parsed["total_results"] == 50
+    assert len(parsed["results"]) == 5
+    assert parsed["_truncated_results_count"] == 45
+
+
+def test_smart_head_recognizes_items_key():
+    bulky_item = "x" * 200
+    payload = json.dumps({
+        "items": [f"{bulky_item}-{i}" for i in range(30)],
+        "next_cursor": "abc",
+    })
+    assert len(payload) > 2000  # sanity
+    out = _smart_head(payload, max_chars=2000)
+    parsed = json.loads(out)
+    assert len(parsed["items"]) == 5
+    assert parsed["next_cursor"] == "abc"
+
+
+def test_smart_head_falls_back_for_non_array_json():
+    """JSON without a known array key falls back to head-cut."""
+    payload = json.dumps({"big_field": "x" * 10000, "other": "y"})
+    out = _smart_head(payload, max_chars=2000)
+    # Either trimmed JSON dict or plain head-cut, but length ≤ 2000
+    assert len(out) <= 2000
+
+
+def test_smart_head_plain_text_head_cut():
+    payload = "line1\n" + ("plain text content " * 1000)
+    out = _smart_head(payload, max_chars=500)
+    assert len(out) <= 500
+    # Head-cut should prefer line boundary if available
+    assert out.startswith("line1")
+
+
+def test_truncate_json_dict_no_array_key_returns_none():
+    """Helper returns None when no recognizable array shape."""
+    assert _truncate_json_dict({"foo": "bar", "baz": 123}) is None
+
+
+def test_truncate_json_dict_short_array_returns_none():
+    """Don't bother truncating a 3-item array."""
+    assert _truncate_json_dict({"results": [1, 2, 3]}) is None
+
+
+def test_truncate_json_dict_first_known_key_wins():
+    """If multiple known keys present, first match wins (results > items)."""
+    data = {"results": list(range(20)), "items": list(range(30))}
+    out = _truncate_json_dict(data)
+    assert out is not None
+    assert "_truncated_results_count" in out
+    assert "_truncated_items_count" not in out  # items left untouched
+
+
+# ── End-to-end integration ──────────────────────────────────────────────
+
+
+def test_realistic_jina_search_response_truncated_to_useful_excerpt(tmp_path):
+    """A 50-result jina_search dump fits into the in-context head with
+    enough metadata + samples for the model to decide whether to read_file."""
+    full_response = json.dumps({
+        "query": "best practices for python async",
+        "total_results": 50,
+        "search_time_ms": 142,
+        "results": [
+            {
+                "url": f"https://example.com/article-{i}",
+                "title": f"Article {i} about async",
+                "snippet": "Lorem ipsum dolor sit amet " * 30,
+            }
+            for i in range(50)
+        ],
+    })
+
+    out = maybe_truncate_tool_result(
+        full_response, call_id="search-1", agent_workspace=tmp_path
+    )
+    assert isinstance(out, str)
+
+    # Full content preserved on disk
+    spill = tmp_path / "_tool_results" / "search-1.txt"
+    assert spill.read_text(encoding="utf-8") == full_response
+
+    # Model can see metadata + sample
+    assert "best practices for python async" in out
+    assert "total_results" in out
+    assert "search-1.txt" in out  # marker tells model where the rest is
+
+
+def test_marker_includes_token_count(tmp_path):
+    payload = "z" * 30000  # ~10000 tokens
+    out = maybe_truncate_tool_result(
+        payload, call_id="big", agent_workspace=tmp_path
+    )
+    # Marker should mention the original token count
+    assert "tokens" in out
+    # The number itself appears (10000)
+    import re
+    m = re.search(r"\((\d+)\s+tokens\)", out)
+    assert m is not None
+    assert int(m.group(1)) > 5000
+
+
+# ── Sandbox safety: call_id sanitization ────────────────────────────────
+
+
+def test_safe_call_id_passes_provider_format():
+    """Real Anthropic / OpenAI / synthetic IDs all pass through unchanged."""
+    for cid in (
+        "toolu_01ABC123def456GHI789jkl",
+        "call_f6db199d-c470-4bb3-8188-1c9eeb43cc60",
+        "call_msg_uuid_v4",
+        "abc-123_XYZ",
+    ):
+        assert _safe_call_id(cid) == cid, f"Should pass through: {cid!r}"
+
+
+def test_safe_call_id_slugs_path_traversal():
+    """A prompt-injected ../-style call_id gets neutralized."""
+    assert _safe_call_id("../../../etc/passwd") != "../../../etc/passwd"
+    # No path separator survives
+    out = _safe_call_id("../../../etc/passwd")
+    assert "/" not in out
+    assert "\\" not in out
+    assert ".." not in out or out.replace(".", "_") == out
+
+
+def test_safe_call_id_handles_empty_and_unicode():
+    assert _safe_call_id("") == "unknown"
+    assert _safe_call_id(None) == "unknown"  # type: ignore[arg-type]
+    out = _safe_call_id("中文 with spaces!")
+    # Unicode and special chars get replaced with underscore
+    assert all(c.isalnum() or c in "_-" for c in out)
+
+
+def test_safe_call_id_caps_length():
+    """Pathological 10K-char call_id is capped at 128."""
+    huge = "a" * 10_000
+    out = _safe_call_id(huge)
+    assert len(out) <= 128
+
+
+def test_path_traversal_call_id_writes_inside_sandbox(tmp_path):
+    """Hostile call_id like '../../foo' must NOT escape _tool_results/."""
+    payload = "x" * 60000  # forces spill
+    hostile_id = "../../../../../etc/passwd"
+
+    out = maybe_truncate_tool_result(
+        payload, call_id=hostile_id, agent_workspace=tmp_path
+    )
+    assert isinstance(out, str)
+
+    # Truncation marker still appears with the SLUGGED name (not the hostile one)
+    assert "[truncated" in out
+    assert "etc/passwd" not in out  # original hostile path doesn't surface
+
+    # Walk the workspace and verify NO file got written outside _tool_results/
+    spill_root = tmp_path / "_tool_results"
+    for path in tmp_path.rglob("*"):
+        if path.is_file():
+            assert spill_root in path.resolve().parents or path.resolve() == spill_root, (
+                f"File leaked outside _tool_results/: {path}"
+            )
+
+
+def test_call_id_with_separator_in_filename(tmp_path):
+    """Path separator in call_id should not create subdirectories under
+    _tool_results/."""
+    payload = "y" * 60000
+    out = maybe_truncate_tool_result(
+        payload, call_id="evil/sub/path", agent_workspace=tmp_path
+    )
+    assert isinstance(out, str)
+
+    # No nested directory should appear under _tool_results/
+    spill_root = tmp_path / "_tool_results"
+    nested_dirs = [p for p in spill_root.rglob("*") if p.is_dir()]
+    assert nested_dirs == [], f"Unexpected subdirectories: {nested_dirs}"
+
+    # Exactly one .txt file at the top level of _tool_results/
+    txt_files = list(spill_root.glob("*.txt"))
+    assert len(txt_files) == 1
+    assert txt_files[0].read_text(encoding="utf-8") == payload