From 114be2465789aa29f3d6ee0e56da471a1fffeaa4 Mon Sep 17 00:00:00 2001
From: zhongyua <zhongyua@ualberta.ca>
Date: Mon, 27 Apr 2026 13:06:56 +0800
Subject: [PATCH 1/2] fix(history): atomic pair-aware truncation for tool_call
 blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace naive `conversation[-ctx_size:]` slicing with a walker that treats
assistant.tool_calls and its matching role="tool" messages as one atomic
block. Naive slicing can leave an orphan role="tool" at the head when the
cut lands mid-pair — OpenAI rejects this with "No tool call found for
function call output" (issue #446).

New helper `services/history_window.truncate_by_message_count` walks tail
backward, identifies blocks, and includes blocks whole or not at all.
Orphan tools are silently dropped regardless of budget.

Replaces head-only pop guard in:
  - app/api/websocket.py (web chat)
  - app/api/feishu.py (feishu channel)

Leaves app/services/llm/caller.py:626 untouched (call_agent_llm short-reply
path's hardcoded [-10:] is intentional).

Tests: 15 covering empty input, budget edges, parallel tool_calls,
multi-pair, head/mid orphans, realistic 60-message invariant.

Addresses #446 failure mode.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backend/app/api/feishu.py              |   9 +-
 backend/app/api/websocket.py           |  11 +-
 backend/app/services/history_window.py | 166 ++++++++++++++
 backend/tests/test_history_window.py   | 288 +++++++++++++++++++++++++
 4 files changed, 469 insertions(+), 5 deletions(-)
 create mode 100644 backend/app/services/history_window.py
 create mode 100644 backend/tests/test_history_window.py

diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 00fe32737..ca4cef7c7 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,6 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import truncate_by_message_count
 
 router = APIRouter(tags=["feishu"])
 
@@ -1634,7 +1635,13 @@ async def _call_agent_llm(
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
+        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
+        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
+        # rows, so this path has no tool messages and the helper acts as plain count
+        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        messages.extend(
+            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+        )
     messages.append({"role": "user", "content": user_text})
 
     # Use actual user_id so the system prompt knows who it's chatting with
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 82b297916..6f17ad1e5 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,6 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import truncate_by_message_count
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -662,10 +663,12 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
new file mode 100644
index 000000000..54c0ce8a6
--- /dev/null
+++ b/backend/app/services/history_window.py
@@ -0,0 +1,166 @@
+"""Pair-aware conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
+tool message inserted between a valid pair and other messages (from
+malformed persistence or upstream truncation) is dropped, not folded
+into an adjacent block. This makes the helper robust against orphans
+at any position, not just at the slice head.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
+    """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has
+    no matching ``assistant.tool_calls`` earlier in the conversation.
+
+    OpenAI rejects the request the moment a ``function_call_output`` is
+    sent without its matching ``function_call``, regardless of whether
+    that tool message is at the head, middle, or end. So orphan detection
+    is by ID matching, not by position.
+    """
+    orphans: set[int] = set()
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        if not tcid:
+            orphans.add(i)
+            continue
+        # Search backward for an assistant whose tool_calls contains this id.
+        # Walks past intervening user / system / other-assistant messages.
+        found = False
+        j = i - 1
+        while j >= 0:
+            m = messages[j]
+            if m.get("role") == "assistant" and m.get("tool_calls"):
+                ids = {tc.get("id") for tc in m["tool_calls"]}
+                if tcid in ids:
+                    found = True
+                    break
+            j -= 1
+        if not found:
+            orphans.add(i)
+    return orphans
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
+        message (identified by ``tool_call_id``, not adjacency).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages — those whose ``tool_call_id`` has no matching assistant — are
+    silently dropped regardless of budget. Sending them to OpenAI causes the
+    #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    orphans = _identify_orphans(messages)
+    n = len(messages)
+    consumed: set[int] = set(orphans)  # orphans drop unconditionally
+    blocks: list[set[int]] = []  # tail-to-head order
+
+    for i in range(n - 1, -1, -1):
+        if i in consumed:
+            continue
+        msg = messages[i]
+        role = msg.get("role")
+
+        if role == "tool":
+            # Find this tool's owning assistant by matching tool_call_id
+            tcid = msg.get("tool_call_id")
+            asst_idx = -1
+            j = i - 1
+            while j >= 0:
+                m = messages[j]
+                if m.get("role") == "assistant" and m.get("tool_calls"):
+                    ids = {tc.get("id") for tc in m["tool_calls"]}
+                    if tcid in ids:
+                        asst_idx = j
+                        break
+                j -= 1
+            if asst_idx < 0:
+                # Defensive — orphan detection should have caught this
+                consumed.add(i)
+                continue
+            # Block = assistant + ALL of its matching tool messages (siblings)
+            asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]}
+            block = {asst_idx}
+            for k in range(asst_idx + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        elif role == "assistant" and msg.get("tool_calls"):
+            # Encountered the assistant before any of its tools (e.g. tools
+            # were truncated upstream or are still in flight). Group with
+            # whatever matching tools follow it.
+            asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]}
+            block = {i}
+            for k in range(i + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        else:
+            consumed.add(i)
+            blocks.append({i})
+
+    # Walk blocks tail-to-head, taking until budget exhausted.
+    keep: set[int] = set()
+    budget = max_messages
+    for block in blocks:
+        size = len(block)
+        if size <= budget:
+            keep |= block
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [messages[k] for k in sorted(keep)]
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
new file mode 100644
index 000000000..934216758
--- /dev/null
+++ b/backend/tests/test_history_window.py
@@ -0,0 +1,288 @@
+"""Unit tests for pair-aware conversation history truncation.
+
+Validates that ``truncate_by_message_count`` preserves
+``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
+orphan tool messages that would trigger the OpenAI #446 failure mode.
+"""
+
+from app.services.history_window import truncate_by_message_count
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────
+
+
+def _u(text: str) -> dict:
+    return {"role": "user", "content": text}
+
+
+def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict:
+    msg: dict = {"role": "assistant", "content": text}
+    if tool_calls:
+        msg["tool_calls"] = tool_calls
+    return msg
+
+
+def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict:
+    return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}}
+
+
+def _t(call_id: str, content: str = "ok") -> dict:
+    return {"role": "tool", "tool_call_id": call_id, "content": content}
+
+
+def _roles(msgs: list[dict]) -> list[str]:
+    return [m.get("role", "?") for m in msgs]
+
+
+# ── Edge cases ──────────────────────────────────────────────────────────
+
+
+def test_empty_input_returns_empty():
+    assert truncate_by_message_count([], 10) == []
+
+
+def test_zero_or_negative_budget_returns_empty():
+    msgs = [_u("hi"), _u("there")]
+    assert truncate_by_message_count(msgs, 0) == []
+    assert truncate_by_message_count(msgs, -5) == []
+
+
+def test_within_budget_returns_all():
+    msgs = [_u("a"), _a("b"), _u("c")]
+    out = truncate_by_message_count(msgs, 10)
+    assert out == msgs
+    assert out is not msgs  # new list
+
+
+def test_input_not_mutated():
+    msgs = [_u("a"), _a("b"), _u("c"), _u("d")]
+    snapshot = list(msgs)
+    truncate_by_message_count(msgs, 2)
+    assert msgs == snapshot
+
+
+# ── Core pair-preservation behavior ─────────────────────────────────────
+
+
+def test_keeps_assistant_tool_pair_intact():
+    """Slicing must not split assistant.tool_calls from its tool result."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["assistant", "tool", "user"]
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[1]["tool_call_id"] == "X"
+
+
+def test_drops_pair_entirely_when_budget_too_small():
+    """If budget can't fit the whole pair, drop it — never half."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 2 — can't fit pair (needs 2) + final user, must drop pair
+    out = truncate_by_message_count(msgs, 2)
+    # Only the trailing user fits as a single block; pair (size 2) doesn't fit
+    # in remaining budget=1 after taking user.
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "done?"
+
+
+def test_drops_orphan_tool_at_head():
+    """A role=tool with no preceding assistant.tool_calls is dropped."""
+    msgs = [
+        _t("X"),  # orphan — no assistant before
+        _u("hi"),
+        _a("ok"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_drops_orphan_tool_at_head_after_slicing():
+    """Slicing produces an orphan tool at head — must be dropped (the
+    classic #446 failure mode)."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),       # ← naive slice [-3:] would start here as orphan
+        _u("u2"),
+        _a("final"),
+    ]
+    # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X)
+    # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because
+    # we'd need budget 5 to include from start. Result: [u("u2"), a("final")].
+    out = truncate_by_message_count(msgs, 3)
+    assert "tool" not in _roles(out)
+    # No orphan tool_call_id reaches output
+    for m in out:
+        if m.get("role") == "tool":
+            raise AssertionError(f"Orphan tool leaked: {m}")
+
+
+def test_multiple_parallel_tool_calls_in_one_assistant():
+    """Assistant with N tool_calls followed by N tools is one atomic block."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"]
+    # Verify the pair came through whole
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[3]["tool_call_id"] == "Z"
+
+
+def test_parallel_tool_pair_dropped_if_too_big():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u].
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["user"]
+
+
+def test_multiple_pairs_some_drop():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("A")]),
+        _t("A"),
+        _u("u2"),
+        _a(None, tool_calls=[_tc("B")]),
+        _t("B"),
+        _u("u3"),
+    ]
+    # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3.
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+    assert out[1]["tool_calls"][0]["id"] == "B"
+    assert out[2]["tool_call_id"] == "B"
+
+
+def test_no_partial_pair_when_budget_exactly_one_short():
+    """Exactly one short of fitting a pair → drop the pair, don't include
+    just the assistant."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+    ]
+    # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk
+    # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0)
+    # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t]
+    out = truncate_by_message_count(msgs, 2)
+    assert _roles(out) == ["assistant", "tool"]
+    # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits).
+    # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't
+    # fit, BREAK. Output: [].
+    out2 = truncate_by_message_count(msgs, 1)
+    assert out2 == []
+
+
+def test_mid_orphan_tool_dropped():
+    """A tool whose tool_call_id has no matching assistant nearby — defensive
+    drop. (Shouldn't happen with current persistence, but be robust.)"""
+    msgs = [
+        _u("u1"),
+        _t("ORPHAN_X"),  # malformed — no preceding assistant.tool_calls
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    # Orphan dropped
+    assert "tool" not in _roles(out)
+    assert _roles(out) == ["user", "user"]
+
+
+def test_orphan_adjacent_to_valid_pair_still_dropped():
+    """Orphan tool message inserted right after a legitimate tool-call pair
+    must be dropped — adjacency to a valid pair does not legitimize it.
+
+    This is the bug class that triggers OpenAI #446 even when slice cut
+    boundaries would otherwise be safe: any orphan reaching the wire,
+    regardless of position, makes the request invalid."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("VALID")]),
+        _t("VALID", "real result"),
+        _t("ORPHAN_id", "ghost result"),  # no assistant emits ORPHAN_id
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+
+    # The orphan must NOT survive — even though it's adjacent to a valid pair
+    orphan_present = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id"
+        for m in out
+    )
+    assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped"
+
+    # The valid pair survives intact
+    valid_assistant = any(
+        m.get("role") == "assistant"
+        and m.get("tool_calls")
+        and any(tc["id"] == "VALID" for tc in m["tool_calls"])
+        for m in out
+    )
+    valid_tool = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "VALID"
+        for m in out
+    )
+    assert valid_assistant and valid_tool
+
+
+def test_system_message_treated_as_normal_block():
+    msgs = [
+        {"role": "system", "content": "you are an agent"},
+        _u("hi"),
+        _a("hello"),
+    ]
+    out = truncate_by_message_count(msgs, 2)
+    # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped.
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_realistic_long_conversation_truncation():
+    """End-to-end: simulate a long chat with many tool-call turns and ensure
+    the output never has orphan tools."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(20):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result {k}"))
+        msgs.append(_u(f"next {k}"))
+    msgs.append(_a("final answer"))
+
+    # Truncate to 30 messages
+    out = truncate_by_message_count(msgs, 30)
+
+    # Sanity: budget respected
+    assert len(out) <= 30
+
+    # Critical invariant: no orphan tool messages anywhere
+    seen_tool_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                seen_tool_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            tcid = m.get("tool_call_id")
+            assert tcid in seen_tool_call_ids, (
+                f"Orphan tool {tcid!r} in output without matching assistant.tool_calls"
+            )

From 2a0040fef35e69706030afde273f7f43f80f5adc Mon Sep 17 00:00:00 2001
From: zhongyua <zhongyua@ualberta.ca>
Date: Mon, 27 Apr 2026 13:32:48 +0800
Subject: [PATCH 2/2] feat(agent): token-aware history truncation alongside
 message cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `agents.context_window_tokens` field (default 50000) and a new
`truncate_by_token_budget` helper that bounds in-context history by both
estimated token cost (primary) and message count (safety cap), preserving
assistant.tool_calls ↔ role=tool pairs intact via the same walker as
truncate_by_message_count.

Why: message-count alone is a wildly variable proxy for token cost — one
50KB tool result eats more context than 100 short user messages. Token
budget gives predictable behavior across heterogeneous traffic; message
cap remains as a safety net against pathological tiny-message floods.

Changes:
  - models/agent.py: + context_window_tokens (Integer, default=50000)
    + DEFAULT_CONTEXT_WINDOW_TOKENS constant
  - schemas/schemas.py: AgentOut, AgentUpdate (1000 <= tokens <= 500000)
  - alembic: add_context_window_tokens.py (idempotent IF NOT EXISTS)
  - services/history_window.py: + truncate_by_token_budget, refactored
    common walker, JSON-serialized char->token estimate via existing
    estimate_tokens_from_chars (chars/3 — overestimates safely)
  - api/websocket.py: pass tok_budget to helper, raise DB load to
    max(ctx_size, 500) so helper has room to choose
  - api/feishu.py: same pattern at 2 sites (web chat + IM channel paths)
  - frontend: AgentDetail Settings slider + i18n + types

10 new tests covering token-budget mode (huge-message dropped, both-bounds
interaction, atomic pair preservation, orphan defense). 25/25 pass.

Other channels (dingtalk/discord/slack/teams/wecom/whatsapp) still use
DB-level message-count limit only — they don't get token awareness in
this PR but won't crash. Migrating them is follow-up scope.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../versions/add_context_window_tokens.py     |  65 ++++++
 backend/app/api/feishu.py                     |  23 ++-
 backend/app/api/websocket.py                  |  41 +++-
 backend/app/models/agent.py                   |  23 +++
 backend/app/schemas/schemas.py                |   2 +
 backend/app/services/history_window.py        | 186 ++++++++++++++----
 backend/tests/test_history_window.py          | 140 ++++++++++++-
 frontend/src/i18n/en.json                     |   4 +-
 frontend/src/i18n/zh.json                     |   4 +-
 frontend/src/pages/AgentDetail.tsx            |  18 ++
 frontend/src/types/index.ts                   |   1 +
 11 files changed, 442 insertions(+), 65 deletions(-)
 create mode 100644 backend/alembic/versions/add_context_window_tokens.py

diff --git a/backend/alembic/versions/add_context_window_tokens.py b/backend/alembic/versions/add_context_window_tokens.py
new file mode 100644
index 000000000..bf385417e
--- /dev/null
+++ b/backend/alembic/versions/add_context_window_tokens.py
@@ -0,0 +1,65 @@
+"""add agents.context_window_tokens for token-aware history truncation
+
+Revision ID: add_context_window_tokens
+Revises: rm_agent_credential_secrets
+Create Date: 2026-04-27
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision: str = "add_context_window_tokens"
+down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Add context_window_tokens with a DDL default of 50000.
+
+    The four-step pattern is required because earlier in the migration chain,
+    ``alembic/versions/0000_initial_schema.py`` calls
+    ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents``
+    from the *current* model state — including any new columns. SQLAlchemy's
+    Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause,
+    so the column ends up ``NOT NULL`` with no default, and a naive
+    ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never
+    sets the default.
+
+    This four-step approach is idempotent regardless of pre-existing state:
+      - column missing → created (nullable, no default initially)
+      - column present without default → default set
+      - any rows with NULL → backfilled to 50000
+      - column made NOT NULL
+
+    Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE
+    affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL
+    column is a no-op.
+    """
+    # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here,
+    #    so existing rows (if any from create_all) aren't blocked.
+    op.execute(
+        "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER"
+    )
+    # 2. Ensure the DDL default is set so future inserts that omit this
+    #    column (raw SQL, restored backups, manual migrations) get 50000.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000"
+    )
+    # 3. Backfill any rows that were created before the default landed.
+    op.execute(
+        "UPDATE agents SET context_window_tokens = 50000 "
+        "WHERE context_window_tokens IS NULL"
+    )
+    # 4. Now safe to enforce NOT NULL.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL"
+    )
+
+
+def downgrade() -> None:
+    # Downgrade omitted — dropping the column would lose per-tenant tuning.
+    pass
diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index ca4cef7c7..02e21f5bf 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,7 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import truncate_by_token_budget
 
 router = APIRouter(tags=["feishu"])
 
@@ -657,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession
             )
             _pre_sess = _pre_sess_r.scalar_one_or_none()
             _history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id
+            # Load extra raw material so app-level token-aware helper has room to choose
             history_result = await db.execute(
                 select(ChatMessage)
                 .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id)
                 .order_by(ChatMessage.created_at.desc())
-                .limit(ctx_size)
+                .limit(max(ctx_size, 500))
             )
             history_msgs = history_result.scalars().all()
             history = _build_llm_history_from_chat_messages(list(reversed(history_msgs)))
@@ -1374,11 +1375,12 @@ async def _handle_feishu_file(
         # Load conversation history for LLM context
         from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
         ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE
+        # Load extra raw material so app-level token-aware helper has room to choose
         _hist_r = await db.execute(
             _select(ChatMessage)
             .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id)
             .order_by(ChatMessage.created_at.desc())
-            .limit(ctx_size)
+            .limit(max(ctx_size, 500))
         )
         _history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all())))
 
@@ -1632,15 +1634,18 @@ async def _call_agent_llm(
 
     # Build conversation messages (without system prompt — call_llm adds it)
     messages: list[dict] = []
-    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
+    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
+    tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
     if history:
-        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
-        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
-        # rows, so this path has no tool messages and the helper acts as plain count
-        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        # Pair-aware truncation: token budget primary, message count as safety cap.
+        # Today _normalize_history_messages drops DB role="tool_call" rows, so this
+        # path has no tool messages and the pair guard is a no-op; the safety kicks
+        # in once a feishu reorganization helper exists.
         messages.extend(
-            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+            truncate_by_token_budget(
+                _normalize_history_messages(history), tok_budget, message_cap=ctx_size,
+            )
         )
     messages.append({"role": "user", "content": user_text})
 
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 6f17ad1e5..8d79deeff 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,7 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import truncate_by_token_budget
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -214,7 +214,9 @@ async def websocket_chat(
             role_description = agent.role_description or ""
             welcome_message = agent.welcome_message or ""
             ctx_size = agent.context_window_size or 100
-            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}")
+            from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS
+            tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
+            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok")
 
             # Load the agent's primary model
             if agent.primary_model_id:
@@ -300,11 +302,14 @@ async def websocket_chat(
                     logger.info(f"[WS] Selected primary session {conv_id}")
 
             try:
+                # Load extra raw material so the app-level token-aware helper
+                # (truncate_by_token_budget below) has room to choose from.
+                _db_load_cap = max(ctx_size, 500)
                 history_result = await db.execute(
                     select(ChatMessage)
                     .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id)
                     .order_by(ChatMessage.created_at.desc())
-                    .limit(ctx_size)
+                    .limit(_db_load_cap)
                 )
                 history_messages = list(reversed(history_result.scalars().all()))
                 logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}")
@@ -663,12 +668,30 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # Pair-aware truncation: keep the last `ctx_size` messages while
-                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
-                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
-                        # head when the cut lands mid-pair, which OpenAI rejects with
-                        # "No tool call found for function call output" (issue #446).
-                        _truncated = truncate_by_message_count(conversation, ctx_size)
+                        # Pair-aware truncation with a token budget plus a message-count
+                        # safety cap. Either bound stops the walk; pairs (assistant.tool_calls
+                        # ↔ role=tool) are kept atomic. Token budget protects against
+                        # one-tool-result-eats-the-window scenarios; message cap protects
+                        # against pathological tiny-message floods. The pair guard fixes
+                        # the orphan-tool failure mode reported in #446.
+                        #
+                        # The current user message (just appended at line ~416) is excluded
+                        # from truncation and re-appended after — otherwise a single huge
+                        # input (large paste, base64 image_data) could push past the budget
+                        # and cause the helper to drop the very message we're answering.
+                        # If the input itself exceeds the model's context, the provider will
+                        # surface a clear error rather than silently dropping it here.
+                        _current = (
+                            conversation[-1]
+                            if conversation and conversation[-1].get("role") == "user"
+                            else None
+                        )
+                        _history = conversation[:-1] if _current is not None else conversation
+                        _truncated = truncate_by_token_budget(
+                            _history, tok_budget, message_cap=ctx_size,
+                        )
+                        if _current is not None:
+                            _truncated.append(_current)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,
diff --git a/backend/app/models/agent.py b/backend/app/models/agent.py
index 5c48d7bf9..2493aaedd 100644
--- a/backend/app/models/agent.py
+++ b/backend/app/models/agent.py
@@ -15,6 +15,11 @@
 # (see: https://github.com/dataelement/Clawith/issues/238).
 DEFAULT_CONTEXT_WINDOW_SIZE = 100
 
+# Default token budget for in-context history. Conservative for 128K-context
+# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent
+# override via Agent.context_window_tokens.
+DEFAULT_CONTEXT_WINDOW_TOKENS = 50000
+
 
 class Agent(Base):
     """Digital employee (Agent) instance.
@@ -81,6 +86,24 @@ class Agent(Base):
     last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
     tokens_used_total: Mapped[int] = mapped_column(Integer, default=0)
     context_window_size: Mapped[int] = mapped_column(Integer, default=100)
+    # Token-aware secondary bound on history sent to the LLM. Truncation uses
+    # the smaller of context_window_size (message count) and this token budget,
+    # preserving assistant.tool_calls ↔ role=tool pairs intact.
+    #
+    # ``server_default`` matters: alembic/versions/0000_initial_schema.py uses
+    # ``Base.metadata.create_all`` which reads model state at runtime. Without
+    # a server_default, fresh-DB bootstrap would create this column NOT NULL
+    # without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration
+    # later in the chain would short-circuit, leaving direct-SQL inserts
+    # broken. ``server_default="50000"`` ensures the DDL has the default
+    # whether the column was created by create_all or by the explicit
+    # migration.
+    context_window_tokens: Mapped[int] = mapped_column(
+        Integer,
+        default=DEFAULT_CONTEXT_WINDOW_TOKENS,
+        server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS),
+        nullable=False,
+    )
     max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50)
 
     # Trigger limits (per-agent, configurable from Settings UI)
diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py
index 0ae5d8e34..eb475d9c2 100644
--- a/backend/app/schemas/schemas.py
+++ b/backend/app/schemas/schemas.py
@@ -251,6 +251,7 @@ class AgentOut(BaseModel):
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     context_window_size: int = 100
+    context_window_tokens: int = 50000
     max_tool_rounds: int = 50
     max_triggers: int = 20
     min_poll_interval_min: int = 5
@@ -286,6 +287,7 @@ class AgentUpdate(BaseModel):
     primary_model_id: uuid.UUID | None = None
     fallback_model_id: uuid.UUID | None = None
     context_window_size: int | None = Field(default=None, ge=1, le=500)
+    context_window_tokens: int | None = Field(default=None, ge=1000, le=500000)
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     max_tool_rounds: int | None = None
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
index 54c0ce8a6..a3d63817c 100644
--- a/backend/app/services/history_window.py
+++ b/backend/app/services/history_window.py
@@ -11,19 +11,26 @@
 between an assistant message and its tool results. This is the failure mode
 reported in issue #446.
 
-Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
-tool message inserted between a valid pair and other messages (from
-malformed persistence or upstream truncation) is dropped, not folded
-into an adjacent block. This makes the helper robust against orphans
-at any position, not just at the slice head.
+Two public entry points:
+  - ``truncate_by_message_count`` — bound by message count
+  - ``truncate_by_token_budget`` — bound by estimated token cost (and an
+    optional message-count safety cap); preferred for production paths
+    where one tool result can dwarf 50 short messages.
 
 Input is expected to be in OpenAI chat-completion format (post-reorganization
-from DB ``role="tool_call"`` rows).
+from DB ``role="tool_call"`` rows). Helper is tolerant of malformed input —
+unmatched tool messages at the head are silently dropped.
 """
 
 from __future__ import annotations
 
-from typing import Any
+import json
+from typing import Any, Callable
+
+from app.services.token_tracker import estimate_tokens_from_chars
+
+
+# ── Block detection (shared between truncators) ─────────────────────────
 
 
 def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
@@ -60,38 +67,24 @@ def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
     return orphans
 
 
-def truncate_by_message_count(
-    messages: list[dict[str, Any]],
-    max_messages: int,
-) -> list[dict[str, Any]]:
-    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
-
-    A "block" is either:
-      - a single non-tool, non-tool-calling message (user / system / assistant text), or
-      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
-        message (identified by ``tool_call_id``, not adjacency).
-
-    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
-    messages — those whose ``tool_call_id`` has no matching assistant — are
-    silently dropped regardless of budget. Sending them to OpenAI causes the
-    #446 error.
+def _identify_blocks(messages: list[dict[str, Any]]) -> list[set[int]]:
+    """Group conversation entries into atomic blocks.
 
-    Args:
-        messages: Conversation list in OpenAI format. Empty list is fine.
-        max_messages: Soft upper bound on the number of returned entries.
-            Values ``<= 0`` return ``[]``.
+    A block is a set of indices that must be kept (or dropped) together:
+      - ``{i}`` for a single non-tool, non-tool-calling message
+      - ``{asst_idx, tool_idx_1, tool_idx_2, ...}`` for an assistant that
+        emitted N tool_calls plus its matching tool result messages,
+        identified by ``tool_call_id`` (not by adjacency — orphan tools
+        inserted between are dropped, not folded into the block).
 
-    Returns:
-        A new list (input is never mutated) of at most ``max_messages`` entries
-        from the tail of ``messages``, with all tool-call pairs intact.
+    Returned tail-to-head: most recent block first. Orphan tool messages
+    (those whose tool_call_id has no matching assistant.tool_calls) are
+    silently dropped — never appear in any block.
     """
-    if max_messages <= 0 or not messages:
-        return []
-
     orphans = _identify_orphans(messages)
     n = len(messages)
+    blocks: list[set[int]] = []
     consumed: set[int] = set(orphans)  # orphans drop unconditionally
-    blocks: list[set[int]] = []  # tail-to-head order
 
     for i in range(n - 1, -1, -1):
         if i in consumed:
@@ -151,16 +144,127 @@ def truncate_by_message_count(
             consumed.add(i)
             blocks.append({i})
 
-    # Walk blocks tail-to-head, taking until budget exhausted.
+    return blocks
+
+
+def _walk_blocks(
+    messages: list[dict[str, Any]],
+    budgets_ok: Callable[[int, int], bool],
+    consume: Callable[[int, int], None],
+) -> list[dict[str, Any]]:
+    """Common walker used by both truncators.
+
+    ``budgets_ok(block_msg_count, block_token_cost)`` returns True if the
+    block fits. ``consume`` updates remaining budget when a block is taken.
+    Stops on first non-fitting block (atomic — never partial-include).
+    """
+    blocks = _identify_blocks(messages)
     keep: set[int] = set()
-    budget = max_messages
     for block in blocks:
         size = len(block)
-        if size <= budget:
-            keep |= block
-            budget -= size
-        else:
-            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+        token_cost = sum(_estimate_msg_tokens(messages[k]) for k in block)
+        if not budgets_ok(size, token_cost):
             break
-
+        keep |= block
+        consume(size, token_cost)
     return [messages[k] for k in sorted(keep)]
+
+
+def _estimate_msg_tokens(msg: dict[str, Any]) -> int:
+    """Estimate token cost for one message via JSON-serialized char count.
+
+    Slight overestimate (JSON keys/quotes inflate vs the tokenizer's view of
+    the structured payload), which is the safe direction — better to truncate
+    a bit early than send too much and OOM the model.
+    """
+    try:
+        serialized = json.dumps(msg, ensure_ascii=False, default=str)
+    except (TypeError, ValueError):
+        # Fallback for unserializable payloads (shouldn't happen in practice)
+        serialized = str(msg)
+    return estimate_tokens_from_chars(len(serialized))
+
+
+# ── Public API ──────────────────────────────────────────────────────────
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool message (``user``/``system``/``assistant`` text), or
+      - an ``assistant`` with ``tool_calls`` plus every immediately-following
+        ``role="tool"`` message (the assistant's tool results).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages with no matching assistant are always dropped, regardless of
+    budget — sending them to OpenAI causes the #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+    remaining = [max_messages]
+
+    def budgets_ok(size: int, _tok: int) -> bool:
+        return size <= remaining[0]
+
+    def consume(size: int, _tok: int) -> None:
+        remaining[0] -= size
+
+    return _walk_blocks(messages, budgets_ok, consume)
+
+
+def truncate_by_token_budget(
+    messages: list[dict[str, Any]],
+    token_budget: int,
+    *,
+    message_cap: int | None = None,
+) -> list[dict[str, Any]]:
+    """Keep tail messages within both bounds, preserving tool-call pairs.
+
+    The two bounds work together: a block is included only if both the
+    remaining token budget and (when set) remaining message cap can absorb
+    its full size. The first bound to be exhausted stops the walk.
+
+    Token cost per message is an overestimate based on JSON-serialized char
+    count divided by ~3 (see ``_estimate_msg_tokens``). This is intentional:
+    for budget enforcement, overestimating is safe.
+
+    Args:
+        messages: Conversation list in OpenAI format.
+        token_budget: Soft upper bound on cumulative estimated tokens.
+            Values ``<= 0`` return ``[]``.
+        message_cap: Optional secondary bound on entry count. When set, the
+            walk stops as soon as either bound is exhausted.
+
+    Returns:
+        A new list of recent messages within the budget(s), with all
+        tool-call pairs intact.
+    """
+    if token_budget <= 0 or not messages:
+        return []
+    if message_cap is not None and message_cap <= 0:
+        return []
+
+    tok_remaining = [token_budget]
+    msg_remaining = [message_cap if message_cap is not None else len(messages) + 1]
+
+    def budgets_ok(size: int, tok_cost: int) -> bool:
+        return size <= msg_remaining[0] and tok_cost <= tok_remaining[0]
+
+    def consume(size: int, tok_cost: int) -> None:
+        msg_remaining[0] -= size
+        tok_remaining[0] -= tok_cost
+
+    return _walk_blocks(messages, budgets_ok, consume)
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
index 934216758..c437b5b7d 100644
--- a/backend/tests/test_history_window.py
+++ b/backend/tests/test_history_window.py
@@ -1,11 +1,14 @@
 """Unit tests for pair-aware conversation history truncation.
 
-Validates that ``truncate_by_message_count`` preserves
-``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
-orphan tool messages that would trigger the OpenAI #446 failure mode.
+Validates that ``truncate_by_message_count`` and ``truncate_by_token_budget``
+preserve ``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never
+produces orphan tool messages that would trigger the OpenAI #446 failure mode.
 """
 
-from app.services.history_window import truncate_by_message_count
+from app.services.history_window import (
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 
 
 # ── Helpers ─────────────────────────────────────────────────────────────
@@ -258,6 +261,135 @@ def test_system_message_treated_as_normal_block():
     assert _roles(out) == ["user", "assistant"]
 
 
+# ── Token-budget mode ───────────────────────────────────────────────────
+
+
+def test_token_budget_empty_or_zero():
+    assert truncate_by_token_budget([], 1000) == []
+    assert truncate_by_token_budget([_u("hi")], 0) == []
+    assert truncate_by_token_budget([_u("hi")], -10) == []
+
+
+def test_token_budget_short_messages_within_budget():
+    msgs = [_u("hi"), _a("hello"), _u("ok")]
+    out = truncate_by_token_budget(msgs, 10000)
+    assert out == msgs
+
+
+def test_token_budget_huge_message_dropped():
+    """One enormous tool result should not push other messages out of context;
+    the huge block just doesn't fit and is dropped."""
+    huge_payload = "x" * 60000  # ~20K tokens via chars/3
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content=huge_payload),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 5000)
+    # u2 fits (small). Pair is huge → doesn't fit → break. u1 not visited (after pair).
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "u2"
+
+
+def test_token_budget_preserves_pair_when_both_fit():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content="small result"),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 5000)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+
+
+def test_token_budget_drops_huge_tail_message_caller_must_protect():
+    """The helper walks tail-to-head. If the LAST message (often the current
+    user input that just arrived) alone exceeds the budget, the walker can't
+    fit it and breaks — leaving NOTHING. Callers that pass the current user
+    message INTO truncation must hold it OUT and re-append afterward (see
+    api/websocket.py for the pattern). This test pins the contract so call
+    sites don't silently regress.
+    """
+    msgs = [
+        _u("history msg 1"),
+        _a("history msg 2"),
+        _u("x" * 60000),  # current input — alone >> 5000 token budget
+    ]
+    out = truncate_by_token_budget(msgs, token_budget=5000)
+    # Walker visits the huge user msg first (tail), can't fit → BREAK.
+    # Nothing else gets a chance because the loop short-circuits on first miss.
+    assert out == []
+
+
+def test_token_budget_with_message_cap_message_wins():
+    """100 small messages fit token budget but message_cap=10 binds first."""
+    msgs = [_u(f"m{k}") for k in range(100)]
+    out = truncate_by_token_budget(msgs, 100000, message_cap=10)
+    assert len(out) == 10
+    # Last 10 messages, in order
+    assert [m["content"] for m in out] == [f"m{k}" for k in range(90, 100)]
+
+
+def test_token_budget_with_message_cap_token_wins():
+    """20 fat messages, message_cap loose, token budget binds first."""
+    msgs = [_u("x" * 1500) for _ in range(20)]  # JSON-serialized ~1527 chars → ~509 tokens
+    out = truncate_by_token_budget(msgs, 2000, message_cap=100)
+    # Token budget binds first (well below message_cap=100). Exact count depends
+    # on JSON overhead but is small enough that several messages don't fit.
+    assert 0 < len(out) < 10
+    assert len(out) < len(msgs)
+
+
+def test_token_budget_orphan_tool_dropped():
+    msgs = [
+        _t("ORPHAN_X", content="ghost"),
+        _u("u1"),
+        _a("ok"),
+    ]
+    out = truncate_by_token_budget(msgs, 100000)
+    # Orphan dropped regardless of budget
+    assert "tool" not in _roles(out)
+
+
+def test_token_budget_drops_large_pair_atomically():
+    """Pair size 2 with first half tiny but second half huge — drop both."""
+    huge = "x" * 9000  # ~3000 tokens
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", content=huge),
+        _u("u2"),
+    ]
+    out = truncate_by_token_budget(msgs, 1000)  # fits u2 (~5 tokens), not pair
+    assert _roles(out) == ["user"]
+
+
+def test_token_budget_zero_message_cap():
+    msgs = [_u("hi")]
+    assert truncate_by_token_budget(msgs, 1000, message_cap=0) == []
+
+
+def test_token_budget_invariant_no_orphan_in_realistic_long_chat():
+    """End-to-end with mixed message sizes — verify no orphan tool leaks."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(15):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result of call {k}"))
+        msgs.append(_u(f"followup {k}"))
+    out = truncate_by_token_budget(msgs, 1500, message_cap=20)
+
+    # No orphan tool
+    assistant_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                assistant_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            assert m["tool_call_id"] in assistant_call_ids
+
+
 def test_realistic_long_conversation_truncation():
     """End-to-end: simulate a long chat with many tool-call turns and ensure
     the output never has orphan tools."""
diff --git a/frontend/src/i18n/en.json b/frontend/src/i18n/en.json
index f47375aa6..44972143a 100644
--- a/frontend/src/i18n/en.json
+++ b/frontend/src/i18n/en.json
@@ -421,7 +421,9 @@
       "noFallback": "No fallback",
       "conversationContext": "Context Window",
       "maxRounds": "Context Window Size",
-      "roundsDesc": "Number of recent messages included as context for each LLM request",
+      "roundsDesc": "Maximum number of recent messages included as context for each LLM request",
+      "maxTokens": "Context Token Budget",
+      "tokensDesc": "Maximum tokens of history per LLM request. Used together with the message-count cap; whichever bound hits first wins. Protects against a single large tool result blowing out context.",
       "tokenLimits": "Token Limits",
       "dailyLimit": "Daily Limit",
       "monthlyLimit": "Monthly Limit",
diff --git a/frontend/src/i18n/zh.json b/frontend/src/i18n/zh.json
index 77446d99d..cc861e462 100644
--- a/frontend/src/i18n/zh.json
+++ b/frontend/src/i18n/zh.json
@@ -428,7 +428,9 @@
       "noFallback": "无备选",
       "conversationContext": "上下文窗口",
       "maxRounds": "上下文窗口大小",
-      "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量",
+      "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量（条数上限）",
+      "maxTokens": "上下文 token 预算",
+      "tokensDesc": "每次 LLM 请求时携带的历史 token 上限。与上下文窗口大小（条数）取较小值，防止单次大工具结果撑爆 context",
       "tokenLimits": "Token 用量限制",
       "dailyLimit": "每日上限",
       "monthlyLimit": "每月上限",
diff --git a/frontend/src/pages/AgentDetail.tsx b/frontend/src/pages/AgentDetail.tsx
index 7e8dd59c0..2d16b403f 100644
--- a/frontend/src/pages/AgentDetail.tsx
+++ b/frontend/src/pages/AgentDetail.tsx
@@ -2214,6 +2214,7 @@ function AgentDetailInner() {
         primary_model_id: '',
         fallback_model_id: '',
         context_window_size: 100,
+        context_window_tokens: 50000,
         max_tool_rounds: 50,
         max_tokens_per_day: '' as string | number,
         max_tokens_per_month: '' as string | number,
@@ -2233,6 +2234,7 @@ function AgentDetailInner() {
                 primary_model_id: agent.primary_model_id || '',
                 fallback_model_id: agent.fallback_model_id || '',
                 context_window_size: agent.context_window_size ?? 100,
+                context_window_tokens: (agent as any).context_window_tokens ?? 50000,
                 max_tool_rounds: (agent as any).max_tool_rounds ?? 50,
                 max_tokens_per_day: agent.max_tokens_per_day || '',
                 max_tokens_per_month: agent.max_tokens_per_month || '',
@@ -5741,6 +5743,7 @@ function AgentDetailInner() {
                             settingsForm.primary_model_id !== (agent?.primary_model_id || '') ||
                             settingsForm.fallback_model_id !== (agent?.fallback_model_id || '') ||
                             settingsForm.context_window_size !== (agent?.context_window_size ?? 100) ||
+                            settingsForm.context_window_tokens !== ((agent as any)?.context_window_tokens ?? 50000) ||
                             settingsForm.max_tool_rounds !== ((agent as any)?.max_tool_rounds ?? 50) ||
                             String(settingsForm.max_tokens_per_day) !== String(agent?.max_tokens_per_day || '') ||
                             String(settingsForm.max_tokens_per_month) !== String(agent?.max_tokens_per_month || '') ||
@@ -5757,6 +5760,7 @@ function AgentDetailInner() {
                                     primary_model_id: settingsForm.primary_model_id || null,
                                     fallback_model_id: settingsForm.fallback_model_id || null,
                                     context_window_size: settingsForm.context_window_size,
+                                    context_window_tokens: settingsForm.context_window_tokens,
                                     max_tool_rounds: settingsForm.max_tool_rounds,
                                     max_tokens_per_day: settingsForm.max_tokens_per_day ? Number(settingsForm.max_tokens_per_day) : null,
                                     max_tokens_per_month: settingsForm.max_tokens_per_month ? Number(settingsForm.max_tokens_per_month) : null,
@@ -5885,6 +5889,20 @@ function AgentDetailInner() {
                                             />
                                             <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '4px' }}>{t('agent.settings.roundsDesc')}</div>
                                         </div>
+                                        <div style={{ marginTop: '12px' }}>
+                                            <label style={{ display: 'block', fontSize: '13px', fontWeight: 500, marginBottom: '6px' }}>{t('agent.settings.maxTokens')}</label>
+                                            <input
+                                                className="input"
+                                                type="number"
+                                                min={1000}
+                                                max={500000}
+                                                step={1000}
+                                                value={settingsForm.context_window_tokens}
+                                                onChange={(e) => setSettingsForm(f => ({ ...f, context_window_tokens: Math.max(1000, Math.min(500000, parseInt(e.target.value) || 50000)) }))}
+                                                style={{ width: '120px' }}
+                                            />
+                                            <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '4px' }}>{t('agent.settings.tokensDesc')}</div>
+                                        </div>
                                     </div>
 
                                     {/* Max Tool Call Rounds */}
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
index e7138f051..e29f740c8 100644
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -36,6 +36,7 @@ export interface Agent {
     last_heartbeat_at?: string;
     timezone?: string;
     context_window_size?: number;
+    context_window_tokens?: number;
     agent_type?: 'native' | 'openclaw';
     openclaw_last_seen?: string;
     unread_count?: number;