From 114be2465789aa29f3d6ee0e56da471a1fffeaa4 Mon Sep 17 00:00:00 2001 From: zhongyua Date: Mon, 27 Apr 2026 13:06:56 +0800 Subject: [PATCH 1/2] fix(history): atomic pair-aware truncation for tool_call blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace naive `conversation[-ctx_size:]` slicing with a walker that treats assistant.tool_calls and its matching role="tool" messages as one atomic block. Naive slicing can leave an orphan role="tool" at the head when the cut lands mid-pair — OpenAI rejects this with "No tool call found for function call output" (issue #446). New helper `services/history_window.truncate_by_message_count` walks tail backward, identifies blocks, and includes blocks whole or not at all. Orphan tools are silently dropped regardless of budget. Replaces head-only pop guard in: - app/api/websocket.py (web chat) - app/api/feishu.py (feishu channel) Leaves app/services/llm/caller.py:626 untouched (call_agent_llm short-reply path's hardcoded [-10:] is intentional). Tests: 15 covering empty input, budget edges, parallel tool_calls, multi-pair, head/mid orphans, realistic 60-message invariant. Addresses #446 failure mode. Co-Authored-By: Claude Opus 4.7 --- backend/app/api/feishu.py | 9 +- backend/app/api/websocket.py | 11 +- backend/app/services/history_window.py | 166 ++++++++++++++ backend/tests/test_history_window.py | 288 +++++++++++++++++++++++++ 4 files changed, 469 insertions(+), 5 deletions(-) create mode 100644 backend/app/services/history_window.py create mode 100644 backend/tests/test_history_window.py diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index 00fe32737..ca4cef7c7 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,6 +18,7 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service +from app.services.history_window import truncate_by_message_count router = APIRouter(tags=["feishu"]) @@ -1634,7 +1635,13 @@ async def _call_agent_llm( from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE if history: - messages.extend(_normalize_history_messages(history)[-ctx_size:]) + # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool + # pairs intact. Today _normalize_history_messages drops DB role="tool_call" + # rows, so this path has no tool messages and the helper acts as plain count + # truncation; the safety kicks in once a feishu reorganization helper exists. + messages.extend( + truncate_by_message_count(_normalize_history_messages(history), ctx_size) + ) messages.append({"role": "user", "content": user_text}) # Use actual user_id so the system prompt knows who it's chatting with diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 82b297916..6f17ad1e5 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,6 +19,7 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session +from app.services.history_window import truncate_by_message_count from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -662,10 +663,12 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # To prevent tool call message pairs(assistant + tool) from being broken down. - _truncated = conversation[-ctx_size:] - while _truncated and _truncated[0].get("role") == "tool": - _truncated.pop(0) + # Pair-aware truncation: keep the last `ctx_size` messages while + # preserving assistant.tool_calls ↔ role=tool blocks atomically. + # Naive [-ctx_size:] slicing can leave orphan tool messages at the + # head when the cut lands mid-pair, which OpenAI rejects with + # "No tool call found for function call output" (issue #446). + _truncated = truncate_by_message_count(conversation, ctx_size) return await call_llm_with_failover( primary_model=llm_model, diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py new file mode 100644 index 000000000..54c0ce8a6 --- /dev/null +++ b/backend/app/services/history_window.py @@ -0,0 +1,166 @@ +"""Pair-aware conversation history truncation. + +Replaces naive ``conversation[-N:]`` slicing with a walker that keeps +``assistant.tool_calls`` and their matching ``role="tool"`` messages as an +atomic block — never half a pair, never orphan tool messages. + +Why: OpenAI Responses API and Chat Completions both reject input where a +``function_call_output`` / ``role="tool"`` message has no matching +``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive +``[-N:]`` slicing can leave such orphans at the head when the cut lands +between an assistant message and its tool results. This is the failure mode +reported in issue #446. + +Orphan detection is by ``tool_call_id`` matching, not by adjacency — a +tool message inserted between a valid pair and other messages (from +malformed persistence or upstream truncation) is dropped, not folded +into an adjacent block. This makes the helper robust against orphans +at any position, not just at the slice head. + +Input is expected to be in OpenAI chat-completion format (post-reorganization +from DB ``role="tool_call"`` rows). +""" + +from __future__ import annotations + +from typing import Any + + +def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: + """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has + no matching ``assistant.tool_calls`` earlier in the conversation. + + OpenAI rejects the request the moment a ``function_call_output`` is + sent without its matching ``function_call``, regardless of whether + that tool message is at the head, middle, or end. So orphan detection + is by ID matching, not by position. + """ + orphans: set[int] = set() + for i, msg in enumerate(messages): + if msg.get("role") != "tool": + continue + tcid = msg.get("tool_call_id") + if not tcid: + orphans.add(i) + continue + # Search backward for an assistant whose tool_calls contains this id. + # Walks past intervening user / system / other-assistant messages. + found = False + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + found = True + break + j -= 1 + if not found: + orphans.add(i) + return orphans + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool, non-tool-calling message (user / system / assistant text), or + - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"`` + message (identified by ``tool_call_id``, not adjacency). + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages — those whose ``tool_call_id`` has no matching assistant — are + silently dropped regardless of budget. Sending them to OpenAI causes the + #446 error. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + + orphans = _identify_orphans(messages) + n = len(messages) + consumed: set[int] = set(orphans) # orphans drop unconditionally + blocks: list[set[int]] = [] # tail-to-head order + + for i in range(n - 1, -1, -1): + if i in consumed: + continue + msg = messages[i] + role = msg.get("role") + + if role == "tool": + # Find this tool's owning assistant by matching tool_call_id + tcid = msg.get("tool_call_id") + asst_idx = -1 + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + asst_idx = j + break + j -= 1 + if asst_idx < 0: + # Defensive — orphan detection should have caught this + consumed.add(i) + continue + # Block = assistant + ALL of its matching tool messages (siblings) + asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]} + block = {asst_idx} + for k in range(asst_idx + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + elif role == "assistant" and msg.get("tool_calls"): + # Encountered the assistant before any of its tools (e.g. tools + # were truncated upstream or are still in flight). Group with + # whatever matching tools follow it. + asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]} + block = {i} + for k in range(i + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + else: + consumed.add(i) + blocks.append({i}) + + # Walk blocks tail-to-head, taking until budget exhausted. + keep: set[int] = set() + budget = max_messages + for block in blocks: + size = len(block) + if size <= budget: + keep |= block + budget -= size + else: + # Block doesn't fit — stop. Do NOT partial-include (would split pair). + break + + return [messages[k] for k in sorted(keep)] diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py new file mode 100644 index 000000000..934216758 --- /dev/null +++ b/backend/tests/test_history_window.py @@ -0,0 +1,288 @@ +"""Unit tests for pair-aware conversation history truncation. + +Validates that ``truncate_by_message_count`` preserves +``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces +orphan tool messages that would trigger the OpenAI #446 failure mode. +""" + +from app.services.history_window import truncate_by_message_count + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _u(text: str) -> dict: + return {"role": "user", "content": text} + + +def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict: + msg: dict = {"role": "assistant", "content": text} + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + +def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict: + return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}} + + +def _t(call_id: str, content: str = "ok") -> dict: + return {"role": "tool", "tool_call_id": call_id, "content": content} + + +def _roles(msgs: list[dict]) -> list[str]: + return [m.get("role", "?") for m in msgs] + + +# ── Edge cases ────────────────────────────────────────────────────────── + + +def test_empty_input_returns_empty(): + assert truncate_by_message_count([], 10) == [] + + +def test_zero_or_negative_budget_returns_empty(): + msgs = [_u("hi"), _u("there")] + assert truncate_by_message_count(msgs, 0) == [] + assert truncate_by_message_count(msgs, -5) == [] + + +def test_within_budget_returns_all(): + msgs = [_u("a"), _a("b"), _u("c")] + out = truncate_by_message_count(msgs, 10) + assert out == msgs + assert out is not msgs # new list + + +def test_input_not_mutated(): + msgs = [_u("a"), _a("b"), _u("c"), _u("d")] + snapshot = list(msgs) + truncate_by_message_count(msgs, 2) + assert msgs == snapshot + + +# ── Core pair-preservation behavior ───────────────────────────────────── + + +def test_keeps_assistant_tool_pair_intact(): + """Slicing must not split assistant.tool_calls from its tool result.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["assistant", "tool", "user"] + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[1]["tool_call_id"] == "X" + + +def test_drops_pair_entirely_when_budget_too_small(): + """If budget can't fit the whole pair, drop it — never half.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 2 — can't fit pair (needs 2) + final user, must drop pair + out = truncate_by_message_count(msgs, 2) + # Only the trailing user fits as a single block; pair (size 2) doesn't fit + # in remaining budget=1 after taking user. + assert _roles(out) == ["user"] + assert out[0]["content"] == "done?" + + +def test_drops_orphan_tool_at_head(): + """A role=tool with no preceding assistant.tool_calls is dropped.""" + msgs = [ + _t("X"), # orphan — no assistant before + _u("hi"), + _a("ok"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "assistant"] + + +def test_drops_orphan_tool_at_head_after_slicing(): + """Slicing produces an orphan tool at head — must be dropped (the + classic #446 failure mode).""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), # ← naive slice [-3:] would start here as orphan + _u("u2"), + _a("final"), + ] + # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X) + # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because + # we'd need budget 5 to include from start. Result: [u("u2"), a("final")]. + out = truncate_by_message_count(msgs, 3) + assert "tool" not in _roles(out) + # No orphan tool_call_id reaches output + for m in out: + if m.get("role") == "tool": + raise AssertionError(f"Orphan tool leaked: {m}") + + +def test_multiple_parallel_tool_calls_in_one_assistant(): + """Assistant with N tool_calls followed by N tools is one atomic block.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0 + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"] + # Verify the pair came through whole + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[3]["tool_call_id"] == "Z" + + +def test_parallel_tool_pair_dropped_if_too_big(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u]. + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["user"] + + +def test_multiple_pairs_some_drop(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("A")]), + _t("A"), + _u("u2"), + _a(None, tool_calls=[_tc("B")]), + _t("B"), + _u("u3"), + ] + # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3. + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["user", "assistant", "tool", "user"] + assert out[1]["tool_calls"][0]["id"] == "B" + assert out[2]["tool_call_id"] == "B" + + +def test_no_partial_pair_when_budget_exactly_one_short(): + """Exactly one short of fitting a pair → drop the pair, don't include + just the assistant.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + ] + # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk + # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0) + # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t] + out = truncate_by_message_count(msgs, 2) + assert _roles(out) == ["assistant", "tool"] + # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits). + # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't + # fit, BREAK. Output: []. + out2 = truncate_by_message_count(msgs, 1) + assert out2 == [] + + +def test_mid_orphan_tool_dropped(): + """A tool whose tool_call_id has no matching assistant nearby — defensive + drop. (Shouldn't happen with current persistence, but be robust.)""" + msgs = [ + _u("u1"), + _t("ORPHAN_X"), # malformed — no preceding assistant.tool_calls + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + # Orphan dropped + assert "tool" not in _roles(out) + assert _roles(out) == ["user", "user"] + + +def test_orphan_adjacent_to_valid_pair_still_dropped(): + """Orphan tool message inserted right after a legitimate tool-call pair + must be dropped — adjacency to a valid pair does not legitimize it. + + This is the bug class that triggers OpenAI #446 even when slice cut + boundaries would otherwise be safe: any orphan reaching the wire, + regardless of position, makes the request invalid.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("VALID")]), + _t("VALID", "real result"), + _t("ORPHAN_id", "ghost result"), # no assistant emits ORPHAN_id + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + + # The orphan must NOT survive — even though it's adjacent to a valid pair + orphan_present = any( + m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id" + for m in out + ) + assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped" + + # The valid pair survives intact + valid_assistant = any( + m.get("role") == "assistant" + and m.get("tool_calls") + and any(tc["id"] == "VALID" for tc in m["tool_calls"]) + for m in out + ) + valid_tool = any( + m.get("role") == "tool" and m.get("tool_call_id") == "VALID" + for m in out + ) + assert valid_assistant and valid_tool + + +def test_system_message_treated_as_normal_block(): + msgs = [ + {"role": "system", "content": "you are an agent"}, + _u("hi"), + _a("hello"), + ] + out = truncate_by_message_count(msgs, 2) + # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped. + assert _roles(out) == ["user", "assistant"] + + +def test_realistic_long_conversation_truncation(): + """End-to-end: simulate a long chat with many tool-call turns and ensure + the output never has orphan tools.""" + msgs: list[dict] = [_u("start")] + for k in range(20): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result {k}")) + msgs.append(_u(f"next {k}")) + msgs.append(_a("final answer")) + + # Truncate to 30 messages + out = truncate_by_message_count(msgs, 30) + + # Sanity: budget respected + assert len(out) <= 30 + + # Critical invariant: no orphan tool messages anywhere + seen_tool_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + seen_tool_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + tcid = m.get("tool_call_id") + assert tcid in seen_tool_call_ids, ( + f"Orphan tool {tcid!r} in output without matching assistant.tool_calls" + ) From 2a0040fef35e69706030afde273f7f43f80f5adc Mon Sep 17 00:00:00 2001 From: zhongyua Date: Mon, 27 Apr 2026 13:32:48 +0800 Subject: [PATCH 2/2] feat(agent): token-aware history truncation alongside message cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `agents.context_window_tokens` field (default 50000) and a new `truncate_by_token_budget` helper that bounds in-context history by both estimated token cost (primary) and message count (safety cap), preserving assistant.tool_calls ↔ role=tool pairs intact via the same walker as truncate_by_message_count. Why: message-count alone is a wildly variable proxy for token cost — one 50KB tool result eats more context than 100 short user messages. Token budget gives predictable behavior across heterogeneous traffic; message cap remains as a safety net against pathological tiny-message floods. Changes: - models/agent.py: + context_window_tokens (Integer, default=50000) + DEFAULT_CONTEXT_WINDOW_TOKENS constant - schemas/schemas.py: AgentOut, AgentUpdate (1000 <= tokens <= 500000) - alembic: add_context_window_tokens.py (idempotent IF NOT EXISTS) - services/history_window.py: + truncate_by_token_budget, refactored common walker, JSON-serialized char->token estimate via existing estimate_tokens_from_chars (chars/3 — overestimates safely) - api/websocket.py: pass tok_budget to helper, raise DB load to max(ctx_size, 500) so helper has room to choose - api/feishu.py: same pattern at 2 sites (web chat + IM channel paths) - frontend: AgentDetail Settings slider + i18n + types 10 new tests covering token-budget mode (huge-message dropped, both-bounds interaction, atomic pair preservation, orphan defense). 25/25 pass. Other channels (dingtalk/discord/slack/teams/wecom/whatsapp) still use DB-level message-count limit only — they don't get token awareness in this PR but won't crash. Migrating them is follow-up scope. Co-Authored-By: Claude Opus 4.7 --- .../versions/add_context_window_tokens.py | 65 ++++++ backend/app/api/feishu.py | 23 ++- backend/app/api/websocket.py | 41 +++- backend/app/models/agent.py | 23 +++ backend/app/schemas/schemas.py | 2 + backend/app/services/history_window.py | 186 ++++++++++++++---- backend/tests/test_history_window.py | 140 ++++++++++++- frontend/src/i18n/en.json | 4 +- frontend/src/i18n/zh.json | 4 +- frontend/src/pages/AgentDetail.tsx | 18 ++ frontend/src/types/index.ts | 1 + 11 files changed, 442 insertions(+), 65 deletions(-) create mode 100644 backend/alembic/versions/add_context_window_tokens.py diff --git a/backend/alembic/versions/add_context_window_tokens.py b/backend/alembic/versions/add_context_window_tokens.py new file mode 100644 index 000000000..bf385417e --- /dev/null +++ b/backend/alembic/versions/add_context_window_tokens.py @@ -0,0 +1,65 @@ +"""add agents.context_window_tokens for token-aware history truncation + +Revision ID: add_context_window_tokens +Revises: rm_agent_credential_secrets +Create Date: 2026-04-27 +""" + +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = "add_context_window_tokens" +down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Add context_window_tokens with a DDL default of 50000. + + The four-step pattern is required because earlier in the migration chain, + ``alembic/versions/0000_initial_schema.py`` calls + ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents`` + from the *current* model state — including any new columns. SQLAlchemy's + Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause, + so the column ends up ``NOT NULL`` with no default, and a naive + ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never + sets the default. + + This four-step approach is idempotent regardless of pre-existing state: + - column missing → created (nullable, no default initially) + - column present without default → default set + - any rows with NULL → backfilled to 50000 + - column made NOT NULL + + Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE + affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL + column is a no-op. + """ + # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here, + # so existing rows (if any from create_all) aren't blocked. + op.execute( + "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER" + ) + # 2. Ensure the DDL default is set so future inserts that omit this + # column (raw SQL, restored backups, manual migrations) get 50000. + op.execute( + "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000" + ) + # 3. Backfill any rows that were created before the default landed. + op.execute( + "UPDATE agents SET context_window_tokens = 50000 " + "WHERE context_window_tokens IS NULL" + ) + # 4. Now safe to enforce NOT NULL. + op.execute( + "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL" + ) + + +def downgrade() -> None: + # Downgrade omitted — dropping the column would lose per-tenant tuning. + pass diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index ca4cef7c7..02e21f5bf 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,7 +18,7 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service -from app.services.history_window import truncate_by_message_count +from app.services.history_window import truncate_by_token_budget router = APIRouter(tags=["feishu"]) @@ -657,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession ) _pre_sess = _pre_sess_r.scalar_one_or_none() _history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id + # Load extra raw material so app-level token-aware helper has room to choose history_result = await db.execute( select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(max(ctx_size, 500)) ) history_msgs = history_result.scalars().all() history = _build_llm_history_from_chat_messages(list(reversed(history_msgs))) @@ -1374,11 +1375,12 @@ async def _handle_feishu_file( # Load conversation history for LLM context from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE + # Load extra raw material so app-level token-aware helper has room to choose _hist_r = await db.execute( _select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(max(ctx_size, 500)) ) _history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all()))) @@ -1632,15 +1634,18 @@ async def _call_agent_llm( # Build conversation messages (without system prompt — call_llm adds it) messages: list[dict] = [] - from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE + from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE + tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS if history: - # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool - # pairs intact. Today _normalize_history_messages drops DB role="tool_call" - # rows, so this path has no tool messages and the helper acts as plain count - # truncation; the safety kicks in once a feishu reorganization helper exists. + # Pair-aware truncation: token budget primary, message count as safety cap. + # Today _normalize_history_messages drops DB role="tool_call" rows, so this + # path has no tool messages and the pair guard is a no-op; the safety kicks + # in once a feishu reorganization helper exists. messages.extend( - truncate_by_message_count(_normalize_history_messages(history), ctx_size) + truncate_by_token_budget( + _normalize_history_messages(history), tok_budget, message_cap=ctx_size, + ) ) messages.append({"role": "user", "content": user_text}) diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 6f17ad1e5..8d79deeff 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,7 +19,7 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session -from app.services.history_window import truncate_by_message_count +from app.services.history_window import truncate_by_token_budget from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -214,7 +214,9 @@ async def websocket_chat( role_description = agent.role_description or "" welcome_message = agent.welcome_message or "" ctx_size = agent.context_window_size or 100 - logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}") + from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS + tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS + logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok") # Load the agent's primary model if agent.primary_model_id: @@ -300,11 +302,14 @@ async def websocket_chat( logger.info(f"[WS] Selected primary session {conv_id}") try: + # Load extra raw material so the app-level token-aware helper + # (truncate_by_token_budget below) has room to choose from. + _db_load_cap = max(ctx_size, 500) history_result = await db.execute( select(ChatMessage) .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id) .order_by(ChatMessage.created_at.desc()) - .limit(ctx_size) + .limit(_db_load_cap) ) history_messages = list(reversed(history_result.scalars().all())) logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}") @@ -663,12 +668,30 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # Pair-aware truncation: keep the last `ctx_size` messages while - # preserving assistant.tool_calls ↔ role=tool blocks atomically. - # Naive [-ctx_size:] slicing can leave orphan tool messages at the - # head when the cut lands mid-pair, which OpenAI rejects with - # "No tool call found for function call output" (issue #446). - _truncated = truncate_by_message_count(conversation, ctx_size) + # Pair-aware truncation with a token budget plus a message-count + # safety cap. Either bound stops the walk; pairs (assistant.tool_calls + # ↔ role=tool) are kept atomic. Token budget protects against + # one-tool-result-eats-the-window scenarios; message cap protects + # against pathological tiny-message floods. The pair guard fixes + # the orphan-tool failure mode reported in #446. + # + # The current user message (just appended at line ~416) is excluded + # from truncation and re-appended after — otherwise a single huge + # input (large paste, base64 image_data) could push past the budget + # and cause the helper to drop the very message we're answering. + # If the input itself exceeds the model's context, the provider will + # surface a clear error rather than silently dropping it here. + _current = ( + conversation[-1] + if conversation and conversation[-1].get("role") == "user" + else None + ) + _history = conversation[:-1] if _current is not None else conversation + _truncated = truncate_by_token_budget( + _history, tok_budget, message_cap=ctx_size, + ) + if _current is not None: + _truncated.append(_current) return await call_llm_with_failover( primary_model=llm_model, diff --git a/backend/app/models/agent.py b/backend/app/models/agent.py index 5c48d7bf9..2493aaedd 100644 --- a/backend/app/models/agent.py +++ b/backend/app/models/agent.py @@ -15,6 +15,11 @@ # (see: https://github.com/dataelement/Clawith/issues/238). DEFAULT_CONTEXT_WINDOW_SIZE = 100 +# Default token budget for in-context history. Conservative for 128K-context +# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent +# override via Agent.context_window_tokens. +DEFAULT_CONTEXT_WINDOW_TOKENS = 50000 + class Agent(Base): """Digital employee (Agent) instance. @@ -81,6 +86,24 @@ class Agent(Base): last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) tokens_used_total: Mapped[int] = mapped_column(Integer, default=0) context_window_size: Mapped[int] = mapped_column(Integer, default=100) + # Token-aware secondary bound on history sent to the LLM. Truncation uses + # the smaller of context_window_size (message count) and this token budget, + # preserving assistant.tool_calls ↔ role=tool pairs intact. + # + # ``server_default`` matters: alembic/versions/0000_initial_schema.py uses + # ``Base.metadata.create_all`` which reads model state at runtime. Without + # a server_default, fresh-DB bootstrap would create this column NOT NULL + # without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration + # later in the chain would short-circuit, leaving direct-SQL inserts + # broken. ``server_default="50000"`` ensures the DDL has the default + # whether the column was created by create_all or by the explicit + # migration. + context_window_tokens: Mapped[int] = mapped_column( + Integer, + default=DEFAULT_CONTEXT_WINDOW_TOKENS, + server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS), + nullable=False, + ) max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50) # Trigger limits (per-agent, configurable from Settings UI) diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py index 0ae5d8e34..eb475d9c2 100644 --- a/backend/app/schemas/schemas.py +++ b/backend/app/schemas/schemas.py @@ -251,6 +251,7 @@ class AgentOut(BaseModel): max_tokens_per_day: int | None = None max_tokens_per_month: int | None = None context_window_size: int = 100 + context_window_tokens: int = 50000 max_tool_rounds: int = 50 max_triggers: int = 20 min_poll_interval_min: int = 5 @@ -286,6 +287,7 @@ class AgentUpdate(BaseModel): primary_model_id: uuid.UUID | None = None fallback_model_id: uuid.UUID | None = None context_window_size: int | None = Field(default=None, ge=1, le=500) + context_window_tokens: int | None = Field(default=None, ge=1000, le=500000) max_tokens_per_day: int | None = None max_tokens_per_month: int | None = None max_tool_rounds: int | None = None diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py index 54c0ce8a6..a3d63817c 100644 --- a/backend/app/services/history_window.py +++ b/backend/app/services/history_window.py @@ -11,19 +11,26 @@ between an assistant message and its tool results. This is the failure mode reported in issue #446. -Orphan detection is by ``tool_call_id`` matching, not by adjacency — a -tool message inserted between a valid pair and other messages (from -malformed persistence or upstream truncation) is dropped, not folded -into an adjacent block. This makes the helper robust against orphans -at any position, not just at the slice head. +Two public entry points: + - ``truncate_by_message_count`` — bound by message count + - ``truncate_by_token_budget`` — bound by estimated token cost (and an + optional message-count safety cap); preferred for production paths + where one tool result can dwarf 50 short messages. Input is expected to be in OpenAI chat-completion format (post-reorganization -from DB ``role="tool_call"`` rows). +from DB ``role="tool_call"`` rows). Helper is tolerant of malformed input — +unmatched tool messages at the head are silently dropped. """ from __future__ import annotations -from typing import Any +import json +from typing import Any, Callable + +from app.services.token_tracker import estimate_tokens_from_chars + + +# ── Block detection (shared between truncators) ───────────────────────── def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: @@ -60,38 +67,24 @@ def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: return orphans -def truncate_by_message_count( - messages: list[dict[str, Any]], - max_messages: int, -) -> list[dict[str, Any]]: - """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. - - A "block" is either: - - a single non-tool, non-tool-calling message (user / system / assistant text), or - - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"`` - message (identified by ``tool_call_id``, not adjacency). - - Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` - messages — those whose ``tool_call_id`` has no matching assistant — are - silently dropped regardless of budget. Sending them to OpenAI causes the - #446 error. +def _identify_blocks(messages: list[dict[str, Any]]) -> list[set[int]]: + """Group conversation entries into atomic blocks. - Args: - messages: Conversation list in OpenAI format. Empty list is fine. - max_messages: Soft upper bound on the number of returned entries. - Values ``<= 0`` return ``[]``. + A block is a set of indices that must be kept (or dropped) together: + - ``{i}`` for a single non-tool, non-tool-calling message + - ``{asst_idx, tool_idx_1, tool_idx_2, ...}`` for an assistant that + emitted N tool_calls plus its matching tool result messages, + identified by ``tool_call_id`` (not by adjacency — orphan tools + inserted between are dropped, not folded into the block). - Returns: - A new list (input is never mutated) of at most ``max_messages`` entries - from the tail of ``messages``, with all tool-call pairs intact. + Returned tail-to-head: most recent block first. Orphan tool messages + (those whose tool_call_id has no matching assistant.tool_calls) are + silently dropped — never appear in any block. """ - if max_messages <= 0 or not messages: - return [] - orphans = _identify_orphans(messages) n = len(messages) + blocks: list[set[int]] = [] consumed: set[int] = set(orphans) # orphans drop unconditionally - blocks: list[set[int]] = [] # tail-to-head order for i in range(n - 1, -1, -1): if i in consumed: @@ -151,16 +144,127 @@ def truncate_by_message_count( consumed.add(i) blocks.append({i}) - # Walk blocks tail-to-head, taking until budget exhausted. + return blocks + + +def _walk_blocks( + messages: list[dict[str, Any]], + budgets_ok: Callable[[int, int], bool], + consume: Callable[[int, int], None], +) -> list[dict[str, Any]]: + """Common walker used by both truncators. + + ``budgets_ok(block_msg_count, block_token_cost)`` returns True if the + block fits. ``consume`` updates remaining budget when a block is taken. + Stops on first non-fitting block (atomic — never partial-include). + """ + blocks = _identify_blocks(messages) keep: set[int] = set() - budget = max_messages for block in blocks: size = len(block) - if size <= budget: - keep |= block - budget -= size - else: - # Block doesn't fit — stop. Do NOT partial-include (would split pair). + token_cost = sum(_estimate_msg_tokens(messages[k]) for k in block) + if not budgets_ok(size, token_cost): break - + keep |= block + consume(size, token_cost) return [messages[k] for k in sorted(keep)] + + +def _estimate_msg_tokens(msg: dict[str, Any]) -> int: + """Estimate token cost for one message via JSON-serialized char count. + + Slight overestimate (JSON keys/quotes inflate vs the tokenizer's view of + the structured payload), which is the safe direction — better to truncate + a bit early than send too much and OOM the model. + """ + try: + serialized = json.dumps(msg, ensure_ascii=False, default=str) + except (TypeError, ValueError): + # Fallback for unserializable payloads (shouldn't happen in practice) + serialized = str(msg) + return estimate_tokens_from_chars(len(serialized)) + + +# ── Public API ────────────────────────────────────────────────────────── + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool message (``user``/``system``/``assistant`` text), or + - an ``assistant`` with ``tool_calls`` plus every immediately-following + ``role="tool"`` message (the assistant's tool results). + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages with no matching assistant are always dropped, regardless of + budget — sending them to OpenAI causes the #446 error. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + remaining = [max_messages] + + def budgets_ok(size: int, _tok: int) -> bool: + return size <= remaining[0] + + def consume(size: int, _tok: int) -> None: + remaining[0] -= size + + return _walk_blocks(messages, budgets_ok, consume) + + +def truncate_by_token_budget( + messages: list[dict[str, Any]], + token_budget: int, + *, + message_cap: int | None = None, +) -> list[dict[str, Any]]: + """Keep tail messages within both bounds, preserving tool-call pairs. + + The two bounds work together: a block is included only if both the + remaining token budget and (when set) remaining message cap can absorb + its full size. The first bound to be exhausted stops the walk. + + Token cost per message is an overestimate based on JSON-serialized char + count divided by ~3 (see ``_estimate_msg_tokens``). This is intentional: + for budget enforcement, overestimating is safe. + + Args: + messages: Conversation list in OpenAI format. + token_budget: Soft upper bound on cumulative estimated tokens. + Values ``<= 0`` return ``[]``. + message_cap: Optional secondary bound on entry count. When set, the + walk stops as soon as either bound is exhausted. + + Returns: + A new list of recent messages within the budget(s), with all + tool-call pairs intact. + """ + if token_budget <= 0 or not messages: + return [] + if message_cap is not None and message_cap <= 0: + return [] + + tok_remaining = [token_budget] + msg_remaining = [message_cap if message_cap is not None else len(messages) + 1] + + def budgets_ok(size: int, tok_cost: int) -> bool: + return size <= msg_remaining[0] and tok_cost <= tok_remaining[0] + + def consume(size: int, tok_cost: int) -> None: + msg_remaining[0] -= size + tok_remaining[0] -= tok_cost + + return _walk_blocks(messages, budgets_ok, consume) diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py index 934216758..c437b5b7d 100644 --- a/backend/tests/test_history_window.py +++ b/backend/tests/test_history_window.py @@ -1,11 +1,14 @@ """Unit tests for pair-aware conversation history truncation. -Validates that ``truncate_by_message_count`` preserves -``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces -orphan tool messages that would trigger the OpenAI #446 failure mode. +Validates that ``truncate_by_message_count`` and ``truncate_by_token_budget`` +preserve ``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never +produces orphan tool messages that would trigger the OpenAI #446 failure mode. """ -from app.services.history_window import truncate_by_message_count +from app.services.history_window import ( + truncate_by_message_count, + truncate_by_token_budget, +) # ── Helpers ───────────────────────────────────────────────────────────── @@ -258,6 +261,135 @@ def test_system_message_treated_as_normal_block(): assert _roles(out) == ["user", "assistant"] +# ── Token-budget mode ─────────────────────────────────────────────────── + + +def test_token_budget_empty_or_zero(): + assert truncate_by_token_budget([], 1000) == [] + assert truncate_by_token_budget([_u("hi")], 0) == [] + assert truncate_by_token_budget([_u("hi")], -10) == [] + + +def test_token_budget_short_messages_within_budget(): + msgs = [_u("hi"), _a("hello"), _u("ok")] + out = truncate_by_token_budget(msgs, 10000) + assert out == msgs + + +def test_token_budget_huge_message_dropped(): + """One enormous tool result should not push other messages out of context; + the huge block just doesn't fit and is dropped.""" + huge_payload = "x" * 60000 # ~20K tokens via chars/3 + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content=huge_payload), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 5000) + # u2 fits (small). Pair is huge → doesn't fit → break. u1 not visited (after pair). + assert _roles(out) == ["user"] + assert out[0]["content"] == "u2" + + +def test_token_budget_preserves_pair_when_both_fit(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content="small result"), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 5000) + assert _roles(out) == ["user", "assistant", "tool", "user"] + + +def test_token_budget_drops_huge_tail_message_caller_must_protect(): + """The helper walks tail-to-head. If the LAST message (often the current + user input that just arrived) alone exceeds the budget, the walker can't + fit it and breaks — leaving NOTHING. Callers that pass the current user + message INTO truncation must hold it OUT and re-append afterward (see + api/websocket.py for the pattern). This test pins the contract so call + sites don't silently regress. + """ + msgs = [ + _u("history msg 1"), + _a("history msg 2"), + _u("x" * 60000), # current input — alone >> 5000 token budget + ] + out = truncate_by_token_budget(msgs, token_budget=5000) + # Walker visits the huge user msg first (tail), can't fit → BREAK. + # Nothing else gets a chance because the loop short-circuits on first miss. + assert out == [] + + +def test_token_budget_with_message_cap_message_wins(): + """100 small messages fit token budget but message_cap=10 binds first.""" + msgs = [_u(f"m{k}") for k in range(100)] + out = truncate_by_token_budget(msgs, 100000, message_cap=10) + assert len(out) == 10 + # Last 10 messages, in order + assert [m["content"] for m in out] == [f"m{k}" for k in range(90, 100)] + + +def test_token_budget_with_message_cap_token_wins(): + """20 fat messages, message_cap loose, token budget binds first.""" + msgs = [_u("x" * 1500) for _ in range(20)] # JSON-serialized ~1527 chars → ~509 tokens + out = truncate_by_token_budget(msgs, 2000, message_cap=100) + # Token budget binds first (well below message_cap=100). Exact count depends + # on JSON overhead but is small enough that several messages don't fit. + assert 0 < len(out) < 10 + assert len(out) < len(msgs) + + +def test_token_budget_orphan_tool_dropped(): + msgs = [ + _t("ORPHAN_X", content="ghost"), + _u("u1"), + _a("ok"), + ] + out = truncate_by_token_budget(msgs, 100000) + # Orphan dropped regardless of budget + assert "tool" not in _roles(out) + + +def test_token_budget_drops_large_pair_atomically(): + """Pair size 2 with first half tiny but second half huge — drop both.""" + huge = "x" * 9000 # ~3000 tokens + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", content=huge), + _u("u2"), + ] + out = truncate_by_token_budget(msgs, 1000) # fits u2 (~5 tokens), not pair + assert _roles(out) == ["user"] + + +def test_token_budget_zero_message_cap(): + msgs = [_u("hi")] + assert truncate_by_token_budget(msgs, 1000, message_cap=0) == [] + + +def test_token_budget_invariant_no_orphan_in_realistic_long_chat(): + """End-to-end with mixed message sizes — verify no orphan tool leaks.""" + msgs: list[dict] = [_u("start")] + for k in range(15): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result of call {k}")) + msgs.append(_u(f"followup {k}")) + out = truncate_by_token_budget(msgs, 1500, message_cap=20) + + # No orphan tool + assistant_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + assistant_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + assert m["tool_call_id"] in assistant_call_ids + + def test_realistic_long_conversation_truncation(): """End-to-end: simulate a long chat with many tool-call turns and ensure the output never has orphan tools.""" diff --git a/frontend/src/i18n/en.json b/frontend/src/i18n/en.json index f47375aa6..44972143a 100644 --- a/frontend/src/i18n/en.json +++ b/frontend/src/i18n/en.json @@ -421,7 +421,9 @@ "noFallback": "No fallback", "conversationContext": "Context Window", "maxRounds": "Context Window Size", - "roundsDesc": "Number of recent messages included as context for each LLM request", + "roundsDesc": "Maximum number of recent messages included as context for each LLM request", + "maxTokens": "Context Token Budget", + "tokensDesc": "Maximum tokens of history per LLM request. Used together with the message-count cap; whichever bound hits first wins. Protects against a single large tool result blowing out context.", "tokenLimits": "Token Limits", "dailyLimit": "Daily Limit", "monthlyLimit": "Monthly Limit", diff --git a/frontend/src/i18n/zh.json b/frontend/src/i18n/zh.json index 77446d99d..cc861e462 100644 --- a/frontend/src/i18n/zh.json +++ b/frontend/src/i18n/zh.json @@ -428,7 +428,9 @@ "noFallback": "无备选", "conversationContext": "上下文窗口", "maxRounds": "上下文窗口大小", - "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量", + "roundsDesc": "每次 LLM 请求时携带的近期历史消息数量(条数上限)", + "maxTokens": "上下文 token 预算", + "tokensDesc": "每次 LLM 请求时携带的历史 token 上限。与上下文窗口大小(条数)取较小值,防止单次大工具结果撑爆 context", "tokenLimits": "Token 用量限制", "dailyLimit": "每日上限", "monthlyLimit": "每月上限", diff --git a/frontend/src/pages/AgentDetail.tsx b/frontend/src/pages/AgentDetail.tsx index 7e8dd59c0..2d16b403f 100644 --- a/frontend/src/pages/AgentDetail.tsx +++ b/frontend/src/pages/AgentDetail.tsx @@ -2214,6 +2214,7 @@ function AgentDetailInner() { primary_model_id: '', fallback_model_id: '', context_window_size: 100, + context_window_tokens: 50000, max_tool_rounds: 50, max_tokens_per_day: '' as string | number, max_tokens_per_month: '' as string | number, @@ -2233,6 +2234,7 @@ function AgentDetailInner() { primary_model_id: agent.primary_model_id || '', fallback_model_id: agent.fallback_model_id || '', context_window_size: agent.context_window_size ?? 100, + context_window_tokens: (agent as any).context_window_tokens ?? 50000, max_tool_rounds: (agent as any).max_tool_rounds ?? 50, max_tokens_per_day: agent.max_tokens_per_day || '', max_tokens_per_month: agent.max_tokens_per_month || '', @@ -5741,6 +5743,7 @@ function AgentDetailInner() { settingsForm.primary_model_id !== (agent?.primary_model_id || '') || settingsForm.fallback_model_id !== (agent?.fallback_model_id || '') || settingsForm.context_window_size !== (agent?.context_window_size ?? 100) || + settingsForm.context_window_tokens !== ((agent as any)?.context_window_tokens ?? 50000) || settingsForm.max_tool_rounds !== ((agent as any)?.max_tool_rounds ?? 50) || String(settingsForm.max_tokens_per_day) !== String(agent?.max_tokens_per_day || '') || String(settingsForm.max_tokens_per_month) !== String(agent?.max_tokens_per_month || '') || @@ -5757,6 +5760,7 @@ function AgentDetailInner() { primary_model_id: settingsForm.primary_model_id || null, fallback_model_id: settingsForm.fallback_model_id || null, context_window_size: settingsForm.context_window_size, + context_window_tokens: settingsForm.context_window_tokens, max_tool_rounds: settingsForm.max_tool_rounds, max_tokens_per_day: settingsForm.max_tokens_per_day ? Number(settingsForm.max_tokens_per_day) : null, max_tokens_per_month: settingsForm.max_tokens_per_month ? Number(settingsForm.max_tokens_per_month) : null, @@ -5885,6 +5889,20 @@ function AgentDetailInner() { />
{t('agent.settings.roundsDesc')}
+
+ + setSettingsForm(f => ({ ...f, context_window_tokens: Math.max(1000, Math.min(500000, parseInt(e.target.value) || 50000)) }))} + style={{ width: '120px' }} + /> +
{t('agent.settings.tokensDesc')}
+
{/* Max Tool Call Rounds */} diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index e7138f051..e29f740c8 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -36,6 +36,7 @@ export interface Agent { last_heartbeat_at?: string; timezone?: string; context_window_size?: number; + context_window_tokens?: number; agent_type?: 'native' | 'openclaw'; openclaw_last_seen?: string; unread_count?: number;