diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index 00fe32737..ca4cef7c7 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,6 +18,7 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service +from app.services.history_window import truncate_by_message_count router = APIRouter(tags=["feishu"]) @@ -1634,7 +1635,13 @@ async def _call_agent_llm( from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE if history: - messages.extend(_normalize_history_messages(history)[-ctx_size:]) + # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool + # pairs intact. Today _normalize_history_messages drops DB role="tool_call" + # rows, so this path has no tool messages and the helper acts as plain count + # truncation; the safety kicks in once a feishu reorganization helper exists. + messages.extend( + truncate_by_message_count(_normalize_history_messages(history), ctx_size) + ) messages.append({"role": "user", "content": user_text}) # Use actual user_id so the system prompt knows who it's chatting with diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 82b297916..6f17ad1e5 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,6 +19,7 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session +from app.services.history_window import truncate_by_message_count from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -662,10 +663,12 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # To prevent tool call message pairs(assistant + tool) from being broken down. - _truncated = conversation[-ctx_size:] - while _truncated and _truncated[0].get("role") == "tool": - _truncated.pop(0) + # Pair-aware truncation: keep the last `ctx_size` messages while + # preserving assistant.tool_calls ↔ role=tool blocks atomically. + # Naive [-ctx_size:] slicing can leave orphan tool messages at the + # head when the cut lands mid-pair, which OpenAI rejects with + # "No tool call found for function call output" (issue #446). + _truncated = truncate_by_message_count(conversation, ctx_size) return await call_llm_with_failover( primary_model=llm_model, diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py new file mode 100644 index 000000000..54c0ce8a6 --- /dev/null +++ b/backend/app/services/history_window.py @@ -0,0 +1,166 @@ +"""Pair-aware conversation history truncation. + +Replaces naive ``conversation[-N:]`` slicing with a walker that keeps +``assistant.tool_calls`` and their matching ``role="tool"`` messages as an +atomic block — never half a pair, never orphan tool messages. + +Why: OpenAI Responses API and Chat Completions both reject input where a +``function_call_output`` / ``role="tool"`` message has no matching +``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive +``[-N:]`` slicing can leave such orphans at the head when the cut lands +between an assistant message and its tool results. This is the failure mode +reported in issue #446. + +Orphan detection is by ``tool_call_id`` matching, not by adjacency — a +tool message inserted between a valid pair and other messages (from +malformed persistence or upstream truncation) is dropped, not folded +into an adjacent block. This makes the helper robust against orphans +at any position, not just at the slice head. + +Input is expected to be in OpenAI chat-completion format (post-reorganization +from DB ``role="tool_call"`` rows). +""" + +from __future__ import annotations + +from typing import Any + + +def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]: + """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has + no matching ``assistant.tool_calls`` earlier in the conversation. + + OpenAI rejects the request the moment a ``function_call_output`` is + sent without its matching ``function_call``, regardless of whether + that tool message is at the head, middle, or end. So orphan detection + is by ID matching, not by position. + """ + orphans: set[int] = set() + for i, msg in enumerate(messages): + if msg.get("role") != "tool": + continue + tcid = msg.get("tool_call_id") + if not tcid: + orphans.add(i) + continue + # Search backward for an assistant whose tool_calls contains this id. + # Walks past intervening user / system / other-assistant messages. + found = False + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + found = True + break + j -= 1 + if not found: + orphans.add(i) + return orphans + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool, non-tool-calling message (user / system / assistant text), or + - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"`` + message (identified by ``tool_call_id``, not adjacency). + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages — those whose ``tool_call_id`` has no matching assistant — are + silently dropped regardless of budget. Sending them to OpenAI causes the + #446 error. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + + orphans = _identify_orphans(messages) + n = len(messages) + consumed: set[int] = set(orphans) # orphans drop unconditionally + blocks: list[set[int]] = [] # tail-to-head order + + for i in range(n - 1, -1, -1): + if i in consumed: + continue + msg = messages[i] + role = msg.get("role") + + if role == "tool": + # Find this tool's owning assistant by matching tool_call_id + tcid = msg.get("tool_call_id") + asst_idx = -1 + j = i - 1 + while j >= 0: + m = messages[j] + if m.get("role") == "assistant" and m.get("tool_calls"): + ids = {tc.get("id") for tc in m["tool_calls"]} + if tcid in ids: + asst_idx = j + break + j -= 1 + if asst_idx < 0: + # Defensive — orphan detection should have caught this + consumed.add(i) + continue + # Block = assistant + ALL of its matching tool messages (siblings) + asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]} + block = {asst_idx} + for k in range(asst_idx + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + elif role == "assistant" and msg.get("tool_calls"): + # Encountered the assistant before any of its tools (e.g. tools + # were truncated upstream or are still in flight). Group with + # whatever matching tools follow it. + asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]} + block = {i} + for k in range(i + 1, n): + if k in consumed: + continue + m = messages[k] + if ( + m.get("role") == "tool" + and m.get("tool_call_id") in asst_tc_ids + ): + block.add(k) + consumed |= block + blocks.append(block) + else: + consumed.add(i) + blocks.append({i}) + + # Walk blocks tail-to-head, taking until budget exhausted. + keep: set[int] = set() + budget = max_messages + for block in blocks: + size = len(block) + if size <= budget: + keep |= block + budget -= size + else: + # Block doesn't fit — stop. Do NOT partial-include (would split pair). + break + + return [messages[k] for k in sorted(keep)] diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py new file mode 100644 index 000000000..934216758 --- /dev/null +++ b/backend/tests/test_history_window.py @@ -0,0 +1,288 @@ +"""Unit tests for pair-aware conversation history truncation. + +Validates that ``truncate_by_message_count`` preserves +``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces +orphan tool messages that would trigger the OpenAI #446 failure mode. +""" + +from app.services.history_window import truncate_by_message_count + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _u(text: str) -> dict: + return {"role": "user", "content": text} + + +def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict: + msg: dict = {"role": "assistant", "content": text} + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + +def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict: + return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}} + + +def _t(call_id: str, content: str = "ok") -> dict: + return {"role": "tool", "tool_call_id": call_id, "content": content} + + +def _roles(msgs: list[dict]) -> list[str]: + return [m.get("role", "?") for m in msgs] + + +# ── Edge cases ────────────────────────────────────────────────────────── + + +def test_empty_input_returns_empty(): + assert truncate_by_message_count([], 10) == [] + + +def test_zero_or_negative_budget_returns_empty(): + msgs = [_u("hi"), _u("there")] + assert truncate_by_message_count(msgs, 0) == [] + assert truncate_by_message_count(msgs, -5) == [] + + +def test_within_budget_returns_all(): + msgs = [_u("a"), _a("b"), _u("c")] + out = truncate_by_message_count(msgs, 10) + assert out == msgs + assert out is not msgs # new list + + +def test_input_not_mutated(): + msgs = [_u("a"), _a("b"), _u("c"), _u("d")] + snapshot = list(msgs) + truncate_by_message_count(msgs, 2) + assert msgs == snapshot + + +# ── Core pair-preservation behavior ───────────────────────────────────── + + +def test_keeps_assistant_tool_pair_intact(): + """Slicing must not split assistant.tool_calls from its tool result.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["assistant", "tool", "user"] + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[1]["tool_call_id"] == "X" + + +def test_drops_pair_entirely_when_budget_too_small(): + """If budget can't fit the whole pair, drop it — never half.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 2 — can't fit pair (needs 2) + final user, must drop pair + out = truncate_by_message_count(msgs, 2) + # Only the trailing user fits as a single block; pair (size 2) doesn't fit + # in remaining budget=1 after taking user. + assert _roles(out) == ["user"] + assert out[0]["content"] == "done?" + + +def test_drops_orphan_tool_at_head(): + """A role=tool with no preceding assistant.tool_calls is dropped.""" + msgs = [ + _t("X"), # orphan — no assistant before + _u("hi"), + _a("ok"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "assistant"] + + +def test_drops_orphan_tool_at_head_after_slicing(): + """Slicing produces an orphan tool at head — must be dropped (the + classic #446 failure mode).""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), # ← naive slice [-3:] would start here as orphan + _u("u2"), + _a("final"), + ] + # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X) + # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because + # we'd need budget 5 to include from start. Result: [u("u2"), a("final")]. + out = truncate_by_message_count(msgs, 3) + assert "tool" not in _roles(out) + # No orphan tool_call_id reaches output + for m in out: + if m.get("role") == "tool": + raise AssertionError(f"Orphan tool leaked: {m}") + + +def test_multiple_parallel_tool_calls_in_one_assistant(): + """Assistant with N tool_calls followed by N tools is one atomic block.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0 + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"] + # Verify the pair came through whole + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[3]["tool_call_id"] == "Z" + + +def test_parallel_tool_pair_dropped_if_too_big(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u]. + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["user"] + + +def test_multiple_pairs_some_drop(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("A")]), + _t("A"), + _u("u2"), + _a(None, tool_calls=[_tc("B")]), + _t("B"), + _u("u3"), + ] + # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3. + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["user", "assistant", "tool", "user"] + assert out[1]["tool_calls"][0]["id"] == "B" + assert out[2]["tool_call_id"] == "B" + + +def test_no_partial_pair_when_budget_exactly_one_short(): + """Exactly one short of fitting a pair → drop the pair, don't include + just the assistant.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + ] + # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk + # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0) + # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t] + out = truncate_by_message_count(msgs, 2) + assert _roles(out) == ["assistant", "tool"] + # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits). + # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't + # fit, BREAK. Output: []. + out2 = truncate_by_message_count(msgs, 1) + assert out2 == [] + + +def test_mid_orphan_tool_dropped(): + """A tool whose tool_call_id has no matching assistant nearby — defensive + drop. (Shouldn't happen with current persistence, but be robust.)""" + msgs = [ + _u("u1"), + _t("ORPHAN_X"), # malformed — no preceding assistant.tool_calls + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + # Orphan dropped + assert "tool" not in _roles(out) + assert _roles(out) == ["user", "user"] + + +def test_orphan_adjacent_to_valid_pair_still_dropped(): + """Orphan tool message inserted right after a legitimate tool-call pair + must be dropped — adjacency to a valid pair does not legitimize it. + + This is the bug class that triggers OpenAI #446 even when slice cut + boundaries would otherwise be safe: any orphan reaching the wire, + regardless of position, makes the request invalid.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("VALID")]), + _t("VALID", "real result"), + _t("ORPHAN_id", "ghost result"), # no assistant emits ORPHAN_id + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + + # The orphan must NOT survive — even though it's adjacent to a valid pair + orphan_present = any( + m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id" + for m in out + ) + assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped" + + # The valid pair survives intact + valid_assistant = any( + m.get("role") == "assistant" + and m.get("tool_calls") + and any(tc["id"] == "VALID" for tc in m["tool_calls"]) + for m in out + ) + valid_tool = any( + m.get("role") == "tool" and m.get("tool_call_id") == "VALID" + for m in out + ) + assert valid_assistant and valid_tool + + +def test_system_message_treated_as_normal_block(): + msgs = [ + {"role": "system", "content": "you are an agent"}, + _u("hi"), + _a("hello"), + ] + out = truncate_by_message_count(msgs, 2) + # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped. + assert _roles(out) == ["user", "assistant"] + + +def test_realistic_long_conversation_truncation(): + """End-to-end: simulate a long chat with many tool-call turns and ensure + the output never has orphan tools.""" + msgs: list[dict] = [_u("start")] + for k in range(20): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result {k}")) + msgs.append(_u(f"next {k}")) + msgs.append(_a("final answer")) + + # Truncate to 30 messages + out = truncate_by_message_count(msgs, 30) + + # Sanity: budget respected + assert len(out) <= 30 + + # Critical invariant: no orphan tool messages anywhere + seen_tool_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + seen_tool_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + tcid = m.get("tool_call_id") + assert tcid in seen_tool_call_ids, ( + f"Orphan tool {tcid!r} in output without matching assistant.tool_calls" + )