diff --git a/backend/alembic/versions/add_llm_context_window_tokens.py b/backend/alembic/versions/add_llm_context_window_tokens.py new file mode 100644 index 000000000..d55c47208 --- /dev/null +++ b/backend/alembic/versions/add_llm_context_window_tokens.py @@ -0,0 +1,22 @@ +"""add llm context_window_tokens + +Revision ID: add_llm_context_window_tokens +Revises: add_user_tenant_onboarding +Create Date: 2026-05-12 00:00:00.000000 +""" + +from alembic import op + + +revision = "add_llm_context_window_tokens" +down_revision = "add_user_tenant_onboarding" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute("ALTER TABLE llm_models ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER") + + +def downgrade() -> None: + op.execute("ALTER TABLE llm_models DROP COLUMN IF EXISTS context_window_tokens") diff --git a/backend/app/api/enterprise.py b/backend/app/api/enterprise.py index 715adf25f..e0799b53f 100644 --- a/backend/app/api/enterprise.py +++ b/backend/app/api/enterprise.py @@ -177,6 +177,7 @@ async def add_llm_model( enabled=data.enabled, supports_vision=data.supports_vision, max_output_tokens=data.max_output_tokens, + context_window_tokens=data.context_window_tokens, request_timeout=data.request_timeout, tenant_id=uuid.UUID(tid) if tid else None, ) @@ -321,6 +322,8 @@ async def update_llm_model( model.supports_vision = data.supports_vision if hasattr(data, 'max_output_tokens') and data.max_output_tokens is not None: model.max_output_tokens = data.max_output_tokens + if 'context_window_tokens' in data.model_fields_set: + model.context_window_tokens = data.context_window_tokens if hasattr(data, 'request_timeout') and data.request_timeout is not None: model.request_timeout = data.request_timeout diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py index 00fe32737..0c6f7fe64 100644 --- a/backend/app/api/feishu.py +++ b/backend/app/api/feishu.py @@ -18,6 +18,11 @@ from app.models.identity import IdentityProvider from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut from app.services.feishu_service import feishu_service +from app.services.history_window import ( + token_budget_from_context_window, + truncate_by_message_count, + truncate_by_token_budget, +) router = APIRouter(tags=["feishu"]) @@ -1633,9 +1638,23 @@ async def _call_agent_llm( messages: list[dict] = [] from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE + user_message = {"role": "user", "content": user_text} if history: - messages.extend(_normalize_history_messages(history)[-ctx_size:]) - messages.append({"role": "user", "content": user_text}) + _normalized_history = _normalize_history_messages(history) + _conversation = [*_normalized_history, user_message] + _token_budget = token_budget_from_context_window( + getattr(model, "context_window_tokens", None) + ) + if _token_budget: + messages.extend( + truncate_by_token_budget(_conversation, ctx_size, _token_budget) + ) + else: + messages.extend( + truncate_by_message_count(_conversation, ctx_size) + ) + else: + messages.append(user_message) # Use actual user_id so the system prompt knows who it's chatting with effective_user_id = user_id or agent_id diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py index 0b9e37ff9..f629354a3 100644 --- a/backend/app/api/websocket.py +++ b/backend/app/api/websocket.py @@ -19,6 +19,11 @@ from app.models.llm import LLMModel from app.models.user import User from app.services.chat_session_service import ensure_primary_platform_session +from app.services.history_window import ( + token_budget_from_context_window, + truncate_by_message_count, + truncate_by_token_budget, +) from app.services.llm import call_llm, call_llm_with_failover router = APIRouter(tags=["websocket"]) @@ -775,10 +780,22 @@ async def _call_with_failover(): async def _on_failover(reason: str): await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) - # To prevent tool call message pairs(assistant + tool) from being broken down. - _truncated = conversation[-ctx_size:] - while _truncated and _truncated[0].get("role") == "tool": - _truncated.pop(0) + # Pair-aware truncation: keep the last `ctx_size` messages while + # preserving assistant.tool_calls ↔ role=tool blocks atomically. + # Naive [-ctx_size:] slicing can leave orphan tool messages at the + # head when the cut lands mid-pair, which OpenAI rejects with + # "No tool call found for function call output" (issue #446). + _token_budget = token_budget_from_context_window( + getattr(effective_llm_model, "context_window_tokens", None) + ) + if _token_budget: + _truncated = truncate_by_token_budget( + conversation, + ctx_size, + _token_budget, + ) + else: + _truncated = truncate_by_message_count(conversation, ctx_size) # Per-(user, agent) onboarding. With no row, prepend the # greeting prompt and mark the pair as "greeted" once it diff --git a/backend/app/models/llm.py b/backend/app/models/llm.py index cbe5fddd2..0d791bb8d 100644 --- a/backend/app/models/llm.py +++ b/backend/app/models/llm.py @@ -28,6 +28,8 @@ class LLMModel(Base): temperature: Mapped[float | None] = mapped_column(Float, nullable=True) request_timeout: Mapped[int | None] = mapped_column(Integer, nullable=True) # Request timeout in seconds, default 120 max_output_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) # Per-model output token limit override + # Model input context window used for local history fallback. + context_window_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) updated_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), onupdate=func.now() diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py index 46e0b3023..a5d00218d 100644 --- a/backend/app/schemas/schemas.py +++ b/backend/app/schemas/schemas.py @@ -399,6 +399,7 @@ class LLMModelCreate(BaseModel): enabled: bool = True supports_vision: bool = False max_output_tokens: int | None = None + context_window_tokens: int | None = Field(None, ge=1) request_timeout: int | None = None class LLMModelUpdate(BaseModel): @@ -412,6 +413,7 @@ class LLMModelUpdate(BaseModel): enabled: bool | None = None supports_vision: bool | None = None max_output_tokens: int | None = None + context_window_tokens: int | None = Field(None, ge=1) request_timeout: int | None = None @@ -427,6 +429,7 @@ class LLMModelOut(BaseModel): enabled: bool supports_vision: bool = False max_output_tokens: int | None = None + context_window_tokens: int | None = None request_timeout: int | None = None created_at: datetime diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py new file mode 100644 index 000000000..37acf4a04 --- /dev/null +++ b/backend/app/services/history_window.py @@ -0,0 +1,223 @@ +"""Tool-block-safe conversation history truncation. + +Replaces naive ``conversation[-N:]`` slicing with a walker that keeps +``assistant.tool_calls`` and their matching ``role="tool"`` messages as an +atomic block — never half a pair, never orphan tool messages. + +Why: OpenAI Responses API and Chat Completions both reject input where a +``function_call_output`` / ``role="tool"`` message has no matching +``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive +``[-N:]`` slicing can leave such orphans at the head when the cut lands +between an assistant message and its tool results. This is the failure mode +reported in issue #446. + +Tool results must be in the contiguous tool-result run immediately after +their owning assistant. A tool message inserted elsewhere (from malformed +persistence or upstream truncation) is dropped, not folded into an adjacent +block. This makes the helper robust against orphans at any position, not just +at the slice head. + +Incomplete assistant tool-call blocks are also dropped. If an assistant +declares multiple tool calls, every declared ``tool_call_id`` must have a +matching ``role="tool"`` result before the next non-tool message. This mirrors +the API contract enforced by OpenAI-compatible providers and avoids sending +synthetic/fake tool results into weaker models' context. + +Input is expected to be in OpenAI chat-completion format (post-reorganization +from DB ``role="tool_call"`` rows). +""" + +from __future__ import annotations + +import json +from typing import Any + +from app.services.token_tracker import estimate_tokens_from_chars + + +TOKEN_BUDGET_CONTEXT_RATIO = 0.8 + + +def _assistant_tool_call_ids(message: dict[str, Any]) -> list[str]: + """Return non-empty tool call ids declared by an assistant message.""" + if message.get("role") != "assistant": + return [] + tool_calls = message.get("tool_calls") + if not isinstance(tool_calls, list): + return [] + + ids: list[str] = [] + for tool_call in tool_calls: + if isinstance(tool_call, dict): + tool_call_id = tool_call.get("id") + if isinstance(tool_call_id, str) and tool_call_id: + ids.append(tool_call_id) + return ids + + +def _safe_history_blocks(messages: list[dict[str, Any]]) -> list[list[dict[str, Any]]]: + """Build API-safe message blocks in original order. + + A valid tool block is an assistant message with tool calls followed by + contiguous matching ``role="tool"`` results. A missing result invalidates + the whole block; orphan/duplicate tool results are consumed and dropped. + """ + blocks: list[list[dict[str, Any]]] = [] + i = 0 + n = len(messages) + + while i < n: + message = messages[i] + role = message.get("role") + + if role == "tool": + # Orphan or delayed tool result. It is invalid without the owning + # assistant immediately before the tool-result run. + i += 1 + continue + + tool_call_ids = _assistant_tool_call_ids(message) + if not tool_call_ids: + blocks.append([message]) + i += 1 + continue + + required = set(tool_call_ids) + seen: set[str] = set() + block = [message] + j = i + 1 + + while j < n and messages[j].get("role") == "tool": + tool_message = messages[j] + tool_call_id = tool_message.get("tool_call_id") + if ( + isinstance(tool_call_id, str) + and tool_call_id in required + and tool_call_id not in seen + ): + seen.add(tool_call_id) + block.append(tool_message) + # Consume every contiguous tool message here. Non-matching or + # duplicate tool results are invalid for this block and are dropped + # instead of being allowed to become later orphan messages. + j += 1 + + if seen == required: + blocks.append(block) + # If incomplete, drop the assistant and any partial tool results. Old + # history truncation should discard broken blocks rather than inventing + # synthetic tool results. + i = j + + return blocks + + +def token_budget_from_context_window( + context_window_tokens: int | None, + ratio: float = TOKEN_BUDGET_CONTEXT_RATIO, +) -> int | None: + """Return the fallback history token budget from an explicit model window.""" + if not context_window_tokens or context_window_tokens <= 0: + return None + ratio = min(max(ratio, 0.0), 1.0) + return max(int(context_window_tokens * ratio), 1) + + +def _estimate_message_tokens(message: dict[str, Any]) -> int: + """Roughly estimate one OpenAI-format message's token footprint.""" + try: + serialized = json.dumps(message, ensure_ascii=False, separators=(",", ":")) + except (TypeError, ValueError): + serialized = str(message) + return estimate_tokens_from_chars(len(serialized)) + + +def _estimate_block_tokens(block: list[dict[str, Any]]) -> int: + return sum(_estimate_message_tokens(message) for message in block) + + +def truncate_by_message_count( + messages: list[dict[str, Any]], + max_messages: int, +) -> list[dict[str, Any]]: + """Keep at most ``max_messages`` recent messages, preserving tool-call pairs. + + A "block" is either: + - a single non-tool, non-tool-calling message (user / system / assistant text), or + - an ``assistant`` with ``tool_calls`` plus every matching contiguous + ``role="tool"`` message. + + Blocks are atomic: included whole or not at all. Orphan ``role="tool"`` + messages and incomplete assistant tool-call blocks are silently dropped + regardless of budget. Sending either shape to OpenAI causes the #446 class + of errors. + + Args: + messages: Conversation list in OpenAI format. Empty list is fine. + max_messages: Soft upper bound on the number of returned entries. + Values ``<= 0`` return ``[]``. + + Returns: + A new list (input is never mutated) of at most ``max_messages`` entries + from the tail of ``messages``, with all tool-call pairs intact. + """ + if max_messages <= 0 or not messages: + return [] + + blocks = _safe_history_blocks(messages) + selected: list[list[dict[str, Any]]] = [] + budget = max_messages + for block in reversed(blocks): + size = len(block) + if size <= budget: + selected.append(block) + budget -= size + else: + # Block doesn't fit — stop. Do NOT partial-include (would split pair). + break + + return [message for block in reversed(selected) for message in block] + + +def truncate_by_token_budget( + messages: list[dict[str, Any]], + max_messages: int, + max_tokens: int, + *, + keep_latest_block: bool = True, +) -> list[dict[str, Any]]: + """Keep recent API-safe blocks within message and token budgets. + + This is a provider-safe fallback, not summary compaction: old blocks are + dropped, never summarized, and tool-call blocks remain atomic. The latest + block is retained by default so the current user turn is not removed even + when it alone exceeds the local estimate. + """ + if max_messages <= 0 or max_tokens <= 0 or not messages: + return [] + + blocks = _safe_history_blocks(messages) + selected: list[list[dict[str, Any]]] = [] + message_budget = max_messages + token_budget = max_tokens + + for block in reversed(blocks): + size = len(block) + tokens = _estimate_block_tokens(block) + is_latest = not selected + + if size <= message_budget and tokens <= token_budget: + selected.append(block) + message_budget -= size + token_budget -= tokens + continue + + if keep_latest_block and is_latest: + selected.append(block) + message_budget = max(message_budget - size, 0) + token_budget = max(token_budget - tokens, 0) + continue + + break + + return [message for block in reversed(selected) for message in block] diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py new file mode 100644 index 000000000..711537ec6 --- /dev/null +++ b/backend/tests/test_history_window.py @@ -0,0 +1,413 @@ +"""Unit tests for pair-aware conversation history truncation. + +Validates that ``truncate_by_message_count`` preserves +``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces +orphan tool messages that would trigger the OpenAI #446 failure mode. +""" + +from app.services.history_window import ( + token_budget_from_context_window, + truncate_by_message_count, + truncate_by_token_budget, +) + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _u(text: str) -> dict: + return {"role": "user", "content": text} + + +def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict: + msg: dict = {"role": "assistant", "content": text} + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + +def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict: + return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}} + + +def _t(call_id: str, content: str = "ok") -> dict: + return {"role": "tool", "tool_call_id": call_id, "content": content} + + +def _roles(msgs: list[dict]) -> list[str]: + return [m.get("role", "?") for m in msgs] + + +def _content(msgs: list[dict]) -> list[str | None]: + return [m.get("content") for m in msgs] + + +# ── Edge cases ────────────────────────────────────────────────────────── + + +def test_empty_input_returns_empty(): + assert truncate_by_message_count([], 10) == [] + + +def test_zero_or_negative_budget_returns_empty(): + msgs = [_u("hi"), _u("there")] + assert truncate_by_message_count(msgs, 0) == [] + assert truncate_by_message_count(msgs, -5) == [] + + +def test_within_budget_returns_all(): + msgs = [_u("a"), _a("b"), _u("c")] + out = truncate_by_message_count(msgs, 10) + assert out == msgs + assert out is not msgs # new list + + +def test_input_not_mutated(): + msgs = [_u("a"), _a("b"), _u("c"), _u("d")] + snapshot = list(msgs) + truncate_by_message_count(msgs, 2) + assert msgs == snapshot + + +# ── Core pair-preservation behavior ───────────────────────────────────── + + +def test_keeps_assistant_tool_pair_intact(): + """Slicing must not split assistant.tool_calls from its tool result.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["assistant", "tool", "user"] + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[1]["tool_call_id"] == "X" + + +def test_drops_pair_entirely_when_budget_too_small(): + """If budget can't fit the whole pair, drop it — never half.""" + msgs = [ + _u("hi"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + _u("done?"), + ] + # Budget 2 — can't fit pair (needs 2) + final user, must drop pair + out = truncate_by_message_count(msgs, 2) + # Only the trailing user fits as a single block; pair (size 2) doesn't fit + # in remaining budget=1 after taking user. + assert _roles(out) == ["user"] + assert out[0]["content"] == "done?" + + +def test_drops_orphan_tool_at_head(): + """A role=tool with no preceding assistant.tool_calls is dropped.""" + msgs = [ + _t("X"), # orphan — no assistant before + _u("hi"), + _a("ok"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "assistant"] + + +def test_drops_orphan_tool_at_head_after_slicing(): + """Slicing produces an orphan tool at head — must be dropped (the + classic #446 failure mode).""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), # ← naive slice [-3:] would start here as orphan + _u("u2"), + _a("final"), + ] + # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X) + # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because + # we'd need budget 5 to include from start. Result: [u("u2"), a("final")]. + out = truncate_by_message_count(msgs, 3) + assert "tool" not in _roles(out) + # No orphan tool_call_id reaches output + for m in out: + if m.get("role") == "tool": + raise AssertionError(f"Orphan tool leaked: {m}") + + +def test_multiple_parallel_tool_calls_in_one_assistant(): + """Assistant with N tool_calls followed by N tools is one atomic block.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0 + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"] + # Verify the pair came through whole + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[3]["tool_call_id"] == "Z" + + +def test_parallel_tool_pair_dropped_if_too_big(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]), + _t("X"), + _t("Y"), + _t("Z"), + _u("u2"), + ] + # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u]. + out = truncate_by_message_count(msgs, 3) + assert _roles(out) == ["user"] + + +def test_incomplete_parallel_tool_block_dropped_even_with_budget(): + """Assistant declares multiple tool calls; missing any result invalidates + the whole block.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X"), _tc("Y")]), + _t("X"), + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "user"] + assert all(m.get("role") != "tool" for m in out) + assert all(not m.get("tool_calls") for m in out) + + +def test_delayed_tool_result_after_user_does_not_complete_block(): + """A tool result must be in the contiguous tool-result run after its + assistant. Delayed results are invalid and dropped as orphans.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _u("intervening user"), + _t("X"), + _a("final"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "user", "assistant"] + assert "tool" not in _roles(out) + assert all(not m.get("tool_calls") for m in out) + + +def test_duplicate_tool_result_dropped_without_invalidating_complete_block(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X", "first"), + _t("X", "duplicate"), + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + assert _roles(out) == ["user", "assistant", "tool", "user"] + tool_results = [m for m in out if m.get("role") == "tool"] + assert len(tool_results) == 1 + assert tool_results[0]["content"] == "first" + + +def test_multiple_pairs_some_drop(): + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("A")]), + _t("A"), + _u("u2"), + _a(None, tool_calls=[_tc("B")]), + _t("B"), + _u("u3"), + ] + # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3. + out = truncate_by_message_count(msgs, 5) + assert _roles(out) == ["user", "assistant", "tool", "user"] + assert out[1]["tool_calls"][0]["id"] == "B" + assert out[2]["tool_call_id"] == "B" + + +def test_no_partial_pair_when_budget_exactly_one_short(): + """Exactly one short of fitting a pair → drop the pair, don't include + just the assistant.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("X")]), + _t("X"), + ] + # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk + # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0) + # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t] + out = truncate_by_message_count(msgs, 2) + assert _roles(out) == ["assistant", "tool"] + # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits). + # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't + # fit, BREAK. Output: []. + out2 = truncate_by_message_count(msgs, 1) + assert out2 == [] + + +def test_mid_orphan_tool_dropped(): + """A tool whose tool_call_id has no matching assistant nearby — defensive + drop. (Shouldn't happen with current persistence, but be robust.)""" + msgs = [ + _u("u1"), + _t("ORPHAN_X"), # malformed — no preceding assistant.tool_calls + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + # Orphan dropped + assert "tool" not in _roles(out) + assert _roles(out) == ["user", "user"] + + +def test_orphan_adjacent_to_valid_pair_still_dropped(): + """Orphan tool message inserted right after a legitimate tool-call pair + must be dropped — adjacency to a valid pair does not legitimize it. + + This is the bug class that triggers OpenAI #446 even when slice cut + boundaries would otherwise be safe: any orphan reaching the wire, + regardless of position, makes the request invalid.""" + msgs = [ + _u("u1"), + _a(None, tool_calls=[_tc("VALID")]), + _t("VALID", "real result"), + _t("ORPHAN_id", "ghost result"), # no assistant emits ORPHAN_id + _u("u2"), + ] + out = truncate_by_message_count(msgs, 10) + + # The orphan must NOT survive — even though it's adjacent to a valid pair + orphan_present = any( + m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id" + for m in out + ) + assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped" + + # The valid pair survives intact + valid_assistant = any( + m.get("role") == "assistant" + and m.get("tool_calls") + and any(tc["id"] == "VALID" for tc in m["tool_calls"]) + for m in out + ) + valid_tool = any( + m.get("role") == "tool" and m.get("tool_call_id") == "VALID" + for m in out + ) + assert valid_assistant and valid_tool + + +def test_system_message_treated_as_normal_block(): + msgs = [ + {"role": "system", "content": "you are an agent"}, + _u("hi"), + _a("hello"), + ] + out = truncate_by_message_count(msgs, 2) + # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped. + assert _roles(out) == ["user", "assistant"] + + +def test_realistic_long_conversation_truncation(): + """End-to-end: simulate a long chat with many tool-call turns and ensure + the output never has orphan tools.""" + msgs: list[dict] = [_u("start")] + for k in range(20): + msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")])) + msgs.append(_t(f"call_{k}", content=f"result {k}")) + msgs.append(_u(f"next {k}")) + msgs.append(_a("final answer")) + + # Truncate to 30 messages + out = truncate_by_message_count(msgs, 30) + + # Sanity: budget respected + assert len(out) <= 30 + + # Critical invariant: no orphan tool messages anywhere + seen_tool_call_ids: set[str] = set() + for m in out: + if m.get("role") == "assistant" and m.get("tool_calls"): + for tc in m["tool_calls"]: + seen_tool_call_ids.add(tc["id"]) + for m in out: + if m.get("role") == "tool": + tcid = m.get("tool_call_id") + assert tcid in seen_tool_call_ids, ( + f"Orphan tool {tcid!r} in output without matching assistant.tool_calls" + ) + + +# ── Token-budget fallback behavior ────────────────────────────────────── + + +def test_token_budget_from_context_window_uses_eighty_percent(): + assert token_budget_from_context_window(128_000) == 102_400 + assert token_budget_from_context_window(200_000) == 160_000 + + +def test_token_budget_from_context_window_requires_explicit_positive_window(): + assert token_budget_from_context_window(None) is None + assert token_budget_from_context_window(0) is None + assert token_budget_from_context_window(-1) is None + + +def test_token_budget_truncation_drops_old_large_blocks(): + msgs = [ + _u("old-" + ("x" * 900)), + _a("middle"), + _u("latest"), + ] + out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=80) + assert _content(out) == ["middle", "latest"] + + +def test_token_budget_truncation_keeps_tool_block_atomic(): + msgs = [ + _u("old-" + ("x" * 900)), + _a(None, tool_calls=[_tc("X")]), + _t("X", "result"), + _u("latest"), + ] + out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=120) + assert _roles(out) == ["assistant", "tool", "user"] + assert out[0]["tool_calls"][0]["id"] == "X" + assert out[1]["tool_call_id"] == "X" + + +def test_token_budget_truncation_drops_whole_tool_block_if_it_does_not_fit(): + msgs = [ + _u("old"), + _a(None, tool_calls=[_tc("X")]), + _t("X", "result-" + ("x" * 900)), + _u("latest"), + ] + out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=80) + assert _roles(out) == ["user"] + assert out[0]["content"] == "latest" + + +def test_token_budget_truncation_keeps_latest_block_when_over_budget(): + msgs = [ + _u("old"), + _u("latest-" + ("x" * 900)), + ] + out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=10) + assert _roles(out) == ["user"] + assert out[0]["content"].startswith("latest-") + + +def test_token_budget_truncation_still_drops_invalid_blocks(): + msgs = [ + _u("old"), + _a(None, tool_calls=[_tc("X")]), + _u("latest"), + _t("X", "delayed orphan"), + ] + out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=1000) + assert _roles(out) == ["user", "user"] + assert "tool" not in _roles(out)