diff --git a/backend/alembic/versions/add_llm_context_window_tokens.py b/backend/alembic/versions/add_llm_context_window_tokens.py
new file mode 100644
index 000000000..d55c47208
--- /dev/null
+++ b/backend/alembic/versions/add_llm_context_window_tokens.py
@@ -0,0 +1,22 @@
+"""add llm context_window_tokens
+
+Revision ID: add_llm_context_window_tokens
+Revises: add_user_tenant_onboarding
+Create Date: 2026-05-12 00:00:00.000000
+"""
+
+from alembic import op
+
+
+revision = "add_llm_context_window_tokens"
+down_revision = "add_user_tenant_onboarding"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute("ALTER TABLE llm_models ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER")
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE llm_models DROP COLUMN IF EXISTS context_window_tokens")
diff --git a/backend/app/api/enterprise.py b/backend/app/api/enterprise.py
index 715adf25f..e0799b53f 100644
--- a/backend/app/api/enterprise.py
+++ b/backend/app/api/enterprise.py
@@ -177,6 +177,7 @@ async def add_llm_model(
         enabled=data.enabled,
         supports_vision=data.supports_vision,
         max_output_tokens=data.max_output_tokens,
+        context_window_tokens=data.context_window_tokens,
         request_timeout=data.request_timeout,
         tenant_id=uuid.UUID(tid) if tid else None,
     )
@@ -321,6 +322,8 @@ async def update_llm_model(
             model.supports_vision = data.supports_vision
         if hasattr(data, 'max_output_tokens') and data.max_output_tokens is not None:
             model.max_output_tokens = data.max_output_tokens
+        if 'context_window_tokens' in data.model_fields_set:
+            model.context_window_tokens = data.context_window_tokens
         if hasattr(data, 'request_timeout') and data.request_timeout is not None:
             model.request_timeout = data.request_timeout
 
diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 00fe32737..0c6f7fe64 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,6 +18,11 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import (
+    token_budget_from_context_window,
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 
 router = APIRouter(tags=["feishu"])
 
@@ -1633,9 +1638,23 @@ async def _call_agent_llm(
     messages: list[dict] = []
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
+    user_message = {"role": "user", "content": user_text}
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
-    messages.append({"role": "user", "content": user_text})
+        _normalized_history = _normalize_history_messages(history)
+        _conversation = [*_normalized_history, user_message]
+        _token_budget = token_budget_from_context_window(
+            getattr(model, "context_window_tokens", None)
+        )
+        if _token_budget:
+            messages.extend(
+                truncate_by_token_budget(_conversation, ctx_size, _token_budget)
+            )
+        else:
+            messages.extend(
+                truncate_by_message_count(_conversation, ctx_size)
+            )
+    else:
+        messages.append(user_message)
 
     # Use actual user_id so the system prompt knows who it's chatting with
     effective_user_id = user_id or agent_id
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 0b9e37ff9..f629354a3 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,6 +19,11 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import (
+    token_budget_from_context_window,
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -775,10 +780,22 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _token_budget = token_budget_from_context_window(
+                            getattr(effective_llm_model, "context_window_tokens", None)
+                        )
+                        if _token_budget:
+                            _truncated = truncate_by_token_budget(
+                                conversation,
+                                ctx_size,
+                                _token_budget,
+                            )
+                        else:
+                            _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         # Per-(user, agent) onboarding. With no row, prepend the
                         # greeting prompt and mark the pair as "greeted" once it
diff --git a/backend/app/models/llm.py b/backend/app/models/llm.py
index cbe5fddd2..0d791bb8d 100644
--- a/backend/app/models/llm.py
+++ b/backend/app/models/llm.py
@@ -28,6 +28,8 @@ class LLMModel(Base):
     temperature: Mapped[float | None] = mapped_column(Float, nullable=True)
     request_timeout: Mapped[int | None] = mapped_column(Integer, nullable=True)  # Request timeout in seconds, default 120
     max_output_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)  # Per-model output token limit override
+    # Model input context window used for local history fallback.
+    context_window_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
     created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
     updated_at: Mapped[datetime] = mapped_column(
         DateTime(timezone=True), server_default=func.now(), onupdate=func.now()
diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py
index 46e0b3023..a5d00218d 100644
--- a/backend/app/schemas/schemas.py
+++ b/backend/app/schemas/schemas.py
@@ -399,6 +399,7 @@ class LLMModelCreate(BaseModel):
     enabled: bool = True
     supports_vision: bool = False
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = Field(None, ge=1)
     request_timeout: int | None = None
 
 class LLMModelUpdate(BaseModel):
@@ -412,6 +413,7 @@ class LLMModelUpdate(BaseModel):
     enabled: bool | None = None
     supports_vision: bool | None = None
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = Field(None, ge=1)
     request_timeout: int | None = None
 
 
@@ -427,6 +429,7 @@ class LLMModelOut(BaseModel):
     enabled: bool
     supports_vision: bool = False
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = None
     request_timeout: int | None = None
     created_at: datetime
 
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
new file mode 100644
index 000000000..37acf4a04
--- /dev/null
+++ b/backend/app/services/history_window.py
@@ -0,0 +1,223 @@
+"""Tool-block-safe conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Tool results must be in the contiguous tool-result run immediately after
+their owning assistant. A tool message inserted elsewhere (from malformed
+persistence or upstream truncation) is dropped, not folded into an adjacent
+block. This makes the helper robust against orphans at any position, not just
+at the slice head.
+
+Incomplete assistant tool-call blocks are also dropped. If an assistant
+declares multiple tool calls, every declared ``tool_call_id`` must have a
+matching ``role="tool"`` result before the next non-tool message. This mirrors
+the API contract enforced by OpenAI-compatible providers and avoids sending
+synthetic/fake tool results into weaker models' context.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from app.services.token_tracker import estimate_tokens_from_chars
+
+
+TOKEN_BUDGET_CONTEXT_RATIO = 0.8
+
+
+def _assistant_tool_call_ids(message: dict[str, Any]) -> list[str]:
+    """Return non-empty tool call ids declared by an assistant message."""
+    if message.get("role") != "assistant":
+        return []
+    tool_calls = message.get("tool_calls")
+    if not isinstance(tool_calls, list):
+        return []
+
+    ids: list[str] = []
+    for tool_call in tool_calls:
+        if isinstance(tool_call, dict):
+            tool_call_id = tool_call.get("id")
+            if isinstance(tool_call_id, str) and tool_call_id:
+                ids.append(tool_call_id)
+    return ids
+
+
+def _safe_history_blocks(messages: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
+    """Build API-safe message blocks in original order.
+
+    A valid tool block is an assistant message with tool calls followed by
+    contiguous matching ``role="tool"`` results. A missing result invalidates
+    the whole block; orphan/duplicate tool results are consumed and dropped.
+    """
+    blocks: list[list[dict[str, Any]]] = []
+    i = 0
+    n = len(messages)
+
+    while i < n:
+        message = messages[i]
+        role = message.get("role")
+
+        if role == "tool":
+            # Orphan or delayed tool result. It is invalid without the owning
+            # assistant immediately before the tool-result run.
+            i += 1
+            continue
+
+        tool_call_ids = _assistant_tool_call_ids(message)
+        if not tool_call_ids:
+            blocks.append([message])
+            i += 1
+            continue
+
+        required = set(tool_call_ids)
+        seen: set[str] = set()
+        block = [message]
+        j = i + 1
+
+        while j < n and messages[j].get("role") == "tool":
+            tool_message = messages[j]
+            tool_call_id = tool_message.get("tool_call_id")
+            if (
+                isinstance(tool_call_id, str)
+                and tool_call_id in required
+                and tool_call_id not in seen
+            ):
+                seen.add(tool_call_id)
+                block.append(tool_message)
+            # Consume every contiguous tool message here. Non-matching or
+            # duplicate tool results are invalid for this block and are dropped
+            # instead of being allowed to become later orphan messages.
+            j += 1
+
+        if seen == required:
+            blocks.append(block)
+        # If incomplete, drop the assistant and any partial tool results. Old
+        # history truncation should discard broken blocks rather than inventing
+        # synthetic tool results.
+        i = j
+
+    return blocks
+
+
+def token_budget_from_context_window(
+    context_window_tokens: int | None,
+    ratio: float = TOKEN_BUDGET_CONTEXT_RATIO,
+) -> int | None:
+    """Return the fallback history token budget from an explicit model window."""
+    if not context_window_tokens or context_window_tokens <= 0:
+        return None
+    ratio = min(max(ratio, 0.0), 1.0)
+    return max(int(context_window_tokens * ratio), 1)
+
+
+def _estimate_message_tokens(message: dict[str, Any]) -> int:
+    """Roughly estimate one OpenAI-format message's token footprint."""
+    try:
+        serialized = json.dumps(message, ensure_ascii=False, separators=(",", ":"))
+    except (TypeError, ValueError):
+        serialized = str(message)
+    return estimate_tokens_from_chars(len(serialized))
+
+
+def _estimate_block_tokens(block: list[dict[str, Any]]) -> int:
+    return sum(_estimate_message_tokens(message) for message in block)
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching contiguous
+        ``role="tool"`` message.
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages and incomplete assistant tool-call blocks are silently dropped
+    regardless of budget. Sending either shape to OpenAI causes the #446 class
+    of errors.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    blocks = _safe_history_blocks(messages)
+    selected: list[list[dict[str, Any]]] = []
+    budget = max_messages
+    for block in reversed(blocks):
+        size = len(block)
+        if size <= budget:
+            selected.append(block)
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [message for block in reversed(selected) for message in block]
+
+
+def truncate_by_token_budget(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+    max_tokens: int,
+    *,
+    keep_latest_block: bool = True,
+) -> list[dict[str, Any]]:
+    """Keep recent API-safe blocks within message and token budgets.
+
+    This is a provider-safe fallback, not summary compaction: old blocks are
+    dropped, never summarized, and tool-call blocks remain atomic. The latest
+    block is retained by default so the current user turn is not removed even
+    when it alone exceeds the local estimate.
+    """
+    if max_messages <= 0 or max_tokens <= 0 or not messages:
+        return []
+
+    blocks = _safe_history_blocks(messages)
+    selected: list[list[dict[str, Any]]] = []
+    message_budget = max_messages
+    token_budget = max_tokens
+
+    for block in reversed(blocks):
+        size = len(block)
+        tokens = _estimate_block_tokens(block)
+        is_latest = not selected
+
+        if size <= message_budget and tokens <= token_budget:
+            selected.append(block)
+            message_budget -= size
+            token_budget -= tokens
+            continue
+
+        if keep_latest_block and is_latest:
+            selected.append(block)
+            message_budget = max(message_budget - size, 0)
+            token_budget = max(token_budget - tokens, 0)
+            continue
+
+        break
+
+    return [message for block in reversed(selected) for message in block]
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
new file mode 100644
index 000000000..711537ec6
--- /dev/null
+++ b/backend/tests/test_history_window.py
@@ -0,0 +1,413 @@
+"""Unit tests for pair-aware conversation history truncation.
+
+Validates that ``truncate_by_message_count`` preserves
+``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
+orphan tool messages that would trigger the OpenAI #446 failure mode.
+"""
+
+from app.services.history_window import (
+    token_budget_from_context_window,
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────
+
+
+def _u(text: str) -> dict:
+    return {"role": "user", "content": text}
+
+
+def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict:
+    msg: dict = {"role": "assistant", "content": text}
+    if tool_calls:
+        msg["tool_calls"] = tool_calls
+    return msg
+
+
+def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict:
+    return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}}
+
+
+def _t(call_id: str, content: str = "ok") -> dict:
+    return {"role": "tool", "tool_call_id": call_id, "content": content}
+
+
+def _roles(msgs: list[dict]) -> list[str]:
+    return [m.get("role", "?") for m in msgs]
+
+
+def _content(msgs: list[dict]) -> list[str | None]:
+    return [m.get("content") for m in msgs]
+
+
+# ── Edge cases ──────────────────────────────────────────────────────────
+
+
+def test_empty_input_returns_empty():
+    assert truncate_by_message_count([], 10) == []
+
+
+def test_zero_or_negative_budget_returns_empty():
+    msgs = [_u("hi"), _u("there")]
+    assert truncate_by_message_count(msgs, 0) == []
+    assert truncate_by_message_count(msgs, -5) == []
+
+
+def test_within_budget_returns_all():
+    msgs = [_u("a"), _a("b"), _u("c")]
+    out = truncate_by_message_count(msgs, 10)
+    assert out == msgs
+    assert out is not msgs  # new list
+
+
+def test_input_not_mutated():
+    msgs = [_u("a"), _a("b"), _u("c"), _u("d")]
+    snapshot = list(msgs)
+    truncate_by_message_count(msgs, 2)
+    assert msgs == snapshot
+
+
+# ── Core pair-preservation behavior ─────────────────────────────────────
+
+
+def test_keeps_assistant_tool_pair_intact():
+    """Slicing must not split assistant.tool_calls from its tool result."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["assistant", "tool", "user"]
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[1]["tool_call_id"] == "X"
+
+
+def test_drops_pair_entirely_when_budget_too_small():
+    """If budget can't fit the whole pair, drop it — never half."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 2 — can't fit pair (needs 2) + final user, must drop pair
+    out = truncate_by_message_count(msgs, 2)
+    # Only the trailing user fits as a single block; pair (size 2) doesn't fit
+    # in remaining budget=1 after taking user.
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "done?"
+
+
+def test_drops_orphan_tool_at_head():
+    """A role=tool with no preceding assistant.tool_calls is dropped."""
+    msgs = [
+        _t("X"),  # orphan — no assistant before
+        _u("hi"),
+        _a("ok"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_drops_orphan_tool_at_head_after_slicing():
+    """Slicing produces an orphan tool at head — must be dropped (the
+    classic #446 failure mode)."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),       # ← naive slice [-3:] would start here as orphan
+        _u("u2"),
+        _a("final"),
+    ]
+    # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X)
+    # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because
+    # we'd need budget 5 to include from start. Result: [u("u2"), a("final")].
+    out = truncate_by_message_count(msgs, 3)
+    assert "tool" not in _roles(out)
+    # No orphan tool_call_id reaches output
+    for m in out:
+        if m.get("role") == "tool":
+            raise AssertionError(f"Orphan tool leaked: {m}")
+
+
+def test_multiple_parallel_tool_calls_in_one_assistant():
+    """Assistant with N tool_calls followed by N tools is one atomic block."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"]
+    # Verify the pair came through whole
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[3]["tool_call_id"] == "Z"
+
+
+def test_parallel_tool_pair_dropped_if_too_big():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u].
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["user"]
+
+
+def test_incomplete_parallel_tool_block_dropped_even_with_budget():
+    """Assistant declares multiple tool calls; missing any result invalidates
+    the whole block."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y")]),
+        _t("X"),
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "user"]
+    assert all(m.get("role") != "tool" for m in out)
+    assert all(not m.get("tool_calls") for m in out)
+
+
+def test_delayed_tool_result_after_user_does_not_complete_block():
+    """A tool result must be in the contiguous tool-result run after its
+    assistant. Delayed results are invalid and dropped as orphans."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _u("intervening user"),
+        _t("X"),
+        _a("final"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "user", "assistant"]
+    assert "tool" not in _roles(out)
+    assert all(not m.get("tool_calls") for m in out)
+
+
+def test_duplicate_tool_result_dropped_without_invalidating_complete_block():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", "first"),
+        _t("X", "duplicate"),
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+    tool_results = [m for m in out if m.get("role") == "tool"]
+    assert len(tool_results) == 1
+    assert tool_results[0]["content"] == "first"
+
+
+def test_multiple_pairs_some_drop():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("A")]),
+        _t("A"),
+        _u("u2"),
+        _a(None, tool_calls=[_tc("B")]),
+        _t("B"),
+        _u("u3"),
+    ]
+    # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3.
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+    assert out[1]["tool_calls"][0]["id"] == "B"
+    assert out[2]["tool_call_id"] == "B"
+
+
+def test_no_partial_pair_when_budget_exactly_one_short():
+    """Exactly one short of fitting a pair → drop the pair, don't include
+    just the assistant."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+    ]
+    # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk
+    # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0)
+    # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t]
+    out = truncate_by_message_count(msgs, 2)
+    assert _roles(out) == ["assistant", "tool"]
+    # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits).
+    # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't
+    # fit, BREAK. Output: [].
+    out2 = truncate_by_message_count(msgs, 1)
+    assert out2 == []
+
+
+def test_mid_orphan_tool_dropped():
+    """A tool whose tool_call_id has no matching assistant nearby — defensive
+    drop. (Shouldn't happen with current persistence, but be robust.)"""
+    msgs = [
+        _u("u1"),
+        _t("ORPHAN_X"),  # malformed — no preceding assistant.tool_calls
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    # Orphan dropped
+    assert "tool" not in _roles(out)
+    assert _roles(out) == ["user", "user"]
+
+
+def test_orphan_adjacent_to_valid_pair_still_dropped():
+    """Orphan tool message inserted right after a legitimate tool-call pair
+    must be dropped — adjacency to a valid pair does not legitimize it.
+
+    This is the bug class that triggers OpenAI #446 even when slice cut
+    boundaries would otherwise be safe: any orphan reaching the wire,
+    regardless of position, makes the request invalid."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("VALID")]),
+        _t("VALID", "real result"),
+        _t("ORPHAN_id", "ghost result"),  # no assistant emits ORPHAN_id
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+
+    # The orphan must NOT survive — even though it's adjacent to a valid pair
+    orphan_present = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id"
+        for m in out
+    )
+    assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped"
+
+    # The valid pair survives intact
+    valid_assistant = any(
+        m.get("role") == "assistant"
+        and m.get("tool_calls")
+        and any(tc["id"] == "VALID" for tc in m["tool_calls"])
+        for m in out
+    )
+    valid_tool = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "VALID"
+        for m in out
+    )
+    assert valid_assistant and valid_tool
+
+
+def test_system_message_treated_as_normal_block():
+    msgs = [
+        {"role": "system", "content": "you are an agent"},
+        _u("hi"),
+        _a("hello"),
+    ]
+    out = truncate_by_message_count(msgs, 2)
+    # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped.
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_realistic_long_conversation_truncation():
+    """End-to-end: simulate a long chat with many tool-call turns and ensure
+    the output never has orphan tools."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(20):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result {k}"))
+        msgs.append(_u(f"next {k}"))
+    msgs.append(_a("final answer"))
+
+    # Truncate to 30 messages
+    out = truncate_by_message_count(msgs, 30)
+
+    # Sanity: budget respected
+    assert len(out) <= 30
+
+    # Critical invariant: no orphan tool messages anywhere
+    seen_tool_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                seen_tool_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            tcid = m.get("tool_call_id")
+            assert tcid in seen_tool_call_ids, (
+                f"Orphan tool {tcid!r} in output without matching assistant.tool_calls"
+            )
+
+
+# ── Token-budget fallback behavior ──────────────────────────────────────
+
+
+def test_token_budget_from_context_window_uses_eighty_percent():
+    assert token_budget_from_context_window(128_000) == 102_400
+    assert token_budget_from_context_window(200_000) == 160_000
+
+
+def test_token_budget_from_context_window_requires_explicit_positive_window():
+    assert token_budget_from_context_window(None) is None
+    assert token_budget_from_context_window(0) is None
+    assert token_budget_from_context_window(-1) is None
+
+
+def test_token_budget_truncation_drops_old_large_blocks():
+    msgs = [
+        _u("old-" + ("x" * 900)),
+        _a("middle"),
+        _u("latest"),
+    ]
+    out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=80)
+    assert _content(out) == ["middle", "latest"]
+
+
+def test_token_budget_truncation_keeps_tool_block_atomic():
+    msgs = [
+        _u("old-" + ("x" * 900)),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", "result"),
+        _u("latest"),
+    ]
+    out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=120)
+    assert _roles(out) == ["assistant", "tool", "user"]
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[1]["tool_call_id"] == "X"
+
+
+def test_token_budget_truncation_drops_whole_tool_block_if_it_does_not_fit():
+    msgs = [
+        _u("old"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X", "result-" + ("x" * 900)),
+        _u("latest"),
+    ]
+    out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=80)
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "latest"
+
+
+def test_token_budget_truncation_keeps_latest_block_when_over_budget():
+    msgs = [
+        _u("old"),
+        _u("latest-" + ("x" * 900)),
+    ]
+    out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=10)
+    assert _roles(out) == ["user"]
+    assert out[0]["content"].startswith("latest-")
+
+
+def test_token_budget_truncation_still_drops_invalid_blocks():
+    msgs = [
+        _u("old"),
+        _a(None, tool_calls=[_tc("X")]),
+        _u("latest"),
+        _t("X", "delayed orphan"),
+    ]
+    out = truncate_by_token_budget(msgs, max_messages=10, max_tokens=1000)
+    assert _roles(out) == ["user", "user"]
+    assert "tool" not in _roles(out)