diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 00fe32737..ca4cef7c7 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -18,6 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import truncate_by_message_count
 
 router = APIRouter(tags=["feishu"])
 
@@ -1634,7 +1635,13 @@ async def _call_agent_llm(
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
+        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
+        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
+        # rows, so this path has no tool messages and the helper acts as plain count
+        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        messages.extend(
+            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+        )
     messages.append({"role": "user", "content": user_text})
 
     # Use actual user_id so the system prompt knows who it's chatting with
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 82b297916..6f17ad1e5 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -19,6 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import truncate_by_message_count
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -662,10 +663,12 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
new file mode 100644
index 000000000..54c0ce8a6
--- /dev/null
+++ b/backend/app/services/history_window.py
@@ -0,0 +1,166 @@
+"""Pair-aware conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
+tool message inserted between a valid pair and other messages (from
+malformed persistence or upstream truncation) is dropped, not folded
+into an adjacent block. This makes the helper robust against orphans
+at any position, not just at the slice head.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
+    """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has
+    no matching ``assistant.tool_calls`` earlier in the conversation.
+
+    OpenAI rejects the request the moment a ``function_call_output`` is
+    sent without its matching ``function_call``, regardless of whether
+    that tool message is at the head, middle, or end. So orphan detection
+    is by ID matching, not by position.
+    """
+    orphans: set[int] = set()
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        if not tcid:
+            orphans.add(i)
+            continue
+        # Search backward for an assistant whose tool_calls contains this id.
+        # Walks past intervening user / system / other-assistant messages.
+        found = False
+        j = i - 1
+        while j >= 0:
+            m = messages[j]
+            if m.get("role") == "assistant" and m.get("tool_calls"):
+                ids = {tc.get("id") for tc in m["tool_calls"]}
+                if tcid in ids:
+                    found = True
+                    break
+            j -= 1
+        if not found:
+            orphans.add(i)
+    return orphans
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
+        message (identified by ``tool_call_id``, not adjacency).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages — those whose ``tool_call_id`` has no matching assistant — are
+    silently dropped regardless of budget. Sending them to OpenAI causes the
+    #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    orphans = _identify_orphans(messages)
+    n = len(messages)
+    consumed: set[int] = set(orphans)  # orphans drop unconditionally
+    blocks: list[set[int]] = []  # tail-to-head order
+
+    for i in range(n - 1, -1, -1):
+        if i in consumed:
+            continue
+        msg = messages[i]
+        role = msg.get("role")
+
+        if role == "tool":
+            # Find this tool's owning assistant by matching tool_call_id
+            tcid = msg.get("tool_call_id")
+            asst_idx = -1
+            j = i - 1
+            while j >= 0:
+                m = messages[j]
+                if m.get("role") == "assistant" and m.get("tool_calls"):
+                    ids = {tc.get("id") for tc in m["tool_calls"]}
+                    if tcid in ids:
+                        asst_idx = j
+                        break
+                j -= 1
+            if asst_idx < 0:
+                # Defensive — orphan detection should have caught this
+                consumed.add(i)
+                continue
+            # Block = assistant + ALL of its matching tool messages (siblings)
+            asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]}
+            block = {asst_idx}
+            for k in range(asst_idx + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        elif role == "assistant" and msg.get("tool_calls"):
+            # Encountered the assistant before any of its tools (e.g. tools
+            # were truncated upstream or are still in flight). Group with
+            # whatever matching tools follow it.
+            asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]}
+            block = {i}
+            for k in range(i + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        else:
+            consumed.add(i)
+            blocks.append({i})
+
+    # Walk blocks tail-to-head, taking until budget exhausted.
+    keep: set[int] = set()
+    budget = max_messages
+    for block in blocks:
+        size = len(block)
+        if size <= budget:
+            keep |= block
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [messages[k] for k in sorted(keep)]
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py
new file mode 100644
index 000000000..934216758
--- /dev/null
+++ b/backend/tests/test_history_window.py
@@ -0,0 +1,288 @@
+"""Unit tests for pair-aware conversation history truncation.
+
+Validates that ``truncate_by_message_count`` preserves
+``assistant.tool_calls`` ↔ ``role="tool"`` blocks atomically — never produces
+orphan tool messages that would trigger the OpenAI #446 failure mode.
+"""
+
+from app.services.history_window import truncate_by_message_count
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────
+
+
+def _u(text: str) -> dict:
+    return {"role": "user", "content": text}
+
+
+def _a(text: str | None = None, tool_calls: list[dict] | None = None) -> dict:
+    msg: dict = {"role": "assistant", "content": text}
+    if tool_calls:
+        msg["tool_calls"] = tool_calls
+    return msg
+
+
+def _tc(call_id: str, name: str = "noop", args: str = "{}") -> dict:
+    return {"id": call_id, "type": "function", "function": {"name": name, "arguments": args}}
+
+
+def _t(call_id: str, content: str = "ok") -> dict:
+    return {"role": "tool", "tool_call_id": call_id, "content": content}
+
+
+def _roles(msgs: list[dict]) -> list[str]:
+    return [m.get("role", "?") for m in msgs]
+
+
+# ── Edge cases ──────────────────────────────────────────────────────────
+
+
+def test_empty_input_returns_empty():
+    assert truncate_by_message_count([], 10) == []
+
+
+def test_zero_or_negative_budget_returns_empty():
+    msgs = [_u("hi"), _u("there")]
+    assert truncate_by_message_count(msgs, 0) == []
+    assert truncate_by_message_count(msgs, -5) == []
+
+
+def test_within_budget_returns_all():
+    msgs = [_u("a"), _a("b"), _u("c")]
+    out = truncate_by_message_count(msgs, 10)
+    assert out == msgs
+    assert out is not msgs  # new list
+
+
+def test_input_not_mutated():
+    msgs = [_u("a"), _a("b"), _u("c"), _u("d")]
+    snapshot = list(msgs)
+    truncate_by_message_count(msgs, 2)
+    assert msgs == snapshot
+
+
+# ── Core pair-preservation behavior ─────────────────────────────────────
+
+
+def test_keeps_assistant_tool_pair_intact():
+    """Slicing must not split assistant.tool_calls from its tool result."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 3 — would naively keep [a+tc(X), t(X), u("done?")], that's clean
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["assistant", "tool", "user"]
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[1]["tool_call_id"] == "X"
+
+
+def test_drops_pair_entirely_when_budget_too_small():
+    """If budget can't fit the whole pair, drop it — never half."""
+    msgs = [
+        _u("hi"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+        _u("done?"),
+    ]
+    # Budget 2 — can't fit pair (needs 2) + final user, must drop pair
+    out = truncate_by_message_count(msgs, 2)
+    # Only the trailing user fits as a single block; pair (size 2) doesn't fit
+    # in remaining budget=1 after taking user.
+    assert _roles(out) == ["user"]
+    assert out[0]["content"] == "done?"
+
+
+def test_drops_orphan_tool_at_head():
+    """A role=tool with no preceding assistant.tool_calls is dropped."""
+    msgs = [
+        _t("X"),  # orphan — no assistant before
+        _u("hi"),
+        _a("ok"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_drops_orphan_tool_at_head_after_slicing():
+    """Slicing produces an orphan tool at head — must be dropped (the
+    classic #446 failure mode)."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),       # ← naive slice [-3:] would start here as orphan
+        _u("u2"),
+        _a("final"),
+    ]
+    # Budget 3: take from end. _a("final") block. _u("u2") block. Then t(X)
+    # alone — orphan, dropped. Pair (a+tc, t) doesn't get full chance because
+    # we'd need budget 5 to include from start. Result: [u("u2"), a("final")].
+    out = truncate_by_message_count(msgs, 3)
+    assert "tool" not in _roles(out)
+    # No orphan tool_call_id reaches output
+    for m in out:
+        if m.get("role") == "tool":
+            raise AssertionError(f"Orphan tool leaked: {m}")
+
+
+def test_multiple_parallel_tool_calls_in_one_assistant():
+    """Assistant with N tool_calls followed by N tools is one atomic block."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 5: take u("u2"), then the 4-entry block (a + 3 tools). budget=5-1-4=0
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["assistant", "tool", "tool", "tool", "user"]
+    # Verify the pair came through whole
+    assert out[0]["tool_calls"][0]["id"] == "X"
+    assert out[3]["tool_call_id"] == "Z"
+
+
+def test_parallel_tool_pair_dropped_if_too_big():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X"), _tc("Y"), _tc("Z")]),
+        _t("X"),
+        _t("Y"),
+        _t("Z"),
+        _u("u2"),
+    ]
+    # Budget 3: take u("u2"). Pair size 4, doesn't fit budget 2. Stop. Output [u].
+    out = truncate_by_message_count(msgs, 3)
+    assert _roles(out) == ["user"]
+
+
+def test_multiple_pairs_some_drop():
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("A")]),
+        _t("A"),
+        _u("u2"),
+        _a(None, tool_calls=[_tc("B")]),
+        _t("B"),
+        _u("u3"),
+    ]
+    # 7 entries. Budget 5: take u("u3") (1), pair B (2) → budget=2, take u("u2") (1) → budget=1, pair A (2) doesn't fit. Output: u2, a+B, t(B), u3.
+    out = truncate_by_message_count(msgs, 5)
+    assert _roles(out) == ["user", "assistant", "tool", "user"]
+    assert out[1]["tool_calls"][0]["id"] == "B"
+    assert out[2]["tool_call_id"] == "B"
+
+
+def test_no_partial_pair_when_budget_exactly_one_short():
+    """Exactly one short of fitting a pair → drop the pair, don't include
+    just the assistant."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("X")]),
+        _t("X"),
+    ]
+    # Budget 2: pair size 2, fits → [a+tc, t]. (u dropped to fit pair? No — walk
+    # from end: t(X) goes back to a(tc=X) → pair block (1,2) size 2. Then u (0,0)
+    # size 1. Take pair first, budget=0. Stop. Output: [a+tc, t]
+    out = truncate_by_message_count(msgs, 2)
+    assert _roles(out) == ["assistant", "tool"]
+    # If only budget 1: pair size 2 doesn't fit. Then look at u (size 1, fits).
+    # But blocks order is [(1,2), (0,0)] from walk. We try pair first, doesn't
+    # fit, BREAK. Output: [].
+    out2 = truncate_by_message_count(msgs, 1)
+    assert out2 == []
+
+
+def test_mid_orphan_tool_dropped():
+    """A tool whose tool_call_id has no matching assistant nearby — defensive
+    drop. (Shouldn't happen with current persistence, but be robust.)"""
+    msgs = [
+        _u("u1"),
+        _t("ORPHAN_X"),  # malformed — no preceding assistant.tool_calls
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+    # Orphan dropped
+    assert "tool" not in _roles(out)
+    assert _roles(out) == ["user", "user"]
+
+
+def test_orphan_adjacent_to_valid_pair_still_dropped():
+    """Orphan tool message inserted right after a legitimate tool-call pair
+    must be dropped — adjacency to a valid pair does not legitimize it.
+
+    This is the bug class that triggers OpenAI #446 even when slice cut
+    boundaries would otherwise be safe: any orphan reaching the wire,
+    regardless of position, makes the request invalid."""
+    msgs = [
+        _u("u1"),
+        _a(None, tool_calls=[_tc("VALID")]),
+        _t("VALID", "real result"),
+        _t("ORPHAN_id", "ghost result"),  # no assistant emits ORPHAN_id
+        _u("u2"),
+    ]
+    out = truncate_by_message_count(msgs, 10)
+
+    # The orphan must NOT survive — even though it's adjacent to a valid pair
+    orphan_present = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "ORPHAN_id"
+        for m in out
+    )
+    assert not orphan_present, "Orphan tool adjacent to valid pair must be dropped"
+
+    # The valid pair survives intact
+    valid_assistant = any(
+        m.get("role") == "assistant"
+        and m.get("tool_calls")
+        and any(tc["id"] == "VALID" for tc in m["tool_calls"])
+        for m in out
+    )
+    valid_tool = any(
+        m.get("role") == "tool" and m.get("tool_call_id") == "VALID"
+        for m in out
+    )
+    assert valid_assistant and valid_tool
+
+
+def test_system_message_treated_as_normal_block():
+    msgs = [
+        {"role": "system", "content": "you are an agent"},
+        _u("hi"),
+        _a("hello"),
+    ]
+    out = truncate_by_message_count(msgs, 2)
+    # Walk from end: a (size 1), u (size 1). budget 2: take both. system dropped.
+    assert _roles(out) == ["user", "assistant"]
+
+
+def test_realistic_long_conversation_truncation():
+    """End-to-end: simulate a long chat with many tool-call turns and ensure
+    the output never has orphan tools."""
+    msgs: list[dict] = [_u("start")]
+    for k in range(20):
+        msgs.append(_a(None, tool_calls=[_tc(f"call_{k}")]))
+        msgs.append(_t(f"call_{k}", content=f"result {k}"))
+        msgs.append(_u(f"next {k}"))
+    msgs.append(_a("final answer"))
+
+    # Truncate to 30 messages
+    out = truncate_by_message_count(msgs, 30)
+
+    # Sanity: budget respected
+    assert len(out) <= 30
+
+    # Critical invariant: no orphan tool messages anywhere
+    seen_tool_call_ids: set[str] = set()
+    for m in out:
+        if m.get("role") == "assistant" and m.get("tool_calls"):
+            for tc in m["tool_calls"]:
+                seen_tool_call_ids.add(tc["id"])
+    for m in out:
+        if m.get("role") == "tool":
+            tcid = m.get("tool_call_id")
+            assert tcid in seen_tool_call_ids, (
+                f"Orphan tool {tcid!r} in output without matching assistant.tool_calls"
+            )