dataelement · TatsuKo-Tsukimi · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/backend/alembic/versions/add_context_window_tokens.py b/backend/alembic/versions/add_context_window_tokens.py
@@ -0,0 +1,65 @@
+"""add agents.context_window_tokens for token-aware history truncation
+
+Revision ID: add_context_window_tokens
+Revises: rm_agent_credential_secrets
+Create Date: 2026-04-27
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision: str = "add_context_window_tokens"
+down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Add context_window_tokens with a DDL default of 50000.
+
+    The four-step pattern is required because earlier in the migration chain,
+    ``alembic/versions/0000_initial_schema.py`` calls
+    ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents``
+    from the *current* model state — including any new columns. SQLAlchemy's
+    Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause,
+    so the column ends up ``NOT NULL`` with no default, and a naive
+    ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never
+    sets the default.
+
+    This four-step approach is idempotent regardless of pre-existing state:
+      - column missing → created (nullable, no default initially)
+      - column present without default → default set
+      - any rows with NULL → backfilled to 50000
+      - column made NOT NULL
+
+    Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE
+    affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL
+    column is a no-op.
+    """
+    # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here,
+    #    so existing rows (if any from create_all) aren't blocked.
+    op.execute(
+        "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER"
+    )
+    # 2. Ensure the DDL default is set so future inserts that omit this
+    #    column (raw SQL, restored backups, manual migrations) get 50000.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000"
+    )
+    # 3. Backfill any rows that were created before the default landed.
+    op.execute(
+        "UPDATE agents SET context_window_tokens = 50000 "
+        "WHERE context_window_tokens IS NULL"
+    )
+    # 4. Now safe to enforce NOT NULL.
+    op.execute(
+        "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL"
+    )
+
+
+def downgrade() -> None:
+    # Downgrade omitted — dropping the column would lose per-tenant tuning.
+    pass
diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
@@ -18,6 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import truncate_by_token_budget
 
 router = APIRouter(tags=["feishu"])
 
@@ -656,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession
             )
             _pre_sess = _pre_sess_r.scalar_one_or_none()
             _history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id
+            # Load extra raw material so app-level token-aware helper has room to choose
             history_result = await db.execute(
                 select(ChatMessage)
                 .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id)
                 .order_by(ChatMessage.created_at.desc())
-                .limit(ctx_size)
+                .limit(max(ctx_size, 500))
             )
             history_msgs = history_result.scalars().all()
             history = _build_llm_history_from_chat_messages(list(reversed(history_msgs)))
@@ -1373,11 +1375,12 @@ async def _handle_feishu_file(
         # Load conversation history for LLM context
         from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
         ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE
+        # Load extra raw material so app-level token-aware helper has room to choose
         _hist_r = await db.execute(
             _select(ChatMessage)
             .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id)
             .order_by(ChatMessage.created_at.desc())
-            .limit(ctx_size)
+            .limit(max(ctx_size, 500))
         )
         _history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all())))
 
@@ -1631,10 +1634,19 @@ async def _call_agent_llm(
 
     # Build conversation messages (without system prompt — call_llm adds it)
     messages: list[dict] = []
-    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
+    from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
+    tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
+        # Pair-aware truncation: token budget primary, message count as safety cap.
+        # Today _normalize_history_messages drops DB role="tool_call" rows, so this
+        # path has no tool messages and the pair guard is a no-op; the safety kicks
+        # in once a feishu reorganization helper exists.
+        messages.extend(
+            truncate_by_token_budget(
+                _normalize_history_messages(history), tok_budget, message_cap=ctx_size,
+            )
+        )
     messages.append({"role": "user", "content": user_text})
 
     # Use actual user_id so the system prompt knows who it's chatting with

diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
@@ -19,6 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import truncate_by_token_budget
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -213,7 +214,9 @@ async def websocket_chat(
             role_description = agent.role_description or ""
             welcome_message = agent.welcome_message or ""
             ctx_size = agent.context_window_size or 100
-            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}")
+            from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS
+            tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
+            logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok")
 
             # Load the agent's primary model
             if agent.primary_model_id:
@@ -299,11 +302,14 @@ async def websocket_chat(
                     logger.info(f"[WS] Selected primary session {conv_id}")
 
             try:
+                # Load extra raw material so the app-level token-aware helper
+                # (truncate_by_token_budget below) has room to choose from.
+                _db_load_cap = max(ctx_size, 500)
                 history_result = await db.execute(
                     select(ChatMessage)
                     .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id)
                     .order_by(ChatMessage.created_at.desc())
-                    .limit(ctx_size)
+                    .limit(_db_load_cap)
                 )
                 history_messages = list(reversed(history_result.scalars().all()))
                 logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}")
@@ -662,10 +668,30 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation with a token budget plus a message-count
+                        # safety cap. Either bound stops the walk; pairs (assistant.tool_calls
+                        # ↔ role=tool) are kept atomic. Token budget protects against
+                        # one-tool-result-eats-the-window scenarios; message cap protects
+                        # against pathological tiny-message floods. The pair guard fixes
+                        # the orphan-tool failure mode reported in #446.
+                        #
+                        # The current user message (just appended at line ~416) is excluded
+                        # from truncation and re-appended after — otherwise a single huge
+                        # input (large paste, base64 image_data) could push past the budget
+                        # and cause the helper to drop the very message we're answering.
+                        # If the input itself exceeds the model's context, the provider will
+                        # surface a clear error rather than silently dropping it here.
+                        _current = (
+                            conversation[-1]
+                            if conversation and conversation[-1].get("role") == "user"
+                            else None
+                        )
+                        _history = conversation[:-1] if _current is not None else conversation
+                        _truncated = truncate_by_token_budget(
+                            _history, tok_budget, message_cap=ctx_size,
+                        )
+                        if _current is not None:
+                            _truncated.append(_current)
 
                         return await call_llm_with_failover(
                             primary_model=llm_model,

diff --git a/backend/app/models/agent.py b/backend/app/models/agent.py
@@ -15,6 +15,11 @@
 # (see: https://github.com/dataelement/Clawith/issues/238).
 DEFAULT_CONTEXT_WINDOW_SIZE = 100
 
+# Default token budget for in-context history. Conservative for 128K-context
+# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent
+# override via Agent.context_window_tokens.
+DEFAULT_CONTEXT_WINDOW_TOKENS = 50000
+
 
 class Agent(Base):
     """Digital employee (Agent) instance.
@@ -81,6 +86,24 @@ class Agent(Base):
     last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
     tokens_used_total: Mapped[int] = mapped_column(Integer, default=0)
     context_window_size: Mapped[int] = mapped_column(Integer, default=100)
+    # Token-aware secondary bound on history sent to the LLM. Truncation uses
+    # the smaller of context_window_size (message count) and this token budget,
+    # preserving assistant.tool_calls ↔ role=tool pairs intact.
+    #
+    # ``server_default`` matters: alembic/versions/0000_initial_schema.py uses
+    # ``Base.metadata.create_all`` which reads model state at runtime. Without
+    # a server_default, fresh-DB bootstrap would create this column NOT NULL
+    # without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration
+    # later in the chain would short-circuit, leaving direct-SQL inserts
+    # broken. ``server_default="50000"`` ensures the DDL has the default
+    # whether the column was created by create_all or by the explicit
+    # migration.
+    context_window_tokens: Mapped[int] = mapped_column(
+        Integer,
+        default=DEFAULT_CONTEXT_WINDOW_TOKENS,
+        server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS),
+        nullable=False,
+    )
     max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50)
 
     # Trigger limits (per-agent, configurable from Settings UI)

diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py
@@ -251,6 +251,7 @@ class AgentOut(BaseModel):
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     context_window_size: int = 100
+    context_window_tokens: int = 50000
     max_tool_rounds: int = 50
     max_triggers: int = 20
     min_poll_interval_min: int = 5
@@ -286,6 +287,7 @@ class AgentUpdate(BaseModel):
     primary_model_id: uuid.UUID | None = None
     fallback_model_id: uuid.UUID | None = None
     context_window_size: int | None = Field(default=None, ge=1, le=500)
+    context_window_tokens: int | None = Field(default=None, ge=1000, le=500000)
     max_tokens_per_day: int | None = None
     max_tokens_per_month: int | None = None
     max_tool_rounds: int | None = None

diff --git a/backend/app/services/agent_context.py b/backend/app/services/agent_context.py
@@ -438,6 +438,18 @@ async def build_agent_context(agent_id: uuid.UUID, agent_name: str, role_descrip
 3. **NEVER fabricate file contents or tool results from memory.**
    Even if you saw a file before, you MUST call the tool again to get current data.
 
+   **Handling truncated tool results:** When a tool returns a large payload, the system
+   may truncate the in-context view and save the full output to your workspace. You will
+   see a marker like:
+
+   `[truncated. Full output (12453 tokens) saved to _tool_results/<call_id>.txt under your
+   workspace — use the read_file tool to retrieve specific sections]`
+
+   When you see this marker, the head excerpt above it is enough for an overview. If you
+   need the full content (specific search hit, page from a PDF, item N of a list), call
+   `read_file` with the path shown in the marker. Do NOT fabricate the missing content
+   from your prior knowledge — the saved file is the ground truth.
+
 4. **Use `write_file` to update memory/memory.md with important information.**
 
 5. **Use `write_file` to update focus.md with your current focus items.**

diff --git a/backend/app/services/heartbeat.py b/backend/app/services/heartbeat.py
@@ -370,10 +370,20 @@ async def _execute_heartbeat(agent_id: uuid.UUID):
                     else:
                         tool_result = await execute_tool(tool_name, args, agent_id, agent_creator_id)
 
+                    # Spill oversized tool results to disk; keep in-context bounded
+                    _hb_content: str | list = str(tool_result)
+                    if agent_id:
+                        from pathlib import Path as _HbPath
+                        from app.config import get_settings as _hb_get_settings
+                        from app.services.tool_result_truncation import maybe_truncate_tool_result as _hb_trunc
+                        _hb_ws = _HbPath(_hb_get_settings().AGENT_DATA_DIR) / str(agent_id)
+                        _hb_content = _hb_trunc(
+                            _hb_content, call_id=tc["id"], agent_workspace=_hb_ws,
+                        )
                     llm_messages.append(LLMMessage(
                         role="tool",
                         tool_call_id=tc["id"],
-                        content=str(tool_result),
+                        content=_hb_content,
                     ))
             else:
                 reply = response.content or ""