-
Notifications
You must be signed in to change notification settings - Fork 574
feat(agent): token-aware history truncation alongside message cap #488
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| """add agents.context_window_tokens for token-aware history truncation | ||
|
|
||
| Revision ID: add_context_window_tokens | ||
| Revises: rm_agent_credential_secrets | ||
| Create Date: 2026-04-27 | ||
| """ | ||
|
|
||
| from typing import Sequence, Union | ||
|
|
||
| from alembic import op | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision: str = "add_context_window_tokens" | ||
| down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets" | ||
| branch_labels: Union[str, Sequence[str], None] = None | ||
| depends_on: Union[str, Sequence[str], None] = None | ||
|
|
||
|
|
||
| def upgrade() -> None: | ||
| """Add context_window_tokens with a DDL default of 50000. | ||
|
|
||
| The four-step pattern is required because earlier in the migration chain, | ||
| ``alembic/versions/0000_initial_schema.py`` calls | ||
| ``Base.metadata.create_all(checkfirst=True)``, which creates ``agents`` | ||
| from the *current* model state — including any new columns. SQLAlchemy's | ||
| Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause, | ||
| so the column ends up ``NOT NULL`` with no default, and a naive | ||
| ``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never | ||
| sets the default. | ||
|
|
||
| This four-step approach is idempotent regardless of pre-existing state: | ||
| - column missing → created (nullable, no default initially) | ||
| - column present without default → default set | ||
| - any rows with NULL → backfilled to 50000 | ||
| - column made NOT NULL | ||
|
|
||
| Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE | ||
| affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL | ||
| column is a no-op. | ||
| """ | ||
| # 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here, | ||
| # so existing rows (if any from create_all) aren't blocked. | ||
| op.execute( | ||
| "ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER" | ||
| ) | ||
| # 2. Ensure the DDL default is set so future inserts that omit this | ||
| # column (raw SQL, restored backups, manual migrations) get 50000. | ||
| op.execute( | ||
| "ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000" | ||
| ) | ||
| # 3. Backfill any rows that were created before the default landed. | ||
| op.execute( | ||
| "UPDATE agents SET context_window_tokens = 50000 " | ||
| "WHERE context_window_tokens IS NULL" | ||
| ) | ||
| # 4. Now safe to enforce NOT NULL. | ||
| op.execute( | ||
| "ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL" | ||
| ) | ||
|
|
||
|
|
||
| def downgrade() -> None: | ||
| # Downgrade omitted — dropping the column would lose per-tenant tuning. | ||
| pass |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| from app.models.llm import LLMModel | ||
| from app.models.user import User | ||
| from app.services.chat_session_service import ensure_primary_platform_session | ||
| from app.services.history_window import truncate_by_token_budget | ||
| from app.services.llm import call_llm, call_llm_with_failover | ||
|
|
||
| router = APIRouter(tags=["websocket"]) | ||
|
|
@@ -213,7 +214,9 @@ async def websocket_chat( | |
| role_description = agent.role_description or "" | ||
| welcome_message = agent.welcome_message or "" | ||
| ctx_size = agent.context_window_size or 100 | ||
| logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}") | ||
| from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS | ||
| tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS | ||
| logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok") | ||
|
|
||
| # Load the agent's primary model | ||
| if agent.primary_model_id: | ||
|
|
@@ -299,11 +302,14 @@ async def websocket_chat( | |
| logger.info(f"[WS] Selected primary session {conv_id}") | ||
|
|
||
| try: | ||
| # Load extra raw material so the app-level token-aware helper | ||
| # (truncate_by_token_budget below) has room to choose from. | ||
| _db_load_cap = max(ctx_size, 500) | ||
| history_result = await db.execute( | ||
| select(ChatMessage) | ||
| .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id) | ||
| .order_by(ChatMessage.created_at.desc()) | ||
| .limit(ctx_size) | ||
| .limit(_db_load_cap) | ||
| ) | ||
| history_messages = list(reversed(history_result.scalars().all())) | ||
| logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}") | ||
|
|
@@ -662,10 +668,30 @@ async def _call_with_failover(): | |
| async def _on_failover(reason: str): | ||
| await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) | ||
|
|
||
| # To prevent tool call message pairs(assistant + tool) from being broken down. | ||
| _truncated = conversation[-ctx_size:] | ||
| while _truncated and _truncated[0].get("role") == "tool": | ||
| _truncated.pop(0) | ||
| # Pair-aware truncation with a token budget plus a message-count | ||
| # safety cap. Either bound stops the walk; pairs (assistant.tool_calls | ||
| # ↔ role=tool) are kept atomic. Token budget protects against | ||
| # one-tool-result-eats-the-window scenarios; message cap protects | ||
| # against pathological tiny-message floods. The pair guard fixes | ||
| # the orphan-tool failure mode reported in #446. | ||
| # | ||
| # The current user message (just appended at line ~416) is excluded | ||
| # from truncation and re-appended after — otherwise a single huge | ||
| # input (large paste, base64 image_data) could push past the budget | ||
| # and cause the helper to drop the very message we're answering. | ||
| # If the input itself exceeds the model's context, the provider will | ||
| # surface a clear error rather than silently dropping it here. | ||
| _current = ( | ||
| conversation[-1] | ||
| if conversation and conversation[-1].get("role") == "user" | ||
| else None | ||
| ) | ||
| _history = conversation[:-1] if _current is not None else conversation | ||
| _truncated = truncate_by_token_budget( | ||
| _history, tok_budget, message_cap=ctx_size, | ||
| ) | ||
|
Comment on lines
+690
to
+692
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This truncates Useful? React with 👍 / 👎. |
||
| if _current is not None: | ||
| _truncated.append(_current) | ||
|
|
||
| return await call_llm_with_failover( | ||
| primary_model=llm_model, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
websocket_chatreloads agent/model config on each incoming message, buttok_budgetis only set once at connection setup, so active sockets keep using the oldcontext_window_tokensvalue for all later turns. This makes the new setting effectively inert until reconnect (for example, lowering the budget during an incident will not constrain prompt size on existing sessions), which defeats the runtime tuning added in this change.Useful? React with 👍 / 👎.