Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 43 additions & 10 deletions CLAUDE.md

Large diffs are not rendered by default.

19 changes: 3 additions & 16 deletions THIRD_PARTY_NOTICES.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ These may be compiled from source and shipped alongside ChaosEngineAI.
- **Copyright:** Copyright (c) 2023-2026 The ggml authors
- **Binary:** `llama-server-turbo`, `llama-cli-turbo`
- **Usage:** Adds turbo2/3/4 KV cache quantisation types used by the
RotorQuant and TurboQuant cache strategies. Actively maintained fork
with support for recent model architectures (Gemma 4, etc.).
TurboQuant cache strategy. Actively maintained fork with support for
recent model architectures (Gemma 4, etc.).

> **MIT licence notice (applies to both llama.cpp and the TurboQuant fork):**
>
Expand All @@ -46,18 +46,6 @@ These may be compiled from source and shipped alongside ChaosEngineAI.

---

## Vendored Packages

### ChaosEngine (PCA-based KV cache compression)

- **Repository:** <https://github.com/cryptopoly/ChaosEngine>
- **Licence:** Apache 2.0
- **Submodule:** `vendor/ChaosEngine`
- **Usage:** Desktop builds may bundle this into the runtime via
`npm run stage:runtime`.

---

## Optional Third-Party Cache Strategies

ChaosEngineAI supports optional cache/compression strategy backends.
Expand All @@ -66,8 +54,7 @@ If installed by the user, each is subject to its own licence:
| Strategy | Package | Repository | Licence |
|----------|---------|-----------|---------|
| TriAttention | `triattention` | <https://github.com/WeianMao/triattention> | See upstream |
| RotorQuant (marker) | `turboquant` | <https://github.com/back2matching/turboquant> | Apache 2.0 |
| TurboQuant MLX | `turboquant-mlx` | <https://github.com/sharpner/turboquant-mlx> | MIT |
| TurboQuant MLX | `turboquant-mlx-full` | <https://github.com/arozanov/turboquant-mlx> | MIT |
| MegaKernel | — | <https://github.com/Luce-Org/luce-megakernel> | See upstream |
| TeaCache (diffusion) | vendored patches | <https://github.com/ali-vilab/TeaCache> | Apache 2.0 |

Expand Down
187 changes: 157 additions & 30 deletions backend_service/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import json
import logging
import re
import time
import uuid
from dataclasses import dataclass, field
Expand Down Expand Up @@ -51,40 +52,141 @@ class AgentResult:
total_completion_tokens: int = 0


_TOOL_CALL_OPEN = re.compile(r"<tool_call>\s*", re.IGNORECASE)
_TOOL_CALL_CLOSE = re.compile(r"\s*</tool_call>", re.IGNORECASE)


def _strip_tool_call_xml(text: str) -> str:
"""Remove every ``<tool_call>...`` blob from a model response.

FU-040: the chat UI shows ``result.text`` verbatim in the assistant
bubble, so when a model emits a ``<tool_call>`` block AND we
execute the call (either via the engine's structured field or via
``_parse_tool_calls_from_response``), the user sees the same call
twice — once as raw XML noise and once as a ``ToolCallCard``. We
strip the XML from the text we hand back to the streaming layer.

Uses the same ``JSONDecoder.raw_decode`` walk as the parser so we
only remove the well-formed-JSON region the parser actually
consumed; everything around it (the model's natural-language
framing) stays put. A trailing ``</tool_call>`` close tag, when
present, is also swallowed.
"""
if not text or "<tool_call>" not in text.lower():
return text
decoder = json.JSONDecoder()
out: list[str] = []
cursor = 0
while True:
match = _TOOL_CALL_OPEN.search(text, cursor)
if match is None:
out.append(text[cursor:])
break
out.append(text[cursor:match.start()])
start = match.end()
while start < len(text) and text[start].isspace():
start += 1
if start >= len(text):
break
try:
_payload, end = decoder.raw_decode(text, start)
except json.JSONDecodeError:
# Malformed JSON after ``<tool_call>`` — drop the opener
# alone and continue. The garbage payload stays so the
# operator can see what the model emitted.
cursor = match.end()
continue
cursor = end
close = _TOOL_CALL_CLOSE.match(text, cursor)
if close is not None:
cursor = close.end()
cleaned = "".join(out)
# Collapse the double-blank-line that can appear when we strip a
# mid-paragraph tool_call. ``\n\n\n+`` → ``\n\n`` keeps paragraph
# breaks intact while removing the visible gap.
return re.sub(r"\n{3,}", "\n\n", cleaned).strip()


def _parse_tool_calls_from_response(response_text: str) -> list[dict[str, Any]] | None:
"""Attempt to extract tool calls from a text response.

Models using the OpenAI tool-calling protocol return structured
tool_calls in the response object. For models that embed tool calls
in their text output (e.g., Hermes/Functionary format), we try to
parse them from common patterns.
in their text output (e.g. Hermes / NousResearch / Qwen3-Coder-Next),
we parse them from the ``<tool_call>...</tool_call>`` XML-ish
convention.

FU-040 (2026-05-10): widened to handle three real-world shapes
Coder-Next emitted in a single chat session:

1. ``<tool_call>{"name": "x", "arguments": {...}}</tool_call>``
— the canonical Hermes shape. Always worked.
2. ``<tool_call>{"name": "x", "arguments": {...}}`` — no
closing tag. The previous regex required ``</tool_call>``
and silently dropped these, so the model's tool call
rendered as raw XML text in the assistant bubble with no
execution.
3. ``<tool_call> [ {url: ...}, {url: ...} ]`` — model
hallucinated a JSON ARRAY of pseudo-results instead of a
call object. Rejected (the array shape has no ``name`` /
``arguments`` keys to dispatch from), but we keep parsing
so any well-formed call later in the same message still
lands.

The parser walks each ``<tool_call>`` opener and uses the stdlib
``json.JSONDecoder.raw_decode`` to consume exactly the next valid
JSON value (object OR array) — that handles both shapes (1) and
(2) without requiring a closing tag, and shape (3) decodes to a
list which we discard. ``raw_decode`` also correctly skips nested
braces inside argument string values that a naive regex would
choke on.
"""
# Try the <tool_call> XML-ish format (Hermes/NousResearch)
calls: list[dict[str, Any]] = []
import re
if not response_text or "<tool_call>" not in response_text.lower():
return None

for match in re.finditer(
r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
response_text,
re.DOTALL,
):
calls: list[dict[str, Any]] = []
decoder = json.JSONDecoder()
cursor = 0
while True:
match = _TOOL_CALL_OPEN.search(response_text, cursor)
if match is None:
break
start = match.end()
# Find the first non-whitespace character; ``raw_decode`` needs
# to start at the JSON token itself, not at preceding spaces.
while start < len(response_text) and response_text[start].isspace():
start += 1
if start >= len(response_text):
break
try:
payload = json.loads(match.group(1))
name = payload.get("name") or payload.get("function")
arguments = payload.get("arguments") or payload.get("parameters") or {}
if isinstance(arguments, str):
arguments = json.loads(arguments)
if name:
calls.append({
"id": f"call_{uuid.uuid4().hex[:8]}",
"type": "function",
"function": {
"name": name,
"arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments),
},
})
except (json.JSONDecodeError, KeyError):
payload, end = decoder.raw_decode(response_text, start)
except json.JSONDecodeError:
cursor = start + 1
continue
cursor = end
# Shape (3): the model emitted hallucinated results as a list.
# No ``name`` to dispatch from — skip without aborting the
# outer loop so a later well-formed call in the same message
# still gets picked up.
if not isinstance(payload, dict):
continue
name = payload.get("name") or payload.get("function")
if not name:
continue
arguments = payload.get("arguments") or payload.get("parameters") or {}
if isinstance(arguments, str):
try:
arguments = json.loads(arguments)
except json.JSONDecodeError:
arguments = {"raw": arguments}
calls.append({
"id": f"call_{uuid.uuid4().hex[:8]}",
"type": "function",
"function": {
"name": name,
"arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments),
},
})

return calls if calls else None

Expand All @@ -99,8 +201,27 @@ def _execute_tool_call(
tool_name = func.get("name", "unknown")
raw_args = func.get("arguments", "{}")

# FU-039 (2026-05-10): coerce ``arguments`` to a dict at the source.
# Models occasionally emit ``{"arguments": null}`` (Coder-Next does
# this when the tool call has no parameters) or send a non-string,
# non-dict shape we don't recognise. Both routes used to set
# ``arguments = None``, which then landed in ``ToolCallResult``,
# serialised into the persisted session, and crashed the frontend's
# ``ToolCallCard`` at ``Object.entries(null)`` on every subsequent
# render. Result: a single bad tool turn permanently bricked the
# Chat tab. Defaulting to ``{}`` keeps the contract consumers
# already assume — and means the frontend boundary (also added in
# FU-039) only fires for genuinely corrupt records, not the common
# "no args" path.
try:
arguments = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
if raw_args is None:
arguments = {}
elif isinstance(raw_args, str):
arguments = json.loads(raw_args) if raw_args.strip() else {}
elif isinstance(raw_args, dict):
arguments = raw_args
else:
arguments = {"raw": raw_args}
except json.JSONDecodeError:
arguments = {"raw": raw_args}

Expand Down Expand Up @@ -244,9 +365,12 @@ def run_agent_loop(
tool_calls = _parse_tool_calls_from_response(result.text)

if not tool_calls:
# Model is done — return the final text
# Model is done — return the final text. Strip any
# ``<tool_call>`` XML the parser consumed so the chat
# bubble doesn't show raw call JSON next to a rendered
# ToolCallCard (FU-040).
return AgentResult(
text=result.text,
text=_strip_tool_call_xml(result.text),
tool_calls=all_tool_results,
iterations=iteration + 1,
total_prompt_tokens=total_prompt,
Expand Down Expand Up @@ -356,8 +480,11 @@ def run_agent_loop_streaming(

if not tool_calls:
# Final response — stream it token by token for the user
# Since we already have the full text, emit it in chunks
text = result.text
# Since we already have the full text, emit it in chunks.
# Strip any ``<tool_call>`` XML blobs the parser already
# consumed so the assistant bubble doesn't show raw call
# JSON next to the rendered ToolCallCard (FU-040).
text = _strip_tool_call_xml(result.text)
chunk_size = 4
for i in range(0, len(text), chunk_size):
yield {"token": text[i:i + chunk_size]}
Expand Down
58 changes: 52 additions & 6 deletions backend_service/catalog/text_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,16 @@
"popularityLabel": "Featured family",
"likesLabel": "Qwen official",
"badges": ["Reasoning", "Coding", "Agents", "Long context"],
"capabilities": ["reasoning", "coding", "tool-use", "vision"],
# FU-040 (2026-05-10): dropped ``vision`` from the family-level
# capabilities. Qwen3.6-27B (dense, Coder-Next branding) and
# Qwen3.6-35B-A3B (MoE) are both text-only — vision lives on a
# separate ``Qwen3.6-27B-VL`` variant we do not yet ship. The
# stale tag was promoting ``supportsVision: true`` for every
# community quant variant, which made ``ChatComposer`` render
# the "Attach image" affordance for a model that has no vision
# encoder. Add it back here only when an actual VL variant
# lands in the catalog.
"capabilities": ["reasoning", "coding", "tool-use"],
"defaultVariantId": "Qwen/Qwen3.6-27B",
"variants": [
{
Expand All @@ -115,8 +124,9 @@
"sizeGb": 54.0,
"format": "Transformers",
"quantization": "BF16",
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "Dense 27B Qwen3.6 release with vision and agentic coding tuning. Apache 2.0.",
# FU-040: text-only dense variant (Coder-Next branding).
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "Dense 27B Qwen3.6 release with agentic coding tuning. Apache 2.0.",
"contextWindow": "262K",
"launchMode": "convert",
"backend": "mlx",
Expand All @@ -131,7 +141,8 @@
"sizeGb": 28.0,
"format": "Transformers",
"quantization": "FP8",
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
# FU-040: text-only dense variant.
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "FP8 quantization of the 27B dense release for ~30 GB VRAM systems.",
"contextWindow": "262K",
"launchMode": "convert",
Expand Down Expand Up @@ -163,7 +174,8 @@
"sizeGb": 15.5,
"format": "MLX",
"quantization": "4-bit",
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
# FU-040: text-only dense variant.
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "Community MLX 4-bit conversion for Apple Silicon — fastest local launch path.",
"contextWindow": "262K",
"launchMode": "direct",
Expand Down Expand Up @@ -239,7 +251,10 @@
"popularityLabel": "Featured family",
"likesLabel": "Qwen official",
"badges": ["Reasoning", "Coding", "Long context"],
"capabilities": ["reasoning", "coding", "tool-use", "vision"],
# FU-040: Qwen3.5 dense + MoE variants are text-only. The
# ``vision`` tag at family-level was promoting false positives
# in ``supportsVision`` for every community quant variant.
"capabilities": ["reasoning", "coding", "tool-use"],
"defaultVariantId": "Qwen/Qwen3.5-9B",
"variants": [
{
Expand Down Expand Up @@ -511,6 +526,37 @@
"launchMode": "convert",
"backend": "mlx",
},
# FU-041 (2026-05-10): community MLX 4-bit conversion of the
# Qwen3-Next architecture (qwen3_next, sparse MoE w/ 512
# experts, ~3B active per token, hidden_size=2048). Without
# this variant the library matcher in src/utils/library.ts
# fuzzy-matched a local ``Qwen3-Coder-Next-MLX-4bit`` install
# to the unrelated ``mlx-community/Qwen3.6-27B-4bit`` (dense
# 27B Coder, completely different arch — hidden_size=5120,
# no MoE), which then surfaced the wrong canonicalRepo into
# the runtime snapshot, picked up the wrong capability set,
# and routed DFlash lookups to the wrong drafter. Adding the
# variant explicitly lets the matcher score 80+ on an exact
# repo-path substring hit instead of falling back to the
# closest-quant-and-format match.
{
"id": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
"name": "Qwen3 Coder Next MLX 4-bit",
"repo": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
"link": "https://huggingface.co/lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
# 80B total params, ~3B active per token; the on-disk
# 4-bit conversion fits ~45 GB.
"paramsB": 80.0,
"sizeGb": 45.0,
"format": "MLX",
"quantization": "4-bit",
"capabilities": ["coding", "agents", "tool-use", "reasoning", "thinking"],
"note": "Community MLX 4-bit conversion of the Qwen3-Next MoE coder for Apple Silicon — fastest local launch path.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "mlx",
"releaseDate": "2026-04",
},
],
"readme": [
"Qwen3 Coder Next is purpose-built for software engineering with function calling and agentic workflows.",
Expand Down
2 changes: 0 additions & 2 deletions backend_service/helpers/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def _estimate_baseline_tok_s(system_stats: dict[str, Any]) -> float:
def _strategy_speed_map(strategy: str) -> dict[int, float]:
"""Speed ratio maps by strategy and bit count (fraction of baseline FP16 speed)."""
maps: dict[str, dict[int, float]] = {
"rotorquant": {1: 0.42, 2: 0.50, 3: 0.57, 4: 0.65},
"triattention": {1: 0.48, 2: 0.56, 3: 0.63, 4: 0.70},
"turboquant": {1: 0.44, 2: 0.52, 3: 0.60, 4: 0.67},
}
Expand All @@ -27,7 +26,6 @@ def _strategy_speed_map(strategy: str) -> dict[int, float]:
def _strategy_quality_base(strategy: str) -> dict[int, float]:
"""Base quality percentage by strategy and bit count (before fp16_layers bonus)."""
maps: dict[str, dict[int, float]] = {
"rotorquant": {1: 88.0, 2: 91.0, 3: 93.5, 4: 96.0},
"triattention": {1: 89.5, 2: 92.0, 3: 94.5, 4: 97.0},
"turboquant": {1: 87.5, 2: 90.5, 3: 93.0, 4: 95.5},
}
Expand Down
4 changes: 2 additions & 2 deletions backend_service/inference/binaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def _resolve_llama_server() -> str | None:
def _resolve_llama_server_turbo() -> str | None:
"""Resolve the TurboQuant fork of llama-server (``llama-server-turbo``).

This fork supports all standard cache types **plus** iso/planar/turbo
cache types required by RotorQuant and TurboQuant strategies.
This fork supports all standard cache types **plus** turbo2/3/4
cache types required by the TurboQuant strategy.
"""
override = os.getenv("CHAOSENGINE_LLAMA_SERVER_TURBO")
if override:
Expand Down
Loading