cryptopoly · cryptopoly · May 11, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md
@@ -24,8 +24,8 @@ These may be compiled from source and shipped alongside ChaosEngineAI.
 - **Copyright:** Copyright (c) 2023-2026 The ggml authors
 - **Binary:** `llama-server-turbo`, `llama-cli-turbo`
 - **Usage:** Adds turbo2/3/4 KV cache quantisation types used by the
-  RotorQuant and TurboQuant cache strategies. Actively maintained fork
-  with support for recent model architectures (Gemma 4, etc.).
+  TurboQuant cache strategy. Actively maintained fork with support for
+  recent model architectures (Gemma 4, etc.).
 
 > **MIT licence notice (applies to both llama.cpp and the TurboQuant fork):**
 >
@@ -46,18 +46,6 @@ These may be compiled from source and shipped alongside ChaosEngineAI.
 
 ---
 
-## Vendored Packages
-
-### ChaosEngine (PCA-based KV cache compression)
-
-- **Repository:** <https://github.com/cryptopoly/ChaosEngine>
-- **Licence:** Apache 2.0
-- **Submodule:** `vendor/ChaosEngine`
-- **Usage:** Desktop builds may bundle this into the runtime via
-  `npm run stage:runtime`.
-
----
-
 ## Optional Third-Party Cache Strategies
 
 ChaosEngineAI supports optional cache/compression strategy backends.
@@ -66,8 +54,7 @@ If installed by the user, each is subject to its own licence:
 | Strategy | Package | Repository | Licence |
 |----------|---------|-----------|---------|
 | TriAttention | `triattention` | <https://github.com/WeianMao/triattention> | See upstream |
-| RotorQuant (marker) | `turboquant` | <https://github.com/back2matching/turboquant> | Apache 2.0 |
-| TurboQuant MLX | `turboquant-mlx` | <https://github.com/sharpner/turboquant-mlx> | MIT |
+| TurboQuant MLX | `turboquant-mlx-full` | <https://github.com/arozanov/turboquant-mlx> | MIT |
 | MegaKernel | — | <https://github.com/Luce-Org/luce-megakernel> | See upstream |
 | TeaCache (diffusion) | vendored patches | <https://github.com/ali-vilab/TeaCache> | Apache 2.0 |
 

diff --git a/backend_service/agent.py b/backend_service/agent.py
@@ -12,6 +12,7 @@
 
 import json
 import logging
+import re
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -51,40 +52,141 @@ class AgentResult:
     total_completion_tokens: int = 0
 
 
+_TOOL_CALL_OPEN = re.compile(r"<tool_call>\s*", re.IGNORECASE)
+_TOOL_CALL_CLOSE = re.compile(r"\s*</tool_call>", re.IGNORECASE)
+
+
+def _strip_tool_call_xml(text: str) -> str:
+    """Remove every ``<tool_call>...`` blob from a model response.
+
+    FU-040: the chat UI shows ``result.text`` verbatim in the assistant
+    bubble, so when a model emits a ``<tool_call>`` block AND we
+    execute the call (either via the engine's structured field or via
+    ``_parse_tool_calls_from_response``), the user sees the same call
+    twice — once as raw XML noise and once as a ``ToolCallCard``. We
+    strip the XML from the text we hand back to the streaming layer.
+
+    Uses the same ``JSONDecoder.raw_decode`` walk as the parser so we
+    only remove the well-formed-JSON region the parser actually
+    consumed; everything around it (the model's natural-language
+    framing) stays put. A trailing ``</tool_call>`` close tag, when
+    present, is also swallowed.
+    """
+    if not text or "<tool_call>" not in text.lower():
+        return text
+    decoder = json.JSONDecoder()
+    out: list[str] = []
+    cursor = 0
+    while True:
+        match = _TOOL_CALL_OPEN.search(text, cursor)
+        if match is None:
+            out.append(text[cursor:])
+            break
+        out.append(text[cursor:match.start()])
+        start = match.end()
+        while start < len(text) and text[start].isspace():
+            start += 1
+        if start >= len(text):
+            break
+        try:
+            _payload, end = decoder.raw_decode(text, start)
+        except json.JSONDecodeError:
+            # Malformed JSON after ``<tool_call>`` — drop the opener
+            # alone and continue. The garbage payload stays so the
+            # operator can see what the model emitted.
+            cursor = match.end()
+            continue
+        cursor = end
+        close = _TOOL_CALL_CLOSE.match(text, cursor)
+        if close is not None:
+            cursor = close.end()
+    cleaned = "".join(out)
+    # Collapse the double-blank-line that can appear when we strip a
+    # mid-paragraph tool_call. ``\n\n\n+`` → ``\n\n`` keeps paragraph
+    # breaks intact while removing the visible gap.
+    return re.sub(r"\n{3,}", "\n\n", cleaned).strip()
+
+
 def _parse_tool_calls_from_response(response_text: str) -> list[dict[str, Any]] | None:
     """Attempt to extract tool calls from a text response.
 
     Models using the OpenAI tool-calling protocol return structured
     tool_calls in the response object. For models that embed tool calls
-    in their text output (e.g., Hermes/Functionary format), we try to
-    parse them from common patterns.
+    in their text output (e.g. Hermes / NousResearch / Qwen3-Coder-Next),
+    we parse them from the ``<tool_call>...</tool_call>`` XML-ish
+    convention.
+
+    FU-040 (2026-05-10): widened to handle three real-world shapes
+    Coder-Next emitted in a single chat session:
+
+      1. ``<tool_call>{"name": "x", "arguments": {...}}</tool_call>``
+         — the canonical Hermes shape. Always worked.
+      2. ``<tool_call>{"name": "x", "arguments": {...}}`` — no
+         closing tag. The previous regex required ``</tool_call>``
+         and silently dropped these, so the model's tool call
+         rendered as raw XML text in the assistant bubble with no
+         execution.
+      3. ``<tool_call> [ {url: ...}, {url: ...} ]`` — model
+         hallucinated a JSON ARRAY of pseudo-results instead of a
+         call object. Rejected (the array shape has no ``name`` /
+         ``arguments`` keys to dispatch from), but we keep parsing
+         so any well-formed call later in the same message still
+         lands.
+
+    The parser walks each ``<tool_call>`` opener and uses the stdlib
+    ``json.JSONDecoder.raw_decode`` to consume exactly the next valid
+    JSON value (object OR array) — that handles both shapes (1) and
+    (2) without requiring a closing tag, and shape (3) decodes to a
+    list which we discard. ``raw_decode`` also correctly skips nested
+    braces inside argument string values that a naive regex would
+    choke on.
     """
-    # Try the <tool_call> XML-ish format (Hermes/NousResearch)
-    calls: list[dict[str, Any]] = []
-    import re
+    if not response_text or "<tool_call>" not in response_text.lower():
+        return None
 
-    for match in re.finditer(
-        r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
-        response_text,
-        re.DOTALL,
-    ):
+    calls: list[dict[str, Any]] = []
+    decoder = json.JSONDecoder()
+    cursor = 0
+    while True:
+        match = _TOOL_CALL_OPEN.search(response_text, cursor)
+        if match is None:
+            break
+        start = match.end()
+        # Find the first non-whitespace character; ``raw_decode`` needs
+        # to start at the JSON token itself, not at preceding spaces.
+        while start < len(response_text) and response_text[start].isspace():
+            start += 1
+        if start >= len(response_text):
+            break
         try:
-            payload = json.loads(match.group(1))
-            name = payload.get("name") or payload.get("function")
-            arguments = payload.get("arguments") or payload.get("parameters") or {}
-            if isinstance(arguments, str):
-                arguments = json.loads(arguments)
-            if name:
-                calls.append({
-                    "id": f"call_{uuid.uuid4().hex[:8]}",
-                    "type": "function",
-                    "function": {
-                        "name": name,
-                        "arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments),
-                    },
-                })
-        except (json.JSONDecodeError, KeyError):
+            payload, end = decoder.raw_decode(response_text, start)
+        except json.JSONDecodeError:
+            cursor = start + 1
+            continue
+        cursor = end
+        # Shape (3): the model emitted hallucinated results as a list.
+        # No ``name`` to dispatch from — skip without aborting the
+        # outer loop so a later well-formed call in the same message
+        # still gets picked up.
+        if not isinstance(payload, dict):
             continue
+        name = payload.get("name") or payload.get("function")
+        if not name:
+            continue
+        arguments = payload.get("arguments") or payload.get("parameters") or {}
+        if isinstance(arguments, str):
+            try:
+                arguments = json.loads(arguments)
+            except json.JSONDecodeError:
+                arguments = {"raw": arguments}
+        calls.append({
+            "id": f"call_{uuid.uuid4().hex[:8]}",
+            "type": "function",
+            "function": {
+                "name": name,
+                "arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments),
+            },
+        })
 
     return calls if calls else None
 
@@ -99,8 +201,27 @@ def _execute_tool_call(
     tool_name = func.get("name", "unknown")
     raw_args = func.get("arguments", "{}")
 
+    # FU-039 (2026-05-10): coerce ``arguments`` to a dict at the source.
+    # Models occasionally emit ``{"arguments": null}`` (Coder-Next does
+    # this when the tool call has no parameters) or send a non-string,
+    # non-dict shape we don't recognise. Both routes used to set
+    # ``arguments = None``, which then landed in ``ToolCallResult``,
+    # serialised into the persisted session, and crashed the frontend's
+    # ``ToolCallCard`` at ``Object.entries(null)`` on every subsequent
+    # render. Result: a single bad tool turn permanently bricked the
+    # Chat tab. Defaulting to ``{}`` keeps the contract consumers
+    # already assume — and means the frontend boundary (also added in
+    # FU-039) only fires for genuinely corrupt records, not the common
+    # "no args" path.
     try:
-        arguments = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
+        if raw_args is None:
+            arguments = {}
+        elif isinstance(raw_args, str):
+            arguments = json.loads(raw_args) if raw_args.strip() else {}
+        elif isinstance(raw_args, dict):
+            arguments = raw_args
+        else:
+            arguments = {"raw": raw_args}
     except json.JSONDecodeError:
         arguments = {"raw": raw_args}
 
@@ -244,9 +365,12 @@ def run_agent_loop(
             tool_calls = _parse_tool_calls_from_response(result.text)
 
         if not tool_calls:
-            # Model is done — return the final text
+            # Model is done — return the final text. Strip any
+            # ``<tool_call>`` XML the parser consumed so the chat
+            # bubble doesn't show raw call JSON next to a rendered
+            # ToolCallCard (FU-040).
             return AgentResult(
-                text=result.text,
+                text=_strip_tool_call_xml(result.text),
                 tool_calls=all_tool_results,
                 iterations=iteration + 1,
                 total_prompt_tokens=total_prompt,
@@ -356,8 +480,11 @@ def run_agent_loop_streaming(
 
         if not tool_calls:
             # Final response — stream it token by token for the user
-            # Since we already have the full text, emit it in chunks
-            text = result.text
+            # Since we already have the full text, emit it in chunks.
+            # Strip any ``<tool_call>`` XML blobs the parser already
+            # consumed so the assistant bubble doesn't show raw call
+            # JSON next to the rendered ToolCallCard (FU-040).
+            text = _strip_tool_call_xml(result.text)
             chunk_size = 4
             for i in range(0, len(text), chunk_size):
                 yield {"token": text[i:i + chunk_size]}

diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py
@@ -103,7 +103,16 @@
         "popularityLabel": "Featured family",
         "likesLabel": "Qwen official",
         "badges": ["Reasoning", "Coding", "Agents", "Long context"],
-        "capabilities": ["reasoning", "coding", "tool-use", "vision"],
+        # FU-040 (2026-05-10): dropped ``vision`` from the family-level
+        # capabilities. Qwen3.6-27B (dense, Coder-Next branding) and
+        # Qwen3.6-35B-A3B (MoE) are both text-only — vision lives on a
+        # separate ``Qwen3.6-27B-VL`` variant we do not yet ship. The
+        # stale tag was promoting ``supportsVision: true`` for every
+        # community quant variant, which made ``ChatComposer`` render
+        # the "Attach image" affordance for a model that has no vision
+        # encoder. Add it back here only when an actual VL variant
+        # lands in the catalog.
+        "capabilities": ["reasoning", "coding", "tool-use"],
         "defaultVariantId": "Qwen/Qwen3.6-27B",
         "variants": [
             {
@@ -115,8 +124,9 @@
                 "sizeGb": 54.0,
                 "format": "Transformers",
                 "quantization": "BF16",
-                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
-                "note": "Dense 27B Qwen3.6 release with vision and agentic coding tuning. Apache 2.0.",
+                # FU-040: text-only dense variant (Coder-Next branding).
+                "capabilities": ["reasoning", "coding", "tool-use"],
+                "note": "Dense 27B Qwen3.6 release with agentic coding tuning. Apache 2.0.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
                 "backend": "mlx",
@@ -131,7 +141,8 @@
                 "sizeGb": 28.0,
                 "format": "Transformers",
                 "quantization": "FP8",
-                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
+                # FU-040: text-only dense variant.
+                "capabilities": ["reasoning", "coding", "tool-use"],
                 "note": "FP8 quantization of the 27B dense release for ~30 GB VRAM systems.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
@@ -163,7 +174,8 @@
                 "sizeGb": 15.5,
                 "format": "MLX",
                 "quantization": "4-bit",
-                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
+                # FU-040: text-only dense variant.
+                "capabilities": ["reasoning", "coding", "tool-use"],
                 "note": "Community MLX 4-bit conversion for Apple Silicon — fastest local launch path.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
@@ -239,7 +251,10 @@
         "popularityLabel": "Featured family",
         "likesLabel": "Qwen official",
         "badges": ["Reasoning", "Coding", "Long context"],
-        "capabilities": ["reasoning", "coding", "tool-use", "vision"],
+        # FU-040: Qwen3.5 dense + MoE variants are text-only. The
+        # ``vision`` tag at family-level was promoting false positives
+        # in ``supportsVision`` for every community quant variant.
+        "capabilities": ["reasoning", "coding", "tool-use"],
         "defaultVariantId": "Qwen/Qwen3.5-9B",
         "variants": [
             {
@@ -511,6 +526,37 @@
                 "launchMode": "convert",
                 "backend": "mlx",
             },
+            # FU-041 (2026-05-10): community MLX 4-bit conversion of the
+            # Qwen3-Next architecture (qwen3_next, sparse MoE w/ 512
+            # experts, ~3B active per token, hidden_size=2048). Without
+            # this variant the library matcher in src/utils/library.ts
+            # fuzzy-matched a local ``Qwen3-Coder-Next-MLX-4bit`` install
+            # to the unrelated ``mlx-community/Qwen3.6-27B-4bit`` (dense
+            # 27B Coder, completely different arch — hidden_size=5120,
+            # no MoE), which then surfaced the wrong canonicalRepo into
+            # the runtime snapshot, picked up the wrong capability set,
+            # and routed DFlash lookups to the wrong drafter. Adding the
+            # variant explicitly lets the matcher score 80+ on an exact
+            # repo-path substring hit instead of falling back to the
+            # closest-quant-and-format match.
+            {
+                "id": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
+                "name": "Qwen3 Coder Next MLX 4-bit",
+                "repo": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
+                "link": "https://huggingface.co/lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
+                # 80B total params, ~3B active per token; the on-disk
+                # 4-bit conversion fits ~45 GB.
+                "paramsB": 80.0,
+                "sizeGb": 45.0,
+                "format": "MLX",
+                "quantization": "4-bit",
+                "capabilities": ["coding", "agents", "tool-use", "reasoning", "thinking"],
+                "note": "Community MLX 4-bit conversion of the Qwen3-Next MoE coder for Apple Silicon — fastest local launch path.",
+                "contextWindow": "262K",
+                "launchMode": "direct",
+                "backend": "mlx",
+                "releaseDate": "2026-04",
+            },
         ],
         "readme": [
             "Qwen3 Coder Next is purpose-built for software engineering with function calling and agentic workflows.",

diff --git a/backend_service/helpers/cache.py b/backend_service/helpers/cache.py
@@ -17,7 +17,6 @@ def _estimate_baseline_tok_s(system_stats: dict[str, Any]) -> float:
 def _strategy_speed_map(strategy: str) -> dict[int, float]:
     """Speed ratio maps by strategy and bit count (fraction of baseline FP16 speed)."""
     maps: dict[str, dict[int, float]] = {
-        "rotorquant":   {1: 0.42, 2: 0.50, 3: 0.57, 4: 0.65},
         "triattention": {1: 0.48, 2: 0.56, 3: 0.63, 4: 0.70},
         "turboquant":   {1: 0.44, 2: 0.52, 3: 0.60, 4: 0.67},
     }
@@ -27,7 +26,6 @@ def _strategy_speed_map(strategy: str) -> dict[int, float]:
 def _strategy_quality_base(strategy: str) -> dict[int, float]:
     """Base quality percentage by strategy and bit count (before fp16_layers bonus)."""
     maps: dict[str, dict[int, float]] = {
-        "rotorquant":   {1: 88.0, 2: 91.0, 3: 93.5, 4: 96.0},
         "triattention": {1: 89.5, 2: 92.0, 3: 94.5, 4: 97.0},
         "turboquant":   {1: 87.5, 2: 90.5, 3: 93.0, 4: 95.5},
     }

diff --git a/backend_service/inference/binaries.py b/backend_service/inference/binaries.py
@@ -93,8 +93,8 @@ def _resolve_llama_server() -> str | None:
 def _resolve_llama_server_turbo() -> str | None:
     """Resolve the TurboQuant fork of llama-server (``llama-server-turbo``).
 
-    This fork supports all standard cache types **plus** iso/planar/turbo
-    cache types required by RotorQuant and TurboQuant strategies.
+    This fork supports all standard cache types **plus** turbo2/3/4
+    cache types required by the TurboQuant strategy.
     """
     override = os.getenv("CHAOSENGINE_LLAMA_SERVER_TURBO")
     if override: