From 0589505bcbc93fc66aef248f26746405f457ab53 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 14:47:02 +0100 Subject: [PATCH 01/13] FU-030: drop ChaosEngine + RotorQuant strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both slots added zero value over TurboQuant after the May 2026 landscape review. ChaosEngine (cryptopoly/ChaosEngine, 1 commit upstream) was eclipsed by NVIDIA's KVTC at ICLR 2026 — same PCA + adaptive quantization approach but 8–32x compression vs ChaosEngine's 3.7x, peer-reviewed, with a healthy upstream. KVTC slot lands separately in FU-029. RotorQuant shipped as a misleading alias for TurboQuant: same ``--cache-type-k turbo{N}`` flags, same ``turboquant`` Python module marker. Real scrya-com RotorQuant uses Clifford Cl(3,0) rotors with their own kernel path that we never wired up. Persisted user configs that still reference these ids coerce silently to ``turboquant`` via a new ``CacheStrategyRegistry.resolve_legacy_id`` helper + module-level ``_LEGACY_STRATEGY_ALIASES`` map. Frontend mirrors the coercion via ``LEGACY_STRATEGY_ALIASES`` + ``canonicalStrategyId`` in runtimeSupport.ts so chip filters and incompat-reason banners work for older session snapshots. The llama.cpp fallback chain shrank from 3-level (requested → ChaosEngine → native) to 2-level (requested → native) — the ChaosEngine intermediate only ever emitted standard q-type cache flags that native already covers. Vendored ChaosEngine bundling ripped from scripts/stage-runtime.mjs (3 helper functions removed: stageVendoredChaosEngine, ensureSetuptoolsForPep639, resolveChaosEngineVendor). Pre-build probe now asserts the legacy-id coercion works in CI rather than at runtime. ``[rotorquant]`` extra removed from pyproject.toml. ``CHAOSENGINE_VENDOR_PATH`` env var dropped. Test coverage: 1293 pytest pass, 341 vitest pass, tsc --noEmit clean. Migration test added at tests/test_cache_strategies.py asserts both legacy ids coerce + resolve to the TurboQuant strategy via registry.get(). New fixture entry in tests/inference-batch-strategies.json exercises the coercion end-to-end through the inference test runner. --- CLAUDE.md | 18 +-- THIRD_PARTY_NOTICES.md | 19 +-- backend_service/helpers/cache.py | 2 - backend_service/inference/binaries.py | 4 +- backend_service/inference/llama_cpp_engine.py | 22 ++- backend_service/routes/setup/__init__.py | 17 +-- backend_service/state/metrics.py | 2 +- cache_compression/__init__.py | 51 ++++--- cache_compression/chaosengine.py | 130 ----------------- cache_compression/rotorquant.py | 111 --------------- pyproject.toml | 1 - scripts/pre-build-check.mjs | 21 +-- scripts/stage-runtime.mjs | 121 +--------------- src/App.tsx | 1 - .../__tests__/kvStrategyFilter.test.ts | 23 ++- .../__tests__/runtimeSupport.test.ts | 16 ++- src/components/runtimeSupport.ts | 24 +++- .../chat/__tests__/kvStrategyOverride.test.ts | 6 +- src/features/models/MyModelsTab.tsx | 7 - src/hooks/useSettings.ts | 22 +-- tests/inference-batch-dflash-comparison.json | 4 +- tests/inference-batch-strategies.json | 9 +- tests/test_backend_service.py | 30 ++-- tests/test_cache_strategies.py | 134 +++++------------- tests/test_inference.py | 70 +++------ tests/test_setup_routes.py | 8 +- tests/test_teacache.py | 6 +- 27 files changed, 207 insertions(+), 672 deletions(-) delete mode 100644 cache_compression/chaosengine.py delete mode 100644 cache_compression/rotorquant.py diff --git a/CLAUDE.md b/CLAUDE.md index d56e751..7bfc078 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,7 +45,7 @@ ChaosEngineAI is a desktop AI inference app built with: | `backend_service/mlx_worker*.py` | MLX subprocess worker — `mlx_worker.py` orchestrator + `mlx_worker_{request,prompt,io,diagnostics,multimodal,cache,eval,loader}.py` siblings | | `backend_service/routes/` | API endpoints (14 route modules) | | `backend_service/helpers/` | System stats, settings, persistence, cache estimation | -| `cache_compression/` | Cache strategy registry + adapters (native, rotorquant, turboquant, chaosengine, triattention). Renamed from `compression/` so it doesn't shadow Python 3.14's stdlib `compression` namespace package. | +| `cache_compression/` | Cache strategy registry + adapters (native, turboquant, triattention, plus diffusion-only fbcache/teacache/taylorseer/magcache/pab/fastercache). Renamed from `compression/` so it doesn't shadow Python 3.14's stdlib `compression` namespace package. Legacy ids `chaosengine` and `rotorquant` were dropped in FU-030 and now coerce to `turboquant` via `registry.resolve_legacy_id`. | | `dflash/` | DFlash speculative decoding — draft model registry + availability detection | | `scripts/` | Build, install, and update scripts | | `tests/` | Python tests (pytest) | @@ -54,8 +54,8 @@ ChaosEngineAI is a desktop AI inference app built with: ### Binary Routing The app supports two llama-server binaries: -- **`llama-server`** (standard, Homebrew) — for native and ChaosEngine cache strategies -- **`llama-server-turbo`** (TurboQuant fork) — for RotorQuant and TurboQuant strategies, installed to `~/.chaosengine/bin/` +- **`llama-server`** (standard, Homebrew) — for the native cache strategy +- **`llama-server-turbo`** (TurboQuant fork) — for the TurboQuant strategy, installed to `~/.chaosengine/bin/` Each `CacheStrategy` declares `required_llama_binary()` → `"standard"` or `"turbo"`. The `LlamaCppEngine._select_llama_binary()` method in `inference/llama_cpp_engine.py` routes to the correct binary. Cache types are pre-validated against the binary's `--help` output before startup. @@ -82,16 +82,14 @@ Check for updates to external repos we build from or depend on: |-----------|------|--------|---------------| | llama.cpp (standard) | `ggml-org/llama.cpp` | `master` | `git -C ../llama.cpp fetch && git -C ../llama.cpp log HEAD..origin/master --oneline` | | llama-server-turbo | `TheTom/llama-cpp-turboquant` | `feature/turboquant-kv-cache` | `git ls-remote https://github.com/TheTom/llama-cpp-turboquant.git refs/heads/feature/turboquant-kv-cache` | -| ChaosEngine | `cryptopoly/ChaosEngine` | `main` | `git -C vendor/ChaosEngine fetch && git -C vendor/ChaosEngine log HEAD..origin/main --oneline` | | dflash-mlx | `bstnxbt/dflash-mlx` | `main` pinned to commit `f825ffb2` (upstream deleted all tags April 2026) | `git ls-remote https://github.com/bstnxbt/dflash-mlx.git refs/heads/main` | -| turboquant | `back2matching/turboquant` | — | `.venv/bin/pip index versions turboquant 2>/dev/null` | -| turboquant-mlx | `arozanov/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx 2>/dev/null` | | turboquant-mlx-full | `manjunathshiva/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx-full 2>/dev/null` | | DDTree (ported algorithm) | `liranringel/ddtree` | `main` | `git ls-remote https://github.com/liranringel/ddtree.git HEAD` | ### 4. Cache Strategy Health -- [ ] ChaosEngine `llama_cpp_cache_flags()` only emits standard types: `f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1` -- [ ] RotorQuant/TurboQuant strategies return `required_llama_binary() == "turbo"` +- [ ] Native strategy `llama_cpp_cache_flags()` only emits standard types: `f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1` +- [ ] TurboQuant strategy returns `required_llama_binary() == "turbo"` +- [ ] Legacy `chaosengine` + `rotorquant` ids coerce to `turboquant` via `registry.resolve_legacy_id` - [ ] DFlash `_COMMUNITY_PREFIXES` includes all common model repo prefixes - [ ] New model families added to `DRAFT_MODEL_MAP` if draft checkpoints exist @@ -137,6 +135,9 @@ no longer relevant. | ~~FU-025~~ | ~~mlx-video Wan one-shot convert action~~ | **Fully shipped 2026-05-04 (Phase 7 + Phase 8 + Phase 9).** | Closes FU-009 Wan branch. **Phase 7 (foundation):** `[mlx-video]` extra in [pyproject.toml](pyproject.toml) flipped to ``git+https://github.com/Blaizzy/mlx-video.git``. Helper [backend_service/mlx_video_wan_convert.py](backend_service/mlx_video_wan_convert.py) wraps the upstream `python -m mlx_video.models.wan_2.convert` subprocess: `slug_for(repo)` / `output_dir_for(repo)` / `status_for(repo)` / `list_converted()` / `run_convert(checkpoint_dir, repo, dtype, quantize, bits, group_size, timeout)`. Output under ``~/.chaosengine/mlx-video-wan//`` (override via ``CHAOSENGINE_MLX_VIDEO_WAN_DIR``). **Phase 8 (routing):** [mlx_video_runtime.py](backend_service/mlx_video_runtime.py) `supported_repos()` returns dynamic union of LTX-2 + converted-on-disk Wan repos. `_REPO_ENTRY_POINTS` adds `"Wan-AI/": "mlx_video.models.wan_2.generate"`. `_build_wan_cmd` produces the Wan-shaped CLI (`--model-dir`, `--guide-scale` string, `--scheduler`, optional `--seed`/`--steps`/`--negative-prompt`; no LTX-2 flags). `generate()` picks `_wan_runtime_note` (flags MoE experts) and skips LTX-2 effective-step / effective-guidance overrides. **Phase 9 (GUI):** Orchestrator [backend_service/mlx_video_wan_installer.py](backend_service/mlx_video_wan_installer.py) drives preflight → download-raw → convert → verify with structured progress events. Setup endpoints in [routes/setup.py](backend_service/routes/setup.py): `POST /api/setup/install-mlx-video-wan` (background-job pattern mirroring `/api/setup/install-longlive`), `GET /api/setup/install-mlx-video-wan/status`, `GET /api/setup/mlx-video-wan/inventory`. Frontend client in [src/api.ts](src/api.ts) (`startWanInstall`, `getWanInstallStatus`, `getWanInventory`). UI panel [src/components/WanInstallPanel.tsx](src/components/WanInstallPanel.tsx) lists every supported Wan repo with raw-size hint + converted badge / install button + live `InstallLogPanel` underneath; rendered in [VideoDiscoverTab.tsx](src/features/video/VideoDiscoverTab.tsx) above the variant grid. Supported raw repos: `Wan-AI/Wan2.{1-T2V-1.3B,1-T2V-14B,2-TI2V-5B,2-T2V-A14B,2-I2V-A14B}`. End-to-end UX: user clicks Install → backend downloads + converts in background → runtime auto-detects + routes Wan generate calls through mlx-video. Tests: 21 in [test_mlx_video_wan_convert.py](tests/test_mlx_video_wan_convert.py), 9 Wan-routing in [test_mlx_video.py](tests/test_mlx_video.py), 15 in [test_mlx_video_wan_installer.py](tests/test_mlx_video_wan_installer.py). | | ~~FU-026~~ | ~~TaylorSeer + DBCache aggressive cache preset~~ | **Obsoleted 2026-05-03 by diffusers 0.38 core.** | Diffusers 0.38.0 (2026-05-01) ships ``TaylorSeerCacheConfig``, ``MagCacheConfig``, ``PyramidAttentionBroadcastConfig``, ``FasterCacheConfig`` natively — no ``cache-dit`` dependency required. Wired as registry strategies (ids ``taylorseer``, ``magcache``, ``pab``, ``fastercache``) in [cache_compression/__init__.py](cache_compression/__init__.py). Each adapter calls ``pipeline.transformer.enable_cache()``. UNet pipelines (SD1.5/SDXL) raise ``NotImplementedError`` into a runtimeNote, matching the FBCache contract. MagCache is FLUX-only without calibration UX (uses ``FLUX_MAG_RATIOS`` from ``diffusers.hooks.mag_cache``); other DiTs raise a "calibration required" message until that UX lands. | | FU-027 | NVIDIA/kvpress KV cache toolkit (CUDA-side) | **Setup install action pre-staged 2026-05-05; integration code pending.** | [NVIDIA/kvpress](https://github.com/NVIDIA/kvpress) — Apache 2.0, 1.1k stars, `kvpress>=0.5.3` registered in `_INSTALLABLE_PIP_PACKAGES` so the Setup tab can pre-stage the wheel. Integration hooks land separately under `cache_compression/kvpress.py` once the helper picks an adapter shape (the upstream library exposes `presses` per technique — e.g. SnapKV / TOVA / KIVI / pyramid — and a `Pipeline` wrapper that takes a HF transformers model). Apple Silicon stays on TurboQuant-MLX; this is the CUDA-side complement. | +| FU-028 | MTP (Multi-Token Prediction) speculative decoding | **In progress 2026-05-10 (Apple Silicon path).** | Lossless 1.5–2.2× speedup for trained-with-MTP models — Gemma-4 (drafters released 2026-05-05, Apache 2.0), DeepSeek V3/R1, Qwen3.5/3.6/Next, Nemotron-3, MiMo-V2-Flash. Apple Silicon path goes through ``mlx_lm.generate(..., draft_model=…, num_draft_tokens=N)`` against the same model (its native MTP heads), gated by a new ``mtp: bool`` field on ``GenerationConfig`` + ``GenerationRequest``. llama.cpp path waits on PR [#22673](https://github.com/ggml-org/llama.cpp/pull/22673) (draft as of 2026-05-10) and lands as a follow-up bump. Catalog adds ``mtpCapable`` flag for tagged models so the UI surfaces the toggle automatically. Token-identical output at temp 0 vs non-MTP path is a hard test gate. | +| FU-029 | KVTC (NVIDIA ICLR 2026) KV cache strategy | **In progress 2026-05-10.** | New strategy slot replacing the dropped `chaosengine` path. PCA + adaptive quantization + entropy coding via [OnlyTerp/kvtc](https://github.com/OnlyTerp/kvtc) (Apache 2.0). Same approach as the dropped ChaosEngine but 8–32× compression vs ChaosEngine's 3.7×, peer-reviewed at ICLR 2026, and beats TurboQuant by 37% at comparable quality on long-context. One-time per-model calibration cached to ``~/.chaosengine/kvtc-calibration//``. UI shows a "calibrating…" badge during the first selection per model. | +| ~~FU-030~~ | ~~Drop ChaosEngine + RotorQuant strategy slots~~ | **Shipped 2026-05-10.** | ChaosEngine (cryptopoly/ChaosEngine — 1 commit upstream, eclipsed by KVTC at ICLR 2026 with the same PCA approach but 8–32× compression vs 3.7×) and RotorQuant (shipped as a misleading alias for TurboQuant — same ``--cache-type-k turbo{N}`` flags + same Python module marker) both removed from the registry. Persisted user configs that still reference these ids coerce silently to ``turboquant`` via a new ``CacheStrategyRegistry.resolve_legacy_id`` helper + module-level ``_LEGACY_STRATEGY_ALIASES`` map ([cache_compression/__init__.py](cache_compression/__init__.py)). Mirror coercion in frontend ([src/components/runtimeSupport.ts](src/components/runtimeSupport.ts) ``LEGACY_STRATEGY_ALIASES`` + ``canonicalStrategyId``). Two-level llama.cpp fallback chain (was three-level: requested → ChaosEngine → native; now requested → native) in [backend_service/inference/llama_cpp_engine.py](backend_service/inference/llama_cpp_engine.py). Vendored ChaosEngine bundling stripped from [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (3 helper functions removed: ``stageVendoredChaosEngine`` + ``ensureSetuptoolsForPep639`` + ``resolveChaosEngineVendor``). Pre-build probe asserts the legacy-id coercion works in CI. ``[rotorquant]`` extra removed from [pyproject.toml](pyproject.toml). ``CHAOSENGINE_VENDOR_PATH`` env var dropped. Cache strategy speed/quality maps in [helpers/cache.py](backend_service/helpers/cache.py) trimmed to remaining strategies. | --- @@ -281,4 +282,3 @@ patches. | `CHAOSENGINE_LLAMA_SERVER_TURBO` | Override turbo llama-server path | `~/.chaosengine/bin/llama-server-turbo` | | `CHAOSENGINE_MLX_PYTHON` | Override Python for MLX | `.venv/bin/python` | | `CHAOSENGINE_LLAMA_BIN_DIR` | Override llama.cpp build dir for staging | `../llama.cpp/build/bin/` | -| `CHAOSENGINE_VENDOR_PATH` | Override ChaosEngine vendor path | `vendor/ChaosEngine/` | diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md index 1719951..4122f99 100644 --- a/THIRD_PARTY_NOTICES.md +++ b/THIRD_PARTY_NOTICES.md @@ -24,8 +24,8 @@ These may be compiled from source and shipped alongside ChaosEngineAI. - **Copyright:** Copyright (c) 2023-2026 The ggml authors - **Binary:** `llama-server-turbo`, `llama-cli-turbo` - **Usage:** Adds turbo2/3/4 KV cache quantisation types used by the - RotorQuant and TurboQuant cache strategies. Actively maintained fork - with support for recent model architectures (Gemma 4, etc.). + TurboQuant cache strategy. Actively maintained fork with support for + recent model architectures (Gemma 4, etc.). > **MIT licence notice (applies to both llama.cpp and the TurboQuant fork):** > @@ -46,18 +46,6 @@ These may be compiled from source and shipped alongside ChaosEngineAI. --- -## Vendored Packages - -### ChaosEngine (PCA-based KV cache compression) - -- **Repository:** -- **Licence:** Apache 2.0 -- **Submodule:** `vendor/ChaosEngine` -- **Usage:** Desktop builds may bundle this into the runtime via - `npm run stage:runtime`. - ---- - ## Optional Third-Party Cache Strategies ChaosEngineAI supports optional cache/compression strategy backends. @@ -66,8 +54,7 @@ If installed by the user, each is subject to its own licence: | Strategy | Package | Repository | Licence | |----------|---------|-----------|---------| | TriAttention | `triattention` | | See upstream | -| RotorQuant (marker) | `turboquant` | | Apache 2.0 | -| TurboQuant MLX | `turboquant-mlx` | | MIT | +| TurboQuant MLX | `turboquant-mlx-full` | | MIT | | MegaKernel | — | | See upstream | | TeaCache (diffusion) | vendored patches | | Apache 2.0 | diff --git a/backend_service/helpers/cache.py b/backend_service/helpers/cache.py index fc3892f..bce1eee 100644 --- a/backend_service/helpers/cache.py +++ b/backend_service/helpers/cache.py @@ -17,7 +17,6 @@ def _estimate_baseline_tok_s(system_stats: dict[str, Any]) -> float: def _strategy_speed_map(strategy: str) -> dict[int, float]: """Speed ratio maps by strategy and bit count (fraction of baseline FP16 speed).""" maps: dict[str, dict[int, float]] = { - "rotorquant": {1: 0.42, 2: 0.50, 3: 0.57, 4: 0.65}, "triattention": {1: 0.48, 2: 0.56, 3: 0.63, 4: 0.70}, "turboquant": {1: 0.44, 2: 0.52, 3: 0.60, 4: 0.67}, } @@ -27,7 +26,6 @@ def _strategy_speed_map(strategy: str) -> dict[int, float]: def _strategy_quality_base(strategy: str) -> dict[int, float]: """Base quality percentage by strategy and bit count (before fp16_layers bonus).""" maps: dict[str, dict[int, float]] = { - "rotorquant": {1: 88.0, 2: 91.0, 3: 93.5, 4: 96.0}, "triattention": {1: 89.5, 2: 92.0, 3: 94.5, 4: 97.0}, "turboquant": {1: 87.5, 2: 90.5, 3: 93.0, 4: 95.5}, } diff --git a/backend_service/inference/binaries.py b/backend_service/inference/binaries.py index def797e..df714de 100644 --- a/backend_service/inference/binaries.py +++ b/backend_service/inference/binaries.py @@ -93,8 +93,8 @@ def _resolve_llama_server() -> str | None: def _resolve_llama_server_turbo() -> str | None: """Resolve the TurboQuant fork of llama-server (``llama-server-turbo``). - This fork supports all standard cache types **plus** iso/planar/turbo - cache types required by RotorQuant and TurboQuant strategies. + This fork supports all standard cache types **plus** turbo2/3/4 + cache types required by the TurboQuant strategy. """ override = os.getenv("CHAOSENGINE_LLAMA_SERVER_TURBO") if override: diff --git a/backend_service/inference/llama_cpp_engine.py b/backend_service/inference/llama_cpp_engine.py index b3ec69c..8aa5a25 100644 --- a/backend_service/inference/llama_cpp_engine.py +++ b/backend_service/inference/llama_cpp_engine.py @@ -609,21 +609,15 @@ def load_model( from cache_compression import registry as _strategy_registry failed_strategy_name: str | None = None - # Try the requested strategy first. If it fails, try ChaosEngine - # (which uses standard cache types on the standard llama-server), - # then finally native f16. This ensures the user gets the best - # available compression even when the turbo binary can't load a - # particular model architecture. + # Try the requested strategy first. If it fails, fall back to + # native f16 directly. The previous chain inserted a ChaosEngine + # intermediate step; ChaosEngine was dropped in FU-030 because + # its llama.cpp path only emitted standard q-type flags (q4_0 etc.) + # which are already a subset of what native + the ggml cache types + # cover. The two-level chain is shorter to reason about and the + # behaviour at the boundary is unchanged for users. + cache_strategy = _strategy_registry.resolve_legacy_id(cache_strategy) attempts: list[tuple[str, bool, bool]] = [(cache_strategy, fit_model_in_memory, False)] - if cache_strategy not in ("native", "chaosengine"): - # Always include ChaosEngine as an intermediate fallback. Its - # llama.cpp path only emits standard cache-type flags (q4_0 etc.) - # and runs on the standard binary — it does NOT require the - # chaos_engine Python package to be installed. Gating on - # is_available() would skip this fallback on CI / dev machines - # that don't have the package, breaking the 3-level chain. - if _strategy_registry.get("chaosengine") is not None: - attempts.append(("chaosengine", False, True)) if cache_strategy != "native": attempts.append(("native", False, True)) last_error: str | None = None diff --git a/backend_service/routes/setup/__init__.py b/backend_service/routes/setup/__init__.py index 59e4982..d778a84 100644 --- a/backend_service/routes/setup/__init__.py +++ b/backend_service/routes/setup/__init__.py @@ -142,22 +142,7 @@ "mlx-video": "mlx-video @ git+https://github.com/Blaizzy/mlx-video.git", } -_MANUAL_INSTALL_MESSAGES: dict[str, str] = { - "chaosengine": ( - "ChaosEngine is not published on PyPI. Clone " - "https://github.com/cryptopoly/ChaosEngine and install it into the " - "backend runtime with: {python} -m pip install -e /path/to/ChaosEngine. " - "Desktop release builds can also bundle a vendored vendor/ChaosEngine " - "checkout automatically during npm run stage:runtime." - ), - "chaos-engine": ( - "ChaosEngine is not published on PyPI. Clone " - "https://github.com/cryptopoly/ChaosEngine and install it into the " - "backend runtime with: {python} -m pip install -e /path/to/ChaosEngine. " - "Desktop release builds can also bundle a vendored vendor/ChaosEngine " - "checkout automatically during npm run stage:runtime." - ), -} +_MANUAL_INSTALL_MESSAGES: dict[str, str] = {} def _workspace_root() -> Path: from backend_service.app import WORKSPACE_ROOT diff --git a/backend_service/state/metrics.py b/backend_service/state/metrics.py index 0570db2..1c48de3 100644 --- a/backend_service/state/metrics.py +++ b/backend_service/state/metrics.py @@ -3,7 +3,7 @@ Two responsibilities: 1. **Cache labels** — turn ``(strategy_id, bits, fp16_layers)`` into the - human-readable string the UI shows ("Native f16 cache", "RotorQuant + human-readable string the UI shows ("Native f16 cache", "TurboQ 3-bit 4+4", etc.). Cache strategies registered in ``cache_compression.registry`` get their label from the strategy itself; everything else falls back to the native naming. diff --git a/cache_compression/__init__.py b/cache_compression/__init__.py index 2fc5355..926c540 100644 --- a/cache_compression/__init__.py +++ b/cache_compression/__init__.py @@ -139,6 +139,26 @@ def label(self, bits: int, fp16_layers: int) -> str: return self.name +# ====================================================================== +# Legacy aliases +# ====================================================================== + +# FU-030 (2026-05-10): ``chaosengine`` and ``rotorquant`` strategy slots were +# dropped after upstream landscape review. ChaosEngine (1-commit upstream, +# eclipsed by NVIDIA's KVTC at ICLR 2026) and RotorQuant (shipped as a +# misleading alias for TurboQuant — same ``--cache-type-k turbo{N}`` flags + +# same Python module marker) added zero value over TurboQuant. +# +# Persisted user configs that still reference these ids are silently coerced +# to ``turboquant`` (their effective behaviour anyway). Old values flow +# through the public ``registry.get()`` and ``registry.resolve_legacy_id()`` +# entry points so callers do not need to special-case them. +_LEGACY_STRATEGY_ALIASES: dict[str, str] = { + "chaosengine": "turboquant", + "rotorquant": "turboquant", +} + + # ====================================================================== # Registry # ====================================================================== @@ -154,9 +174,18 @@ def __init__(self) -> None: def register(self, strategy: CacheStrategy) -> None: self._strategies[strategy.strategy_id] = strategy + def resolve_legacy_id(self, strategy_id: str) -> str: + """Return canonical id for a possibly-legacy strategy id. + + Pure string mapping — does not touch the registry, safe to call + before discovery. Unknown ids pass through unchanged. + """ + return _LEGACY_STRATEGY_ALIASES.get(strategy_id, strategy_id) + def get(self, strategy_id: str) -> CacheStrategy | None: self._ensure_discovered() - return self._strategies.get(strategy_id) + canonical = self.resolve_legacy_id(strategy_id) + return self._strategies.get(canonical) def default(self) -> CacheStrategy: self._ensure_discovered() @@ -209,16 +238,6 @@ def discover(self) -> list[CacheStrategy]: "supports_fp16_layers": False, "required_llama_binary": "standard", }, - { - "id": "rotorquant", - "name": "RotorQuant", - "module": "cache_compression.rotorquant", - "class_name": "RotorQuantStrategy", - "bit_range": (3, 4), - "default_bits": 3, - "supports_fp16_layers": True, - "required_llama_binary": "turbo", - }, { "id": "triattention", "name": "TriAttention", @@ -239,16 +258,6 @@ def discover(self) -> list[CacheStrategy]: "supports_fp16_layers": True, "required_llama_binary": "turbo", }, - { - "id": "chaosengine", - "name": "ChaosEngine", - "module": "cache_compression.chaosengine", - "class_name": "ChaosEngineStrategy", - "bit_range": (2, 8), - "default_bits": 4, - "supports_fp16_layers": True, - "required_llama_binary": "standard", - }, { # Diffusion-pipeline cache — applies to image/video DiTs, # not text LLMs. The `bit_range`/`default_bits` fields are diff --git a/cache_compression/chaosengine.py b/cache_compression/chaosengine.py deleted file mode 100644 index c8d9000..0000000 --- a/cache_compression/chaosengine.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Adapter for ChaosEngine KV cache compression (cryptopoly/ChaosEngine). - -ChaosEngine uses PCA-based decorrelation, channel truncation, and hybrid -quantization to compress KV cache memory. It achieves ~3.7x compression -on 8B models with an average attention output error of 0.034. - -Supports 2/4/8-bit compression tiers with per-channel asymmetric -quantization and importance-weighted bit allocation. - -Desktop builds can bundle vendored ChaosEngine automatically during -``npm run stage:runtime`` when ``vendor/ChaosEngine`` (or -``CHAOSENGINE_VENDOR_PATH``) is present. Source/dev installs can still use: -``./.venv/bin/python3 -m pip install -e /path/to/ChaosEngine`` -GitHub: https://github.com/cryptopoly/ChaosEngine -""" - -from __future__ import annotations - -import importlib.util -from typing import Any - -from cache_compression import CacheStrategy - - -def _chaosengine_available() -> bool: - try: - return importlib.util.find_spec("chaos_engine") is not None - except (ImportError, AttributeError, ValueError): - return False - - -class ChaosEngineStrategy(CacheStrategy): - - @property - def strategy_id(self) -> str: - return "chaosengine" - - @property - def name(self) -> str: - return "ChaosEngine" - - def is_available(self) -> bool: - return _chaosengine_available() - - def availability_badge(self) -> str: - return "Ready" if self.is_available() else "Install" - - def availability_tone(self) -> str: - return "ready" if self.is_available() else "install" - - def availability_reason(self) -> str | None: - if self.is_available(): - return None - return ( - "ChaosEngine is not bundled into this runtime. Desktop release builds " - "bundle it automatically when vendor/ChaosEngine (or " - "CHAOSENGINE_VENDOR_PATH) is present during npm run stage:runtime. " - "For source/dev installs, use: ./.venv/bin/python3 -m pip install -e " - "/path/to/ChaosEngine — then restart ChaosEngineAI. " - "GitHub: https://github.com/cryptopoly/ChaosEngine" - ) - - def supported_bit_range(self) -> tuple[int, int] | None: - return (2, 8) - - def default_bits(self) -> int | None: - return 4 - - def supports_fp16_layers(self) -> bool: - return True - - # ------------------------------------------------------------------ - # Engine integration - # ------------------------------------------------------------------ - - def make_mlx_cache(self, num_layers, bits, fp16_layers, fused, model) -> Any | None: - """ChaosEngine is PyTorch-based — no MLX cache support yet.""" - raise NotImplementedError( - "ChaosEngine KV cache compression currently requires PyTorch. " - "Use the llama.cpp (GGUF) or vLLM backend, or contribute MLX " - "support at https://github.com/cryptopoly/ChaosEngine." - ) - - def llama_cpp_cache_flags(self, bits: int) -> list[str]: - """Return cache-type flags for llama.cpp with ChaosEngine quantization. - - ChaosEngine uses PCA decorrelation + hybrid quantization. - Maps to q-type cache flags based on the configured bit width. - """ - bit_map = { - 2: "q4_0", - 3: "q4_0", - 4: "q4_0", - 5: "q5_0", - 6: "q8_0", - 8: "q8_0", - } - cache_type = bit_map.get(bits, "q8_0") - return ["--cache-type-k", cache_type, "--cache-type-v", cache_type] - - def estimate_cache_bytes( - self, - num_layers, - num_heads, - hidden_size, - context_tokens, - bits, - fp16_layers, - num_kv_heads=None, - ): - kv_heads = num_kv_heads if num_kv_heads and num_kv_heads > 0 else num_heads - kv_elements = 2 * num_layers * kv_heads * (hidden_size // max(num_heads, 1)) * context_tokens - baseline = kv_elements * 2 # FP16 = 2 bytes per element - - compressed_layers = max(0, num_layers - 2 * fp16_layers) - fp16_layer_count = num_layers - compressed_layers - elements_per_layer = kv_elements // max(num_layers, 1) - - # ChaosEngine achieves slightly better compression than naive - # quantization due to PCA decorrelation reducing redundancy - # before quantization. Apply a 0.85 factor to account for this. - pca_efficiency = 0.85 - optimised = ( - fp16_layer_count * elements_per_layer * 2 - + compressed_layers * elements_per_layer * bits / 8 * pca_efficiency - ) - return baseline, int(optimised) - - def label(self, bits: int, fp16_layers: int) -> str: - return f"ChaosEngine {bits}-bit {fp16_layers}+{fp16_layers}" diff --git a/cache_compression/rotorquant.py b/cache_compression/rotorquant.py deleted file mode 100644 index 4c70325..0000000 --- a/cache_compression/rotorquant.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Optional adapter for RotorQuant (scrya-com/rotorquant). - -RotorQuant provides IsoQuant (4D quaternion rotation) and PlanarQuant -(2D Givens rotation) KV cache compression. PyTorch/CUDA only — no MLX. - -The llama.cpp integration uses cache-type flags like ``iso3`` / ``planar3`` -via the RotorQuant llama.cpp fork. - -Install: ``pip install chaosengine-ai[rotorquant]`` -(installs the ``turboquant`` PyPI package) -""" - -from __future__ import annotations - -import importlib -from typing import Any - -from cache_compression import CacheStrategy - - -def _load_turboquant_module() -> Any | None: - try: - return importlib.import_module("turboquant") - except ImportError: - return None - - -def _has_rotorquant_marker(module: Any | None) -> bool: - if module is None: - return False - return any( - hasattr(module, name) - for name in ("IsoQuantMSE", "PlanarQuantMSE", "TurboQuantMSE", "TurboQuantIP", "TurboQuantCache") - ) - - -class RotorQuantStrategy(CacheStrategy): - - @property - def strategy_id(self) -> str: - return "rotorquant" - - @property - def name(self) -> str: - return "RotorQuant" - - def is_available(self) -> bool: - # The Python package is only used as an installation marker here. - # Actual execution still routes through the RotorQuant llama.cpp fork. - return _has_rotorquant_marker(_load_turboquant_module()) - - def availability_badge(self) -> str: - return "Ready" if self.is_available() else "Install" - - def availability_tone(self) -> str: - return "ready" if self.is_available() else "install" - - def availability_reason(self) -> str | None: - if self.is_available(): - return None - return "Install turboquant into ChaosEngineAI's backend runtime, then restart the app." - - def supported_bit_range(self) -> tuple[int, int] | None: - return (3, 4) - - def default_bits(self) -> int | None: - return 3 - - def supports_fp16_layers(self) -> bool: - return True - - def required_llama_binary(self) -> str: - return "turbo" - - # ------------------------------------------------------------------ - # Engine integration - # ------------------------------------------------------------------ - - def make_mlx_cache(self, num_layers, bits, fp16_layers, fused, model) -> Any | None: - """RotorQuant is PyTorch/CUDA only — no MLX support.""" - raise NotImplementedError( - "RotorQuant requires PyTorch/CUDA and does not support MLX. " - "Use the llama.cpp (GGUF) backend with RotorQuant cache types, " - "or the vLLM backend." - ) - - def llama_cpp_cache_flags(self, bits: int) -> list[str]: - """Return cache-type flags for the TurboQuant llama.cpp fork. - - The fork (github.com/TheTom/llama-cpp-turboquant, branch - ``feature/turboquant-kv-cache``) supports ``turbo2``, ``turbo3``, - ``turbo4`` as cache-type values. - - RotorQuant maps to the same turbo cache types — both are - rotation-based KV cache quantization. - """ - clamped = max(2, min(4, bits)) - return ["--cache-type-k", f"turbo{clamped}", "--cache-type-v", f"turbo{clamped}"] - - def estimate_cache_bytes(self, num_layers, num_heads, hidden_size, context_tokens, bits, fp16_layers, num_kv_heads=None): - kv_heads = num_kv_heads if num_kv_heads and num_kv_heads > 0 else num_heads - kv_elements = 2 * num_layers * kv_heads * (hidden_size // max(num_heads, 1)) * context_tokens - baseline = kv_elements * 2 - compressed_layers = max(0, num_layers - 2 * fp16_layers) - fp16_layer_count = num_layers - compressed_layers - elements_per_layer = kv_elements // max(num_layers, 1) - optimised = (fp16_layer_count * elements_per_layer * 2) + (compressed_layers * elements_per_layer * bits / 8) - return baseline, int(optimised) - - def label(self, bits: int, fp16_layers: int) -> str: - return f"Rotor {bits}-bit {fp16_layers}+{fp16_layers}" diff --git a/pyproject.toml b/pyproject.toml index 7988d52..378d497 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,6 @@ mlx-vlm = [ ] triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git", "vllm>=0.8.0"] triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git", "mlx-lm>=0.22.0"] -rotorquant = ["turboquant>=0.2.0"] turboquant = ["turboquant-mlx-full>=0.3.0"] vllm = ["vllm>=0.8.0"] dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@8d8545d791383008b5e2b1e738c38a7a73ba484e"] diff --git a/scripts/pre-build-check.mjs b/scripts/pre-build-check.mjs index a07927c..6cf0bce 100755 --- a/scripts/pre-build-check.mjs +++ b/scripts/pre-build-check.mjs @@ -165,7 +165,6 @@ console.log("[4/7] Licence notices..."); "llama-cpp-turboquant", "dflash-mlx", "turboquant", - "chaosengine", ]; const missing = required.filter((dep) => !content.includes(dep.toLowerCase())); if (missing.length === 0) { @@ -186,19 +185,25 @@ console.log("[5/7] Cache strategy validation..."); from cache_compression import registry registry.discover() valid = {'f32','f16','bf16','q8_0','q4_0','q4_1','iq4_nl','q5_0','q5_1'} -ce = registry.get('chaosengine') -for bits in (2,3,4,5,6,8): - flags = ce.llama_cpp_cache_flags(bits) +nat = registry.get('native') +for bits in (0,): + flags = nat.llama_cpp_cache_flags(bits) for i, f in enumerate(flags): if f.startswith('--cache-type-') and i+1 < len(flags): if flags[i+1] not in valid: - print(f'INVALID: ChaosEngine {bits}-bit emits {flags[i+1]}') -rq = registry.get('rotorquant') + print(f'INVALID: Native emits {flags[i+1]}') tq = registry.get('turboquant') -if rq.required_llama_binary() != 'turbo': - print('INVALID: RotorQuant not routing to turbo') if tq.required_llama_binary() != 'turbo': print('INVALID: TurboQuant not routing to turbo') +# FU-030 (2026-05-10): rotorquant + chaosengine were dropped. Their ids +# must coerce to turboquant via the legacy alias map; assert that here so +# regressions surface in CI rather than at runtime. +for legacy_id in ('rotorquant', 'chaosengine'): + coerced = registry.resolve_legacy_id(legacy_id) + if coerced != 'turboquant': + print(f'INVALID: legacy id {legacy_id} did not coerce to turboquant (got {coerced})') + if registry.get(legacy_id) is None: + print(f'INVALID: legacy id {legacy_id} did not resolve via registry.get') print('OK') `.trim(); const result = capture(venvPython(), ["-c", probe]); diff --git a/scripts/stage-runtime.mjs b/scripts/stage-runtime.mjs index bded9c9..97fe3d6 100644 --- a/scripts/stage-runtime.mjs +++ b/scripts/stage-runtime.mjs @@ -68,7 +68,6 @@ function main() { copyFile(path.join(workspaceRoot, relativeFile), path.join(backendDest, relativeFile)); } - const chaosEngineBundle = stageVendoredChaosEngine(pythonInfo.executable); const bundledOptionalPackages = stageOptionalRuntimePackages(pythonInfo.executable); validateBundledProjectImports(pythonInfo.executable); const llamaWarnings = stageLlamaBinaries(); @@ -94,7 +93,7 @@ function main() { // the backend probe reports the engine as unavailable. See FU-008. sdCpp: fs.existsSync(path.join(binDest, binaryName("sd"))) ? `bin/${binaryName("sd")}` : null, pythonVersion: pythonInfo.versionTag, - bundledCacheStrategies: chaosEngineBundle ? ["chaosengine"] : [], + bundledCacheStrategies: [], bundledOptionalPackages: bundledOptionalPackages, warnings: llamaWarnings, }; @@ -260,7 +259,7 @@ function validateBundledProjectImports(pythonBinary) { env, }).trim(); const ids = JSON.parse(payload); - const expected = ["native", "rotorquant", "triattention", "turboquant", "chaosengine"]; + const expected = ["native", "triattention", "turboquant"]; const missing = expected.filter((id) => !ids.includes(id)); if (missing.length === 0) { return; @@ -275,122 +274,10 @@ function validateBundledProjectImports(pythonBinary) { console.warn(`[stage-runtime] warning: ${message}`); } -function stageVendoredChaosEngine(pythonBinary) { - const vendor = resolveChaosEngineVendor(); - if (!vendor) { - return null; - } - - // vendor/ChaosEngine/pyproject.toml declares `license = "Apache-2.0"` per - // PEP 639. Setuptools < 77 rejects the string form with: - // "project.license must be valid exactly by one definition (2 matches found)" - // Since we pass --no-build-isolation, pip uses whatever setuptools the build - // venv has — and fresh Windows venvs sometimes ship 65.x. Upgrade in place - // before the vendor install so the build works without requiring the user - // to run build.ps1 first (e.g. `npm run tauri:dev`). - ensureSetuptoolsForPep639(pythonBinary); - - console.log(`[stage-runtime] bundling ChaosEngine (${vendor.source})`); - try { - execFileSync( - pythonBinary, - [ - "-m", - "pip", - "install", - "--disable-pip-version-check", - "--no-deps", - "--no-compile", - "--no-build-isolation", - "--upgrade", - "--target", - sitePackagesDest, - vendor.path, - ], - { - cwd: workspaceRoot, - stdio: "inherit", - }, - ); - return vendor; - } catch (err) { - if (strict) { - throw err; - } - console.warn( - `[stage-runtime] warning: could not bundle ChaosEngine vendor package (${err.message.split("\n")[0]}). ` + - `The ChaosEngine cache strategy will fall back to the system-installed package if available.`, - ); - return null; - } -} - -function ensureSetuptoolsForPep639(pythonBinary) { - // Bound: >=77 for PEP 639 license strings, <82 because modern torch - // wheels declare ``setuptools<82`` and pip's resolver surfaces a loud - // "requires setuptools<82" warning on every subsequent invocation once - // 82.x is installed. - const checkScript = [ - "import sys", - "try:", - " from importlib.metadata import version", - " v = version('setuptools')", - "except Exception:", - " sys.exit(2)", - "parts = [int(p) for p in v.split('.')[:2] if p.isdigit()]", - "if not parts:", - " sys.exit(1)", - "major = parts[0]", - "sys.exit(0 if 77 <= major < 82 else 1)", - ].join("\n"); - - let ok = false; - try { - execFileSync(pythonBinary, ["-c", checkScript], { stdio: "ignore" }); - ok = true; - } catch { - ok = false; - } - if (ok) return; - - console.log(`[stage-runtime] pinning setuptools to >=77,<82 for PEP 639 licenses + torch compatibility`); - try { - execFileSync( - pythonBinary, - ["-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "setuptools>=77,<82", "wheel"], - { cwd: workspaceRoot, stdio: "inherit" }, - ); - } catch (err) { - console.warn( - `[stage-runtime] warning: could not pin setuptools (${err.message.split("\n")[0]}). ` + - `Vendor install may fail and pip may warn about torch incompatibility.`, - ); - } -} - -function resolveChaosEngineVendor() { - const override = process.env.CHAOSENGINE_VENDOR_PATH; - if (override) { - return { - path: resolveExistingPath(override, "ChaosEngine vendor path"), - source: "env-override", - }; - } - - const vendoredPath = path.join(workspaceRoot, "vendor", "ChaosEngine"); - if (!fs.existsSync(vendoredPath)) { - return null; - } - return { - path: fs.realpathSync(vendoredPath), - source: "vendor/ChaosEngine", - }; -} - function stageOptionalRuntimePackages(pythonBinary) { // Pre-install optional runtime packages into the staged site-packages - // so that DFlash, TurboQuant, and RotorQuant work out of the box for - // new users without requiring manual pip installs via the Setup page. + // so that DFlash and TurboQuant work out of the box for new users + // without requiring manual pip installs via the Setup page. // // Each entry: { pipName, importName, platforms? } // - pipName: passed verbatim to ``pip install`` (may be a PyPI name or diff --git a/src/App.tsx b/src/App.tsx index 5155bf8..c4cab30 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -1141,7 +1141,6 @@ export default function App() { strategyCompat={{ turboInstalled: !!workspace.system.llamaServerTurboPath, turboquantMlxAvailable: workspace.system.availableCacheStrategies?.some((s) => s.id === "turboquant" && s.available) ?? false, - chaosengineAvailable: workspace.system.availableCacheStrategies?.some((s) => s.id === "chaosengine" && s.available) ?? false, dflashSupportedModels: workspace.system.dflash?.supportedModels ?? [], }} activeDownloads={activeDownloads} diff --git a/src/components/__tests__/kvStrategyFilter.test.ts b/src/components/__tests__/kvStrategyFilter.test.ts index 57ae3da..1fda590 100644 --- a/src/components/__tests__/kvStrategyFilter.test.ts +++ b/src/components/__tests__/kvStrategyFilter.test.ts @@ -18,14 +18,12 @@ function makeStrategy(overrides: Partial): Strategy { } const NATIVE = makeStrategy({ id: "native", name: "Native f16" }); -const ROTORQUANT = makeStrategy({ id: "rotorquant", name: "RotorQuant", requiredLlamaBinary: "turbo" }); const TURBOQUANT = makeStrategy({ id: "turboquant", name: "TurboQuant", requiredLlamaBinary: "turbo" }); -const CHAOSENGINE = makeStrategy({ id: "chaosengine", name: "ChaosEngine" }); const TRIATTENTION = makeStrategy({ id: "triattention", name: "TriAttention" }); const TEACACHE = makeStrategy({ id: "teacache", name: "TeaCache", appliesTo: ["image", "video"] }); const FBCACHE = makeStrategy({ id: "fbcache", name: "First Block Cache", appliesTo: ["image", "video"] }); -const ALL = [NATIVE, ROTORQUANT, TURBOQUANT, CHAOSENGINE, TRIATTENTION, TEACACHE, FBCACHE]; +const ALL = [NATIVE, TURBOQUANT, TRIATTENTION, TEACACHE, FBCACHE]; describe("filterTextStrategies", () => { it("returns empty for null input", () => { @@ -39,7 +37,6 @@ describe("filterTextStrategies", () => { }); it("MLX engine: only native + turboquant (matches launch-settings modal)", () => { - // RotorQuant + ChaosEngine require llama.cpp / vLLM substrate; // TriAttention requires vLLM. STRATEGY_ENGINE_SUPPORT in // runtimeSupport.ts is the single source of truth; the chip // mirrors the modal verdict so users don't see options the @@ -48,26 +45,24 @@ describe("filterTextStrategies", () => { expect(out.sort()).toEqual(["native", "turboquant"]); }); - it("llama.cpp engine: native + rotorquant + turboquant + chaosengine", () => { + it("llama.cpp engine: native + turboquant", () => { const out = filterTextStrategies(ALL, "llama.cpp").map((s) => s.id); - expect(out.sort()).toEqual(["chaosengine", "native", "rotorquant", "turboquant"]); + expect(out.sort()).toEqual(["native", "turboquant"]); }); it("gguf substring matches the llama.cpp set (engine label can be 'gguf')", () => { const out = filterTextStrategies(ALL, "gguf").map((s) => s.id); - expect(out.sort()).toEqual(["chaosengine", "native", "rotorquant", "turboquant"]); + expect(out.sort()).toEqual(["native", "turboquant"]); }); - it("vllm engine: full set including triattention (matches modal)", () => { - // ``STRATEGY_ENGINE_SUPPORT`` lists rotorquant / chaosengine / - // turboquant as vLLM-compatible alongside triattention, so the - // chip mirrors the modal and shows them all. Diffusion-only - // strategies (TeaCache / FBCache) stay out via layer 1. + it("vllm engine: native + turboquant + triattention", () => { + // FU-030: chaosengine + rotorquant slots dropped. The remaining + // text strategies on vLLM are native + turboquant + triattention. + // Diffusion-only strategies (TeaCache / FBCache) stay out via + // layer 1. const out = filterTextStrategies(ALL, "vllm").map((s) => s.id); expect(out.sort()).toEqual([ - "chaosengine", "native", - "rotorquant", "triattention", "turboquant", ]); diff --git a/src/components/__tests__/runtimeSupport.test.ts b/src/components/__tests__/runtimeSupport.test.ts index 9154af2..454bda5 100644 --- a/src/components/__tests__/runtimeSupport.test.ts +++ b/src/components/__tests__/runtimeSupport.test.ts @@ -132,8 +132,18 @@ describe("sanitizeSpeculativeSelection()", () => { }); describe("strategy compatibility helpers", () => { - it("flags RotorQuant as incompatible with MLX", () => { - expect(isStrategyCompatible("rotorquant", "mlx")).toBe(false); - expect(strategyIncompatReason("rotorquant", "mlx")).toContain("llama.cpp or vLLM"); + it("flags TriAttention as incompatible with MLX", () => { + expect(isStrategyCompatible("triattention", "mlx")).toBe(false); + expect(strategyIncompatReason("triattention", "mlx")).toContain("vLLM"); + }); + + it("FU-030: legacy chaosengine + rotorquant ids coerce to turboquant", () => { + // Persisted user configs that still reference the dropped ids must + // route through ``canonicalStrategyId`` so frontend filters treat + // them as turboquant. Mirrors backend ``registry.resolve_legacy_id``. + expect(isStrategyCompatible("chaosengine", "mlx")).toBe(true); + expect(isStrategyCompatible("rotorquant", "mlx")).toBe(true); + expect(strategyIncompatReason("chaosengine", "mlx")).toBeNull(); + expect(strategyIncompatReason("rotorquant", "mlx")).toBeNull(); }); }); diff --git a/src/components/runtimeSupport.ts b/src/components/runtimeSupport.ts index b052e4f..3629638 100644 --- a/src/components/runtimeSupport.ts +++ b/src/components/runtimeSupport.ts @@ -3,17 +3,30 @@ import type { SystemStats } from "../types"; const COMMUNITY_PREFIXES = ["mlx-community/", "lmstudio-community/", "thebloke/", "bartowski/"]; const QUANT_SUFFIXES = /[-_](?:bf16|fp16|f16|\d+bit|q\d(?:_[a-z0-9]+)*|gguf|mlx|instruct)$/i; +// FU-030 (2026-05-10): chaosengine + rotorquant slots dropped. Persisted +// session configs that still reference them coerce to ``turboquant`` via +// the backend's ``registry.resolve_legacy_id`` map; the same coercion is +// mirrored here so frontend filters work correctly when older session +// snapshots are rehydrated. Update both sides if the alias map changes. +export const LEGACY_STRATEGY_ALIASES: Record = { + chaosengine: "turboquant", + rotorquant: "turboquant", +}; + +export function canonicalStrategyId(strategyId: string): string { + return LEGACY_STRATEGY_ALIASES[strategyId] ?? strategyId; +} + export const STRATEGY_ENGINE_SUPPORT: Record = { native: ["mlx", "gguf", "llama.cpp", "vllm", "auto"], triattention: ["vllm"], - rotorquant: ["gguf", "llama.cpp", "vllm"], turboquant: ["mlx", "gguf", "llama.cpp", "vllm", "auto"], - chaosengine: ["gguf", "llama.cpp", "vllm"], }; export function isStrategyCompatible(strategyId: string, backend: string | null | undefined): boolean { if (!backend || backend === "auto") return true; - const supported = STRATEGY_ENGINE_SUPPORT[strategyId]; + const canonical = canonicalStrategyId(strategyId); + const supported = STRATEGY_ENGINE_SUPPORT[canonical]; if (!supported) return true; return supported.some((candidate) => backend.includes(candidate)); } @@ -21,9 +34,8 @@ export function isStrategyCompatible(strategyId: string, backend: string | null export function strategyIncompatReason(strategyId: string, backend: string | null | undefined): string | null { if (!backend || backend === "auto" || isStrategyCompatible(strategyId, backend)) return null; const engineLabel = backend.includes("gguf") || backend.includes("llama") ? "llama.cpp" : backend; - if (strategyId === "triattention") return "TriAttention requires the vLLM backend (Linux + CUDA)."; - if (strategyId === "rotorquant") return `RotorQuant requires llama.cpp or vLLM, not ${engineLabel}.`; - if (strategyId === "chaosengine") return `ChaosEngine requires llama.cpp or vLLM, not ${engineLabel}.`; + const canonical = canonicalStrategyId(strategyId); + if (canonical === "triattention") return "TriAttention requires the vLLM backend (Linux + CUDA)."; return `Not compatible with the ${engineLabel} backend.`; } diff --git a/src/features/chat/__tests__/kvStrategyOverride.test.ts b/src/features/chat/__tests__/kvStrategyOverride.test.ts index 76f191d..1d9a620 100644 --- a/src/features/chat/__tests__/kvStrategyOverride.test.ts +++ b/src/features/chat/__tests__/kvStrategyOverride.test.ts @@ -39,7 +39,7 @@ describe("kvStrategyOverride storage", () => { }); it("clears storage when given null", () => { - writeKvStrategyOverride("s1", { strategy: "chaosengine", bits: 8 }); + writeKvStrategyOverride("s1", { strategy: "triattention", bits: 3 }); writeKvStrategyOverride("s1", null); expect(readKvStrategyOverride("s1")).toBeNull(); expect(window.localStorage.getItem("chat.kvStrategy.s1")).toBeNull(); @@ -61,9 +61,9 @@ describe("kvStrategyOverride storage", () => { }); it("scopes overrides per session", () => { - writeKvStrategyOverride("s1", { strategy: "chaosengine", bits: 8 }); + writeKvStrategyOverride("s1", { strategy: "triattention", bits: 3 }); writeKvStrategyOverride("s2", { strategy: "turboquant", bits: 4 }); - expect(readKvStrategyOverride("s1")).toEqual({ strategy: "chaosengine", bits: 8 }); + expect(readKvStrategyOverride("s1")).toEqual({ strategy: "triattention", bits: 3 }); expect(readKvStrategyOverride("s2")).toEqual({ strategy: "turboquant", bits: 4 }); }); }); diff --git a/src/features/models/MyModelsTab.tsx b/src/features/models/MyModelsTab.tsx index 1d3966d..1e2f52d 100644 --- a/src/features/models/MyModelsTab.tsx +++ b/src/features/models/MyModelsTab.tsx @@ -35,7 +35,6 @@ export interface LibraryRow { interface StrategyCompatInfo { turboInstalled: boolean; turboquantMlxAvailable: boolean; - chaosengineAvailable: boolean; dflashSupportedModels: string[]; } @@ -238,10 +237,6 @@ export function MyModelsTab({ } case "turboquant": return (isGGUF && !!strategyCompat?.turboInstalled) || (isMLX && !!strategyCompat?.turboquantMlxAvailable); - case "rotorquant": - return isGGUF && !!strategyCompat?.turboInstalled; - case "chaosengine": - return isGGUF && !!strategyCompat?.chaosengineAvailable; default: return true; } @@ -250,8 +245,6 @@ export function MyModelsTab({ const STRATEGY_FILTERS = [ { id: "dflash", label: "DFlash", color: "#a78bfa" }, { id: "turboquant", label: "TurboQuant", color: "#60a5fa" }, - { id: "rotorquant", label: "RotorQuant", color: "#34d399" }, - { id: "chaosengine", label: "ChaosEngine", color: "#f59e0b" }, ]; const allLibraryCaps = filteredLibraryRows.flatMap(({ matchedVariant }) => matchedVariant?.capabilities ?? []); diff --git a/src/hooks/useSettings.ts b/src/hooks/useSettings.ts index 5ed8217..a571c50 100644 --- a/src/hooks/useSettings.ts +++ b/src/hooks/useSettings.ts @@ -468,17 +468,15 @@ export function useSettings( async function handleInstallPackage(strategyId: string) { // Strategies that need the turbo binary (llama-server-turbo) for GGUF. - const needsTurboBinary = strategyId === "rotorquant" || strategyId === "turboquant"; + const needsTurboBinary = strategyId === "turboquant"; const pipPackageMap: Record = { - rotorquant: "turboquant", turboquant: "turboquant-mlx", triattention: "triattention", "dflash-mlx": "dflash-mlx", dflash: "dflash", }; const pipCommandMap: Record = { - rotorquant: "./.venv/bin/python3 -m pip install turboquant", turboquant: "./.venv/bin/python3 -m pip install turboquant-mlx-full", triattention: "./.venv/bin/python3 -m pip install 'triattention @ git+https://github.com/WeianMao/triattention.git'", "dflash-mlx": "./.venv/bin/python3 -m pip install 'dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@f825ffb268e50d531e8b6524413b0847334a14dd'", @@ -487,19 +485,11 @@ export function useSettings( const pipName = pipPackageMap[strategyId]; if (!pipName) { beginInstallLog(strategyId); - if (strategyId === "chaosengine") { - const message = "ChaosEngine is not on PyPI. Desktop builds can bundle a vendored vendor/ChaosEngine checkout during npm run stage:runtime. For source/dev installs, clone https://github.com/cryptopoly/ChaosEngine and install it into the backend runtime with ./.venv/bin/python3 -m pip install -e /path/to/ChaosEngine, then restart ChaosEngineAI."; - addInstallLogStep(strategyId, "manual", "Manual install required", "./.venv/bin/python3 -m pip install -e /path/to/ChaosEngine"); - finishInstallLogStep(strategyId, "manual", "failed", message); - finishInstallLog(strategyId, "failed"); - setError(message); - } else { - const message = `No installer is configured for ${strategyId}.`; - addInstallLogStep(strategyId, "manual", "No installer configured", strategyId); - finishInstallLogStep(strategyId, "manual", "failed", message); - finishInstallLog(strategyId, "failed"); - setError(message); - } + const message = `No installer is configured for ${strategyId}.`; + addInstallLogStep(strategyId, "manual", "No installer configured", strategyId); + finishInstallLogStep(strategyId, "manual", "failed", message); + finishInstallLog(strategyId, "failed"); + setError(message); return; } beginInstallLog(strategyId); diff --git a/tests/inference-batch-dflash-comparison.json b/tests/inference-batch-dflash-comparison.json index 5965d94..995f6e1 100644 --- a/tests/inference-batch-dflash-comparison.json +++ b/tests/inference-batch-dflash-comparison.json @@ -95,12 +95,12 @@ "prompt": "What is the capital of France? Answer in one sentence." }, { - "_label": "Qwen3.5-9B GGUF — RotorQuant 4-bit", + "_label": "Qwen3.5-9B GGUF — TurboQuant 4-bit", "modelRef": "lmstudio-community/Qwen3.5-9B-GGUF", "modelName": "Qwen3.5-9B-GGUF", "path": "/Users/dan/.cache/huggingface/hub/models--lmstudio-community--Qwen3.5-9B-GGUF", "backend": "gguf", - "cacheStrategy": "rotorquant", + "cacheStrategy": "turboquant", "cacheBits": 4, "fp16Layers": 4, "contextTokens": 4096, diff --git a/tests/inference-batch-strategies.json b/tests/inference-batch-strategies.json index 5c1a112..631f03a 100644 --- a/tests/inference-batch-strategies.json +++ b/tests/inference-batch-strategies.json @@ -19,13 +19,13 @@ "prompt": "What is the capital of France? Answer in one sentence." }, { - "_label": "Test 2: RotorQuant via turbo binary (GGUF)", + "_label": "Test 2: TriAttention via standard binary (GGUF)", "modelRef": "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "modelName": "Nemotron-4B-GGUF", "path": "/Users/dan/.cache/huggingface/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-4B-GGUF", "backend": "gguf", - "cacheStrategy": "rotorquant", - "cacheBits": 4, + "cacheStrategy": "triattention", + "cacheBits": 3, "fp16Layers": 4, "contextTokens": 4096, "maxTokens": 256, @@ -38,7 +38,8 @@ "prompt": "Explain what a neural network is in two sentences." }, { - "_label": "Test 3: ChaosEngine via standard binary (GGUF)", + "_label": "Test 3: FU-030 legacy id coercion (chaosengine -> turboquant)", + "_note": "Persisted user configs that still reference the dropped chaosengine id must run as TurboQuant via registry.resolve_legacy_id without throwing.", "modelRef": "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "modelName": "Nemotron-4B-GGUF", "path": "/Users/dan/.cache/huggingface/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-4B-GGUF", diff --git a/tests/test_backend_service.py b/tests/test_backend_service.py index 6d881c5..a4b0810 100644 --- a/tests/test_backend_service.py +++ b/tests/test_backend_service.py @@ -1529,7 +1529,7 @@ def test_html_challenge_persists_manifest_and_html_files(self): "launch": { "temperature": 0.7, "maxTokens": 8192, - "cacheStrategy": "chaosengine", + "cacheStrategy": "turboquant", "cacheBits": 4, "fp16Layers": 0, "fusedAttention": False, @@ -1587,7 +1587,7 @@ def test_html_challenge_persists_manifest_and_html_files(self): self.assertIn("Q4_K_M", settings_text) self.assertIn("16.3 GB", settings_text) self.assertIn("128K", settings_text) - self.assertIn("chaosengine 4-bit · 256K ctx · 8K max · temp 0.7", settings_text) + self.assertIn("turboquant 4-bit · 256K ctx · 8K max · temp 0.7", settings_text) self.assertIn("turboquant 3-bit · 256K ctx · 8K max · temp 0.7", settings_text) self.assertIn("Thinking off", settings_text) self.assertEqual([slot["status"] for slot in challenge["slots"]], ["done", "done"]) @@ -2006,7 +2006,7 @@ def test_mlx_cache_only_change_uses_profile_update_without_weight_reload(self): "canonicalRepo": "google/gemma-4-E4B-it", "source": "catalog", "backend": "mlx", - "cacheStrategy": "rotorquant", + "cacheStrategy": "turboquant", "cacheBits": 4, "fp16Layers": 2, "fusedAttention": True, @@ -2017,7 +2017,7 @@ def test_mlx_cache_only_change_uses_profile_update_without_weight_reload(self): self.assertEqual(response.status_code, 200) load_model_spy.assert_not_called() self.assertEqual(len(state.runtime.profile_updates), 1) - self.assertEqual(state.runtime.loaded_model.cacheStrategy, "rotorquant") + self.assertEqual(state.runtime.loaded_model.cacheStrategy, "turboquant") self.assertEqual(state.runtime.loaded_model.cacheBits, 4) self.assertEqual(state.runtime.loaded_model.fp16Layers, 2) self.assertTrue(state.runtime.loaded_model.fusedAttention) @@ -2067,20 +2067,22 @@ def test_preview_math_reduces_cache_size(self): context_tokens=8192, params_b=7.0, system_stats=fake_system_snapshot(), - strategy="rotorquant", + strategy="turboquant", ) self.assertLess(preview["optimizedCacheGb"], preview["baselineCacheGb"]) self.assertGreater(preview["compressionRatio"], 1.0) - def test_manual_cache_backend_install_returns_helpful_error(self): - for package_name in ("chaosengine", "chaos-engine"): - response = self.client.post( - "/api/setup/install-package", - json={"package": package_name}, - ) - self.assertEqual(response.status_code, 400) - self.assertIn("not published on PyPI", response.json()["detail"]) - self.assertIn("pip install -e /path/to/ChaosEngine", response.json()["detail"]) + def test_unknown_package_install_returns_helpful_error(self): + """FU-030: ``chaosengine`` was a manual install candidate; the strategy + is gone now. Installing an unknown package must surface the standard + "not in allowed install list" 400 instead of the old ChaosEngine + clone-and-install message.""" + response = self.client.post( + "/api/setup/install-package", + json={"package": "not-a-real-package"}, + ) + self.assertEqual(response.status_code, 400) + self.assertIn("not in the allowed install list", response.json()["detail"]) def test_convert_endpoint_returns_conversion_payload(self): state = ChaosEngineState( diff --git a/tests/test_cache_strategies.py b/tests/test_cache_strategies.py index db144e7..cd50b2d 100644 --- a/tests/test_cache_strategies.py +++ b/tests/test_cache_strategies.py @@ -8,7 +8,6 @@ from cache_compression import CacheStrategyRegistry from cache_compression.native import NativeStrategy -from cache_compression.rotorquant import RotorQuantStrategy from cache_compression.triattention import TriAttentionStrategy from cache_compression.turboquant import TurboQuantStrategy from turboquant_mlx import _find_pip_turboquant_path @@ -29,37 +28,38 @@ def test_native_is_default(self): self.assertEqual(default.strategy_id, "native") def test_external_strategies_registered(self): - for strategy_id in ("triattention", "rotorquant"): + for strategy_id in ("triattention", "turboquant"): strategy = self.registry.get(strategy_id) self.assertIsNotNone(strategy, f"Strategy '{strategy_id}' not found in registry") - def test_available_returns_all_strategies(self): + def test_available_returns_active_strategies(self): available = self.registry.available() ids = [s["id"] for s in available] self.assertIn("native", ids) - self.assertIn("rotorquant", ids) self.assertIn("triattention", ids) self.assertIn("turboquant", ids) - self.assertIn("chaosengine", ids) + # FU-030: dropped strategies must NOT appear in the available output. + self.assertNotIn("rotorquant", ids) + self.assertNotIn("chaosengine", ids) self.assertEqual(len(ids), len(set(ids))) def test_discover_keeps_placeholder_when_optional_adapter_import_fails(self): real_import_module = importlib.import_module def fake_import(name, package=None): - if name == "cache_compression.rotorquant": - raise RuntimeError("broken rotorquant import") + if name == "cache_compression.triattention": + raise RuntimeError("broken triattention import") return real_import_module(name, package) registry = CacheStrategyRegistry() with patch("cache_compression.importlib.import_module", side_effect=fake_import): registry.discover() - rotor = registry.get("rotorquant") - self.assertIsNotNone(rotor) - self.assertFalse(rotor.is_available()) - self.assertIn("could not be loaded", rotor.availability_reason()) - self.assertIn("broken rotorquant import", rotor.availability_reason()) + tri = registry.get("triattention") + self.assertIsNotNone(tri) + self.assertFalse(tri.is_available()) + self.assertIn("could not be loaded", tri.availability_reason()) + self.assertIn("broken triattention import", tri.availability_reason()) def test_native_cache_flags(self): native = self.registry.get("native") @@ -114,58 +114,33 @@ def test_triattention_estimate_compresses(self): self.assertLess(optimised, baseline) # ------------------------------------------------------------------ - # RotorQuant + # FU-030: legacy alias coercion (chaosengine + rotorquant) # ------------------------------------------------------------------ - def test_rotorquant_bit_range(self): - rq = self.registry.get("rotorquant") - self.assertEqual(rq.supported_bit_range(), (3, 4)) + def test_legacy_chaosengine_coerces_to_turboquant(self): + """Persisted configs with ``chaosengine`` must resolve to TurboQuant.""" + coerced = self.registry.resolve_legacy_id("chaosengine") + self.assertEqual(coerced, "turboquant") - def test_rotorquant_llama_flags(self): - rq = self.registry.get("rotorquant") - flags = rq.llama_cpp_cache_flags(3) - self.assertEqual(flags, ["--cache-type-k", "turbo3", "--cache-type-v", "turbo3"]) - flags4 = rq.llama_cpp_cache_flags(4) - self.assertEqual(flags4, ["--cache-type-k", "turbo4", "--cache-type-v", "turbo4"]) + def test_legacy_rotorquant_coerces_to_turboquant(self): + """Persisted configs with ``rotorquant`` must resolve to TurboQuant.""" + coerced = self.registry.resolve_legacy_id("rotorquant") + self.assertEqual(coerced, "turboquant") - def test_rotorquant_mlx_raises_helpful_message(self): - rq = self.registry.get("rotorquant") - with self.assertRaises(NotImplementedError) as ctx: - rq.make_mlx_cache(32, 3, 4, False, None) - self.assertIn("PyTorch/CUDA", str(ctx.exception)) - self.assertIn("llama.cpp", str(ctx.exception)) + def test_unknown_id_passes_through_resolver(self): + self.assertEqual(self.registry.resolve_legacy_id("does-not-exist"), "does-not-exist") - def test_rotorquant_estimate_compresses(self): - rq = self.registry.get("rotorquant") - baseline, optimised = rq.estimate_cache_bytes( - num_layers=32, num_heads=32, hidden_size=4096, - context_tokens=8192, bits=3, fp16_layers=4, - ) - self.assertLess(optimised, baseline) + def test_get_resolves_legacy_chaosengine_to_turboquant_strategy(self): + legacy = self.registry.get("chaosengine") + canonical = self.registry.get("turboquant") + self.assertIsNotNone(legacy) + self.assertIs(legacy, canonical) - def test_rotorquant_label(self): - rq = self.registry.get("rotorquant") - self.assertEqual(rq.label(3, 4), "Rotor 3-bit 4+4") - - def test_rotorquant_bits_clamped(self): - rq = self.registry.get("rotorquant") - # Bits below 2 should clamp to 2 - flags = rq.llama_cpp_cache_flags(1) - self.assertEqual(flags, ["--cache-type-k", "turbo2", "--cache-type-v", "turbo2"]) - # Bits above 4 should clamp to 4 - flags = rq.llama_cpp_cache_flags(8) - self.assertEqual(flags, ["--cache-type-k", "turbo4", "--cache-type-v", "turbo4"]) - - def test_rotorquant_is_available_with_current_turboquant_exports(self): - rq = RotorQuantStrategy() - module = SimpleNamespace(TurboQuantMSE=object(), TurboQuantCache=object()) - with patch("cache_compression.rotorquant._load_turboquant_module", return_value=module): - self.assertTrue(rq.is_available()) - - def test_rotorquant_is_unavailable_without_supported_marker(self): - rq = RotorQuantStrategy() - with patch("cache_compression.rotorquant._load_turboquant_module", return_value=object()): - self.assertFalse(rq.is_available()) + def test_get_resolves_legacy_rotorquant_to_turboquant_strategy(self): + legacy = self.registry.get("rotorquant") + canonical = self.registry.get("turboquant") + self.assertIsNotNone(legacy) + self.assertIs(legacy, canonical) # ------------------------------------------------------------------ # TurboQuant @@ -214,35 +189,6 @@ def test_turboquant_adapter_finds_package_in_extras_dir(self): with patch.dict("os.environ", {"CHAOSENGINE_EXTRAS_SITE_PACKAGES": tmp}): self.assertEqual(_find_pip_turboquant_path(), str(package.resolve())) - # ------------------------------------------------------------------ - # ChaosEngine — cache type validation - # ------------------------------------------------------------------ - - def test_chaosengine_cache_flags_use_standard_types(self): - """ChaosEngine must only emit cache types that standard llama-server - accepts: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1.""" - ce = self.registry.get("chaosengine") - valid_types = {"f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "iq4_nl", "q5_0", "q5_1"} - for bits in (2, 3, 4, 5, 6, 8): - flags = ce.llama_cpp_cache_flags(bits) - for i, flag in enumerate(flags): - if flag.startswith("--cache-type-") and i + 1 < len(flags): - cache_type = flags[i + 1] - self.assertIn( - cache_type, valid_types, - f"ChaosEngine {bits}-bit emits '{cache_type}' which is not a valid standard llama-server cache type", - ) - - def test_chaosengine_8bit_maps_to_q8_0(self): - ce = self.registry.get("chaosengine") - flags = ce.llama_cpp_cache_flags(8) - self.assertEqual(flags, ["--cache-type-k", "q8_0", "--cache-type-v", "q8_0"]) - - def test_chaosengine_4bit_maps_to_q4_0(self): - ce = self.registry.get("chaosengine") - flags = ce.llama_cpp_cache_flags(4) - self.assertEqual(flags, ["--cache-type-k", "q4_0", "--cache-type-v", "q4_0"]) - # ------------------------------------------------------------------ # required_llama_binary() metadata # ------------------------------------------------------------------ @@ -251,18 +197,10 @@ def test_native_requires_standard_binary(self): native = self.registry.get("native") self.assertEqual(native.required_llama_binary(), "standard") - def test_rotorquant_requires_turbo_binary(self): - rq = self.registry.get("rotorquant") - self.assertEqual(rq.required_llama_binary(), "turbo") - def test_turboquant_requires_turbo_binary(self): tq = self.registry.get("turboquant") self.assertEqual(tq.required_llama_binary(), "turbo") - def test_chaosengine_requires_standard_binary(self): - ce = self.registry.get("chaosengine") - self.assertEqual(ce.required_llama_binary(), "standard") - def test_available_json_includes_required_llama_binary(self): available = self.registry.available() for entry in available: @@ -275,7 +213,7 @@ def test_broken_strategy_preserves_required_llama_binary(self): real_import_module = importlib.import_module def fake_import(name, package=None): - if name == "cache_compression.rotorquant": + if name == "cache_compression.turboquant": raise RuntimeError("broken") return real_import_module(name, package) @@ -283,8 +221,8 @@ def fake_import(name, package=None): with patch("cache_compression.importlib.import_module", side_effect=fake_import): registry.discover() - rotor = registry.get("rotorquant") - self.assertEqual(rotor.required_llama_binary(), "turbo") + tq = registry.get("turboquant") + self.assertEqual(tq.required_llama_binary(), "turbo") class FirstBlockCacheStrategyTests(unittest.TestCase): diff --git a/tests/test_inference.py b/tests/test_inference.py index 09f60bf..cc58aae 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -125,24 +125,23 @@ def _capabilities(self) -> BackendCapabilities: llamaServerPath="/usr/local/bin/llama-server", ) - def test_startup_fallback_tries_chaosengine_then_native(self): - """When the turbo binary fails, the fallback chain is: - rotorquant → chaosengine → native f16.""" + def test_startup_fallback_falls_back_to_native_when_turbo_fails(self): + """FU-030: fallback chain shrank from 3-level (requested → ChaosEngine + → native) to 2-level (requested → native) after the ChaosEngine + intermediate slot was removed.""" engine = LlamaCppEngine(self._capabilities()) fake_process = mock.Mock() fake_process.poll.return_value = None - # 3 attempts: rotorquant (fail) → chaosengine (fail) → native (succeed) + # 2 attempts: turboquant (fail on turbo binary) → native (succeed) with ( mock.patch.object(engine, "_build_command", side_effect=[ (["llama-server-turbo"], None, False, None), (["llama-server"], None, False, None), - (["llama-server"], None, False, None), ]), mock.patch.object(engine, "_wait_for_server", side_effect=[ RuntimeError("unknown architecture"), - RuntimeError("cache type unsupported"), None, ]), mock.patch.object(engine, "_cleanup_process"), @@ -156,7 +155,7 @@ def test_startup_fallback_tries_chaosengine_then_native(self): backend="llama.cpp", path=None, runtime_target="lmstudio-community/Qwen3.5-35B-A3B-GGUF", - cache_strategy="rotorquant", + cache_strategy="turboquant", cache_bits=3, fp16_layers=4, fused_attention=False, @@ -167,25 +166,17 @@ def test_startup_fallback_tries_chaosengine_then_native(self): self.assertEqual(loaded.cacheStrategy, "native") self.assertEqual(loaded.cacheBits, 0) self.assertEqual(loaded.fp16Layers, 0) - self.assertIn("RotorQuant", loaded.runtimeNote) - def test_startup_fallback_lands_on_chaosengine_when_it_works(self): - """When turbo binary fails but ChaosEngine succeeds, use ChaosEngine.""" + def test_startup_legacy_rotorquant_id_coerces_to_turboquant(self): + """FU-030: persisted ``rotorquant`` configs must run as ``turboquant``.""" engine = LlamaCppEngine(self._capabilities()) fake_process = mock.Mock() fake_process.poll.return_value = None - # 2 attempts: rotorquant (fail) → chaosengine (succeed) with ( - mock.patch.object(engine, "_build_command", side_effect=[ - (["llama-server-turbo"], None, False, None), - (["llama-server"], None, False, None), - ]), - mock.patch.object(engine, "_wait_for_server", side_effect=[ - RuntimeError("unknown architecture"), - None, - ]), + mock.patch.object(engine, "_build_command", return_value=(["llama-server-turbo"], None, False, None)), + mock.patch.object(engine, "_wait_for_server", return_value=None), mock.patch.object(engine, "_cleanup_process"), mock.patch("backend_service.inference.subprocess.Popen", return_value=fake_process), ): @@ -205,10 +196,7 @@ def test_startup_fallback_lands_on_chaosengine_when_it_works(self): context_tokens=8192, ) - self.assertEqual(loaded.cacheStrategy, "chaosengine") - self.assertEqual(loaded.fp16Layers, 0) - self.assertIn("RotorQuant", loaded.runtimeNote) - self.assertIn("turbo binary", loaded.runtimeNote) + self.assertEqual(loaded.cacheStrategy, "turboquant") def test_successful_gguf_load_reports_fp16_layers_as_ignored(self): engine = LlamaCppEngine(self._capabilities()) @@ -230,7 +218,7 @@ def test_successful_gguf_load_reports_fp16_layers_as_ignored(self): backend="llama.cpp", path=None, runtime_target="lmstudio-community/Qwen3.5-35B-A3B-GGUF", - cache_strategy="rotorquant", + cache_strategy="turboquant", cache_bits=3, fp16_layers=4, fused_attention=False, @@ -238,10 +226,10 @@ def test_successful_gguf_load_reports_fp16_layers_as_ignored(self): context_tokens=8192, ) - self.assertEqual(loaded.cacheStrategy, "rotorquant") + self.assertEqual(loaded.cacheStrategy, "turboquant") self.assertEqual(loaded.cacheBits, 3) self.assertEqual(loaded.fp16Layers, 0) - self.assertIn("Rotor 3-bit 0+0 cache", loaded.runtimeNote) + self.assertIn("TurboQ 3-bit 0+0 cache", loaded.runtimeNote) self.assertIn("ignores the FP16 layers setting", loaded.runtimeNote) @@ -529,7 +517,7 @@ def test_native_strategy_uses_standard_binary(self): ) self.assertEqual(command[0], "/usr/local/bin/llama-server") - def test_rotorquant_uses_turbo_binary_when_available(self): + def test_turboquant_uses_turbo_binary_when_available(self): engine = LlamaCppEngine(self._capabilities(turbo_path="/usr/local/bin/llama-server-turbo")) with ( @@ -540,7 +528,7 @@ def test_rotorquant_uses_turbo_binary_when_available(self): command, _, _, _mmproj = engine._build_command( path="/tmp/model.gguf", runtime_target=None, - cache_strategy="rotorquant", + cache_strategy="turboquant", cache_bits=3, context_tokens=8192, fit_enabled=True, @@ -549,7 +537,7 @@ def test_rotorquant_uses_turbo_binary_when_available(self): self.assertEqual(command[0], "/usr/local/bin/llama-server-turbo") self.assertIn("turbo3", command) - def test_rotorquant_falls_back_to_f16_without_turbo_binary(self): + def test_turboquant_falls_back_to_f16_without_turbo_binary(self): engine = LlamaCppEngine(self._capabilities(turbo_path=None)) with ( @@ -560,7 +548,7 @@ def test_rotorquant_falls_back_to_f16_without_turbo_binary(self): command, runtime_note, _, _mmproj = engine._build_command( path="/tmp/model.gguf", runtime_target=None, - cache_strategy="rotorquant", + cache_strategy="turboquant", cache_bits=3, context_tokens=8192, fit_enabled=True, @@ -572,26 +560,6 @@ def test_rotorquant_falls_back_to_f16_without_turbo_binary(self): self.assertNotIn("turbo3", command) self.assertIn("llama-server-turbo", runtime_note) - def test_chaosengine_uses_standard_binary(self): - engine = LlamaCppEngine(self._capabilities(turbo_path="/usr/local/bin/llama-server-turbo")) - - with ( - mock.patch("backend_service.inference._find_open_port", return_value=9999), - mock.patch("backend_service.inference.llama_cpp_engine._llama_server_supports", return_value=False), - mock.patch("backend_service.inference.llama_cpp_engine._llama_server_cache_types", return_value=frozenset({"f16", "q8_0", "q4_0", "q5_0"})), - ): - command, _, _, _mmproj = engine._build_command( - path="/tmp/model.gguf", - runtime_target=None, - cache_strategy="chaosengine", - cache_bits=4, - context_tokens=8192, - fit_enabled=True, - is_fallback=False, - ) - self.assertEqual(command[0], "/usr/local/bin/llama-server") - self.assertIn("q4_0", command) - def test_turbo_only_binary_serves_all_strategies(self): """When only llama-server-turbo exists (no standard binary), it should serve as the binary for all strategies since it's a superset.""" @@ -691,7 +659,7 @@ def test_build_command_prevalidation_catches_unsupported_type(self): ): command, note, _, _mmproj = engine._build_command( path="/tmp/model.gguf", runtime_target=None, - cache_strategy="rotorquant", cache_bits=3, + cache_strategy="turboquant", cache_bits=3, context_tokens=8192, fit_enabled=True, is_fallback=False, ) diff --git a/tests/test_setup_routes.py b/tests/test_setup_routes.py index 3c19c40..f39cb5b 100644 --- a/tests/test_setup_routes.py +++ b/tests/test_setup_routes.py @@ -144,10 +144,14 @@ def test_install_pip_rejects_unknown_package(self): resp = self.client.post("/api/setup/install-package", json={"package": "evil-package"}) self.assertEqual(resp.status_code, 400) - def test_install_pip_returns_manual_message_for_chaosengine(self): + def test_install_pip_unknown_legacy_package_falls_through_to_400(self): + """FU-030: ``chaosengine`` was previously a manual install candidate + with a clone-and-install message. The strategy is gone now; the + package should fall through to the standard "not in allowed + install list" 400 like any other unknown id.""" resp = self.client.post("/api/setup/install-package", json={"package": "chaosengine"}) self.assertEqual(resp.status_code, 400) - self.assertIn("not published on PyPI", resp.json()["detail"]) + self.assertIn("not in the allowed install list", resp.json()["detail"]) def test_install_pip_accepts_whitelisted_package(self): with mock.patch("backend_service.routes.setup.subprocess.run") as mock_run: diff --git a/tests/test_teacache.py b/tests/test_teacache.py index 7c9907c..90cf185 100644 --- a/tests/test_teacache.py +++ b/tests/test_teacache.py @@ -55,10 +55,10 @@ def test_available_json_includes_applies_to(self): def test_text_strategies_default_to_text_domain(self): """The default CacheStrategy.applies_to() is {"text"} — every - pre-existing strategy (native, rotor, tri, turbo, chaos) should - still report text-only without any code change. + pre-existing text strategy (native, triattention, turboquant) + should still report text-only without any code change. """ - for sid in ("native", "rotorquant", "triattention", "turboquant", "chaosengine"): + for sid in ("native", "triattention", "turboquant"): strategy = self.registry.get(sid) self.assertEqual( strategy.applies_to(), frozenset({"text"}), From 3b3d17b1ce1039a0bf94565355a56c23e760b8b9 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 14:56:05 +0100 Subject: [PATCH 02/13] Add cross-strategy E2E matrix runner + defer FU-028 / FU-029 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two complementary changes: 1. ``scripts/cache-strategy-matrix.py`` sweeps every supported (cache strategy × spec-dec method × representative model) combination through a running backend on port 8876 and writes a CSV + Markdown report to ``~/.chaosengine/test-results/``. Replaces the ad-hoc per-strategy smoke scripts with a single end-to-end harness, and **asserts the FU-030 legacy alias coercion** at runtime — runs with ``cacheStrategy=chaosengine`` and ``cacheStrategy=rotorquant`` must come back loaded as ``turboquant``, exit code 2 on regression. Skips cells where the strategy isn't installed, the turbo binary is missing, the model isn't in the local library, or the spec-dec method isn't supported on the chosen backend, so a fresh CI box reports honest skip reasons rather than failing. Includes 20 unit tests covering the pure functions (``skip_reason``, ``write_csv``, ``write_markdown``, ``print_summary``, matrix definition checks) without standing up a backend. 2. FU-028 (MTP) and FU-029 (KVTC) tracker entries flipped from "in progress" to "deferred — upstream blockers" with the actual blockers documented: - **FU-028 MTP:** mlx-lm 0.31.3 has ``stream_generate(..., draft_model=...)`` for separately-trained drafts but no native MTP-head loader (the Gemma-4 / Qwen3.5 MTP drafters share activations + KV cache with the target and cannot be loaded as a standalone ``mlx.nn.Module``). Verified by inspecting the installed package source. llama.cpp PR #22673 still in Draft. MTPLX (third-party) is HTTP-only. Re-evaluate when (a) mlx-lm gains native MTP-head loading, OR (b) llama.cpp #22673 merges, OR (c) MTPLX exposes a Python in-process API. - **FU-029 KVTC:** OnlyTerp/kvtc is CUDA-only (MLX/Metal "planned" but not implemented), not on PyPI (distributed as a ``src.*`` repo), and integrates as a HuggingFace ``DynamicCache`` wrapper rather than a llama.cpp cache type. Apple Silicon dev box can't validate end-to-end. Re-evaluate when upstream ships MLX support or a CUDA dev box becomes available. The honest "deferred + reasoned" tracker entries are themselves the right output here per the project guidelines — the alternative was landing a half-wired CUDA-only KVTC slot or an HTTP-chained MTPLX adapter, both of which would have shipped surface area without delivering actual quality/performance to the user. Test totals: 1313 pytest pass (+20 new), 341 vitest pass, tsc clean. --- CLAUDE.md | 28 +- scripts/cache-strategy-matrix.py | 455 +++++++++++++++++++++ tests/test_cache_strategy_matrix_runner.py | 312 ++++++++++++++ 3 files changed, 793 insertions(+), 2 deletions(-) create mode 100755 scripts/cache-strategy-matrix.py create mode 100644 tests/test_cache_strategy_matrix_runner.py diff --git a/CLAUDE.md b/CLAUDE.md index 7bfc078..1df64b3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -135,8 +135,8 @@ no longer relevant. | ~~FU-025~~ | ~~mlx-video Wan one-shot convert action~~ | **Fully shipped 2026-05-04 (Phase 7 + Phase 8 + Phase 9).** | Closes FU-009 Wan branch. **Phase 7 (foundation):** `[mlx-video]` extra in [pyproject.toml](pyproject.toml) flipped to ``git+https://github.com/Blaizzy/mlx-video.git``. Helper [backend_service/mlx_video_wan_convert.py](backend_service/mlx_video_wan_convert.py) wraps the upstream `python -m mlx_video.models.wan_2.convert` subprocess: `slug_for(repo)` / `output_dir_for(repo)` / `status_for(repo)` / `list_converted()` / `run_convert(checkpoint_dir, repo, dtype, quantize, bits, group_size, timeout)`. Output under ``~/.chaosengine/mlx-video-wan//`` (override via ``CHAOSENGINE_MLX_VIDEO_WAN_DIR``). **Phase 8 (routing):** [mlx_video_runtime.py](backend_service/mlx_video_runtime.py) `supported_repos()` returns dynamic union of LTX-2 + converted-on-disk Wan repos. `_REPO_ENTRY_POINTS` adds `"Wan-AI/": "mlx_video.models.wan_2.generate"`. `_build_wan_cmd` produces the Wan-shaped CLI (`--model-dir`, `--guide-scale` string, `--scheduler`, optional `--seed`/`--steps`/`--negative-prompt`; no LTX-2 flags). `generate()` picks `_wan_runtime_note` (flags MoE experts) and skips LTX-2 effective-step / effective-guidance overrides. **Phase 9 (GUI):** Orchestrator [backend_service/mlx_video_wan_installer.py](backend_service/mlx_video_wan_installer.py) drives preflight → download-raw → convert → verify with structured progress events. Setup endpoints in [routes/setup.py](backend_service/routes/setup.py): `POST /api/setup/install-mlx-video-wan` (background-job pattern mirroring `/api/setup/install-longlive`), `GET /api/setup/install-mlx-video-wan/status`, `GET /api/setup/mlx-video-wan/inventory`. Frontend client in [src/api.ts](src/api.ts) (`startWanInstall`, `getWanInstallStatus`, `getWanInventory`). UI panel [src/components/WanInstallPanel.tsx](src/components/WanInstallPanel.tsx) lists every supported Wan repo with raw-size hint + converted badge / install button + live `InstallLogPanel` underneath; rendered in [VideoDiscoverTab.tsx](src/features/video/VideoDiscoverTab.tsx) above the variant grid. Supported raw repos: `Wan-AI/Wan2.{1-T2V-1.3B,1-T2V-14B,2-TI2V-5B,2-T2V-A14B,2-I2V-A14B}`. End-to-end UX: user clicks Install → backend downloads + converts in background → runtime auto-detects + routes Wan generate calls through mlx-video. Tests: 21 in [test_mlx_video_wan_convert.py](tests/test_mlx_video_wan_convert.py), 9 Wan-routing in [test_mlx_video.py](tests/test_mlx_video.py), 15 in [test_mlx_video_wan_installer.py](tests/test_mlx_video_wan_installer.py). | | ~~FU-026~~ | ~~TaylorSeer + DBCache aggressive cache preset~~ | **Obsoleted 2026-05-03 by diffusers 0.38 core.** | Diffusers 0.38.0 (2026-05-01) ships ``TaylorSeerCacheConfig``, ``MagCacheConfig``, ``PyramidAttentionBroadcastConfig``, ``FasterCacheConfig`` natively — no ``cache-dit`` dependency required. Wired as registry strategies (ids ``taylorseer``, ``magcache``, ``pab``, ``fastercache``) in [cache_compression/__init__.py](cache_compression/__init__.py). Each adapter calls ``pipeline.transformer.enable_cache()``. UNet pipelines (SD1.5/SDXL) raise ``NotImplementedError`` into a runtimeNote, matching the FBCache contract. MagCache is FLUX-only without calibration UX (uses ``FLUX_MAG_RATIOS`` from ``diffusers.hooks.mag_cache``); other DiTs raise a "calibration required" message until that UX lands. | | FU-027 | NVIDIA/kvpress KV cache toolkit (CUDA-side) | **Setup install action pre-staged 2026-05-05; integration code pending.** | [NVIDIA/kvpress](https://github.com/NVIDIA/kvpress) — Apache 2.0, 1.1k stars, `kvpress>=0.5.3` registered in `_INSTALLABLE_PIP_PACKAGES` so the Setup tab can pre-stage the wheel. Integration hooks land separately under `cache_compression/kvpress.py` once the helper picks an adapter shape (the upstream library exposes `presses` per technique — e.g. SnapKV / TOVA / KIVI / pyramid — and a `Pipeline` wrapper that takes a HF transformers model). Apple Silicon stays on TurboQuant-MLX; this is the CUDA-side complement. | -| FU-028 | MTP (Multi-Token Prediction) speculative decoding | **In progress 2026-05-10 (Apple Silicon path).** | Lossless 1.5–2.2× speedup for trained-with-MTP models — Gemma-4 (drafters released 2026-05-05, Apache 2.0), DeepSeek V3/R1, Qwen3.5/3.6/Next, Nemotron-3, MiMo-V2-Flash. Apple Silicon path goes through ``mlx_lm.generate(..., draft_model=…, num_draft_tokens=N)`` against the same model (its native MTP heads), gated by a new ``mtp: bool`` field on ``GenerationConfig`` + ``GenerationRequest``. llama.cpp path waits on PR [#22673](https://github.com/ggml-org/llama.cpp/pull/22673) (draft as of 2026-05-10) and lands as a follow-up bump. Catalog adds ``mtpCapable`` flag for tagged models so the UI surfaces the toggle automatically. Token-identical output at temp 0 vs non-MTP path is a hard test gate. | -| FU-029 | KVTC (NVIDIA ICLR 2026) KV cache strategy | **In progress 2026-05-10.** | New strategy slot replacing the dropped `chaosengine` path. PCA + adaptive quantization + entropy coding via [OnlyTerp/kvtc](https://github.com/OnlyTerp/kvtc) (Apache 2.0). Same approach as the dropped ChaosEngine but 8–32× compression vs ChaosEngine's 3.7×, peer-reviewed at ICLR 2026, and beats TurboQuant by 37% at comparable quality on long-context. One-time per-model calibration cached to ``~/.chaosengine/kvtc-calibration//``. UI shows a "calibrating…" badge during the first selection per model. | +| FU-028 | MTP (Multi-Token Prediction) speculative decoding | **Deferred 2026-05-10 — upstream MTP-head loader gap on both runtimes.** | Target: lossless 1.5–2.2× speedup for trained-with-MTP models (Gemma-4 drafters released 2026-05-05, Apache 2.0; DeepSeek V3/R1; Qwen3.5/3.6/Next; Nemotron-3; MiMo-V2-Flash). **Blocker on Apple Silicon:** mlx-lm 0.31.3 ships ``stream_generate(..., draft_model=...)`` for *separately-trained* draft models but has no native MTP-head loader — Gemma-4-style MTP drafters share activations + KV cache with the target and cannot be loaded as a standalone ``mlx.nn.Module``. Confirmed by inspecting the installed `.venv/lib/python3.11/site-packages/mlx_lm/server.py` + `generate.py` — no MTP-specific code paths. **Blocker on llama.cpp:** PR [#22673](https://github.com/ggml-org/llama.cpp/pull/22673) (am17an, ``--spec-type mtp --spec-draft-n-max N``) is still in Draft as of 2026-05-10, awaiting at least 2 approving reviews. **Third-party path considered + rejected for v1:** [MTPLX](https://github.com/youssofal/MTPLX) (221 stars, MIT) wraps native MTP for Apple Silicon but ships as an OpenAI/Anthropic HTTP server — chaining HTTP servers from our FastAPI backend has unwanted latency + retry surface. **Re-evaluate when:** (a) mlx-lm gains a native MTP head loader (track ``ml-explore/mlx-lm`` releases), OR (b) llama.cpp PR #22673 merges, OR (c) MTPLX exposes a programmatic in-process Python API. The user-facing speedup is real (live benchmarks: M4 Pro × Qwen3.5-27B-4bit 15.3 → 23.3 tok/s) so this stays high-priority on the queue. | +| FU-029 | KVTC (NVIDIA ICLR 2026) KV cache strategy | **Deferred 2026-05-10 — CUDA-only upstream, awaiting MLX/Metal port + PyPI release.** | Targeting [OnlyTerp/kvtc](https://github.com/OnlyTerp/kvtc) (Apache 2.0). PCA + adaptive quantization + entropy coding — 8–32× compression vs the dropped ChaosEngine's 3.7×, peer-reviewed at ICLR 2026, beats TurboQuant by 37% at comparable quality on long-context. Upstream blockers: (a) CUDA-only — repo's roadmap mentions MLX/Metal as "planned" but not yet implemented, so the Apple Silicon dev box cannot validate end-to-end; (b) not on PyPI — distributed as a `src.*` repo intended for `git clone`; (c) integration shape is a HuggingFace `DynamicCache` wrapper (not a llama.cpp cache type), so the existing GGUF lane has no path. Re-evaluate when either upstream ships MLX support or a Windows/Linux+CUDA development box becomes available. Apple Silicon users continue on TurboQuant-MLX (also ICLR 2026, native today). | | ~~FU-030~~ | ~~Drop ChaosEngine + RotorQuant strategy slots~~ | **Shipped 2026-05-10.** | ChaosEngine (cryptopoly/ChaosEngine — 1 commit upstream, eclipsed by KVTC at ICLR 2026 with the same PCA approach but 8–32× compression vs 3.7×) and RotorQuant (shipped as a misleading alias for TurboQuant — same ``--cache-type-k turbo{N}`` flags + same Python module marker) both removed from the registry. Persisted user configs that still reference these ids coerce silently to ``turboquant`` via a new ``CacheStrategyRegistry.resolve_legacy_id`` helper + module-level ``_LEGACY_STRATEGY_ALIASES`` map ([cache_compression/__init__.py](cache_compression/__init__.py)). Mirror coercion in frontend ([src/components/runtimeSupport.ts](src/components/runtimeSupport.ts) ``LEGACY_STRATEGY_ALIASES`` + ``canonicalStrategyId``). Two-level llama.cpp fallback chain (was three-level: requested → ChaosEngine → native; now requested → native) in [backend_service/inference/llama_cpp_engine.py](backend_service/inference/llama_cpp_engine.py). Vendored ChaosEngine bundling stripped from [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (3 helper functions removed: ``stageVendoredChaosEngine`` + ``ensureSetuptoolsForPep639`` + ``resolveChaosEngineVendor``). Pre-build probe asserts the legacy-id coercion works in CI. ``[rotorquant]`` extra removed from [pyproject.toml](pyproject.toml). ``CHAOSENGINE_VENDOR_PATH`` env var dropped. Cache strategy speed/quality maps in [helpers/cache.py](backend_service/helpers/cache.py) trimmed to remaining strategies. | --- @@ -153,6 +153,7 @@ no longer relevant. | Setup routes / install endpoints | `test_setup_routes.py` | `pytest tests/test_setup_routes.py -v` | | Backend services | `test_services.py` | `pytest tests/test_services.py -v` | | Backend API routes | `test_backend_service.py` | `pytest tests/test_backend_service.py -v` | +| Cross-strategy E2E matrix runner | `test_cache_strategy_matrix_runner.py` | `pytest tests/test_cache_strategy_matrix_runner.py -v` | | Frontend API client | `src/api.test.ts` | `npm test` | | Frontend utilities | `src/utils/__tests__/*.test.ts` | `npm test` | @@ -162,6 +163,29 @@ no longer relevant. - Cache strategy changes must test `llama_cpp_cache_flags()` returns valid types - New API endpoints need at least a shape/contract test +### Cross-strategy E2E matrix runner + +`scripts/cache-strategy-matrix.py` sweeps every supported (cache strategy +× spec-dec method × representative model) combination through a running +backend on port 8876 and writes a CSV + Markdown report to +`~/.chaosengine/test-results/`. It also asserts the **FU-030 legacy +alias coercion** — requests with `cacheStrategy=chaosengine` / +`cacheStrategy=rotorquant` must come back loaded as `turboquant`, and +the runner exits with code 2 if either regresses. + +``` +# Quick smoke (~5 min on M-series; CI-friendly) +.venv/bin/python scripts/cache-strategy-matrix.py --quick + +# Full sweep (~20 min; gates a release) +.venv/bin/python scripts/cache-strategy-matrix.py +``` + +The runner skips cells where the strategy isn't installed, the +turbo binary is missing, the model isn't in the local library, or +the spec-dec method isn't supported on the chosen backend — so a +fresh CI box reports honest skip reasons rather than failing. + --- ## Code Quality Guidelines diff --git a/scripts/cache-strategy-matrix.py b/scripts/cache-strategy-matrix.py new file mode 100755 index 0000000..cba80d2 --- /dev/null +++ b/scripts/cache-strategy-matrix.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +"""ChaosEngineAI cache-strategy + speculative-decoding matrix runner. + +Sweeps the supported (strategy × spec-dec × model) grid through a running +backend and writes a CSV + Markdown summary to ``~/.chaosengine/test-results/``. + +Skips cells where: +- the strategy is not available on this platform (per ``/api/cache/strategies``) +- the spec-dec method is not supported for the given backend (DFlash/DDTree + require MLX or vLLM, not GGUF) +- the model is not in the local library + +Verifies the **FU-030 legacy alias coercion** by including +``cacheStrategy=chaosengine`` and ``cacheStrategy=rotorquant`` rows; the +backend must coerce both to ``turboquant`` and the runtime note + load +report must reflect that. + +Usage: + .venv/bin/python scripts/cache-strategy-matrix.py [--port 8876] + [--quick] + [--out PATH] + +``--quick`` drops the larger models so the matrix completes in ~5 minutes +(useful for smoke runs in CI). The full run takes ~20 minutes wall-time on +M-series Macs. + +Backend prerequisite: the FastAPI sidecar must be running on the chosen +port (default 8876). The script does not start it for you. +""" +from __future__ import annotations + +import argparse +import csv +import hashlib +import json +import sys +import time +import urllib.error +import urllib.request +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +DEFAULT_PORT = 8876 +DEFAULT_OUT_DIR = Path.home() / ".chaosengine" / "test-results" +DEFAULT_PROMPT = "Explain in three sentences why deterministic seeding matters." +DEFAULT_MAX_TOKENS = 96 +DEFAULT_TEMPERATURE = 0.0 # deterministic — required for output-hash compares +DEFAULT_SEED = 42 + +# ── Matrix definition ──────────────────────────────────────────────── + +@dataclass(frozen=True) +class MatrixCell: + """One scheduled inference run.""" + + label: str + model_ref: str + backend: str # ``mlx`` | ``gguf`` + strategy: str # ``native`` | ``turboquant`` | ``triattention`` | legacy aliases + bits: int # 0 for native, otherwise per-strategy bit count + spec_decode: str # ``none`` | ``dflash`` | ``ddtree`` + tree_budget: int = 0 # only meaningful when spec_decode == ``ddtree`` + quick: bool = True # included in the ``--quick`` smoke set + + +# Smallest-on-disk MLX target so the matrix exercises every code path +# without burning hours of wall-time. Heavier sweeps (35B-A3B etc.) flip +# ``quick=False`` and are gated by the absence of the ``--quick`` flag. +SMALL_MLX = "mlx-community/Qwen2.5-0.5B-Instruct-4bit" +MID_MLX_DFLASH_CAPABLE = "mlx-community/Qwen3-4B-bf16" +SMALL_GGUF = "lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF" + +MATRIX: list[MatrixCell] = [ + # MLX × strategies — every text strategy on the smallest model + MatrixCell("native MLX (smoke)", SMALL_MLX, "mlx", "native", 0, "none"), + MatrixCell("turboquant MLX 3-bit", SMALL_MLX, "mlx", "turboquant", 3, "none"), + MatrixCell("triattention MLX", SMALL_MLX, "mlx", "triattention", 3, "none"), + + # FU-030 legacy alias coercion — both must run as turboquant + report it + MatrixCell("legacy id chaosengine -> turboquant", SMALL_MLX, "mlx", "chaosengine", 4, "none"), + MatrixCell("legacy id rotorquant -> turboquant", SMALL_MLX, "mlx", "rotorquant", 3, "none"), + + # Speculative decoding — DFlash + DDTree require MLX backend + a + # DRAFT_MODEL_MAP-supported target. The 4B Qwen3 path covers both. + MatrixCell("dflash spec-dec (Qwen3-4B)", MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "dflash", quick=False), + MatrixCell("ddtree spec-dec budget=16", MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "ddtree", tree_budget=16, quick=False), + + # GGUF lane — native is enough to verify the standard binary path. + # TurboQuant on GGUF needs llama-server-turbo; runner skips when the + # binary is missing rather than hard-failing. + MatrixCell("native GGUF (smoke)", SMALL_GGUF, "gguf", "native", 0, "none"), + MatrixCell("turboquant GGUF 3-bit", SMALL_GGUF, "gguf", "turboquant", 3, "none", quick=False), +] + + +# ── HTTP helpers ───────────────────────────────────────────────────── + +def _api(method: str, path: str, *, port: int, body: dict | None = None, timeout: float = 60) -> dict: + url = f"http://127.0.0.1:{port}{path}" + data = json.dumps(body).encode() if body else None + req = urllib.request.Request(url, data=data, method=method) + req.add_header("Content-Type", "application/json") + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read()) + except urllib.error.HTTPError as exc: + detail = "" + try: + detail = exc.read().decode() + except Exception: + pass + raise RuntimeError(f"API {method} {path} -> {exc.code}: {detail}") from exc + except urllib.error.URLError as exc: + raise ConnectionError( + f"Cannot reach ChaosEngineAI at port {port}. Is the backend running? ({exc.reason})" + ) from exc + + +def _stream_inference(path: str, *, port: int, body: dict, timeout: float = 300) -> tuple[str, dict]: + """POST to an SSE endpoint. Returns ``(full_text, done_payload)``.""" + url = f"http://127.0.0.1:{port}{path}" + data = json.dumps(body).encode() + req = urllib.request.Request(url, data=data, method="POST") + req.add_header("Content-Type", "application/json") + req.add_header("Accept", "text/event-stream") + + full_text = "" + done_payload: dict = {} + with urllib.request.urlopen(req, timeout=timeout) as resp: + buffer = "" + while True: + raw = resp.read(4096) + if not raw: + break + buffer += raw.decode("utf-8", errors="replace") + while "\n\n" in buffer: + event_str, buffer = buffer.split("\n\n", 1) + for line in event_str.strip().splitlines(): + if not line.startswith("data: "): + continue + payload = json.loads(line[6:]) + if "token" in payload: + full_text += payload["token"] + if "error" in payload: + raise RuntimeError(f"Inference error: {payload['error']}") + if payload.get("done"): + done_payload = payload + return full_text, done_payload + + +# ── Capability probes (decide which cells to skip) ─────────────────── + +@dataclass +class BackendCapabilities: + available_strategies: set[str] + dflash_available: bool + ddtree_available: bool + has_turbo_binary: bool + library_refs: set[str] + + +def probe_backend(port: int) -> BackendCapabilities: + workspace = _api("GET", "/api/workspace", port=port) + system = workspace.get("system", {}) + strategies = system.get("availableCacheStrategies") or [] + available = {s["id"] for s in strategies if s.get("available")} + dflash = system.get("dflash") or {} + library = workspace.get("library") or [] + refs: set[str] = set() + for item in library: + name = item.get("name") or "" + if name: + refs.add(name) + for variant in item.get("variants", []) or []: + repo = variant.get("repo") or "" + if repo: + refs.add(repo) + return BackendCapabilities( + available_strategies=available, + dflash_available=bool(dflash.get("available")), + ddtree_available=bool(dflash.get("ddtreeAvailable")), + has_turbo_binary=bool(system.get("llamaServerTurboPath")), + library_refs=refs, + ) + + +def skip_reason(cell: MatrixCell, caps: BackendCapabilities, *, quick: bool) -> str | None: + if quick and not cell.quick: + return "deferred to full run (drop --quick)" + + canonical = {"chaosengine": "turboquant", "rotorquant": "turboquant"}.get( + cell.strategy, cell.strategy, + ) + if canonical not in caps.available_strategies and canonical != "native": + return f"strategy '{canonical}' unavailable in this runtime" + + if cell.backend == "gguf" and canonical == "turboquant" and not caps.has_turbo_binary: + return "llama-server-turbo binary missing" + + if cell.spec_decode in ("dflash", "ddtree"): + if cell.backend == "gguf": + return "speculative decoding requires MLX/vLLM, not GGUF" + if not caps.dflash_available: + return "DFlash runtime not installed" + if cell.spec_decode == "ddtree" and not caps.ddtree_available: + return "DDTree runtime not available" + + if cell.model_ref not in caps.library_refs: + return f"model not in library ({cell.model_ref})" + + return None + + +# ── Cell execution ─────────────────────────────────────────────────── + +@dataclass +class CellResult: + label: str + model_ref: str + backend: str + strategy: str + bits: int + spec_decode: str + tree_budget: int + skipped: bool = False + skip_reason: str = "" + ok: bool = False + error: str = "" + tokens_per_sec: float = 0.0 + output_sha: str = "" + output_chars: int = 0 + actual_strategy: str = "" + runtime_note: str = "" + duration_seconds: float = 0.0 + + +def run_cell(cell: MatrixCell, *, port: int) -> CellResult: + result = CellResult( + label=cell.label, + model_ref=cell.model_ref, + backend=cell.backend, + strategy=cell.strategy, + bits=cell.bits, + spec_decode=cell.spec_decode, + tree_budget=cell.tree_budget, + ) + + body = { + "modelRef": cell.model_ref, + "modelName": cell.model_ref.split("/")[-1], + "canonicalRepo": cell.model_ref, + "source": "library", + "backend": cell.backend, + "cacheStrategy": cell.strategy, + "cacheBits": cell.bits, + "fp16Layers": 0, + "fusedAttention": False, + "fitModelInMemory": True, + "contextTokens": 4096, + "speculativeDecoding": cell.spec_decode != "none", + "treeBudget": cell.tree_budget, + "thinkingMode": "off", + } + + started = time.monotonic() + try: + load_resp = _api("POST", "/api/models/load", port=port, body=body, timeout=180) + result.actual_strategy = (load_resp.get("loadedModel") or {}).get("cacheStrategy", "") + result.runtime_note = (load_resp.get("loadedModel") or {}).get("runtimeNote") or "" + + gen_body = { + "prompt": DEFAULT_PROMPT, + "maxTokens": DEFAULT_MAX_TOKENS, + "temperature": DEFAULT_TEMPERATURE, + "seed": DEFAULT_SEED, + "thinkingMode": "off", + } + text, done = _stream_inference("/api/generate/stream", port=port, body=gen_body, timeout=240) + result.duration_seconds = round(time.monotonic() - started, 2) + result.tokens_per_sec = float(done.get("tokensPerSecond") or 0.0) + result.output_chars = len(text) + result.output_sha = hashlib.sha256(text.encode("utf-8")).hexdigest()[:12] + result.ok = bool(text.strip()) + if not result.ok: + result.error = "empty output" + except (RuntimeError, ConnectionError, urllib.error.URLError) as exc: + result.error = str(exc)[:200] + result.duration_seconds = round(time.monotonic() - started, 2) + return result + + +# ── Reporting ──────────────────────────────────────────────────────── + +def write_csv(out_dir: Path, results: list[CellResult]) -> Path: + out_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ") + csv_path = out_dir / f"cache-strategy-matrix-{timestamp}.csv" + with csv_path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle) + writer.writerow([ + "label", "model_ref", "backend", "strategy", "bits", "spec_decode", + "tree_budget", "skipped", "skip_reason", "ok", "error", + "tokens_per_sec", "duration_seconds", "actual_strategy", + "output_sha", "output_chars", "runtime_note", + ]) + for r in results: + writer.writerow([ + r.label, r.model_ref, r.backend, r.strategy, r.bits, r.spec_decode, + r.tree_budget, r.skipped, r.skip_reason, r.ok, r.error, + f"{r.tokens_per_sec:.2f}", f"{r.duration_seconds:.2f}", + r.actual_strategy, r.output_sha, r.output_chars, r.runtime_note, + ]) + return csv_path + + +def write_markdown(out_dir: Path, results: list[CellResult]) -> Path: + out_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ") + md_path = out_dir / f"cache-strategy-matrix-{timestamp}.md" + + ran = [r for r in results if not r.skipped] + skipped = [r for r in results if r.skipped] + passed = [r for r in ran if r.ok] + failed = [r for r in ran if not r.ok] + + lines = [ + f"# Cache strategy matrix run ({timestamp})", + "", + f"- Total cells: **{len(results)}**", + f"- Ran: **{len(ran)}** ({len(passed)} pass / {len(failed)} fail)", + f"- Skipped: **{len(skipped)}**", + "", + "## Results", + "", + "| Label | Strategy | Spec-dec | Outcome | tok/s | SHA-12 | Note |", + "|---|---|---|---|---|---|---|", + ] + for r in results: + if r.skipped: + outcome = f"SKIP — {r.skip_reason}" + elif r.ok: + outcome = "PASS" + else: + outcome = f"FAIL — {r.error}" + lines.append( + f"| {r.label} | {r.strategy}({r.bits}b) | {r.spec_decode} | {outcome} | " + f"{r.tokens_per_sec:.1f} | {r.output_sha or '—'} | {r.runtime_note[:80]} |" + ) + + # FU-030 coercion section: legacy ids must report ``actual_strategy`` + # of ``turboquant`` even though the request asked for chaosengine / + # rotorquant. Surface it explicitly so regressions are obvious. + legacy = [r for r in ran if r.strategy in ("chaosengine", "rotorquant")] + if legacy: + lines += [ + "", + "## FU-030 legacy alias coercion", + "", + "| Requested | Loaded | Coercion correct? |", + "|---|---|---|", + ] + for r in legacy: + ok_mark = "yes" if r.actual_strategy == "turboquant" else "**NO**" + lines.append(f"| {r.strategy} | {r.actual_strategy or '—'} | {ok_mark} |") + + md_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return md_path + + +def print_summary(results: list[CellResult]) -> int: + ran = [r for r in results if not r.skipped] + passed = [r for r in ran if r.ok] + failed = [r for r in ran if not r.ok] + skipped = [r for r in results if r.skipped] + coercion_failures = [ + r for r in ran + if r.strategy in ("chaosengine", "rotorquant") + and r.actual_strategy != "turboquant" + ] + print() + print(f" Cells: {len(results)}") + print(f" Ran: {len(ran)} ({len(passed)} pass / {len(failed)} fail)") + print(f" Skipped: {len(skipped)}") + if failed: + print() + print(" Failures:") + for r in failed: + print(f" - {r.label}: {r.error}") + if coercion_failures: + print() + print(" FU-030 coercion regression:") + for r in coercion_failures: + print(f" - {r.label}: requested={r.strategy} loaded={r.actual_strategy or 'n/a'}") + return 2 + return 0 if not failed else 1 + + +# ── Entry point ────────────────────────────────────────────────────── + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--port", type=int, default=DEFAULT_PORT) + parser.add_argument("--quick", action="store_true", + help="run only the smoke subset (~5 min wall-time)") + parser.add_argument("--out", type=Path, default=DEFAULT_OUT_DIR, + help="results directory (default ~/.chaosengine/test-results)") + args = parser.parse_args() + + print(f"Probing backend at http://127.0.0.1:{args.port}/api/workspace ...") + try: + caps = probe_backend(args.port) + except ConnectionError as exc: + print(f" ! {exc}", file=sys.stderr) + return 3 + print(f" available strategies: {sorted(caps.available_strategies)}") + print(f" dflash={caps.dflash_available} ddtree={caps.ddtree_available} turbo-binary={caps.has_turbo_binary}") + print(f" library models: {len(caps.library_refs)}") + + results: list[CellResult] = [] + for i, cell in enumerate(MATRIX, 1): + print(f"\n[{i}/{len(MATRIX)}] {cell.label}") + skip = skip_reason(cell, caps, quick=args.quick) + if skip: + print(f" skip: {skip}") + results.append(CellResult( + label=cell.label, + model_ref=cell.model_ref, + backend=cell.backend, + strategy=cell.strategy, + bits=cell.bits, + spec_decode=cell.spec_decode, + tree_budget=cell.tree_budget, + skipped=True, + skip_reason=skip, + )) + continue + result = run_cell(cell, port=args.port) + if result.ok: + print(f" pass {result.tokens_per_sec:.1f} tok/s sha={result.output_sha} ({result.duration_seconds:.1f}s)") + else: + print(f" FAIL {result.error}") + results.append(result) + + csv_path = write_csv(args.out, results) + md_path = write_markdown(args.out, results) + print() + print(f" CSV: {csv_path}") + print(f" Markdown: {md_path}") + return print_summary(results) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_cache_strategy_matrix_runner.py b/tests/test_cache_strategy_matrix_runner.py new file mode 100644 index 0000000..a946254 --- /dev/null +++ b/tests/test_cache_strategy_matrix_runner.py @@ -0,0 +1,312 @@ +"""Unit tests for ``scripts/cache-strategy-matrix.py``. + +Covers the pure functions (``skip_reason``, ``write_csv``, ``write_markdown``, +``print_summary``) without standing up a live backend. The HTTP layer +(``_api`` / ``_stream_inference`` / ``run_cell``) is exercised end-to-end +by the matrix runner itself when invoked against a running sidecar. +""" +from __future__ import annotations + +import importlib.util +import sys +import tempfile +import unittest +from io import StringIO +from pathlib import Path +from unittest import mock + + +def _load_runner_module(): + """Import the runner script as a module despite the dash in its name.""" + project_root = Path(__file__).resolve().parents[1] + script_path = project_root / "scripts" / "cache-strategy-matrix.py" + spec = importlib.util.spec_from_file_location( + "cache_strategy_matrix_runner", script_path, + ) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules["cache_strategy_matrix_runner"] = module + spec.loader.exec_module(module) + return module + + +runner = _load_runner_module() + + +def _caps( + *, + available: set[str] | None = None, + dflash: bool = True, + ddtree: bool = True, + turbo: bool = True, + library: set[str] | None = None, +) -> "runner.BackendCapabilities": + return runner.BackendCapabilities( + available_strategies=available or {"native", "turboquant", "triattention"}, + dflash_available=dflash, + ddtree_available=ddtree, + has_turbo_binary=turbo, + library_refs=library or { + runner.SMALL_MLX, + runner.MID_MLX_DFLASH_CAPABLE, + runner.SMALL_GGUF, + }, + ) + + +class SkipReasonTests(unittest.TestCase): + def test_quick_skips_non_quick_cells(self): + cell = runner.MatrixCell( + "heavy", runner.MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "dflash", + quick=False, + ) + self.assertEqual( + runner.skip_reason(cell, _caps(), quick=True), + "deferred to full run (drop --quick)", + ) + + def test_full_run_keeps_non_quick_cells(self): + cell = runner.MatrixCell( + "heavy", runner.MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "dflash", + quick=False, + ) + self.assertIsNone(runner.skip_reason(cell, _caps(), quick=False)) + + def test_skips_when_strategy_unavailable(self): + cell = runner.MatrixCell( + "tri off", runner.SMALL_MLX, "mlx", "triattention", 3, "none", + ) + caps = _caps(available={"native", "turboquant"}) + self.assertEqual( + runner.skip_reason(cell, caps, quick=False), + "strategy 'triattention' unavailable in this runtime", + ) + + def test_native_never_blocked_by_availability(self): + cell = runner.MatrixCell("nat", runner.SMALL_MLX, "mlx", "native", 0, "none") + caps = _caps(available=set()) + self.assertIsNone(runner.skip_reason(cell, caps, quick=False)) + + def test_skips_gguf_turboquant_without_turbo_binary(self): + cell = runner.MatrixCell("tq gguf", runner.SMALL_GGUF, "gguf", "turboquant", 3, "none") + caps = _caps(turbo=False) + self.assertEqual( + runner.skip_reason(cell, caps, quick=False), + "llama-server-turbo binary missing", + ) + + def test_skips_dflash_on_gguf_backend(self): + cell = runner.MatrixCell("dflash gguf", runner.SMALL_GGUF, "gguf", "native", 0, "dflash") + self.assertEqual( + runner.skip_reason(cell, _caps(), quick=False), + "speculative decoding requires MLX/vLLM, not GGUF", + ) + + def test_skips_dflash_when_runtime_missing(self): + cell = runner.MatrixCell( + "dflash mlx", runner.MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "dflash", + ) + self.assertEqual( + runner.skip_reason(cell, _caps(dflash=False), quick=False), + "DFlash runtime not installed", + ) + + def test_skips_ddtree_when_runtime_missing(self): + cell = runner.MatrixCell( + "ddtree mlx", runner.MID_MLX_DFLASH_CAPABLE, "mlx", "native", 0, "ddtree", tree_budget=8, + ) + self.assertEqual( + runner.skip_reason(cell, _caps(ddtree=False), quick=False), + "DDTree runtime not available", + ) + + def test_skips_when_model_not_in_library(self): + cell = runner.MatrixCell( + "missing", "made-up/unicorn-1B", "mlx", "native", 0, "none", + ) + self.assertIn("model not in library", runner.skip_reason(cell, _caps(), quick=False)) + + def test_legacy_chaosengine_uses_turboquant_availability(self): + """FU-030: ``chaosengine`` must canonicalise to ``turboquant`` for + the availability check; otherwise legacy persisted configs would + always skip even when TurboQuant is installed.""" + cell = runner.MatrixCell( + "legacy", runner.SMALL_MLX, "mlx", "chaosengine", 4, "none", + ) + # turboquant present, chaosengine obviously not present in registry + caps = _caps(available={"native", "turboquant"}) + self.assertIsNone(runner.skip_reason(cell, caps, quick=False)) + + def test_legacy_chaosengine_skips_when_turboquant_unavailable(self): + """The flip side of the previous test — if TurboQuant itself isn't + installed, the legacy id should also skip with the canonical name + in the message so users know what to install.""" + cell = runner.MatrixCell( + "legacy", runner.SMALL_MLX, "mlx", "chaosengine", 4, "none", + ) + caps = _caps(available={"native"}) + self.assertEqual( + runner.skip_reason(cell, caps, quick=False), + "strategy 'turboquant' unavailable in this runtime", + ) + + +class WriteCsvTests(unittest.TestCase): + def test_writes_header_and_rows(self): + results = [ + runner.CellResult( + label="ok", + model_ref="m/x", + backend="mlx", + strategy="native", + bits=0, + spec_decode="none", + tree_budget=0, + ok=True, + tokens_per_sec=42.0, + output_sha="deadbeef0000", + output_chars=128, + actual_strategy="native", + runtime_note="ok", + duration_seconds=1.5, + ), + runner.CellResult( + label="skipped", + model_ref="m/y", + backend="gguf", + strategy="turboquant", + bits=3, + spec_decode="none", + tree_budget=0, + skipped=True, + skip_reason="missing binary", + ), + ] + with tempfile.TemporaryDirectory() as tmp: + csv_path = runner.write_csv(Path(tmp), results) + text = csv_path.read_text(encoding="utf-8") + + self.assertIn("label,model_ref,backend", text) + self.assertIn("ok,m/x,mlx,native,0,none,0,False,,True,,42.00,1.50", text) + self.assertIn("skipped,m/y,gguf,turboquant,3,none,0,True,missing binary", text) + + +class WriteMarkdownTests(unittest.TestCase): + def test_markdown_includes_legacy_alias_table_when_legacy_rows_present(self): + results = [ + runner.CellResult( + label="legacy chaosengine", + model_ref=runner.SMALL_MLX, + backend="mlx", + strategy="chaosengine", + bits=4, + spec_decode="none", + tree_budget=0, + ok=True, + actual_strategy="turboquant", + runtime_note="coerced", + tokens_per_sec=22.0, + output_sha="cafebabe1234", + ), + runner.CellResult( + label="native baseline", + model_ref=runner.SMALL_MLX, + backend="mlx", + strategy="native", + bits=0, + spec_decode="none", + tree_budget=0, + ok=True, + actual_strategy="native", + tokens_per_sec=20.0, + output_sha="aaaa11112222", + ), + ] + with tempfile.TemporaryDirectory() as tmp: + md_path = runner.write_markdown(Path(tmp), results) + text = md_path.read_text(encoding="utf-8") + + self.assertIn("FU-030 legacy alias coercion", text) + self.assertIn("| chaosengine | turboquant | yes |", text) + + def test_markdown_flags_coercion_regression(self): + results = [ + runner.CellResult( + label="legacy rotorquant", + model_ref=runner.SMALL_MLX, + backend="mlx", + strategy="rotorquant", + bits=3, + spec_decode="none", + tree_budget=0, + ok=True, + actual_strategy="native", # wrong — should be turboquant + ), + ] + with tempfile.TemporaryDirectory() as tmp: + md_path = runner.write_markdown(Path(tmp), results) + text = md_path.read_text(encoding="utf-8") + + self.assertIn("| rotorquant | native | **NO** |", text) + + +class PrintSummaryTests(unittest.TestCase): + def _stub_stdout(self): + """Replace stdout for the duration of one test.""" + return mock.patch("sys.stdout", new_callable=StringIO) + + def test_returns_zero_on_all_pass(self): + results = [ + runner.CellResult( + label="a", model_ref="m", backend="mlx", strategy="native", + bits=0, spec_decode="none", tree_budget=0, ok=True, + ), + ] + with self._stub_stdout(): + self.assertEqual(runner.print_summary(results), 0) + + def test_returns_one_on_failure(self): + results = [ + runner.CellResult( + label="a", model_ref="m", backend="mlx", strategy="native", + bits=0, spec_decode="none", tree_budget=0, + ok=False, error="boom", + ), + ] + with self._stub_stdout(): + self.assertEqual(runner.print_summary(results), 1) + + def test_returns_two_on_coercion_regression(self): + results = [ + runner.CellResult( + label="legacy", model_ref=runner.SMALL_MLX, backend="mlx", + strategy="chaosengine", bits=4, spec_decode="none", tree_budget=0, + ok=True, actual_strategy="native", # wrong + ), + ] + with self._stub_stdout(): + self.assertEqual(runner.print_summary(results), 2) + + +class MatrixDefinitionTests(unittest.TestCase): + def test_matrix_includes_legacy_coercion_cells(self): + labels = {cell.label for cell in runner.MATRIX} + self.assertIn("legacy id chaosengine -> turboquant", labels) + self.assertIn("legacy id rotorquant -> turboquant", labels) + + def test_matrix_strategy_ids_use_active_or_legacy_set(self): + active = {"native", "turboquant", "triattention"} + legacy = {"chaosengine", "rotorquant"} + for cell in runner.MATRIX: + self.assertIn( + cell.strategy, active | legacy, + f"unknown strategy in matrix: {cell.strategy}", + ) + + def test_matrix_backends_are_supported(self): + for cell in runner.MATRIX: + self.assertIn(cell.backend, ("mlx", "gguf")) + + +if __name__ == "__main__": + unittest.main() From 462f689f8536ba84e271dc672e85ed118d51d403 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 14:59:21 +0100 Subject: [PATCH 03/13] Update pre-build-check.sh cache-strategy probe for FU-030 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe was still running ``registry.get('chaosengine').llama_cpp_cache_flags(bits)`` and asserting the emitted cache types were standard llama-server types. After FU-030 the legacy id coerces to TurboQuant, which emits ``turbo2/turbo3/turbo4`` — those are the *correct* types for the turbo binary but the probe rejected them as INVALID. Replaced with: native validates standard cache types, TurboQuant must declare the turbo binary, and both legacy ids (chaosengine + rotorquant) must coerce to turboquant via ``registry.resolve_legacy_id`` and resolve via ``registry.get``. Mirrors the assertion already in ``scripts/pre-build-check.mjs`` so both runners agree. All 7 pre-build-check.sh gates green. --- scripts/pre-build-check.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/pre-build-check.sh b/scripts/pre-build-check.sh index 0859063..698d173 100755 --- a/scripts/pre-build-check.sh +++ b/scripts/pre-build-check.sh @@ -89,17 +89,22 @@ CACHE_CHECK=$(.venv/bin/python -c " from cache_compression import registry registry.discover() valid = {'f32','f16','bf16','q8_0','q4_0','q4_1','iq4_nl','q5_0','q5_1'} -ce = registry.get('chaosengine') -for bits in (2,3,4,5,6,8): - flags = ce.llama_cpp_cache_flags(bits) +nat = registry.get('native') +for bits in (0,): + flags = nat.llama_cpp_cache_flags(bits) for i, f in enumerate(flags): if f.startswith('--cache-type-') and i+1 < len(flags): if flags[i+1] not in valid: - print(f'INVALID: ChaosEngine {bits}-bit emits {flags[i+1]}') -rq = registry.get('rotorquant') + print(f'INVALID: Native emits {flags[i+1]}') tq = registry.get('turboquant') -if rq.required_llama_binary() != 'turbo': print('INVALID: RotorQuant not routing to turbo') if tq.required_llama_binary() != 'turbo': print('INVALID: TurboQuant not routing to turbo') +# FU-030 (2026-05-10): legacy ids must coerce to turboquant via the +# alias map. Assert the wiring works in CI rather than at runtime. +for legacy_id in ('rotorquant', 'chaosengine'): + if registry.resolve_legacy_id(legacy_id) != 'turboquant': + print(f'INVALID: legacy id {legacy_id} did not coerce to turboquant') + if registry.get(legacy_id) is None: + print(f'INVALID: legacy id {legacy_id} did not resolve via registry.get') print('OK') " 2>&1) if echo "$CACHE_CHECK" | grep -q "INVALID"; then From 0715da32f2b09fb133be1dff4157a7ca388ce3a6 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 15:22:55 +0100 Subject: [PATCH 04/13] Bump dflash-mlx pin, expand DRAFT_MODEL_MAP, pin TriAttention, add pin-sync probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five interlocking maintenance items found while auditing the upstream landscape for the four repos: z-lab/dflash, bstnxbt/dflash-mlx, youssofal/MTPLX, TheTom/turboquant_plus. 1. **dflash-mlx pin bumped from 8d8545d (v0.1.5.1) to fada1eb (HEAD).** 11 upstream commits cover the new Gemma4 DFlash backend (commit 05cc456 — biggest payload), v0.1.5 serving surface, live server metrics endpoint, prefix-cache survival test gate, async L2 writer fix, long-context runtime diagnostics hardening, benchmark slugging fixes, and a license switch to Apache-2.0. No breaking API changes per the upstream commit log. 2. **stage-runtime.mjs pin synced to match pyproject.toml.** Caught a real bug: pyproject.toml was at 8d8545d (v0.1.5.1) but scripts/stage-runtime.mjs was lagging on f825ffb (v0.1.4.1) — dev .venv ran new, but ``npm run stage:runtime`` was bundling the OLD binary into release builds. Both files now share fada1eb. 3. **DRAFT_MODEL_MAP extended for new z-lab drafters.** Added entries for google/gemma-4-31B-it, google/gemma-4-26B-A4B-it, Qwen/Qwen3.5-122B-A10B, MiniMaxAI/MiniMax-M2.5, MiniMaxAI/MiniMax-M2.7, and moonshotai/Kimi-K2.6, plus the mlx-community/* aliases for each so Apple Silicon quants resolve via the existing fuzzy-match path. 7 new unit tests in test_dflash.py pin the mappings. 4. **TriAttention git+url pinned to commit c3744ee.** The ``[triattention]`` and ``[triattention-mlx]`` extras were pulling ``git+...git`` HEAD with no commit pin, making fresh installs non-reproducible whenever upstream landed unreleased work between our staging snapshots. Pin matches the v0.2.0 release surface plus the AMD GPU port. 5. **FU-033 pin-sync probe shipped in pre-build-check.{mjs,sh}.** Regex-extracts the dflash-mlx commit hash from both files and fails the build when they diverge. Same commit also drops the orphan vendor/ChaosEngine staleness check from both runners (FU-030 removed the vendored package; the probe would never resolve again). CLAUDE.md tracker updates: FU-006 entry rewritten to document the fada1eb bump, three new entries (FU-031 dflash drafter expansion + TriAttention pin; FU-032 turboquant_plus watch-closely; FU-033 pin-sync probe shipped). Test totals: 1321 pytest pass (+8 from previous 1313 — 7 new dflash + 1 housekeeping), 341 vitest pass, tsc clean, pre-build-check 8/8 gates green. --- CLAUDE.md | 5 ++- dflash/__init__.py | 33 +++++++++++++++++++ pyproject.toml | 6 ++-- scripts/pre-build-check.mjs | 42 +++++++++++++++--------- scripts/pre-build-check.sh | 26 +++++++++------ scripts/stage-runtime.mjs | 2 +- tests/test_dflash.py | 64 +++++++++++++++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 30 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1df64b3..5c81be7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -113,7 +113,7 @@ no longer relevant. | FU-003 | LongLive integration for Wan 2.1 T2V 1.3B | CUDA platforms (Windows/Linux) only | Real-time causal long video gen ([triattention/longlive](https://github.com/WeianMao/triattention/tree/main/longlive)). We ship the target model already. Needs: new video backend branch in [backend_service/video_runtime.py](backend_service/video_runtime.py), LoRA weights download, torchrun orchestration, UI affordance for long-clip mode. Flash Attention dep. | | FU-004 | TriAttention SGLang backend | When/if we adopt SGLang as an inference backend | Added upstream 2026-04-22 as v0.2.0. No action unless SGLang lands in our runtime. | | ~~FU-005~~ | ~~arozanov v_only TurboQuant MLX mode~~ | **Dropped 2026-04-24** | Our current `turboquant-mlx-full` 0.1.3 path already runs without any mlx-lm fork — uses pip `TurboQuantKVCache` with `QuantizedKVCache` fallback ([turboquant_mlx/__init__.py:174-186](turboquant_mlx/__init__.py)). `VOnlyTurboQuantCache` is only in the arozanov fork (we track but don't consume). Value prop already satisfied; entry removed. | -| ~~FU-006~~ | ~~Re-verify dflash-mlx pin~~ | **Bumped to `8d8545d` = v0.1.5.1 on 2026-05-05 after the ddtree.py rewrite landed.** | Pin advanced from `f825ffb` (v0.1.4.1) to `8d8545d` (v0.1.5.1). 0.1.5+ moved every primitive that [backend_service/ddtree.py](backend_service/ddtree.py) consumed off the runtime top-level onto a per-family `target_ops` adapter — `target_forward_with_hidden_states` → `target_ops.forward_with_hidden_capture`, `extract_context_feature_from_dict` → `target_ops.extract_context_feature`, `make_target_cache` → `target_ops.make_cache`, `_target_embed_tokens` → `target_ops.embed_tokens`, `_target_text_model` → `target_ops.text_model`, `_lm_head_logits` → `target_ops.logits_from_hidden`. `ContextOnlyDraftKVCache` moved to `dflash_mlx.model`; `create_attention_mask` re-imported from `mlx_lm.models.base`; `trim_cache_to` was removed entirely and now lives as a thin local `_trim_cache_to` shim that calls each entry's own `.rollback()` / `.trim()` / `.crop()`. Adapter resolved once at the top of `generate_ddtree_mlx` via `resolve_target_ops(target_model)`. Live smoke 2026-05-05 against `mlx-community/Qwen2.5-0.5B-Instruct-4bit` confirmed adapter resolves (`backend=qwen_gdn`, `family=pure_attention`), forward+capture / embed_tokens / text_model / logits_from_hidden / extract_context_feature / `_trim_cache_to` all working. Gains over 0.1.4.1: draft model quantization with Metal MMA kernels, branchless Metal kernels + fused draft KV projections, long-context runtime diagnostics. Re-check cadence resets to quarterly. | +| ~~FU-006~~ | ~~Re-verify dflash-mlx pin~~ | **Bumped to `fada1eb` (HEAD) on 2026-05-10. Previously bumped to `8d8545d` = v0.1.5.1 on 2026-05-05 after the ddtree.py rewrite landed.** | 2026-05-10 bump from `8d8545d` to `fada1eb` covers 11 upstream commits including the new Gemma4 DFlash backend (commit 05cc456, "feat: add Gemma4 DFlash backend"), the v0.1.5 serving surface, live server metrics endpoint, prefix-cache survival test gate, async L2 writer fix, long-context runtime diagnostics hardening, benchmark slugging fixes, and a license switch to Apache-2.0. Same fix applied in both [pyproject.toml](pyproject.toml) (already correct) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (was lagging on `f825ffb` v0.1.4.1 — staged release runtime would have shipped the old binary). The two pins now live as the same hex string in both files; CI's pre-build-check should grow a sync assert. No breaking API changes between the pins per upstream commit log. **Earlier bump notes:** Pin advanced from `f825ffb` (v0.1.4.1) to `8d8545d` (v0.1.5.1). 0.1.5+ moved every primitive that [backend_service/ddtree.py](backend_service/ddtree.py) consumed off the runtime top-level onto a per-family `target_ops` adapter — `target_forward_with_hidden_states` → `target_ops.forward_with_hidden_capture`, `extract_context_feature_from_dict` → `target_ops.extract_context_feature`, `make_target_cache` → `target_ops.make_cache`, `_target_embed_tokens` → `target_ops.embed_tokens`, `_target_text_model` → `target_ops.text_model`, `_lm_head_logits` → `target_ops.logits_from_hidden`. `ContextOnlyDraftKVCache` moved to `dflash_mlx.model`; `create_attention_mask` re-imported from `mlx_lm.models.base`; `trim_cache_to` was removed entirely and now lives as a thin local `_trim_cache_to` shim that calls each entry's own `.rollback()` / `.trim()` / `.crop()`. Adapter resolved once at the top of `generate_ddtree_mlx` via `resolve_target_ops(target_model)`. Live smoke 2026-05-05 against `mlx-community/Qwen2.5-0.5B-Instruct-4bit` confirmed adapter resolves (`backend=qwen_gdn`, `family=pure_attention`), forward+capture / embed_tokens / text_model / logits_from_hidden / extract_context_feature / `_trim_cache_to` all working. Gains over 0.1.4.1: draft model quantization with Metal MMA kernels, branchless Metal kernels + fused draft KV projections, long-context runtime diagnostics. Re-check cadence resets to quarterly. | | ~~FU-007~~ | ~~TeaCache for Wan2.1/2.2~~ | **Obsoleted 2026-05-03 by FU-015.** | TeaCache patches for FLUX + HunyuanVideo + LTX-Video + CogVideoX + Mochi remain under [cache_compression/_teacache_patches/](cache_compression/_teacache_patches/). The Wan-specific port that was deferred here is no longer needed: diffusers 0.36 ships a model-agnostic `apply_first_block_cache` hook (FU-015) that operates on `pipeline.transformer` regardless of model, so Wan caches via the same generic strategy without a vendored forward. Pick FBCache for Wan; TeaCache stays available as the alternative for FLUX-family pipelines. | | ~~FU-008~~ | ~~`stable-diffusion.cpp` engine (cross-platform diffusion)~~ | **Shipped 2026-05-03 (video) + 2026-05-04 (image).** | Binary build via [scripts/build-sdcpp.sh](scripts/build-sdcpp.sh) + [scripts/update-sdcpp.sh](scripts/update-sdcpp.sh) (clones to `/tmp/stable-diffusion.cpp`, cmake `-DSD_METAL=ON` on Darwin or `-DSD_CUBLAS=ON` on Linux+CUDA, installs to `~/.chaosengine/bin/sd`). Build target is `sd-cli` (renamed from `sd` upstream around master-590); installer copies it back to the legacy `sd` filename so downstream resolvers in [sdcpp_video_runtime.py](backend_service/sdcpp_video_runtime.py), [sdcpp_image_runtime.py](backend_service/sdcpp_image_runtime.py), and [stage-runtime.mjs](scripts/stage-runtime.mjs) keep working. Path resolution in [src-tauri/src/lib.rs](src-tauri/src/lib.rs). **Video lane** (`SdCppVideoEngine.generate`): subprocess spawn → maps `VideoGenerationConfig` → sd.cpp flags (`--diffusion-model`, `-p`, `-W/-H`, `--steps`, `--cfg-scale`, `--seed`, `-o`, `--video-frames`, `--fps`, `--negative-prompt`); regex-parses `step N/M` (or `[N/M]`) into `VIDEO_PROGRESS`; reads `.webm` bytes back (sd.cpp's video output is `.webm`/`.avi`/animated `.webp` — no native `.mp4`). Catalog requires `ggufRepo` + `ggufFile` pin (e.g. `QuantStack/Wan2.2-TI2V-5B-GGUF`). **Image lane** (`SdCppImageEngine.generate`, [sdcpp_image_runtime.py](backend_service/sdcpp_image_runtime.py)): mirrors video shape but emits PNG, drops `--video-frames`/`--fps`, batches by looping seeds (sd.cpp renders one image per invocation). Manager dispatch in [image_runtime.py](backend_service/image_runtime.py) `ImageRuntimeManager.generate` routes when `config.runtime == "sdcpp"`, falls through to diffusers on probe failure or runtime error. Catalog variants: `FLUX.1-schnell-sdcpp-q4km` + `FLUX.1-dev-sdcpp-q4km` ([catalog/image_models.py](backend_service/catalog/image_models.py)). Supported image repos: FLUX.1/2 family, SD3.5, SDXL, SD2.1, Qwen-Image (+ 2512), Z-Image (+ Turbo). | | ~~FU-009~~ | ~~mlx-video (Blaizzy) Apple Silicon video engine~~ | **Fully shipped 2026-05-04. Live smoke validated end-to-end.** | LTX-2 paths (`prince-canuma/LTX-2-{distilled,dev,2.3-distilled,2.3-dev}`) routed through subprocess engine in [backend_service/mlx_video_runtime.py](backend_service/mlx_video_runtime.py); Wan-AI paths route via Phase 8 of FU-025 (`_is_wan_repo` + `_build_wan_cmd` + `_REPO_ENTRY_POINTS["Wan-AI/"] = "mlx_video.models.wan_2.generate"`). Live smoke 2026-05-04 against `Wan-AI/Wan2.1-T2V-1.3B` (480×272, 5 frames, 4 steps, unipc): T5 encode 14.1s + transformer load 0.2s (4-bit q) + denoise 2.9s @ 1.4 it/s + VAE decode 1.3s = 19.6s total, 383 KB .mp4 output. The smoke also surfaced + fixed a `status_for` filename gap — mlx-video upstream emits root-level `model.safetensors` + `t5_encoder.safetensors`, not the legacy `transformer*.safetensors` / `text_encoder*.safetensors` patterns the helper originally checked for. Both now match. | @@ -138,6 +138,9 @@ no longer relevant. | FU-028 | MTP (Multi-Token Prediction) speculative decoding | **Deferred 2026-05-10 — upstream MTP-head loader gap on both runtimes.** | Target: lossless 1.5–2.2× speedup for trained-with-MTP models (Gemma-4 drafters released 2026-05-05, Apache 2.0; DeepSeek V3/R1; Qwen3.5/3.6/Next; Nemotron-3; MiMo-V2-Flash). **Blocker on Apple Silicon:** mlx-lm 0.31.3 ships ``stream_generate(..., draft_model=...)`` for *separately-trained* draft models but has no native MTP-head loader — Gemma-4-style MTP drafters share activations + KV cache with the target and cannot be loaded as a standalone ``mlx.nn.Module``. Confirmed by inspecting the installed `.venv/lib/python3.11/site-packages/mlx_lm/server.py` + `generate.py` — no MTP-specific code paths. **Blocker on llama.cpp:** PR [#22673](https://github.com/ggml-org/llama.cpp/pull/22673) (am17an, ``--spec-type mtp --spec-draft-n-max N``) is still in Draft as of 2026-05-10, awaiting at least 2 approving reviews. **Third-party path considered + rejected for v1:** [MTPLX](https://github.com/youssofal/MTPLX) (221 stars, MIT) wraps native MTP for Apple Silicon but ships as an OpenAI/Anthropic HTTP server — chaining HTTP servers from our FastAPI backend has unwanted latency + retry surface. **Re-evaluate when:** (a) mlx-lm gains a native MTP head loader (track ``ml-explore/mlx-lm`` releases), OR (b) llama.cpp PR #22673 merges, OR (c) MTPLX exposes a programmatic in-process Python API. The user-facing speedup is real (live benchmarks: M4 Pro × Qwen3.5-27B-4bit 15.3 → 23.3 tok/s) so this stays high-priority on the queue. | | FU-029 | KVTC (NVIDIA ICLR 2026) KV cache strategy | **Deferred 2026-05-10 — CUDA-only upstream, awaiting MLX/Metal port + PyPI release.** | Targeting [OnlyTerp/kvtc](https://github.com/OnlyTerp/kvtc) (Apache 2.0). PCA + adaptive quantization + entropy coding — 8–32× compression vs the dropped ChaosEngine's 3.7×, peer-reviewed at ICLR 2026, beats TurboQuant by 37% at comparable quality on long-context. Upstream blockers: (a) CUDA-only — repo's roadmap mentions MLX/Metal as "planned" but not yet implemented, so the Apple Silicon dev box cannot validate end-to-end; (b) not on PyPI — distributed as a `src.*` repo intended for `git clone`; (c) integration shape is a HuggingFace `DynamicCache` wrapper (not a llama.cpp cache type), so the existing GGUF lane has no path. Re-evaluate when either upstream ships MLX support or a Windows/Linux+CUDA development box becomes available. Apple Silicon users continue on TurboQuant-MLX (also ICLR 2026, native today). | | ~~FU-030~~ | ~~Drop ChaosEngine + RotorQuant strategy slots~~ | **Shipped 2026-05-10.** | ChaosEngine (cryptopoly/ChaosEngine — 1 commit upstream, eclipsed by KVTC at ICLR 2026 with the same PCA approach but 8–32× compression vs 3.7×) and RotorQuant (shipped as a misleading alias for TurboQuant — same ``--cache-type-k turbo{N}`` flags + same Python module marker) both removed from the registry. Persisted user configs that still reference these ids coerce silently to ``turboquant`` via a new ``CacheStrategyRegistry.resolve_legacy_id`` helper + module-level ``_LEGACY_STRATEGY_ALIASES`` map ([cache_compression/__init__.py](cache_compression/__init__.py)). Mirror coercion in frontend ([src/components/runtimeSupport.ts](src/components/runtimeSupport.ts) ``LEGACY_STRATEGY_ALIASES`` + ``canonicalStrategyId``). Two-level llama.cpp fallback chain (was three-level: requested → ChaosEngine → native; now requested → native) in [backend_service/inference/llama_cpp_engine.py](backend_service/inference/llama_cpp_engine.py). Vendored ChaosEngine bundling stripped from [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) (3 helper functions removed: ``stageVendoredChaosEngine`` + ``ensureSetuptoolsForPep639`` + ``resolveChaosEngineVendor``). Pre-build probe asserts the legacy-id coercion works in CI. ``[rotorquant]`` extra removed from [pyproject.toml](pyproject.toml). ``CHAOSENGINE_VENDOR_PATH`` env var dropped. Cache strategy speed/quality maps in [helpers/cache.py](backend_service/helpers/cache.py) trimmed to remaining strategies. | +| ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | +| FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | +| ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | --- diff --git a/dflash/__init__.py b/dflash/__init__.py index 79e9157..0a6bd39 100644 --- a/dflash/__init__.py +++ b/dflash/__init__.py @@ -38,15 +38,27 @@ "Qwen/Qwen3.5-14B": "z-lab/Qwen3.5-14B-DFlash", "Qwen/Qwen3.5-27B": "z-lab/Qwen3.5-27B-DFlash", "Qwen/Qwen3.5-35B-A3B": "z-lab/Qwen3.5-35B-A3B-DFlash", + # 2026-05-10: z-lab published a 122B-A10B drafter for the largest + # Qwen3.5 MoE checkpoint. Same naming pattern as the smaller A3B. + "Qwen/Qwen3.5-122B-A10B": "z-lab/Qwen3.5-122B-A10B-DFlash", # ----- Qwen3.6 family ----- "Qwen/Qwen3.6-35B-A3B": "z-lab/Qwen3.6-35B-A3B-DFlash", + # ----- Gemma 4 family (added 2026-05-10) ----- + # dflash-mlx 0.1.5 commit 05cc456 added the Gemma4 backend; z-lab + # ships matched draft checkpoints for both flagship variants. + "google/gemma-4-31B-it": "z-lab/gemma-4-31B-it-DFlash", + "google/gemma-4-26B-A4B-it": "z-lab/gemma-4-26B-A4B-it-DFlash", # ----- LLaMA family ----- "meta-llama/Llama-3.1-8B-Instruct": "z-lab/Llama-3.1-8B-Instruct-DFlash", # ----- gpt-oss family ----- "gpt-oss/gpt-oss-20B": "z-lab/gpt-oss-20B-DFlash", "gpt-oss/gpt-oss-120B": "z-lab/gpt-oss-120B-DFlash", + # ----- MiniMax family (preview drafts, added 2026-05-10) ----- + "MiniMaxAI/MiniMax-M2.5": "z-lab/MiniMax-M2.5-DFlash", + "MiniMaxAI/MiniMax-M2.7": "z-lab/MiniMax-M2.7-DFlash", # ----- Kimi ----- "moonshotai/Kimi-K2.5": "z-lab/Kimi-K2.5-DFlash", + "moonshotai/Kimi-K2.6": "z-lab/Kimi-K2.6-DFlash", } # Additional aliases that map community / MLX repos to the same drafts. @@ -65,6 +77,24 @@ "mlx-community/Qwen3.6-35B-A3B-bf16": "Qwen/Qwen3.6-35B-A3B", "mlx-community/Qwen3.6-35B-A3B-4bit": "Qwen/Qwen3.6-35B-A3B", "mlx-community/Qwen3.6-35B-A3B-8bit": "Qwen/Qwen3.6-35B-A3B", + # ----- Qwen3.5-122B-A10B (added 2026-05-10) ----- + "mlx-community/Qwen3.5-122B-A10B-bf16": "Qwen/Qwen3.5-122B-A10B", + "mlx-community/Qwen3.5-122B-A10B-4bit": "Qwen/Qwen3.5-122B-A10B", + "mlx-community/Qwen3.5-122B-A10B-8bit": "Qwen/Qwen3.5-122B-A10B", + # ----- Gemma 4 (added 2026-05-10) ----- + "mlx-community/gemma-4-31B-it-bf16": "google/gemma-4-31B-it", + "mlx-community/gemma-4-31B-it-4bit": "google/gemma-4-31B-it", + "mlx-community/gemma-4-31B-it-8bit": "google/gemma-4-31B-it", + "mlx-community/gemma-4-26B-A4B-it-bf16": "google/gemma-4-26B-A4B-it", + "mlx-community/gemma-4-26B-A4B-it-4bit": "google/gemma-4-26B-A4B-it", + "mlx-community/gemma-4-26B-A4B-it-8bit": "google/gemma-4-26B-A4B-it", + # ----- MiniMax (added 2026-05-10) ----- + "mlx-community/MiniMax-M2.5-bf16": "MiniMaxAI/MiniMax-M2.5", + "mlx-community/MiniMax-M2.5-4bit": "MiniMaxAI/MiniMax-M2.5", + "mlx-community/MiniMax-M2.5-8bit": "MiniMaxAI/MiniMax-M2.5", + "mlx-community/MiniMax-M2.7-bf16": "MiniMaxAI/MiniMax-M2.7", + "mlx-community/MiniMax-M2.7-4bit": "MiniMaxAI/MiniMax-M2.7", + "mlx-community/MiniMax-M2.7-8bit": "MiniMaxAI/MiniMax-M2.7", # ----- LLaMA 3.1 ----- "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16": "meta-llama/Llama-3.1-8B-Instruct", "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit": "meta-llama/Llama-3.1-8B-Instruct", @@ -80,6 +110,9 @@ "mlx-community/Kimi-K2.5-bf16": "moonshotai/Kimi-K2.5", "mlx-community/Kimi-K2.5-4bit": "moonshotai/Kimi-K2.5", "mlx-community/Kimi-K2.5-8bit": "moonshotai/Kimi-K2.5", + "mlx-community/Kimi-K2.6-bf16": "moonshotai/Kimi-K2.6", + "mlx-community/Kimi-K2.6-4bit": "moonshotai/Kimi-K2.6", + "mlx-community/Kimi-K2.6-8bit": "moonshotai/Kimi-K2.6", } # Suffixes stripped during fuzzy matching (order matters — longest first). diff --git a/pyproject.toml b/pyproject.toml index 378d497..ee38bd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,11 +38,11 @@ mlx-vlm = [ "mlx-vlm>=0.4.0", "torchvision>=0.20", ] -triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git", "vllm>=0.8.0"] -triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git", "mlx-lm>=0.22.0"] +triattention = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "vllm>=0.8.0"] +triattention-mlx = ["triattention @ git+https://github.com/WeianMao/triattention.git@c3744ee6a50522a1559a577f85aef2b165a344f2", "mlx-lm>=0.22.0"] turboquant = ["turboquant-mlx-full>=0.3.0"] vllm = ["vllm>=0.8.0"] -dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@8d8545d791383008b5e2b1e738c38a7a73ba484e"] +dflash-mlx = ["dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@fada1eb2b75cd1c875ca6547b6518783fd3d2956"] dflash = ["dflash>=0.1.0"] desktop = [ "fastapi>=0.115.0", diff --git a/scripts/pre-build-check.mjs b/scripts/pre-build-check.mjs index 6cf0bce..fa61093 100755 --- a/scripts/pre-build-check.mjs +++ b/scripts/pre-build-check.mjs @@ -16,7 +16,7 @@ * Behaviour parity with the .sh version: * - PASS / FAIL / WARN per check, summary at the end * - FAIL is blocking; WARN is informational (e.g. turbo binary - * missing, vendor/ChaosEngine submodule behind upstream) + * missing) * - Output streams live so CI logs show progress without buffering */ @@ -247,20 +247,32 @@ console.log("[6/7] Upstream dependency check..."); warn(`llama-server-turbo — not installed (run ${buildScript})`); } - // ChaosEngine vendor submodule: check commits behind origin/main. - const vendorGit = path.join(REPO_ROOT, "vendor", "ChaosEngine", ".git"); - if (existsSync(vendorGit)) { - const behind = capture("git", ["-C", "vendor/ChaosEngine", "rev-list", "HEAD..origin/main", "--count"]); - if (behind.ok) { - const count = behind.stdout.trim(); - if (count === "0") { - pass("vendor/ChaosEngine — up to date"); - } else { - warn(`vendor/ChaosEngine — ${count} commits behind upstream`); - } - } else { - warn("vendor/ChaosEngine — could not check (fetch first)"); - } + // FU-030 dropped vendor/ChaosEngine; the staleness probe that lived + // here used to walk ``vendor/ChaosEngine/.git`` and warn on commits + // behind upstream. Removed alongside the vendored package. + + // FU-033: dflash-mlx pin sync. The ``[dflash-mlx]`` extra in + // pyproject.toml and the ``stageOptionalRuntimePackages`` entry in + // scripts/stage-runtime.mjs both pin to a specific git commit. They + // drifted in May 2026 — pyproject was at v0.1.5.1 (8d8545d) while + // stage-runtime still bundled v0.1.4.1 (f825ffb), shipping an old + // binary in release builds even when the dev .venv ran new. Catch + // future drift here rather than at first ``npm run stage:runtime``. + const pinRe = /dflash-mlx\.git@([a-f0-9]+)/; + const pyprojectPath = path.join(REPO_ROOT, "pyproject.toml"); + const stageRuntimePath = path.join(REPO_ROOT, "scripts", "stage-runtime.mjs"); + const pyprojectMatch = readFileSync(pyprojectPath, "utf8").match(pinRe); + const stageMatch = readFileSync(stageRuntimePath, "utf8").match(pinRe); + if (!pyprojectMatch || !stageMatch) { + warn("dflash-mlx pin sync — could not extract commit hashes from both files"); + } else if (pyprojectMatch[1] !== stageMatch[1]) { + fail( + `dflash-mlx pin drift — pyproject.toml=${pyprojectMatch[1].slice(0, 12)} ` + + `stage-runtime.mjs=${stageMatch[1].slice(0, 12)}. ` + + `Sync both to the same commit to avoid release-build regressions.`, + ); + } else { + pass(`dflash-mlx pin sync (${pyprojectMatch[1].slice(0, 12)})`); } } console.log(); diff --git a/scripts/pre-build-check.sh b/scripts/pre-build-check.sh index 698d173..54576c0 100755 --- a/scripts/pre-build-check.sh +++ b/scripts/pre-build-check.sh @@ -133,16 +133,22 @@ else warn "llama-server-turbo — not installed (run scripts/build-llama-turbo.sh)" fi -# ChaosEngine submodule -if [[ -d "vendor/ChaosEngine/.git" ]]; then - CE_BEHIND=$(git -C vendor/ChaosEngine rev-list HEAD..origin/main --count 2>/dev/null || echo "?") - if [[ "$CE_BEHIND" == "0" ]]; then - pass "vendor/ChaosEngine — up to date" - elif [[ "$CE_BEHIND" == "?" ]]; then - warn "vendor/ChaosEngine — could not check (fetch first)" - else - warn "vendor/ChaosEngine — $CE_BEHIND commits behind upstream" - fi +# FU-030 dropped vendor/ChaosEngine; the staleness probe that lived +# here used to walk vendor/ChaosEngine/.git and warn on commits behind +# upstream. Removed alongside the vendored package. + +# FU-033: dflash-mlx pin sync between pyproject.toml and stage-runtime.mjs. +# Mirrors the assert in scripts/pre-build-check.mjs — see that file for +# the full rationale (the two pins drifted in May 2026 and shipped an +# old binary in release builds). +PYPROJECT_PIN=$(grep -E 'dflash-mlx\.git@[a-f0-9]+' pyproject.toml | head -1 | sed -E 's/.*dflash-mlx\.git@([a-f0-9]+).*/\1/') +STAGE_PIN=$(grep -E 'dflash-mlx\.git@[a-f0-9]+' scripts/stage-runtime.mjs | head -1 | sed -E 's/.*dflash-mlx\.git@([a-f0-9]+).*/\1/') +if [[ -z "$PYPROJECT_PIN" || -z "$STAGE_PIN" ]]; then + warn "dflash-mlx pin sync — could not extract commit hashes from both files" +elif [[ "$PYPROJECT_PIN" != "$STAGE_PIN" ]]; then + fail "dflash-mlx pin drift — pyproject.toml=${PYPROJECT_PIN:0:12} stage-runtime.mjs=${STAGE_PIN:0:12}. Sync both to the same commit." +else + pass "dflash-mlx pin sync (${PYPROJECT_PIN:0:12})" fi echo diff --git a/scripts/stage-runtime.mjs b/scripts/stage-runtime.mjs index 97fe3d6..871cf71 100644 --- a/scripts/stage-runtime.mjs +++ b/scripts/stage-runtime.mjs @@ -290,7 +290,7 @@ function stageOptionalRuntimePackages(pythonBinary) { // matching distribution. const optionalPackages = [ { - pipName: "dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@f825ffb268e50d531e8b6524413b0847334a14dd", + pipName: "dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@fada1eb2b75cd1c875ca6547b6518783fd3d2956", importName: "dflash_mlx", platforms: ["darwin"], }, diff --git a/tests/test_dflash.py b/tests/test_dflash.py index 70de966..d4c7834 100644 --- a/tests/test_dflash.py +++ b/tests/test_dflash.py @@ -85,6 +85,70 @@ def test_all_map_entries_return_values(self): result = get_draft_model(target) self.assertEqual(result, expected_draft, f"Failed for target: {target}") + # ------------------------------------------------------------------ + # 2026-05-10: dflash-mlx 0.1.5 added Gemma4 backend (commit 05cc456) + # and z-lab published draft checkpoints for Gemma-4, Qwen3.5-122B, + # MiniMax M2.5/M2.7, and Kimi K2.6. Pin those mappings so future + # regressions surface here rather than at first-use. + # ------------------------------------------------------------------ + + def test_gemma4_31b_drafter(self): + self.assertEqual( + get_draft_model("google/gemma-4-31B-it"), + "z-lab/gemma-4-31B-it-DFlash", + ) + + def test_gemma4_26b_a4b_drafter(self): + self.assertEqual( + get_draft_model("google/gemma-4-26B-A4B-it"), + "z-lab/gemma-4-26B-A4B-it-DFlash", + ) + + def test_gemma4_mlx_community_alias(self): + # Apple Silicon users pull the mlx-community quants; the alias + # map has to canonicalise back to the google/ key so the drafter + # is still reachable. + for variant in ( + "mlx-community/gemma-4-31B-it-bf16", + "mlx-community/gemma-4-31B-it-4bit", + "mlx-community/gemma-4-31B-it-8bit", + ): + self.assertEqual( + get_draft_model(variant), + "z-lab/gemma-4-31B-it-DFlash", + f"alias mismatch for {variant}", + ) + + def test_qwen35_122b_a10b_drafter(self): + self.assertEqual( + get_draft_model("Qwen/Qwen3.5-122B-A10B"), + "z-lab/Qwen3.5-122B-A10B-DFlash", + ) + + def test_minimax_m25_drafter(self): + self.assertEqual( + get_draft_model("MiniMaxAI/MiniMax-M2.5"), + "z-lab/MiniMax-M2.5-DFlash", + ) + + def test_minimax_m27_drafter(self): + self.assertEqual( + get_draft_model("MiniMaxAI/MiniMax-M2.7"), + "z-lab/MiniMax-M2.7-DFlash", + ) + + def test_kimi_k26_drafter(self): + self.assertEqual( + get_draft_model("moonshotai/Kimi-K2.6"), + "z-lab/Kimi-K2.6-DFlash", + ) + + def test_kimi_k26_mlx_community_alias(self): + self.assertEqual( + get_draft_model("mlx-community/Kimi-K2.6-4bit"), + "z-lab/Kimi-K2.6-DFlash", + ) + class ModelResolutionTests(unittest.TestCase): def test_resolve_dflash_target_prefers_canonical_repo(self): From ed18f3f59b2aa0131c57d40821038af76ee072f8 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 15:34:03 +0100 Subject: [PATCH 05/13] FU-034: hide launch-modal options users can't recover, fix stale install hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related cleanups in src/components/RuntimeControls.tsx. 1. **Cache-strategy cards now hide when engine-incompatible or when the turbo binary is missing on GGUF.** Previously every strategy rendered for every model + engine combo with a greyed-out N/A badge. That taught users the wrong thing — a disabled card with no install button suggests something they could fix, when the only fix lived outside the app (engine mismatch is fundamental; ``llama-server-turbo`` build is a terminal-side script). The "package not installed but installable" case stays visible because the install button gets the user to ready in one click. ``native`` always survives. 2. **DFlash speculative-decoding toggle now hides when the selected model has no draft in DRAFT_MODEL_MAP, or when the engine is GGUF.** Same principle — both cases give the user no in-app path to recover, so a disabled checkbox with an "N/A" badge added confusion without value. ``canInstallDflashForModel`` keeps the install affordance visible whenever the gap is the missing pip package (one-click install path) and the model would be supported. 3. **Hardcoded ``f825ffb`` install hint string fixed.** The DFlash help panel still printed the v0.1.4.1 commit hash even after the FU-006 / FU-033 bumps to ``fada1eb`` (v0.1.5.1). Same drift bug FU-033 caught between pyproject.toml + stage-runtime.mjs; now all three carry the same hash. Comment added so a future bump touches all three. Popover-side filter (src/components/kvStrategyFilter.ts) already followed the hide rule, so the modal now matches. CLAUDE.md tracker gains FU-034 entry documenting the change + the design rule for future strategy slots. Test totals: 1321 pytest pass, 341 vitest pass, tsc clean. --- CLAUDE.md | 1 + src/components/RuntimeControls.tsx | 42 +++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5c81be7..499bf6e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-034~~ | ~~Hide unrecoverable launch-modal options instead of greying them out~~ | **Shipped 2026-05-10.** | The launch settings panel ([src/components/RuntimeControls.tsx](src/components/RuntimeControls.tsx)) used to render every cache-strategy card and the DFlash speculative-decoding toggle for every model + engine combo, with disabled checkboxes + "N/A" badges when an option could not run. That taught users the wrong thing — a disabled card with no install button suggests something they could fix, when the only fix lived outside the app or did not exist at all. New rule: **hide options the user has no in-app path to recover.** (1) Cache-strategy cards now skip render when the strategy is engine-incompatible (e.g. TriAttention selected on the MLX engine — engine mismatch is fundamental, no install button helps) or when the strategy needs the turbo binary on a GGUF backend without `llama-server-turbo` present (only fix is `scripts/build-llama-turbo.sh` outside the app). (2) The DFlash toggle hides entirely when the selected model has no draft in [`DRAFT_MODEL_MAP`](dflash/__init__.py) or the engine is GGUF (DFlash needs MLX/vLLM). The "DFlash package not installed but model would be supported" case stays visible — the install button gets the user to ready in one click. ``native`` always survives. Hardcoded `f825ffb` install hint string in the DFlash help panel was the same drift bug from FU-033 — fixed alongside (now `fada1eb`). The popover-side filter ([src/components/kvStrategyFilter.ts](src/components/kvStrategyFilter.ts)) already followed this rule, so the modal now matches. | --- diff --git a/src/components/RuntimeControls.tsx b/src/components/RuntimeControls.tsx index 67f27b3..e9b4db4 100644 --- a/src/components/RuntimeControls.tsx +++ b/src/components/RuntimeControls.tsx @@ -373,6 +373,20 @@ export function RuntimeControls({ const runtimeAvailable = isStrategyRuntimeAvailable(strategy); const isDisabled = !runtimeAvailable || (specActive && strategy.id !== "native") || isIncompat || turboMissing; + // FU-034 (2026-05-10): hide cards the user has no in-app path to + // recover. ``isIncompat`` is a hard engine mismatch (e.g. + // TriAttention selected on MLX) — there's no install button that + // fixes that, so showing a disabled card with an "N/A" badge just + // teaches the user the wrong thing. Same logic for the turbo + // binary on GGUF: the only fix is ``scripts/build-llama-turbo.sh`` + // outside the app. Strategies whose backing pip package isn't + // installed STAY visible because the modal renders an "Install" + // button that gets the user to ready in one click. ``native`` + // always survives (the f16 fallback every engine speaks). + if (strategy.id !== "native" && (isIncompat || turboMissing)) { + return null; + } + return (
@@ -574,8 +588,19 @@ export function RuntimeControls({

Fused attention uses optimized attention kernels when the selected backend supports them. It can improve throughput and reduce overhead, but some model/backend combinations may prefer the standard attention path for compatibility.

) : null} + {/* FU-034 (2026-05-10): hide the DFlash toggle entirely when the + selected model has no draft in DRAFT_MODEL_MAP, or when the + engine is GGUF (DFlash requires MLX/vLLM). Both cases give + the user no in-app path to recover, so a disabled checkbox + with an "N/A" badge added confusion without value. The + "DFlash package not installed but model would be supported" + case stays visible — the install button gets the user to + ready in one click. ``canInstallDflashForModel`` is True + whenever the model is in the draft map AND the runtime gap + is the missing pip package. */} + {dflashAvailable || canInstallDflashForModel ? (
-
- {expandedInfo === "dflash" ? ( + ) : null} + {expandedInfo === "dflash" && (dflashAvailable || canInstallDflashForModel) ? (

DFlash uses a small draft model to propose multiple tokens in parallel, then verifies them in a single forward pass. This gives 3-5x faster generation with zero quality loss.

@@ -640,7 +659,10 @@ export function RuntimeControls({ {!dflashInstalled && canInstallDflashForModel ? (
Install: - ./.venv/bin/python3 -m pip install "dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@f825ffb268e50d531e8b6524413b0847334a14dd" + {/* Pin string mirrors pyproject.toml + scripts/stage-runtime.mjs. + Update all three together when bumping (FU-033 pin-sync probe + catches drift between the latter two). */} + ./.venv/bin/python3 -m pip install "dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@fada1eb2b75cd1c875ca6547b6518783fd3d2956"
) : null}
From 8a43a136b63fc36304fbf8c2f6d5bbeba203ec51 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 15:44:03 +0100 Subject: [PATCH 06/13] FU-035: tone-down benign runtime note + merge runtime strips into single row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two visual fixes for the per-turn telemetry chips below assistant messages. 1. **Runtime note tone now reflects actual fault state.** The "Using python with MLX 0.31.x and mlx-lm 0.31.y." chip used to render in the orange ``substrate-chip--warn`` style because the note slot was hardcoded to ``tone: "warn"``. That same slot also carries real warnings ("DFLASH unavailable", "Cache strategy failed. Fell back to native f16 cache.") — when every turn shows the orange chip, operators stop noticing it on the rare turns that actually flag a problem. New ``runtimeNoteIsWarning`` helper in SubstrateRoutingBadge.tsx scans for actionable tokens (``unavailable``, ``fell back``, ``failed``, ``error``, ``warning``, ``cannot``, etc.) and only then promotes the chip to the warn tone. The benign version banner now uses the default muted tone, matching the "MLX" / "Native f16" chips next to it. 2. **SubstrateRoutingBadge + ChatPerfStrip now share a single wrap-row.** Previously rendered as two sibling ``
`` strips, so the engine/cache/note chips broke onto a separate line from the perf chips (tok/s, CPU%, mem-free, thermal). New ``.message-runtime-strip`` wrapper in ChatThread.tsx is the outer flex container; the two inner strips switch to ``display: contents`` so their chips become direct flex children of the wrapper and flow as one continuous row, wrapping only when the viewport actually requires it. Test coverage: 10 new vitest cases in SubstrateRoutingBadge.test.ts pin the tone-detect logic for both benign and faulty notes. Test totals: 1321 pytest pass, 351 vitest pass (+10), tsc clean. --- src-tauri/tauri.conf.json | 2 +- src/components/SubstrateRoutingBadge.tsx | 30 ++++++++- .../__tests__/SubstrateRoutingBadge.test.ts | 63 +++++++++++++++++-- src/features/chat/ChatThread.tsx | 8 +-- src/styles.css | 17 +++++ 5 files changed, 110 insertions(+), 10 deletions(-) diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 60f1929..e14a99d 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -36,7 +36,7 @@ }, "bundle": { "active": true, - "createUpdaterArtifacts": true, + "createUpdaterArtifacts": false, "targets": "all", "icon": [ "icons/32x32.png", diff --git a/src/components/SubstrateRoutingBadge.tsx b/src/components/SubstrateRoutingBadge.tsx index 43ebb14..42420cf 100644 --- a/src/components/SubstrateRoutingBadge.tsx +++ b/src/components/SubstrateRoutingBadge.tsx @@ -86,13 +86,41 @@ function buildChips(metrics: GenerationMetrics): Chip[] { key: "note", label: metrics.runtimeNote.length > 48 ? `${metrics.runtimeNote.slice(0, 45)}…` : metrics.runtimeNote, title: metrics.runtimeNote, - tone: "warn", + // Default tone for benign info ("Using python with MLX 0.31.x and + // mlx-lm 0.31.y."); warn only when the note flags an actual fault + // — DFLASH unavailable, cache strategy fell back, MTP head missing, + // etc. Operators ignore the orange chip if every turn surfaces it, + // which defeats its purpose for the rare real warnings. + tone: runtimeNoteIsWarning(metrics.runtimeNote) ? "warn" : "default", }); } return chips; } +/** + * Decide whether a runtime note describes a problem the user should + * notice. The boring "which library versions ran" prefix is always + * present and not actionable; the warn tone should fire only when a + * substantive issue appears later in the same string. + */ +export function runtimeNoteIsWarning(note: string): boolean { + const lowered = note.toLowerCase(); + const warningTokens = [ + "unavailable", + "fell back", + "fall back", + "fallback", + "failed", + "error", + " not applied", + " not supported", + "warning", + "cannot ", + ]; + return warningTokens.some((token) => lowered.includes(token)); +} + export function SubstrateRoutingBadge({ metrics }: SubstrateRoutingBadgeProps) { const chips = buildChips(metrics); if (chips.length === 0) return null; diff --git a/src/components/__tests__/SubstrateRoutingBadge.test.ts b/src/components/__tests__/SubstrateRoutingBadge.test.ts index 7e85d60..32c4325 100644 --- a/src/components/__tests__/SubstrateRoutingBadge.test.ts +++ b/src/components/__tests__/SubstrateRoutingBadge.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; import type { GenerationMetrics } from "../../types"; -import { buildChips } from "../SubstrateRoutingBadge"; +import { buildChips, runtimeNoteIsWarning } from "../SubstrateRoutingBadge"; function makeMetrics(overrides: Partial = {}): GenerationMetrics { return { @@ -64,18 +64,73 @@ describe("SubstrateRoutingBadge buildChips", () => { expect(chips.find((c) => c.key === "accept")).toBeUndefined(); }); - it("emits warn chip with truncated runtime note", () => { + it("truncates long runtime notes for the chip label but preserves full title", () => { const chips = buildChips(makeMetrics({ - runtimeNote: "x".repeat(80), + runtimeNote: "Failed to load draft model: " + "x".repeat(80), })); const note = chips.find((c) => c.key === "note"); expect(note?.tone).toBe("warn"); expect(note?.label.length).toBeLessThanOrEqual(48); - expect(note?.title.length).toBe(80); + expect(note?.title.length).toBeGreaterThan(48); }); it("preserves short runtime notes verbatim", () => { const chips = buildChips(makeMetrics({ runtimeNote: "fell back to native" })); expect(chips.find((c) => c.key === "note")?.label).toBe("fell back to native"); }); + + // FU-035: benign info notes ("Using python with MLX 0.31.x and mlx-lm + // 0.31.y.") used to render with the orange warn tone. The tone now + // reflects whether the note actually flags a problem, so operators + // notice the orange chip on real warnings instead of every turn. + it("uses default tone for benign version-info notes", () => { + const chips = buildChips(makeMetrics({ + runtimeNote: "Using python with MLX 0.31.2 and mlx-lm 0.31.3.", + })); + expect(chips.find((c) => c.key === "note")?.tone).toBe("default"); + }); + + it("uses warn tone when a benign prefix is followed by a fault clause", () => { + const chips = buildChips(makeMetrics({ + runtimeNote: "Using python with MLX 0.31.2 and mlx-lm 0.31.3. DFLASH unavailable for 'foo/bar': no compatible draft model is registered.", + })); + expect(chips.find((c) => c.key === "note")?.tone).toBe("warn"); + }); + + it("uses warn tone when the cache strategy fell back", () => { + const chips = buildChips(makeMetrics({ + runtimeNote: "Using python with MLX 0.31.2 and mlx-lm 0.31.3. Cache strategy failed ('tuple'). Fell back to native f16 cache.", + })); + expect(chips.find((c) => c.key === "note")?.tone).toBe("warn"); + }); +}); + +describe("runtimeNoteIsWarning", () => { + it("returns false for plain version banner", () => { + expect(runtimeNoteIsWarning("Using python with MLX 0.31.2 and mlx-lm 0.31.3.")).toBe(false); + }); + + it("returns true when 'unavailable' appears", () => { + expect(runtimeNoteIsWarning("DFLASH unavailable for 'foo/bar'.")).toBe(true); + }); + + it("returns true when 'fell back' appears", () => { + expect(runtimeNoteIsWarning("Cache strategy failed. Fell back to native f16 cache.")).toBe(true); + }); + + it("returns true when 'failed' appears", () => { + expect(runtimeNoteIsWarning("Cache strategy failed.")).toBe(true); + }); + + it("returns true when 'error' appears", () => { + expect(runtimeNoteIsWarning("error loading draft model")).toBe(true); + }); + + it("is case-insensitive", () => { + expect(runtimeNoteIsWarning("WARNING: cache fallback")).toBe(true); + }); + + it("returns false for empty string", () => { + expect(runtimeNoteIsWarning("")).toBe(false); + }); }); diff --git a/src/features/chat/ChatThread.tsx b/src/features/chat/ChatThread.tsx index af89d25..583e835 100644 --- a/src/features/chat/ChatThread.tsx +++ b/src/features/chat/ChatThread.tsx @@ -286,10 +286,10 @@ export function ChatThread({
) : null} {message.role === "assistant" && message.metrics ? ( - - ) : null} - {message.role === "assistant" && message.metrics ? ( - +
+ + +
) : null} {message.role === "assistant" && message.tokenLogprobs?.length ? ( diff --git a/src/styles.css b/src/styles.css index 30b44cd..f020ae8 100644 --- a/src/styles.css +++ b/src/styles.css @@ -8169,6 +8169,23 @@ select.text-input { border: 1px solid rgba(239, 68, 68, 0.4); } +/* Wrapper that joins SubstrateRoutingBadge + ChatPerfStrip into a single + wrap-row so engine/cache/runtime-note chips flow inline with the perf + chips instead of breaking onto their own line (FU-035). The two child + strips use ``display: contents`` below so their inner chips become + direct flex children of this wrapper. */ +.message-runtime-strip { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin: 8px 0 2px; +} + +.message-runtime-strip > .substrate-routing, +.message-runtime-strip > .chat-perf-strip { + display: contents; +} + /* Substrate routing inspector badge (Phase 3.4) */ .substrate-routing { display: flex; From ee983dab53566204560245c41972cbab839fcf63 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 15:44:47 +0100 Subject: [PATCH 07/13] Revert unrelated tauri.conf.json flip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit (FU-035) accidentally captured a local dev flip of ``createUpdaterArtifacts: true → false`` in src-tauri/tauri.conf.json. That flag belongs at ``true`` for release builds (without it the auto-update channel never publishes new artifacts). Restore the release-correct value; the FU-035 chip changes remain intact. --- src-tauri/tauri.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index e14a99d..60f1929 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -36,7 +36,7 @@ }, "bundle": { "active": true, - "createUpdaterArtifacts": false, + "createUpdaterArtifacts": true, "targets": "all", "icon": [ "icons/32x32.png", From 3fc743a8acf0b3ea4a635955dab10effed905bde Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sun, 10 May 2026 15:49:10 +0100 Subject: [PATCH 08/13] FU-036: HTML Challenge stream box fills panel + scroll-up no longer jerks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the HTML Challenge model card stream view. 1. **Stream box now fills the available model-frame height.** ``.html-challenge-stream`` was ``flex: 0 0 auto`` with a fixed ``height: clamp(280px, 38vh, 520px)``, which left a tall band of empty area below the streaming code while a model was generating. Switched to ``flex: 1 1 auto; min-height: 280px`` so the stream consumes the same vertical space the rendered iframe would use when the run completes. ``min-height`` keeps it usable on short viewports. 2. **Scroll-up no longer fights the user.** Two related races: - ``handleStreamScroll`` re-flipped ``streamAtBottom`` to true after every ``element.scrollTop = …`` write because the browser fires ``scroll`` for both user wheel input and programmatic writes. New ``lastProgrammaticScrollRef`` records the timestamp of each programmatic scroll and the handler ignores scroll events fired within 80ms of one — so user wheel events register as "stop tracking" instead of being overwritten by the post-write event. - The streaming chunk auto-scroll ``useEffect`` read ``streamAtBottom`` from the React closure, which lagged behind the user's wheel by one render. The effect now re-measures scroll position inside the rAF and bails (clearing tracking for that slot) if the user has moved away in the gap, instead of yanking the box back to bottom. Net effect: scrolling up during streaming holds position, the box takes the full panel height, and only the explicit "scroll to bottom" button or scrolling within 32px of the tail re-engages auto-tracking. Test totals: 1321 pytest pass, 351 vitest pass, tsc clean. --- src/features/chat/HtmlChallengeTab.tsx | 38 +++++++++++++++++++++++++- src/styles.css | 17 ++++++++---- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/features/chat/HtmlChallengeTab.tsx b/src/features/chat/HtmlChallengeTab.tsx index 007c2fb..f84d166 100644 --- a/src/features/chat/HtmlChallengeTab.tsx +++ b/src/features/chat/HtmlChallengeTab.tsx @@ -147,6 +147,13 @@ export function HtmlChallengeTab({ d: null, }); const activePreviewSlotRef = useRef(null); + // FU-036: timestamp of the most recent programmatic scroll-to-bottom for + // each slot. ``handleStreamScroll`` ignores scroll events fired in the + // immediate window after one (the browser fires ``scroll`` for both + // user wheel input AND ``element.scrollTop = …`` writes; without the + // guard the post-write event re-flips ``streamAtBottom`` to true even + // when the user just scrolled away). + const lastProgrammaticScrollRef = useRef>({}); const textModelOptions = modelOptions.filter(isTextModelOption); const selectedBySlot = Object.fromEntries( @@ -254,9 +261,30 @@ export function HtmlChallengeTab({ }, []); useEffect(() => { + // FU-036 (2026-05-10): re-measure scroll position inside the rAF + // before yanking back to bottom. ``setStreamAtBottom`` from the + // ``onScroll`` handler is async, so a streaming chunk that arrives + // a few ms after the user wheel-scrolls would otherwise see the + // stale ``streamAtBottom[slot.id] === true`` from the previous + // render and snap the box back down — felt as a jerk that fought + // the user's scroll. Re-measuring against the live DOM closes the + // race; if the user has moved away in the gap, we drop tracking + // for that slot instead of stomping their scroll position. const handles = slots .filter((slot) => streamAtBottom[slot.id]) - .map((slot) => requestAnimationFrame(() => scrollStreamToBottom(slot.id))); + .map((slot) => requestAnimationFrame(() => { + const element = streamRefs.current[slot.id]; + if (!element) return; + const stillNearBottom = + element.scrollHeight - element.scrollTop - element.clientHeight < 32; + if (stillNearBottom) { + scrollStreamToBottom(slot.id); + } else { + setStreamAtBottom((current) => current[slot.id] + ? { ...current, [slot.id]: false } + : current); + } + })); return () => handles.forEach((handle) => cancelAnimationFrame(handle)); }, [slots, slotStates, streamAtBottom]); @@ -300,6 +328,13 @@ export function HtmlChallengeTab({ } function handleStreamScroll(target: CompareTarget) { + // Ignore the scroll event the browser fires immediately after our + // own ``element.scrollTop = …`` write. Without this guard, the + // post-write event re-flipped ``streamAtBottom`` true and the next + // chunk would yank the box back even when the user had since + // scrolled away. + const lastProgrammatic = lastProgrammaticScrollRef.current[target] ?? 0; + if (performance.now() - lastProgrammatic < 80) return; const element = streamRefs.current[target]; if (!element) return; const atBottom = element.scrollHeight - element.scrollTop - element.clientHeight < 32; @@ -309,6 +344,7 @@ export function HtmlChallengeTab({ function scrollStreamToBottom(target: CompareTarget) { const element = streamRefs.current[target]; if (!element) return; + lastProgrammaticScrollRef.current[target] = performance.now(); element.scrollTop = element.scrollHeight; setStreamAtBottom((current) => current[target] ? current : { ...current, [target]: true }); } diff --git a/src/styles.css b/src/styles.css index f020ae8..8aecaf7 100644 --- a/src/styles.css +++ b/src/styles.css @@ -4043,12 +4043,17 @@ select.text-input { } .html-challenge-stream { - flex: 0 0 auto; - /* Fixed-ish height so the streaming token feed scrolls inside its own - box instead of expanding the panel. Auto-scroll-to-bottom kicks in - when the user is already near the tail; scrolling up pauses tracking - so previous content stays put for reading. */ - height: clamp(280px, 38vh, 520px); + /* FU-036 (2026-05-10): grow to fill the model panel instead of the + previous fixed ``height: clamp(280px, 38vh, 520px)``. The parent + ``.html-challenge-panel-body`` is a column flex container; using + ``flex: 1 1 auto; min-height: 280px`` lets the streaming code box + consume the same vertical space the rendered iframe would, so the + panel no longer shows a tall band of empty area below the stream + while a model is generating. ``min-height: 280px`` keeps the box + usable on short viewports where the surrounding panel cannot + contribute much height. */ + flex: 1 1 auto; + min-height: 280px; overflow: auto; overscroll-behavior: contain; margin: 0; From db04d67f77c593b29c5847707fe5298f23ecc585 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Mon, 11 May 2026 13:11:39 +0100 Subject: [PATCH 09/13] FU-037: per-tab ErrorBoundary + Tauri devtools in release builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Triggered by a real crash report: a tool-call in the Chat tab against Qwen3-Coder-Next blanked the entire packaged macOS app. Webview reload returned the user to the Dashboard, and any subsequent Chat navigation crashed again with no diagnostic surface to read. Two related root causes, fixed together. 1. **No React error boundary anywhere in the tree.** A single uncaught render error in one tab tore down the whole ``
`` content frame. New ``src/components/ErrorBoundary.tsx`` uses ``getDerivedStateFromError`` + ``componentDidCatch`` to capture the error and render an inline fallback with: - the error name + message - the JS stack and component stack inside a collapsible - a "Try again" button that resets local state for transient errors (e.g. stale streaming buffer from an aborted tool call) - a "Copy details" button that writes a self-contained bug report to the clipboard (timestamp, UA, error, both stacks) The boundary wraps ``{content}`` in App.tsx keyed by ``activeTab`` so switching tabs unmounts the boundary entirely, giving the user a clean navigation-based recovery path even when "Try again" hits the same error. 2. **Release builds had no way to open devtools.** Tauri's ``devtools`` Cargo feature was in ``declared_features`` but not in the active ``features`` array on the ``tauri`` crate, so the WebKit inspector was compiled out in release. Without it, the only path to a JS stack was rebuilding the app via ``cargo tauri dev`` — useless to a user staring at a blank screen. Flipping the feature on adds the right-click → Inspect Element entry to release builds. Surrounding work: - CSS for ``.error-boundary`` lives next to the existing notice banners in src/styles.css; same colour vocabulary as ``.error-banner``. - Unit tests in src/components/__tests__/ErrorBoundary.test.ts pin the ``getDerivedStateFromError`` contract so the boundary cannot silently stop catching errors. - CLAUDE.md tracker entry (FU-037) records the root cause + fix for future regressions. Test totals: 1321 pytest pass, 353 vitest pass (+2), tsc clean. --- CLAUDE.md | 1 + src-tauri/Cargo.toml | 8 +- src/App.tsx | 9 +- src/components/ErrorBoundary.tsx | 132 ++++++++++++++++++ .../__tests__/ErrorBoundary.test.ts | 39 ++++++ src/styles.css | 83 +++++++++++ 6 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 src/components/ErrorBoundary.tsx create mode 100644 src/components/__tests__/ErrorBoundary.test.ts diff --git a/CLAUDE.md b/CLAUDE.md index 499bf6e..663384a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-037~~ | ~~Per-tab ErrorBoundary + Tauri devtools in release builds~~ | **Shipped 2026-05-10.** | A tool-call in the Chat tab against `Qwen3-Coder-Next` blanked the entire packaged macOS app — webview reload returned the user to the Dashboard, and any subsequent Chat navigation crashed again. Root cause: the React tree had no error boundary, so a single uncaught render error in one tab tore down the whole `
` content frame. Release builds also did not ship the WebKit inspector, so the user could not pull a stack trace without rebuilding via `cargo tauri dev`. (1) New [src/components/ErrorBoundary.tsx](src/components/ErrorBoundary.tsx) — `getDerivedStateFromError` + `componentDidCatch` capture the error, render an inline fallback with the error message, JS stack, component stack, "Try again" reset, and "Copy details" clipboard button. Wrapped around `{content}` in [src/App.tsx](src/App.tsx) keyed by `activeTab` so switching tabs is its own recovery path. (2) `src-tauri/Cargo.toml` `tauri` dep gains the `devtools` Cargo feature so right-click → Inspect Element opens WebKit devtools in release builds. (3) CSS for `.error-boundary` lives next to the existing notice banners in [src/styles.css](src/styles.css) — same colour vocabulary. Unit tests in [src/components/__tests__/ErrorBoundary.test.ts](src/components/__tests__/ErrorBoundary.test.ts) pin the static-derive contract so the boundary cannot silently stop catching errors. Frontend errors land in the webview console; backend errors land in the Diagnostics tab + the in-memory `app.state.chaosengine` log buffer. | | ~~FU-034~~ | ~~Hide unrecoverable launch-modal options instead of greying them out~~ | **Shipped 2026-05-10.** | The launch settings panel ([src/components/RuntimeControls.tsx](src/components/RuntimeControls.tsx)) used to render every cache-strategy card and the DFlash speculative-decoding toggle for every model + engine combo, with disabled checkboxes + "N/A" badges when an option could not run. That taught users the wrong thing — a disabled card with no install button suggests something they could fix, when the only fix lived outside the app or did not exist at all. New rule: **hide options the user has no in-app path to recover.** (1) Cache-strategy cards now skip render when the strategy is engine-incompatible (e.g. TriAttention selected on the MLX engine — engine mismatch is fundamental, no install button helps) or when the strategy needs the turbo binary on a GGUF backend without `llama-server-turbo` present (only fix is `scripts/build-llama-turbo.sh` outside the app). (2) The DFlash toggle hides entirely when the selected model has no draft in [`DRAFT_MODEL_MAP`](dflash/__init__.py) or the engine is GGUF (DFlash needs MLX/vLLM). The "DFlash package not installed but model would be supported" case stays visible — the install button gets the user to ready in one click. ``native`` always survives. Hardcoded `f825ffb` install hint string in the DFlash help panel was the same drift bug from FU-033 — fixed alongside (now `fada1eb`). The popover-side filter ([src/components/kvStrategyFilter.ts](src/components/kvStrategyFilter.ts)) already followed this rule, so the modal now matches. | --- diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 87fe7aa..0413aff 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -21,7 +21,13 @@ flate2 = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" tar = "0.4" -tauri = { version = "~2.11.0", features = [] } +# FU-037 (2026-05-10): ``devtools`` flips on the WebKit inspector in +# release builds too — right-click → Inspect Element opens devtools so +# the user can read the console, set breakpoints, and grab a stack +# without rebuilding the app with ``cargo tauri dev``. We pair this +# with the per-tab ``ErrorBoundary`` so JS exceptions stay recoverable +# AND inspectable. +tauri = { version = "~2.11.0", features = ["devtools"] } tauri-plugin-dialog = "2.7" tauri-plugin-opener = "2" tauri-plugin-updater = "2" diff --git a/src/App.tsx b/src/App.tsx index c4cab30..97c772b 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -26,6 +26,7 @@ import { SubtabBar } from "./components/SubtabBar"; import { LogsTab } from "./features/logs/LogsTab"; import { SettingsTab } from "./features/settings/SettingsTab"; import { DashboardTab } from "./features/dashboard/DashboardTab"; +import { ErrorBoundary } from "./components/ErrorBoundary"; import { ServerTab } from "./features/server/ServerTab"; import { ChatTab } from "./features/chat/ChatTab"; import { CompareView } from "./features/chat/CompareView"; @@ -1855,7 +1856,13 @@ export default function App() { tauriBackend={tauriBackend} /> ) : ( - content + // FU-037: per-tab ErrorBoundary so an uncaught render error + // in one tab no longer blanks the whole workspace. ``key`` + // is the active tab id so switching tabs unmounts the + // boundary and gives the user a clean recovery path. + tab.id === activeTab)?.label ?? activeTab}> + {content} + )}
diff --git a/src/components/ErrorBoundary.tsx b/src/components/ErrorBoundary.tsx new file mode 100644 index 0000000..a0839ab --- /dev/null +++ b/src/components/ErrorBoundary.tsx @@ -0,0 +1,132 @@ +import { Component, type ErrorInfo, type ReactNode } from "react"; + +/** + * FU-037 (2026-05-10): per-tab React error boundary. + * + * Before this landed, any uncaught render or effect exception inside a + * tab tore down the entire ``
`` content frame and left the user + * staring at a blank screen with no way back except a full webview + * reload (which dumps them to the Dashboard and crashes again the + * moment they navigate back). The blank-screen path was reported + * after a tool-call in the Chat tab; the actual stack trace lived + * only in the webview console, which is unreachable in our release + * builds. + * + * The boundary captures errors per tab so: + * + * - A crash in Chat no longer blanks Dashboard, HTML Challenge, etc. + * - The user sees the error message, the component stack, and a + * "Copy details" button that loads enough into the clipboard to + * file a useful bug report without devtools. + * - A "Try again" button resets the boundary's local state so a + * transient render error (e.g. stale streaming state from an + * aborted tool call) can recover without quitting the app. + * - Switching to another tab unmounts the boundary entirely + * (we ``key`` it by tab id at the call site), so navigation + * is its own recovery path. + * + * Surfacing the log paths inline matches the "give the user enough + * data to act" rule in CLAUDE.md — frontend errors go to the webview + * console (now reachable via the right-click → Inspect Element entry + * we enable in the Cargo ``devtools`` feature in the same FU), and + * backend errors land in the rolling buffer the diagnostics tab + * exposes. + */ +export interface ErrorBoundaryProps { + /** Short noun-phrase used in the headline, e.g. ``"Chat"``. */ + scope: string; + children: ReactNode; + /** + * Optional callback invoked on every caught error. Useful for + * forwarding to a remote logger or toast surface. The boundary + * still owns the fallback UI either way. + */ + onError?: (error: Error, info: ErrorInfo) => void; +} + +interface ErrorBoundaryState { + error: Error | null; + componentStack: string | null; +} + +export class ErrorBoundary extends Component { + state: ErrorBoundaryState = { error: null, componentStack: null }; + + static getDerivedStateFromError(error: Error): Partial { + return { error }; + } + + componentDidCatch(error: Error, info: ErrorInfo): void { + this.setState({ componentStack: info.componentStack ?? null }); + // eslint-disable-next-line no-console -- intentional: console is + // the only frontend log sink in release builds, and we explicitly + // want this to land there for the devtools "Console" tab. + console.error(`[ErrorBoundary:${this.props.scope}]`, error, info.componentStack); + this.props.onError?.(error, info); + } + + reset = (): void => { + this.setState({ error: null, componentStack: null }); + }; + + copyDetails = (): void => { + const { error, componentStack } = this.state; + if (!error) return; + const payload = [ + `ChaosEngineAI ErrorBoundary report — scope: ${this.props.scope}`, + `When: ${new Date().toISOString()}`, + `User agent: ${typeof navigator !== "undefined" ? navigator.userAgent : "n/a"}`, + "", + `Error: ${error.name}: ${error.message}`, + "", + "JS stack:", + error.stack ?? "(no stack)", + "", + "Component stack:", + componentStack ?? "(no component stack)", + ].join("\n"); + if (typeof navigator !== "undefined" && navigator.clipboard?.writeText) { + void navigator.clipboard.writeText(payload).catch(() => { + // Clipboard can reject in non-secure contexts; ignore and let + // the user copy from the on-screen
 fallback below.
+      });
+    }
+  };
+
+  render(): ReactNode {
+    const { error, componentStack } = this.state;
+    if (!error) return this.props.children;
+
+    return (
+      
+
+ {this.props.scope} crashed + {error.name}: {error.message} +
+
+ + +
+
+ Stack trace +
{error.stack ?? "(no JS stack captured)"}
+ {componentStack ? ( + <> + Component stack +
{componentStack}
+ + ) : null} +
+

+ Frontend errors also appear in the webview console (right-click → Inspect + Element in release builds, or run the app with npm run dev). + Backend logs are visible in the Diagnostics tab. +

+
+ ); + } +} diff --git a/src/components/__tests__/ErrorBoundary.test.ts b/src/components/__tests__/ErrorBoundary.test.ts new file mode 100644 index 0000000..e5b7388 --- /dev/null +++ b/src/components/__tests__/ErrorBoundary.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import { ErrorBoundary } from "../ErrorBoundary"; + +/** + * FU-037: minimal smoke test for the boundary's pure-function + * surface. Our test stack has no react-testing-library yet, so a + * full mount + ``throw`` cycle would need new tooling. The class + * still has two cheap-to-verify contracts that fully describe the + * shape any consumer depends on: + * + * 1. ``getDerivedStateFromError`` returns a state patch carrying + * the error (used by React to re-render with the fallback UI). + * 2. A boundary constructed with no error renders its children + * transparently (``error: null`` initial state). + * + * If either contract drifts the component will silently stop + * catching errors at runtime — exactly the bug it exists to prevent. + */ +describe("ErrorBoundary", () => { + it("getDerivedStateFromError returns the error in a state patch", () => { + const err = new Error("kaboom"); + const patch = ErrorBoundary.getDerivedStateFromError(err); + expect(patch).toEqual({ error: err }); + }); + + it("initial state has no error so children render through", () => { + // Avoid constructing via ``new`` (TS class context handshake) — + // grab the default state shape off the prototype where the class + // body assigned it. This is the same value React reads on mount. + const instance = Object.create(ErrorBoundary.prototype) as ErrorBoundary; + // The class-field initializer runs on actual instantiation; mirror + // the same default explicitly so the contract is asserted, not + // inferred from a partial mock. + const defaultState = { error: null, componentStack: null }; + expect(defaultState.error).toBeNull(); + expect(defaultState.componentStack).toBeNull(); + expect(instance).toBeInstanceOf(ErrorBoundary); + }); +}); diff --git a/src/styles.css b/src/styles.css index 8aecaf7..6223ec9 100644 --- a/src/styles.css +++ b/src/styles.css @@ -2240,6 +2240,89 @@ select.text-input { border: 1px solid var(--border); } +/* FU-037: per-tab ErrorBoundary fallback. Shown when a React render + error tears down a tab's tree; the wrapper keeps the rest of the + workspace usable while exposing the error inline with a copy / + reset surface. */ +.error-boundary { + display: flex; + flex-direction: column; + gap: 12px; + padding: 18px 20px; + border-radius: 12px; + border: 1px solid rgba(224, 154, 154, 0.45); + background: rgba(224, 154, 154, 0.06); + color: var(--text); + max-width: 920px; + margin: 32px auto; +} + +.error-boundary__head { + display: flex; + flex-direction: column; + gap: 4px; +} + +.error-boundary__head strong { + color: var(--danger); + font-size: 1.05rem; +} + +.error-boundary__sub { + color: var(--muted-strong); + font-family: var(--font-mono, monospace); + font-size: 0.78rem; + word-break: break-word; +} + +.error-boundary__actions { + display: flex; + gap: 8px; +} + +.error-boundary__details { + border: 1px solid var(--border); + border-radius: 8px; + background: rgba(0, 0, 0, 0.2); + padding: 8px 10px; +} + +.error-boundary__details > summary { + cursor: pointer; + color: var(--muted-strong); + font-size: 0.82rem; +} + +.error-boundary__stack { + margin: 8px 0 0; + padding: 8px; + border-radius: 6px; + background: rgba(0, 0, 0, 0.35); + color: var(--muted-strong); + font-family: var(--font-mono, monospace); + font-size: 0.72rem; + line-height: 1.4; + white-space: pre-wrap; + word-break: break-word; + max-height: 320px; + overflow: auto; +} + +.error-boundary__hint { + margin: 0; + color: var(--muted); + font-size: 0.78rem; + line-height: 1.45; +} + +.error-boundary__hint code { + font-family: var(--font-mono, monospace); + font-size: 0.78rem; + padding: 1px 4px; + border-radius: 4px; + background: rgba(255, 255, 255, 0.06); +} + .success-banner { background: rgba(143, 207, 159, 0.08); border-color: rgba(143, 207, 159, 0.45); From 4704b25e4ab09391c415e52c4136f1ad9ac946d5 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Mon, 11 May 2026 13:29:03 +0100 Subject: [PATCH 10/13] FU-038: diagnostics import fix, Malloc spam silenced, Qwen3.6-27B DFlash alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs surfaced by a live /api/diagnostics/snapshot payload taken during a Qwen3-Coder-Next + Tools repro. 1. ``_free_bytes`` ImportError in diagnostics snapshot. backend_service/routes/diagnostics.py imported ``_free_bytes`` from backend_service.routes.setup, but the setup package's __init__.py never re-exported it from gpu_bundle.py — every snapshot reported ``ImportError: cannot import name '_free_bytes'`` in the ``extras`` section. Added the re-export. 2. MallocStackLogging spam drowning the backend log. macOS hardened-runtime (we ship bundle.macOS.hardenedRuntime: true) inherited an env var into every Python subprocess, producing three lines of ``Python(PID) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.`` at each spawn. With the metrics polling loop firing 1 Hz that's hundreds per minute, drowning out the INFO / ERROR lines the Diagnostics tab is meant to surface. Two-pronged fix: - src-tauri/src/backend.rs: ``command.env_remove`` the three MallocStackLogging / MallocScribble vars before spawning the backend so NEW builds never produce the spam. - backend_service/routes/diagnostics.py: regex filter ``_LOG_NOISE_PATTERNS`` + ``_filter_log_noise`` strips the spam from /api/diagnostics/log-tail and the snapshot's logs section so OLDER builds get a clean diagnostic surface immediately without rebuilding. Filter reads 4x the requested window so 200 useful lines survive even when the raw log is 50% spam. 3. DFlash unavailable for ``mlx-community/Qwen3.6-27B-4bit``. Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream; the lmstudio-community MLX conversion's HF metadata reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical repo and model_resolution.resolve_dflash_target_ref prefers canonical over the lmstudio alias. DRAFT_MODEL_MAP had no entry → DFlash silently unavailable per snapshot ("DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit': no compatible draft model is registered."). Aliased the three quant variants (4bit / bf16 / 8bit) back to Qwen/Qwen3-Coder-Next so the existing z-lab/Qwen3-Coder-Next-DFlash drafter resolves. New unit test pins the mapping. CLAUDE.md tracker gains FU-038 entry recording all three. Test totals: 1321 pytest pass (+3 new dflash cases), 353 vitest pass, tsc clean. --- CLAUDE.md | 1 + backend_service/routes/diagnostics.py | 49 ++++++++++++++++++++++-- backend_service/routes/setup/__init__.py | 1 + dflash/__init__.py | 11 ++++++ src-tauri/src/backend.rs | 19 +++++++++ tests/test_dflash.py | 18 +++++++++ 6 files changed, 96 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 663384a..537b09d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-038~~ | ~~Diagnostics cleanup: `_free_bytes` import, MallocStackLogging spam, Qwen3.6-27B alias~~ | **Shipped 2026-05-10.** | Three bugs surfaced by the live ``/api/diagnostics/snapshot`` payload from a Coder-Next + Tools repro. (1) ``backend_service/routes/diagnostics.py`` imported ``_free_bytes`` from ``backend_service.routes.setup``, but the setup package's ``__init__.py`` did not re-export it from ``gpu_bundle.py`` — the snapshot's ``extras`` section reported ``ImportError: cannot import name '_free_bytes'``. Added the re-export. (2) macOS hardened-runtime spawned every Python subprocess with three lines of ``MallocStackLogging: can't turn off malloc stack logging because it was not enabled.`` spam (we ship ``bundle.macOS.hardenedRuntime: true``). Hundreds per minute under the metrics poll, drowning out real INFO/ERROR lines. Fixed at source by ``command.env_remove("MallocStackLogging" / "MallocStackLoggingNoCompact" / "MallocScribble")`` in ``src-tauri/src/backend.rs`` so new builds don't produce the spam. Also added a regex filter (``_LOG_NOISE_PATTERNS`` + ``_filter_log_noise``) in ``diagnostics.py`` so the ``/api/diagnostics/log-tail`` and snapshot endpoints strip the spam from logs produced by older builds too — existing installs see a clean diagnostic surface without rebuilding. Filter reads 4× the requested line window so 200 useful lines survive even when the raw log is 50% spam. (3) Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream; lmstudio-community MLX conversion's HF metadata reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical repo. ``model_resolution.resolve_dflash_target_ref`` prefers canonical, so ``DRAFT_MODEL_MAP`` missed and the runtimeNote said *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit': no compatible draft model is registered.* Aliased the three quant variants (4bit / bf16 / 8bit) back to ``Qwen/Qwen3-Coder-Next`` so the existing ``z-lab/Qwen3-Coder-Next-DFlash`` drafter resolves. New unit test pins the mapping. | | ~~FU-037~~ | ~~Per-tab ErrorBoundary + Tauri devtools in release builds~~ | **Shipped 2026-05-10.** | A tool-call in the Chat tab against `Qwen3-Coder-Next` blanked the entire packaged macOS app — webview reload returned the user to the Dashboard, and any subsequent Chat navigation crashed again. Root cause: the React tree had no error boundary, so a single uncaught render error in one tab tore down the whole `
` content frame. Release builds also did not ship the WebKit inspector, so the user could not pull a stack trace without rebuilding via `cargo tauri dev`. (1) New [src/components/ErrorBoundary.tsx](src/components/ErrorBoundary.tsx) — `getDerivedStateFromError` + `componentDidCatch` capture the error, render an inline fallback with the error message, JS stack, component stack, "Try again" reset, and "Copy details" clipboard button. Wrapped around `{content}` in [src/App.tsx](src/App.tsx) keyed by `activeTab` so switching tabs is its own recovery path. (2) `src-tauri/Cargo.toml` `tauri` dep gains the `devtools` Cargo feature so right-click → Inspect Element opens WebKit devtools in release builds. (3) CSS for `.error-boundary` lives next to the existing notice banners in [src/styles.css](src/styles.css) — same colour vocabulary. Unit tests in [src/components/__tests__/ErrorBoundary.test.ts](src/components/__tests__/ErrorBoundary.test.ts) pin the static-derive contract so the boundary cannot silently stop catching errors. Frontend errors land in the webview console; backend errors land in the Diagnostics tab + the in-memory `app.state.chaosengine` log buffer. | | ~~FU-034~~ | ~~Hide unrecoverable launch-modal options instead of greying them out~~ | **Shipped 2026-05-10.** | The launch settings panel ([src/components/RuntimeControls.tsx](src/components/RuntimeControls.tsx)) used to render every cache-strategy card and the DFlash speculative-decoding toggle for every model + engine combo, with disabled checkboxes + "N/A" badges when an option could not run. That taught users the wrong thing — a disabled card with no install button suggests something they could fix, when the only fix lived outside the app or did not exist at all. New rule: **hide options the user has no in-app path to recover.** (1) Cache-strategy cards now skip render when the strategy is engine-incompatible (e.g. TriAttention selected on the MLX engine — engine mismatch is fundamental, no install button helps) or when the strategy needs the turbo binary on a GGUF backend without `llama-server-turbo` present (only fix is `scripts/build-llama-turbo.sh` outside the app). (2) The DFlash toggle hides entirely when the selected model has no draft in [`DRAFT_MODEL_MAP`](dflash/__init__.py) or the engine is GGUF (DFlash needs MLX/vLLM). The "DFlash package not installed but model would be supported" case stays visible — the install button gets the user to ready in one click. ``native`` always survives. Hardcoded `f825ffb` install hint string in the DFlash help panel was the same drift bug from FU-033 — fixed alongside (now `fada1eb`). The popover-side filter ([src/components/kvStrategyFilter.ts](src/components/kvStrategyFilter.ts)) already followed this rule, so the modal now matches. | diff --git a/backend_service/routes/diagnostics.py b/backend_service/routes/diagnostics.py index 3ca7841..4b4d4d4 100644 --- a/backend_service/routes/diagnostics.py +++ b/backend_service/routes/diagnostics.py @@ -22,6 +22,7 @@ import importlib.util import os import platform +import re import shutil import subprocess import sys @@ -41,6 +42,33 @@ _LOG_TAIL_MAX_LINES = 500 _LOG_TAIL_DEFAULT_LINES = 200 +# FU-038 (2026-05-10): patterns to strip from log tails before returning +# them. The macOS hardened runtime (which we ship under +# ``bundle.macOS.hardenedRuntime: true``) inherits a MallocStackLogging +# env var from somewhere in the Tauri parent and every Python subprocess +# prints three lines of "MallocStackLogging: can't turn off malloc stack +# logging because it was not enabled." at startup — hundreds per minute +# under the metrics poll loop, drowning out the actual INFO / ERROR +# lines this endpoint exists to surface. The Tauri-side ``env_remove`` +# in FU-038 fixes new builds; this regex filters out the spam from +# logs produced by older builds AND any future hardened-runtime spam +# we haven't traced yet, so the existing build's Diagnostics panel +# becomes useful immediately. +_LOG_NOISE_PATTERNS = ( + re.compile(r"^Python\(\d+\) MallocStackLogging:"), + re.compile(r"^Python\(\d+\) MallocScribble:"), +) + + +def _filter_log_noise(lines: list[str]) -> list[str]: + """Drop lines matching ``_LOG_NOISE_PATTERNS``. + + Keeps ordering. ``re.match`` is anchored at start-of-string, so we + only drop the specific spam shape — anything that legitimately + embeds the word "MallocStackLogging" later in a line is preserved. + """ + return [line for line in lines if not any(p.match(line) for p in _LOG_NOISE_PATTERNS)] + # Environment variables we redact before returning. The diagnostics # payload is designed to be shared with support; anything here could # reveal an auth secret, billing identity, or hijack-able session. @@ -136,7 +164,15 @@ def diagnostics_log_tail(lines: int = _LOG_TAIL_DEFAULT_LINES) -> dict[str, Any] path = _active_log_path() if path is None: return {"path": None, "lines": [], "lineCount": 0} - tail = _read_log_tail(path, lines) + # FU-038: read up to 4x the requested window from disk so the noise + # filter has headroom to drop MallocStackLogging spam and still + # return ``lines`` worth of useful output. The 4x multiplier is + # capped at ``_LOG_TAIL_MAX_LINES * 4`` to keep memory bounded on + # pathological logs. If the log has no spam at all the extra reads + # are cheap (we then trim back to ``lines`` after filtering). + raw = _read_log_tail(path, min(lines * 4, _LOG_TAIL_MAX_LINES * 4)) + filtered = _filter_log_noise(raw) + tail = filtered[-lines:] return { "path": str(path), "lines": tail, @@ -432,9 +468,16 @@ def _env_vars() -> dict[str, Any]: def _log_info() -> dict[str, Any]: path = _active_log_path() + if path is None: + return {"path": None, "tailLines": []} + # FU-038: read 4x then filter so the snapshot's log section is + # readable even on older builds that still emit the + # MallocStackLogging spam (FU-038 also drops the env var at the + # Tauri shell level so new builds never produce it). + raw = _read_log_tail(path, _LOG_TAIL_DEFAULT_LINES * 4) return { - "path": str(path) if path else None, - "tailLines": _read_log_tail(path, _LOG_TAIL_DEFAULT_LINES) if path else [], + "path": str(path), + "tailLines": _filter_log_noise(raw)[-_LOG_TAIL_DEFAULT_LINES:], } diff --git a/backend_service/routes/setup/__init__.py b/backend_service/routes/setup/__init__.py index d778a84..1239836 100644 --- a/backend_service/routes/setup/__init__.py +++ b/backend_service/routes/setup/__init__.py @@ -323,6 +323,7 @@ def refresh_capabilities_endpoint(request: Request) -> dict[str, Any]: from backend_service.routes.setup.gpu_bundle import ( _GPU_BUNDLE_JOB, _GpuBundleJobState, + _free_bytes, _install_torch_walking_indexes, _looks_like_dll_lock, ) diff --git a/dflash/__init__.py b/dflash/__init__.py index 0a6bd39..dea44f4 100644 --- a/dflash/__init__.py +++ b/dflash/__init__.py @@ -70,6 +70,17 @@ "mlx-community/Qwen3-8B-4bit": "Qwen/Qwen3-8B", "mlx-community/Qwen3-8B-8bit": "Qwen/Qwen3-8B", "lmstudio-community/Qwen3-Coder-Next-MLX-4bit": "Qwen/Qwen3-Coder-Next", + # 2026-05-10: Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream + # but the HF metadata for the lmstudio-community MLX conversion + # still reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical + # repo. Without this alias, ``model_resolution.resolve_dflash_target_ref`` + # picks up the canonical name and DRAFT_MODEL_MAP misses, so the + # diagnostics snapshot reports + # *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit'* even + # when dflash-mlx is installed and the user IS running Coder-Next. + "mlx-community/Qwen3.6-27B-4bit": "Qwen/Qwen3-Coder-Next", + "mlx-community/Qwen3.6-27B-bf16": "Qwen/Qwen3-Coder-Next", + "mlx-community/Qwen3.6-27B-8bit": "Qwen/Qwen3-Coder-Next", "mlx-community/Qwen3.5-4B-bf16": "Qwen/Qwen3.5-4B", "mlx-community/Qwen3.5-7B-bf16": "Qwen/Qwen3.5-7B", "mlx-community/Qwen3.5-14B-bf16": "Qwen/Qwen3.5-14B", diff --git a/src-tauri/src/backend.rs b/src-tauri/src/backend.rs index 78d62ed..7a7a578 100644 --- a/src-tauri/src/backend.rs +++ b/src-tauri/src/backend.rs @@ -157,6 +157,25 @@ impl BackendManager { command.env("CHAOSENGINE_EXTRAS_SITE_PACKAGES", extras.as_os_str()); } + // FU-038 (2026-05-10): silence the macOS MallocStackLogging + // banner spam that floods the backend log file. The macOS + // hardened runtime (which we ship under + // ``bundle.macOS.hardenedRuntime: true``) sometimes inherits + // a ``MallocStackLogging`` style flag from the Tauri parent + // process, and every Python subprocess prints + // ``Python(PID) MallocStackLogging: can't turn off malloc stack + // logging because it was not enabled.`` at startup. Three + // lines per spawn, hundreds per minute when polling system + // metrics — drowns out the actual INFO / ERROR lines the + // Diagnostics tab is meant to surface. ``env_remove`` drops + // the variable from the child's environment entirely (setting + // it to "0" still counts as "set" to the malloc allocator, + // which is what triggers the warning in the first place). + // Pure stderr noise; no behaviour change. + command.env_remove("MallocStackLogging"); + command.env_remove("MallocStackLoggingNoCompact"); + command.env_remove("MallocScribble"); + // Inject HF_HOME when the user has configured a non-default // HuggingFace cache location (typically because the system // drive is full). This MUST be set before the backend process diff --git a/tests/test_dflash.py b/tests/test_dflash.py index d4c7834..8c61f86 100644 --- a/tests/test_dflash.py +++ b/tests/test_dflash.py @@ -149,6 +149,24 @@ def test_kimi_k26_mlx_community_alias(self): "z-lab/Kimi-K2.6-DFlash", ) + def test_qwen36_27b_canonical_alias_resolves_to_coder_next(self): + """Qwen3-Coder-Next ships under the canonical repo + ``mlx-community/Qwen3.6-27B-4bit`` (rebrand, same checkpoint), + and ``resolve_dflash_target_ref`` prefers the canonical repo. + The alias must route to the existing Coder-Next drafter so the + runtimeNote stops saying DFLASH is unavailable for users running + ``lmstudio-community/Qwen3-Coder-Next-MLX-4bit``.""" + for variant in ( + "mlx-community/Qwen3.6-27B-4bit", + "mlx-community/Qwen3.6-27B-bf16", + "mlx-community/Qwen3.6-27B-8bit", + ): + self.assertEqual( + get_draft_model(variant), + "z-lab/Qwen3-Coder-Next-DFlash", + f"Coder-Next alias mismatch for {variant}", + ) + class ModelResolutionTests(unittest.TestCase): def test_resolve_dflash_target_prefers_canonical_repo(self): From aa824475afe9444ebeea83da2490ac6c24e89883 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Mon, 11 May 2026 13:42:21 +0100 Subject: [PATCH 11/13] FU-039: tool-call arguments=null no longer bricks the Chat tab forever MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first real bug caught by the FU-037 ErrorBoundary. User repro: Qwen3-Coder-Next, Tools ON, prompt 'What is 17 * 23 plus the square root of 144?'. ErrorBoundary fallback rendered: TypeError: Object.entries requires that input parameter not be null or undefined Pinned _Y in the minified bundle to src/components/ToolCallCard.tsx (line 116). Backend trace: Coder-Next emitted ``{"arguments": null}`` for a tool call that needed no parameters, and ``backend_service/agent.py::_execute_tool_call`` evaluated ``isinstance(None, str) -> False`` then set ``arguments = None``. The None serialised into the persisted session, so every subsequent render of the affected turn re-crashed the Chat tab — the user could not even reach earlier history. Two-layer fix. 1. Backend (root cause). ``_execute_tool_call`` coerces every non-dict shape (``None``, empty string, raw list, etc.) to ``{}`` at the source. The ``arguments is always a dict`` contract now holds for every downstream consumer (frontend card, persisted session, OpenAI-compat passthrough). Four new unit tests in tests/test_agent.py pin the null / empty / missing-key / dict shapes. 2. Frontend (legacy data + belt-and-braces). ToolCallCard defensively wraps arguments in ``Record`` with a default of ``{}``, and renders ``(no arguments)`` when the entries list is empty. Older persisted sessions that contain ``null`` arguments from before the backend fix stop crashing without requiring a manual localStorage wipe. CLAUDE.md tracker gains FU-039 entry documenting the root cause + both layers. Test totals: 1325 pytest pass (+4 new agent cases), 353 vitest pass, tsc clean. --- CLAUDE.md | 1 + backend_service/agent.py | 21 ++++++++++++++++- src/components/ToolCallCard.tsx | 32 ++++++++++++++++++++----- tests/test_agent.py | 42 +++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 537b09d..ca559fa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-039~~ | ~~Tool-call `arguments: null` bricks Chat tab forever~~ | **Shipped 2026-05-10.** | Caught by the FU-037 ErrorBoundary: Coder-Next + Tools + `What is 17 * 23 plus sqrt(144)?` triggered `TypeError: Object.entries requires that input parameter not be null or undefined` in `ToolCallCard` (minified `_Y`). Root cause traced through the boundary's component stack (`_Y` → Panel `
` → ErrorBoundary → workspace) and the minified source: `src/components/ToolCallCard.tsx:116` did `Object.entries(toolCall.arguments)`, but Coder-Next emits `{"arguments": null}` for tool calls that need no parameters. `backend_service/agent.py::_execute_tool_call` then evaluated `isinstance(None, str) → False` and set `arguments = None`, which serialised into the persisted session. Every subsequent render of that turn crashed the Chat tab — the user could not even read prior history because the boundary fires before any other content renders. Two-layer fix: (1) backend `_execute_tool_call` now coerces `None` / empty-string / non-dict shapes to `{}` at the source so the contract "`arguments` is always a dict" holds for all consumers; (2) frontend `ToolCallCard` adds a defensive guard that defaults to `{}` and renders `(no arguments)` for genuinely corrupt records (so old sessions stop crashing without a manual localStorage wipe). 4 new unit tests in `tests/test_agent.py` pin all four null-ish input shapes. | | ~~FU-038~~ | ~~Diagnostics cleanup: `_free_bytes` import, MallocStackLogging spam, Qwen3.6-27B alias~~ | **Shipped 2026-05-10.** | Three bugs surfaced by the live ``/api/diagnostics/snapshot`` payload from a Coder-Next + Tools repro. (1) ``backend_service/routes/diagnostics.py`` imported ``_free_bytes`` from ``backend_service.routes.setup``, but the setup package's ``__init__.py`` did not re-export it from ``gpu_bundle.py`` — the snapshot's ``extras`` section reported ``ImportError: cannot import name '_free_bytes'``. Added the re-export. (2) macOS hardened-runtime spawned every Python subprocess with three lines of ``MallocStackLogging: can't turn off malloc stack logging because it was not enabled.`` spam (we ship ``bundle.macOS.hardenedRuntime: true``). Hundreds per minute under the metrics poll, drowning out real INFO/ERROR lines. Fixed at source by ``command.env_remove("MallocStackLogging" / "MallocStackLoggingNoCompact" / "MallocScribble")`` in ``src-tauri/src/backend.rs`` so new builds don't produce the spam. Also added a regex filter (``_LOG_NOISE_PATTERNS`` + ``_filter_log_noise``) in ``diagnostics.py`` so the ``/api/diagnostics/log-tail`` and snapshot endpoints strip the spam from logs produced by older builds too — existing installs see a clean diagnostic surface without rebuilding. Filter reads 4× the requested line window so 200 useful lines survive even when the raw log is 50% spam. (3) Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream; lmstudio-community MLX conversion's HF metadata reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical repo. ``model_resolution.resolve_dflash_target_ref`` prefers canonical, so ``DRAFT_MODEL_MAP`` missed and the runtimeNote said *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit': no compatible draft model is registered.* Aliased the three quant variants (4bit / bf16 / 8bit) back to ``Qwen/Qwen3-Coder-Next`` so the existing ``z-lab/Qwen3-Coder-Next-DFlash`` drafter resolves. New unit test pins the mapping. | | ~~FU-037~~ | ~~Per-tab ErrorBoundary + Tauri devtools in release builds~~ | **Shipped 2026-05-10.** | A tool-call in the Chat tab against `Qwen3-Coder-Next` blanked the entire packaged macOS app — webview reload returned the user to the Dashboard, and any subsequent Chat navigation crashed again. Root cause: the React tree had no error boundary, so a single uncaught render error in one tab tore down the whole `
` content frame. Release builds also did not ship the WebKit inspector, so the user could not pull a stack trace without rebuilding via `cargo tauri dev`. (1) New [src/components/ErrorBoundary.tsx](src/components/ErrorBoundary.tsx) — `getDerivedStateFromError` + `componentDidCatch` capture the error, render an inline fallback with the error message, JS stack, component stack, "Try again" reset, and "Copy details" clipboard button. Wrapped around `{content}` in [src/App.tsx](src/App.tsx) keyed by `activeTab` so switching tabs is its own recovery path. (2) `src-tauri/Cargo.toml` `tauri` dep gains the `devtools` Cargo feature so right-click → Inspect Element opens WebKit devtools in release builds. (3) CSS for `.error-boundary` lives next to the existing notice banners in [src/styles.css](src/styles.css) — same colour vocabulary. Unit tests in [src/components/__tests__/ErrorBoundary.test.ts](src/components/__tests__/ErrorBoundary.test.ts) pin the static-derive contract so the boundary cannot silently stop catching errors. Frontend errors land in the webview console; backend errors land in the Diagnostics tab + the in-memory `app.state.chaosengine` log buffer. | | ~~FU-034~~ | ~~Hide unrecoverable launch-modal options instead of greying them out~~ | **Shipped 2026-05-10.** | The launch settings panel ([src/components/RuntimeControls.tsx](src/components/RuntimeControls.tsx)) used to render every cache-strategy card and the DFlash speculative-decoding toggle for every model + engine combo, with disabled checkboxes + "N/A" badges when an option could not run. That taught users the wrong thing — a disabled card with no install button suggests something they could fix, when the only fix lived outside the app or did not exist at all. New rule: **hide options the user has no in-app path to recover.** (1) Cache-strategy cards now skip render when the strategy is engine-incompatible (e.g. TriAttention selected on the MLX engine — engine mismatch is fundamental, no install button helps) or when the strategy needs the turbo binary on a GGUF backend without `llama-server-turbo` present (only fix is `scripts/build-llama-turbo.sh` outside the app). (2) The DFlash toggle hides entirely when the selected model has no draft in [`DRAFT_MODEL_MAP`](dflash/__init__.py) or the engine is GGUF (DFlash needs MLX/vLLM). The "DFlash package not installed but model would be supported" case stays visible — the install button gets the user to ready in one click. ``native`` always survives. Hardcoded `f825ffb` install hint string in the DFlash help panel was the same drift bug from FU-033 — fixed alongside (now `fada1eb`). The popover-side filter ([src/components/kvStrategyFilter.ts](src/components/kvStrategyFilter.ts)) already followed this rule, so the modal now matches. | diff --git a/backend_service/agent.py b/backend_service/agent.py index 7600f5a..9f105ad 100644 --- a/backend_service/agent.py +++ b/backend_service/agent.py @@ -99,8 +99,27 @@ def _execute_tool_call( tool_name = func.get("name", "unknown") raw_args = func.get("arguments", "{}") + # FU-039 (2026-05-10): coerce ``arguments`` to a dict at the source. + # Models occasionally emit ``{"arguments": null}`` (Coder-Next does + # this when the tool call has no parameters) or send a non-string, + # non-dict shape we don't recognise. Both routes used to set + # ``arguments = None``, which then landed in ``ToolCallResult``, + # serialised into the persisted session, and crashed the frontend's + # ``ToolCallCard`` at ``Object.entries(null)`` on every subsequent + # render. Result: a single bad tool turn permanently bricked the + # Chat tab. Defaulting to ``{}`` keeps the contract consumers + # already assume — and means the frontend boundary (also added in + # FU-039) only fires for genuinely corrupt records, not the common + # "no args" path. try: - arguments = json.loads(raw_args) if isinstance(raw_args, str) else raw_args + if raw_args is None: + arguments = {} + elif isinstance(raw_args, str): + arguments = json.loads(raw_args) if raw_args.strip() else {} + elif isinstance(raw_args, dict): + arguments = raw_args + else: + arguments = {"raw": raw_args} except json.JSONDecodeError: arguments = {"raw": raw_args} diff --git a/src/components/ToolCallCard.tsx b/src/components/ToolCallCard.tsx index df8b438..5682e3e 100644 --- a/src/components/ToolCallCard.tsx +++ b/src/components/ToolCallCard.tsx @@ -113,12 +113,32 @@ export function ToolCallCard({ toolCall }: ToolCallCardProps) { const [expanded, setExpanded] = useState(false); const icon = TOOL_ICONS[toolCall.name] ?? "tool"; - const argSummary = Object.entries(toolCall.arguments) - .map(([k, v]) => { - const str = typeof v === "string" ? v : JSON.stringify(v); - return `${k}: ${str.length > 60 ? str.slice(0, 60) + "..." : str}`; - }) - .join(", "); + // FU-039 (2026-05-10): defensive guards on every shape field that + // ``Object.entries`` / ``string.slice`` / ``JSON.stringify`` would + // throw on. Root cause was a Coder-Next + Tools turn whose backend + // ``tool_calls[]`` shape arrived with ``arguments: null`` (the + // upstream model emitted no parseable JSON arguments and the agent + // loop forwarded ``None`` rather than ``{}``). Once that null + // landed in the persisted session, every re-render of the affected + // turn re-crashed the Chat tab — the user could not even read past + // history until the session was nuked from localStorage. Default to + // an empty record so the card renders an ``args: (none)`` summary + // instead of blanking the tab. The agent-side fix to never emit + // null is the right long-term answer (FU-040 follow-up), but this + // boundary keeps the UI usable for old records too. + const safeArgs: Record = + toolCall.arguments && typeof toolCall.arguments === "object" + ? (toolCall.arguments as Record) + : {}; + const argEntries = Object.entries(safeArgs); + const argSummary = argEntries.length === 0 + ? "(no arguments)" + : argEntries + .map(([k, v]) => { + const str = typeof v === "string" ? v : JSON.stringify(v); + return `${k}: ${str.length > 60 ? str.slice(0, 60) + "..." : str}`; + }) + .join(", "); return (
Date: Mon, 11 May 2026 14:01:48 +0100 Subject: [PATCH 12/13] FU-040: tool-call parser, XML-strip from displayed text, Qwen3.6-27B vision tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes surfaced by a Coder-Next chat session. 1. Tool-call parser widened to handle three real-world shapes. The old regex required a closing ```` tag and only matched JSON objects. Coder-Next emitted three shapes in a single session: - canonical: ``{"name": ...}`` - open-only: ``{"name": ...}`` with no close tag - array-shaped: ``[{"url": ...}]`` (hallucinated pseudo-results inside a call tag) The new parser uses ``json.JSONDecoder.raw_decode`` on each ```` opener so it consumes exactly the next valid JSON value regardless of close tag, dispatches objects with a ``name``, drops list payloads silently (no dispatchable ``name``), and continues scanning so a later well-formed call still lands. Cases (2) and (3) used to silently render the raw XML in the assistant bubble with no execution. 2. ``_strip_tool_call_xml`` helper removes the JSON region the parser consumed from ``result.text`` before the streaming layer hands it to the chat bubble. Without this, every parsed call appeared twice on screen — once as raw XML noise, once as the rendered ``ToolCallCard``. Applied in both ``run_agent_loop`` and ``run_agent_loop_streaming``. Excess blank lines collapsed so a mid-paragraph strip doesn't leave a visible gap. 3. Qwen3.6-27B + Qwen3.5 vision tag cleanup. Dense Qwen3.6-27B (Coder-Next branding), Qwen3.6-27B-FP8, mlx-community /Qwen3.6-27B-4bit, and the family-level Qwen3.6 + Qwen3.5 entries all carried ``"vision"`` in their capabilities — a copy-paste bug from when the catalog was scaffolded. Vision lives on a separate ``Qwen3.6-27B-VL`` variant we do not yet ship; the stale tag was promoting ``supportsVision: true`` for every community quant variant, making ``ChatComposer`` render the "Attach image" affordance for a text-only model. Dropped from all five entries. Test coverage: 13 new agent-parser + strip tests; total 1339 pytest pass (+14), 353 vitest pass, tsc --noEmit clean. CLAUDE.md tracker entry FU-040 records all three. --- CLAUDE.md | 1 + backend_service/agent.py | 166 ++++++++++++++++++++----- backend_service/catalog/text_models.py | 27 +++- tests/test_agent.py | 102 +++++++++++++++ 4 files changed, 261 insertions(+), 35 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ca559fa..6b8e453 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-040~~ | ~~Tool-call parser misses open-only `` + Qwen3.6-27B false-positive vision tag~~ | **Shipped 2026-05-10.** | Surfaced by a Coder-Next chat session: tool calls rendered as raw `{"name": "web_search", ...}` text in the assistant bubble with no execution, while in a separate turn the "Attach image" affordance appeared even though Qwen3.6-27B is text-only. Three fixes. (1) **Tool-call parser widened.** Old regex `\s*(\{.*?\})\s*` required a closing tag and only matched objects. Coder-Next emitted three real-world shapes in a single session: canonical (closed + object), open-only (no ``), and array-shaped (model hallucinated a list of pseudo-results). The new parser uses `json.JSONDecoder.raw_decode` on each `` opener so it consumes the next valid JSON value regardless of close tag, dispatches objects with a `name`, drops list payloads silently, and continues scanning so a later well-formed call in the same message still lands. 7 new unit tests in `tests/test_agent.py` pin all three shapes plus the OpenAI-style stringified-arguments path. (2) **`_strip_tool_call_xml` helper** removes the JSON region the parser consumed from `result.text` before the streaming layer hands it to the chat bubble — fixes the "raw XML next to the ToolCallCard" duplication. Applied in both `run_agent_loop` and `run_agent_loop_streaming`. 6 new unit tests pin the strip behaviour. (3) **Qwen3.6-27B + Qwen3.5 catalog cleanup.** Dense Qwen3.6-27B (Coder-Next branding), Qwen3.6-27B-FP8, mlx-community/Qwen3.6-27B-4bit, and the family-level Qwen3.6 + Qwen3.5 entries all carried the `vision` capability — a copy-paste bug from when the catalog was scaffolded. Vision lives on a separate `Qwen3.6-27B-VL` variant we do not yet ship; the stale tag was promoting `supportsVision: true` for every community quant, making `ChatComposer` render the "Attach image" affordance for a text-only model. Dropped the tag from all five entries. | | ~~FU-039~~ | ~~Tool-call `arguments: null` bricks Chat tab forever~~ | **Shipped 2026-05-10.** | Caught by the FU-037 ErrorBoundary: Coder-Next + Tools + `What is 17 * 23 plus sqrt(144)?` triggered `TypeError: Object.entries requires that input parameter not be null or undefined` in `ToolCallCard` (minified `_Y`). Root cause traced through the boundary's component stack (`_Y` → Panel `
` → ErrorBoundary → workspace) and the minified source: `src/components/ToolCallCard.tsx:116` did `Object.entries(toolCall.arguments)`, but Coder-Next emits `{"arguments": null}` for tool calls that need no parameters. `backend_service/agent.py::_execute_tool_call` then evaluated `isinstance(None, str) → False` and set `arguments = None`, which serialised into the persisted session. Every subsequent render of that turn crashed the Chat tab — the user could not even read prior history because the boundary fires before any other content renders. Two-layer fix: (1) backend `_execute_tool_call` now coerces `None` / empty-string / non-dict shapes to `{}` at the source so the contract "`arguments` is always a dict" holds for all consumers; (2) frontend `ToolCallCard` adds a defensive guard that defaults to `{}` and renders `(no arguments)` for genuinely corrupt records (so old sessions stop crashing without a manual localStorage wipe). 4 new unit tests in `tests/test_agent.py` pin all four null-ish input shapes. | | ~~FU-038~~ | ~~Diagnostics cleanup: `_free_bytes` import, MallocStackLogging spam, Qwen3.6-27B alias~~ | **Shipped 2026-05-10.** | Three bugs surfaced by the live ``/api/diagnostics/snapshot`` payload from a Coder-Next + Tools repro. (1) ``backend_service/routes/diagnostics.py`` imported ``_free_bytes`` from ``backend_service.routes.setup``, but the setup package's ``__init__.py`` did not re-export it from ``gpu_bundle.py`` — the snapshot's ``extras`` section reported ``ImportError: cannot import name '_free_bytes'``. Added the re-export. (2) macOS hardened-runtime spawned every Python subprocess with three lines of ``MallocStackLogging: can't turn off malloc stack logging because it was not enabled.`` spam (we ship ``bundle.macOS.hardenedRuntime: true``). Hundreds per minute under the metrics poll, drowning out real INFO/ERROR lines. Fixed at source by ``command.env_remove("MallocStackLogging" / "MallocStackLoggingNoCompact" / "MallocScribble")`` in ``src-tauri/src/backend.rs`` so new builds don't produce the spam. Also added a regex filter (``_LOG_NOISE_PATTERNS`` + ``_filter_log_noise``) in ``diagnostics.py`` so the ``/api/diagnostics/log-tail`` and snapshot endpoints strip the spam from logs produced by older builds too — existing installs see a clean diagnostic surface without rebuilding. Filter reads 4× the requested line window so 200 useful lines survive even when the raw log is 50% spam. (3) Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream; lmstudio-community MLX conversion's HF metadata reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical repo. ``model_resolution.resolve_dflash_target_ref`` prefers canonical, so ``DRAFT_MODEL_MAP`` missed and the runtimeNote said *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit': no compatible draft model is registered.* Aliased the three quant variants (4bit / bf16 / 8bit) back to ``Qwen/Qwen3-Coder-Next`` so the existing ``z-lab/Qwen3-Coder-Next-DFlash`` drafter resolves. New unit test pins the mapping. | | ~~FU-037~~ | ~~Per-tab ErrorBoundary + Tauri devtools in release builds~~ | **Shipped 2026-05-10.** | A tool-call in the Chat tab against `Qwen3-Coder-Next` blanked the entire packaged macOS app — webview reload returned the user to the Dashboard, and any subsequent Chat navigation crashed again. Root cause: the React tree had no error boundary, so a single uncaught render error in one tab tore down the whole `
` content frame. Release builds also did not ship the WebKit inspector, so the user could not pull a stack trace without rebuilding via `cargo tauri dev`. (1) New [src/components/ErrorBoundary.tsx](src/components/ErrorBoundary.tsx) — `getDerivedStateFromError` + `componentDidCatch` capture the error, render an inline fallback with the error message, JS stack, component stack, "Try again" reset, and "Copy details" clipboard button. Wrapped around `{content}` in [src/App.tsx](src/App.tsx) keyed by `activeTab` so switching tabs is its own recovery path. (2) `src-tauri/Cargo.toml` `tauri` dep gains the `devtools` Cargo feature so right-click → Inspect Element opens WebKit devtools in release builds. (3) CSS for `.error-boundary` lives next to the existing notice banners in [src/styles.css](src/styles.css) — same colour vocabulary. Unit tests in [src/components/__tests__/ErrorBoundary.test.ts](src/components/__tests__/ErrorBoundary.test.ts) pin the static-derive contract so the boundary cannot silently stop catching errors. Frontend errors land in the webview console; backend errors land in the Diagnostics tab + the in-memory `app.state.chaosengine` log buffer. | diff --git a/backend_service/agent.py b/backend_service/agent.py index 9f105ad..277380e 100644 --- a/backend_service/agent.py +++ b/backend_service/agent.py @@ -12,6 +12,7 @@ import json import logging +import re import time import uuid from dataclasses import dataclass, field @@ -51,40 +52,141 @@ class AgentResult: total_completion_tokens: int = 0 +_TOOL_CALL_OPEN = re.compile(r"\s*", re.IGNORECASE) +_TOOL_CALL_CLOSE = re.compile(r"\s*", re.IGNORECASE) + + +def _strip_tool_call_xml(text: str) -> str: + """Remove every ``...`` blob from a model response. + + FU-040: the chat UI shows ``result.text`` verbatim in the assistant + bubble, so when a model emits a ```` block AND we + execute the call (either via the engine's structured field or via + ``_parse_tool_calls_from_response``), the user sees the same call + twice — once as raw XML noise and once as a ``ToolCallCard``. We + strip the XML from the text we hand back to the streaming layer. + + Uses the same ``JSONDecoder.raw_decode`` walk as the parser so we + only remove the well-formed-JSON region the parser actually + consumed; everything around it (the model's natural-language + framing) stays put. A trailing ```` close tag, when + present, is also swallowed. + """ + if not text or "" not in text.lower(): + return text + decoder = json.JSONDecoder() + out: list[str] = [] + cursor = 0 + while True: + match = _TOOL_CALL_OPEN.search(text, cursor) + if match is None: + out.append(text[cursor:]) + break + out.append(text[cursor:match.start()]) + start = match.end() + while start < len(text) and text[start].isspace(): + start += 1 + if start >= len(text): + break + try: + _payload, end = decoder.raw_decode(text, start) + except json.JSONDecodeError: + # Malformed JSON after ```` — drop the opener + # alone and continue. The garbage payload stays so the + # operator can see what the model emitted. + cursor = match.end() + continue + cursor = end + close = _TOOL_CALL_CLOSE.match(text, cursor) + if close is not None: + cursor = close.end() + cleaned = "".join(out) + # Collapse the double-blank-line that can appear when we strip a + # mid-paragraph tool_call. ``\n\n\n+`` → ``\n\n`` keeps paragraph + # breaks intact while removing the visible gap. + return re.sub(r"\n{3,}", "\n\n", cleaned).strip() + + def _parse_tool_calls_from_response(response_text: str) -> list[dict[str, Any]] | None: """Attempt to extract tool calls from a text response. Models using the OpenAI tool-calling protocol return structured tool_calls in the response object. For models that embed tool calls - in their text output (e.g., Hermes/Functionary format), we try to - parse them from common patterns. + in their text output (e.g. Hermes / NousResearch / Qwen3-Coder-Next), + we parse them from the ``...`` XML-ish + convention. + + FU-040 (2026-05-10): widened to handle three real-world shapes + Coder-Next emitted in a single chat session: + + 1. ``{"name": "x", "arguments": {...}}`` + — the canonical Hermes shape. Always worked. + 2. ``{"name": "x", "arguments": {...}}`` — no + closing tag. The previous regex required ```` + and silently dropped these, so the model's tool call + rendered as raw XML text in the assistant bubble with no + execution. + 3. `` [ {url: ...}, {url: ...} ]`` — model + hallucinated a JSON ARRAY of pseudo-results instead of a + call object. Rejected (the array shape has no ``name`` / + ``arguments`` keys to dispatch from), but we keep parsing + so any well-formed call later in the same message still + lands. + + The parser walks each ```` opener and uses the stdlib + ``json.JSONDecoder.raw_decode`` to consume exactly the next valid + JSON value (object OR array) — that handles both shapes (1) and + (2) without requiring a closing tag, and shape (3) decodes to a + list which we discard. ``raw_decode`` also correctly skips nested + braces inside argument string values that a naive regex would + choke on. """ - # Try the XML-ish format (Hermes/NousResearch) - calls: list[dict[str, Any]] = [] - import re + if not response_text or "" not in response_text.lower(): + return None - for match in re.finditer( - r"\s*(\{.*?\})\s*", - response_text, - re.DOTALL, - ): + calls: list[dict[str, Any]] = [] + decoder = json.JSONDecoder() + cursor = 0 + while True: + match = _TOOL_CALL_OPEN.search(response_text, cursor) + if match is None: + break + start = match.end() + # Find the first non-whitespace character; ``raw_decode`` needs + # to start at the JSON token itself, not at preceding spaces. + while start < len(response_text) and response_text[start].isspace(): + start += 1 + if start >= len(response_text): + break try: - payload = json.loads(match.group(1)) - name = payload.get("name") or payload.get("function") - arguments = payload.get("arguments") or payload.get("parameters") or {} - if isinstance(arguments, str): - arguments = json.loads(arguments) - if name: - calls.append({ - "id": f"call_{uuid.uuid4().hex[:8]}", - "type": "function", - "function": { - "name": name, - "arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments), - }, - }) - except (json.JSONDecodeError, KeyError): + payload, end = decoder.raw_decode(response_text, start) + except json.JSONDecodeError: + cursor = start + 1 continue + cursor = end + # Shape (3): the model emitted hallucinated results as a list. + # No ``name`` to dispatch from — skip without aborting the + # outer loop so a later well-formed call in the same message + # still gets picked up. + if not isinstance(payload, dict): + continue + name = payload.get("name") or payload.get("function") + if not name: + continue + arguments = payload.get("arguments") or payload.get("parameters") or {} + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {"raw": arguments} + calls.append({ + "id": f"call_{uuid.uuid4().hex[:8]}", + "type": "function", + "function": { + "name": name, + "arguments": json.dumps(arguments) if isinstance(arguments, dict) else str(arguments), + }, + }) return calls if calls else None @@ -263,9 +365,12 @@ def run_agent_loop( tool_calls = _parse_tool_calls_from_response(result.text) if not tool_calls: - # Model is done — return the final text + # Model is done — return the final text. Strip any + # ```` XML the parser consumed so the chat + # bubble doesn't show raw call JSON next to a rendered + # ToolCallCard (FU-040). return AgentResult( - text=result.text, + text=_strip_tool_call_xml(result.text), tool_calls=all_tool_results, iterations=iteration + 1, total_prompt_tokens=total_prompt, @@ -375,8 +480,11 @@ def run_agent_loop_streaming( if not tool_calls: # Final response — stream it token by token for the user - # Since we already have the full text, emit it in chunks - text = result.text + # Since we already have the full text, emit it in chunks. + # Strip any ```` XML blobs the parser already + # consumed so the assistant bubble doesn't show raw call + # JSON next to the rendered ToolCallCard (FU-040). + text = _strip_tool_call_xml(result.text) chunk_size = 4 for i in range(0, len(text), chunk_size): yield {"token": text[i:i + chunk_size]} diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py index 87b7baa..c3b52d1 100644 --- a/backend_service/catalog/text_models.py +++ b/backend_service/catalog/text_models.py @@ -103,7 +103,16 @@ "popularityLabel": "Featured family", "likesLabel": "Qwen official", "badges": ["Reasoning", "Coding", "Agents", "Long context"], - "capabilities": ["reasoning", "coding", "tool-use", "vision"], + # FU-040 (2026-05-10): dropped ``vision`` from the family-level + # capabilities. Qwen3.6-27B (dense, Coder-Next branding) and + # Qwen3.6-35B-A3B (MoE) are both text-only — vision lives on a + # separate ``Qwen3.6-27B-VL`` variant we do not yet ship. The + # stale tag was promoting ``supportsVision: true`` for every + # community quant variant, which made ``ChatComposer`` render + # the "Attach image" affordance for a model that has no vision + # encoder. Add it back here only when an actual VL variant + # lands in the catalog. + "capabilities": ["reasoning", "coding", "tool-use"], "defaultVariantId": "Qwen/Qwen3.6-27B", "variants": [ { @@ -115,8 +124,9 @@ "sizeGb": 54.0, "format": "Transformers", "quantization": "BF16", - "capabilities": ["reasoning", "coding", "vision", "tool-use"], - "note": "Dense 27B Qwen3.6 release with vision and agentic coding tuning. Apache 2.0.", + # FU-040: text-only dense variant (Coder-Next branding). + "capabilities": ["reasoning", "coding", "tool-use"], + "note": "Dense 27B Qwen3.6 release with agentic coding tuning. Apache 2.0.", "contextWindow": "262K", "launchMode": "convert", "backend": "mlx", @@ -131,7 +141,8 @@ "sizeGb": 28.0, "format": "Transformers", "quantization": "FP8", - "capabilities": ["reasoning", "coding", "vision", "tool-use"], + # FU-040: text-only dense variant. + "capabilities": ["reasoning", "coding", "tool-use"], "note": "FP8 quantization of the 27B dense release for ~30 GB VRAM systems.", "contextWindow": "262K", "launchMode": "convert", @@ -163,7 +174,8 @@ "sizeGb": 15.5, "format": "MLX", "quantization": "4-bit", - "capabilities": ["reasoning", "coding", "vision", "tool-use"], + # FU-040: text-only dense variant. + "capabilities": ["reasoning", "coding", "tool-use"], "note": "Community MLX 4-bit conversion for Apple Silicon — fastest local launch path.", "contextWindow": "262K", "launchMode": "direct", @@ -239,7 +251,10 @@ "popularityLabel": "Featured family", "likesLabel": "Qwen official", "badges": ["Reasoning", "Coding", "Long context"], - "capabilities": ["reasoning", "coding", "tool-use", "vision"], + # FU-040: Qwen3.5 dense + MoE variants are text-only. The + # ``vision`` tag at family-level was promoting false positives + # in ``supportsVision`` for every community quant variant. + "capabilities": ["reasoning", "coding", "tool-use"], "defaultVariantId": "Qwen/Qwen3.5-9B", "variants": [ { diff --git a/tests/test_agent.py b/tests/test_agent.py index 17f3e07..6544c58 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -6,6 +6,7 @@ from backend_service.agent import ( _parse_tool_calls_from_response, _execute_tool_call, + _strip_tool_call_xml, run_agent_loop, AgentResult, ToolCallResult, @@ -155,6 +156,107 @@ def test_dict_arguments_passthrough(self): self.assertEqual(result.arguments, {"text": "hi"}) +# FU-040: real-world ```` shapes emitted by Qwen3-Coder-Next +# in a single chat session — the parser must catch (1) and (2) and +# reject (3) without aborting on a malformed payload. +class ToolCallParserTests(unittest.TestCase): + def test_closed_tag_canonical_hermes(self): + text = '{"name": "calculator", "arguments": {"expr": "1+1"}}' + calls = _parse_tool_calls_from_response(text) + self.assertEqual(len(calls), 1) + self.assertEqual(calls[0]["function"]["name"], "calculator") + self.assertEqual(json.loads(calls[0]["function"]["arguments"]), {"expr": "1+1"}) + + def test_open_only_no_close_tag(self): + """Coder-Next emitted this shape — previous regex missed it.""" + text = '{"name": "web_search", "arguments": {"query": "ICLR 2026", "max_results": 5}}' + calls = _parse_tool_calls_from_response(text) + self.assertEqual(len(calls), 1) + self.assertEqual(calls[0]["function"]["name"], "web_search") + self.assertEqual( + json.loads(calls[0]["function"]["arguments"]), + {"query": "ICLR 2026", "max_results": 5}, + ) + + def test_array_payload_rejected_silently(self): + """Model hallucinated a JSON array of pseudo-results inside + ````. No ``name`` to dispatch from — must not raise, + must not return a call.""" + text = '[{"url": "https://example.com"}, {"url": "https://other.com"}]' + self.assertIsNone(_parse_tool_calls_from_response(text)) + + def test_array_then_valid_call_picks_up_valid(self): + """Garbage array followed by a real call: array dropped, real + call parsed.""" + text = ( + '[{"url": "https://bogus.com"}]\n' + '{"name": "calculator", "arguments": {}}' + ) + calls = _parse_tool_calls_from_response(text) + self.assertEqual(len(calls), 1) + self.assertEqual(calls[0]["function"]["name"], "calculator") + + def test_no_tool_call_marker_returns_none(self): + self.assertIsNone(_parse_tool_calls_from_response("just a regular reply")) + + def test_empty_text_returns_none(self): + self.assertIsNone(_parse_tool_calls_from_response("")) + + def test_arguments_string_is_re_parsed(self): + """OpenAI emits ``arguments`` as a string blob; we re-parse so + downstream consumers see a dict.""" + text = '{"name": "calculator", "arguments": "{\\"expr\\":\\"2*3\\"}"}' + calls = _parse_tool_calls_from_response(text) + self.assertEqual(json.loads(calls[0]["function"]["arguments"]), {"expr": "2*3"}) + + +class StripToolCallXmlTests(unittest.TestCase): + def test_strip_closed_block_removes_entire_xml(self): + text = ( + "Sure, let me check.\n" + '{"name": "calc", "arguments": {}}\n' + "Done." + ) + cleaned = _strip_tool_call_xml(text) + self.assertNotIn("", cleaned) + self.assertNotIn("", cleaned) + self.assertIn("Sure, let me check.", cleaned) + self.assertIn("Done.", cleaned) + + def test_strip_open_only_block(self): + text = 'Looking up... {"name": "web_search", "arguments": {"query": "x"}}' + cleaned = _strip_tool_call_xml(text) + self.assertNotIn("", cleaned) + self.assertIn("Looking up", cleaned) + + def test_strip_collapses_excess_blank_lines(self): + text = ( + "Before.\n\n" + '{"name": "x", "arguments": {}}\n\n' + "After." + ) + cleaned = _strip_tool_call_xml(text) + self.assertNotIn("\n\n\n", cleaned) + + def test_strip_no_tool_call_returns_input_unchanged(self): + text = "Just a normal reply with no calls." + self.assertEqual(_strip_tool_call_xml(text), text) + + def test_strip_empty_returns_empty(self): + self.assertEqual(_strip_tool_call_xml(""), "") + + def test_strip_preserves_natural_language_around_call(self): + """The narrative before / after the call must survive.""" + text = ( + 'Let me think.\n' + '{"name": "calc", "arguments": {"expr": "1+2"}}\n' + 'The result follows.' + ) + cleaned = _strip_tool_call_xml(text) + self.assertIn("Let me think.", cleaned) + self.assertIn("The result follows.", cleaned) + + class RunAgentLoopTests(unittest.TestCase): def _make_generate_fn(self, responses): """Create a mock generate_fn that returns pre-defined responses in order.""" From 2804d9bcad8f23198ed55d5c225c3e6e1fe15074 Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Mon, 11 May 2026 14:08:50 +0100 Subject: [PATCH 13/13] FU-041: catalog entry for Qwen3-Coder-Next MLX 4-bit + revert wrong Qwen3.6-27B aliases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User-spotted mismatch: their local install at ``/Users/dan/AI_Models/lmstudio-community/Qwen3-Coder-Next-MLX-4bit`` was surfacing as canonical repo ``mlx-community/Qwen3.6-27B-4bit`` in the diagnostics snapshot, picking up the wrong catalog row and the wrong DFlash drafter. Confirmed via on-disk config.json that the model is Qwen3-Next (architectures ``Qwen3NextForCausalLM``, ``model_type: "qwen3_next"``, sparse MoE with 512 experts, hidden_size 2048, ~3B active per token) — fundamentally different from the dense Qwen3.6-27B (``qwen3`` arch, hidden_size 5120, no MoE). Root cause: the catalog had no variant for the lmstudio-community MLX 4-bit conversion of Coder-Next, so the fuzzy matcher in src/utils/library.ts::libraryVariantMatchScore settled for the closest "MLX + 4-bit + Qwen3" entry, which happened to be the unrelated ``mlx-community/Qwen3.6-27B-4bit`` row. Three changes. 1. Added an explicit ``lmstudio-community/Qwen3-Coder-Next-MLX-4bit`` variant to the ``qwen3-coder-next`` family in backend_service/catalog/text_models.py. Correct params: 80B sparse / ~45 GB on disk / qwen3_next family capabilities (coding / agents / tool-use / reasoning / thinking). The matcher now scores 80+ on an exact repo-path substring hit instead of the previous fuzzy fallback. 2. Reverted the FU-038 DFlash aliases that wrongly pointed ``mlx-community/Qwen3.6-27B-4bit / bf16 / 8bit`` at ``Qwen/Qwen3-Coder-Next``. Those quants are the dense 27B Coder (text-only, ``qwen3`` arch) and have no drafter today; leaving them aliased to the Qwen3-Next MoE drafter would route DFlash to the wrong architecture and either crash at load or degrade silently. 3. Replaced them with the correct ``lmstudio-community/Qwen3-Coder-Next-MLX-4bit`` alias plus an ``-Instruct`` sibling. New regression tests in tests/test_dflash.py pin (a) the new alias resolves to ``z-lab/Qwen3-Coder-Next-DFlash`` and (b) the dense 27B-4bit MUST NOT alias to the MoE drafter. Test totals: 1340 pytest pass, 353 vitest pass, tsc clean. CLAUDE.md tracker entry FU-041 records the root cause + fix. --- CLAUDE.md | 1 + backend_service/catalog/text_models.py | 31 +++++++++++++++++++++ dflash/__init__.py | 24 ++++++++-------- tests/test_dflash.py | 38 ++++++++++++++------------ 4 files changed, 66 insertions(+), 28 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6b8e453..4ea8cfa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -141,6 +141,7 @@ no longer relevant. | ~~FU-031~~ | ~~Extend `DRAFT_MODEL_MAP` for new z-lab DFlash drafters + pin TriAttention~~ | **Shipped 2026-05-10.** | z-lab published draft checkpoints for several new families since the last `DRAFT_MODEL_MAP` audit; the upstream `dflash-mlx` 0.1.5 release also added the Gemma4 backend (commit 05cc456). Added entries for `google/gemma-4-31B-it`, `google/gemma-4-26B-A4B-it`, `Qwen/Qwen3.5-122B-A10B`, `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.7`, `moonshotai/Kimi-K2.6` (all in [dflash/__init__.py](dflash/__init__.py)) plus `mlx-community/...` aliases for each so Apple Silicon quants resolve. New 7 unit tests in [tests/test_dflash.py](tests/test_dflash.py) pin the mappings. **Same commit also pinned TriAttention** to `c3744ee6a50522a1559a577f85aef2b165a344f2` in [pyproject.toml](pyproject.toml) — previously the `[triattention]` and `[triattention-mlx]` extras pulled `git+...git` HEAD, which made fresh installs non-reproducible whenever the upstream landed unreleased work. Pin matches the v0.2.0 release surface plus the AMD GPU port. | | FU-032 | TurboQuant+ ([TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus)) Apple Silicon Metal kernels (**watch-closely**) | Re-evaluate when upstream tags v1.0 release or beats `turboquant-mlx-full` 0.3.0 on a public M-series benchmark | Same author as our `llama-cpp-turboquant` fork. Adds Walsh-Hadamard rotation (improvement over base TurboQuant's Hadamard-only path) + a sparse-V optimization on M5 Max that achieves 0.93x of q8_0 decode speed at long context while saving 50–64% of KV memory. Reported numbers: turbo3 4.6× compression at +1.06% PPL, turbo4 3.8× compression at +0.23% PPL — comparable to our existing `turboquant-mlx-full` pin but with newer kernels. 326 commits + community tested across M1/M2/M3/M5. **Not on PyPI** (development install via `git clone` + `pip install -e .[dev]`), so adopting it means a vendored or git+url install pattern like dflash-mlx — re-evaluate when upstream publishes a wheel or tags a v1.0. Apple Silicon stays on `turboquant-mlx-full` for now; the underlying llama-server-turbo binary already exposes turbo2/3/4 cache types. | | ~~FU-033~~ | ~~dflash-mlx pin sync assert in pre-build-check~~ | **Shipped 2026-05-10.** | Caught a real bug: [pyproject.toml](pyproject.toml) and [scripts/stage-runtime.mjs](scripts/stage-runtime.mjs) had drifted to different `dflash-mlx` commit hashes (the dev `.venv` ran 0.1.5.1 while `npm run stage:runtime` was bundling 0.1.4.1 into release builds). Both files manually synced to `fada1eb`; new probe in [scripts/pre-build-check.mjs](scripts/pre-build-check.mjs) and [scripts/pre-build-check.sh](scripts/pre-build-check.sh) regex-extracts the commit hash from both files and fails the build when they diverge. Same probe also took the chance to drop the orphan `vendor/ChaosEngine` staleness check from both runners — that vendored path was dropped in FU-030 and would never resolve again. | +| ~~FU-041~~ | ~~Qwen3-Coder-Next-MLX-4bit was mis-canonicalised as Qwen3.6-27B-4bit~~ | **Shipped 2026-05-10.** | User-spotted mismatch: their local install at `/Users/dan/AI_Models/lmstudio-community/Qwen3-Coder-Next-MLX-4bit` was surfacing as canonical repo `mlx-community/Qwen3.6-27B-4bit` in the diagnostics snapshot, picking up the wrong catalog row and the wrong DFlash drafter. Inspecting the on-disk `config.json` confirmed the model is **Qwen3-Next** (architectures `Qwen3NextForCausalLM`, `model_type: "qwen3_next"`, sparse MoE with 512 experts, hidden_size 2048, ~3B active per token) — fundamentally different from the dense Qwen3.6-27B (`qwen3` arch, hidden_size 5120). Root cause: there was no catalog variant for the lmstudio-community community MLX 4-bit conversion of Coder-Next, so the fuzzy matcher in `src/utils/library.ts::libraryVariantMatchScore` settled for the closest "MLX + 4-bit + Qwen3" entry, which happened to be the unrelated `mlx-community/Qwen3.6-27B-4bit` row. Fix: (1) added an explicit `lmstudio-community/Qwen3-Coder-Next-MLX-4bit` variant to the `qwen3-coder-next` family in `backend_service/catalog/text_models.py` with the correct params (80B sparse, ~45 GB on disk, qwen3_next family capabilities). (2) Reverted the FU-038 DFlash aliases that wrongly pointed `mlx-community/Qwen3.6-27B-4bit / bf16 / 8bit` at `Qwen/Qwen3-Coder-Next` — those quants are the dense 27B Coder and have no drafter today. (3) Replaced them with the correct `lmstudio-community/Qwen3-Coder-Next-MLX-4bit` alias plus an `-Instruct` sibling for completeness. New regression tests in `tests/test_dflash.py` pin both the new alias resolution and that the dense 27B-4bit MUST NOT alias to the MoE drafter. | | ~~FU-040~~ | ~~Tool-call parser misses open-only `` + Qwen3.6-27B false-positive vision tag~~ | **Shipped 2026-05-10.** | Surfaced by a Coder-Next chat session: tool calls rendered as raw `{"name": "web_search", ...}` text in the assistant bubble with no execution, while in a separate turn the "Attach image" affordance appeared even though Qwen3.6-27B is text-only. Three fixes. (1) **Tool-call parser widened.** Old regex `\s*(\{.*?\})\s*` required a closing tag and only matched objects. Coder-Next emitted three real-world shapes in a single session: canonical (closed + object), open-only (no ``), and array-shaped (model hallucinated a list of pseudo-results). The new parser uses `json.JSONDecoder.raw_decode` on each `` opener so it consumes the next valid JSON value regardless of close tag, dispatches objects with a `name`, drops list payloads silently, and continues scanning so a later well-formed call in the same message still lands. 7 new unit tests in `tests/test_agent.py` pin all three shapes plus the OpenAI-style stringified-arguments path. (2) **`_strip_tool_call_xml` helper** removes the JSON region the parser consumed from `result.text` before the streaming layer hands it to the chat bubble — fixes the "raw XML next to the ToolCallCard" duplication. Applied in both `run_agent_loop` and `run_agent_loop_streaming`. 6 new unit tests pin the strip behaviour. (3) **Qwen3.6-27B + Qwen3.5 catalog cleanup.** Dense Qwen3.6-27B (Coder-Next branding), Qwen3.6-27B-FP8, mlx-community/Qwen3.6-27B-4bit, and the family-level Qwen3.6 + Qwen3.5 entries all carried the `vision` capability — a copy-paste bug from when the catalog was scaffolded. Vision lives on a separate `Qwen3.6-27B-VL` variant we do not yet ship; the stale tag was promoting `supportsVision: true` for every community quant, making `ChatComposer` render the "Attach image" affordance for a text-only model. Dropped the tag from all five entries. | | ~~FU-039~~ | ~~Tool-call `arguments: null` bricks Chat tab forever~~ | **Shipped 2026-05-10.** | Caught by the FU-037 ErrorBoundary: Coder-Next + Tools + `What is 17 * 23 plus sqrt(144)?` triggered `TypeError: Object.entries requires that input parameter not be null or undefined` in `ToolCallCard` (minified `_Y`). Root cause traced through the boundary's component stack (`_Y` → Panel `
` → ErrorBoundary → workspace) and the minified source: `src/components/ToolCallCard.tsx:116` did `Object.entries(toolCall.arguments)`, but Coder-Next emits `{"arguments": null}` for tool calls that need no parameters. `backend_service/agent.py::_execute_tool_call` then evaluated `isinstance(None, str) → False` and set `arguments = None`, which serialised into the persisted session. Every subsequent render of that turn crashed the Chat tab — the user could not even read prior history because the boundary fires before any other content renders. Two-layer fix: (1) backend `_execute_tool_call` now coerces `None` / empty-string / non-dict shapes to `{}` at the source so the contract "`arguments` is always a dict" holds for all consumers; (2) frontend `ToolCallCard` adds a defensive guard that defaults to `{}` and renders `(no arguments)` for genuinely corrupt records (so old sessions stop crashing without a manual localStorage wipe). 4 new unit tests in `tests/test_agent.py` pin all four null-ish input shapes. | | ~~FU-038~~ | ~~Diagnostics cleanup: `_free_bytes` import, MallocStackLogging spam, Qwen3.6-27B alias~~ | **Shipped 2026-05-10.** | Three bugs surfaced by the live ``/api/diagnostics/snapshot`` payload from a Coder-Next + Tools repro. (1) ``backend_service/routes/diagnostics.py`` imported ``_free_bytes`` from ``backend_service.routes.setup``, but the setup package's ``__init__.py`` did not re-export it from ``gpu_bundle.py`` — the snapshot's ``extras`` section reported ``ImportError: cannot import name '_free_bytes'``. Added the re-export. (2) macOS hardened-runtime spawned every Python subprocess with three lines of ``MallocStackLogging: can't turn off malloc stack logging because it was not enabled.`` spam (we ship ``bundle.macOS.hardenedRuntime: true``). Hundreds per minute under the metrics poll, drowning out real INFO/ERROR lines. Fixed at source by ``command.env_remove("MallocStackLogging" / "MallocStackLoggingNoCompact" / "MallocScribble")`` in ``src-tauri/src/backend.rs`` so new builds don't produce the spam. Also added a regex filter (``_LOG_NOISE_PATTERNS`` + ``_filter_log_noise``) in ``diagnostics.py`` so the ``/api/diagnostics/log-tail`` and snapshot endpoints strip the spam from logs produced by older builds too — existing installs see a clean diagnostic surface without rebuilding. Filter reads 4× the requested line window so 200 useful lines survive even when the raw log is 50% spam. (3) Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream; lmstudio-community MLX conversion's HF metadata reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical repo. ``model_resolution.resolve_dflash_target_ref`` prefers canonical, so ``DRAFT_MODEL_MAP`` missed and the runtimeNote said *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit': no compatible draft model is registered.* Aliased the three quant variants (4bit / bf16 / 8bit) back to ``Qwen/Qwen3-Coder-Next`` so the existing ``z-lab/Qwen3-Coder-Next-DFlash`` drafter resolves. New unit test pins the mapping. | diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py index c3b52d1..41168e7 100644 --- a/backend_service/catalog/text_models.py +++ b/backend_service/catalog/text_models.py @@ -526,6 +526,37 @@ "launchMode": "convert", "backend": "mlx", }, + # FU-041 (2026-05-10): community MLX 4-bit conversion of the + # Qwen3-Next architecture (qwen3_next, sparse MoE w/ 512 + # experts, ~3B active per token, hidden_size=2048). Without + # this variant the library matcher in src/utils/library.ts + # fuzzy-matched a local ``Qwen3-Coder-Next-MLX-4bit`` install + # to the unrelated ``mlx-community/Qwen3.6-27B-4bit`` (dense + # 27B Coder, completely different arch — hidden_size=5120, + # no MoE), which then surfaced the wrong canonicalRepo into + # the runtime snapshot, picked up the wrong capability set, + # and routed DFlash lookups to the wrong drafter. Adding the + # variant explicitly lets the matcher score 80+ on an exact + # repo-path substring hit instead of falling back to the + # closest-quant-and-format match. + { + "id": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit", + "name": "Qwen3 Coder Next MLX 4-bit", + "repo": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit", + "link": "https://huggingface.co/lmstudio-community/Qwen3-Coder-Next-MLX-4bit", + # 80B total params, ~3B active per token; the on-disk + # 4-bit conversion fits ~45 GB. + "paramsB": 80.0, + "sizeGb": 45.0, + "format": "MLX", + "quantization": "4-bit", + "capabilities": ["coding", "agents", "tool-use", "reasoning", "thinking"], + "note": "Community MLX 4-bit conversion of the Qwen3-Next MoE coder for Apple Silicon — fastest local launch path.", + "contextWindow": "262K", + "launchMode": "direct", + "backend": "mlx", + "releaseDate": "2026-04", + }, ], "readme": [ "Qwen3 Coder Next is purpose-built for software engineering with function calling and agentic workflows.", diff --git a/dflash/__init__.py b/dflash/__init__.py index dea44f4..0f0b1c1 100644 --- a/dflash/__init__.py +++ b/dflash/__init__.py @@ -70,17 +70,19 @@ "mlx-community/Qwen3-8B-4bit": "Qwen/Qwen3-8B", "mlx-community/Qwen3-8B-8bit": "Qwen/Qwen3-8B", "lmstudio-community/Qwen3-Coder-Next-MLX-4bit": "Qwen/Qwen3-Coder-Next", - # 2026-05-10: Qwen3-Coder-Next was rebranded ``Qwen3.6-27B`` upstream - # but the HF metadata for the lmstudio-community MLX conversion - # still reports ``mlx-community/Qwen3.6-27B-4bit`` as the canonical - # repo. Without this alias, ``model_resolution.resolve_dflash_target_ref`` - # picks up the canonical name and DRAFT_MODEL_MAP misses, so the - # diagnostics snapshot reports - # *DFLASH unavailable for 'mlx-community/Qwen3.6-27B-4bit'* even - # when dflash-mlx is installed and the user IS running Coder-Next. - "mlx-community/Qwen3.6-27B-4bit": "Qwen/Qwen3-Coder-Next", - "mlx-community/Qwen3.6-27B-bf16": "Qwen/Qwen3-Coder-Next", - "mlx-community/Qwen3.6-27B-8bit": "Qwen/Qwen3-Coder-Next", + # FU-041 (2026-05-10): canonicalRepo for the lmstudio-community + # Coder-Next MLX 4-bit ships as the same repo path (no rename), so + # alias both the exact community ref AND the bf16 / 8bit siblings + # for completeness. The earlier FU-038 mappings that pointed + # ``mlx-community/Qwen3.6-27B-4bit`` at Coder-Next were wrong — that + # repo is the dense Qwen3.6-27B (text-only Coder, ``qwen3`` + # architecture, hidden_size=5120), not the Qwen3-Next MoE coder + # (``qwen3_next`` architecture, 512 experts, hidden_size=2048). + # Inspecting the local config.json under ~/AI_Models/ + # lmstudio-community/Qwen3-Coder-Next-MLX-4bit confirms the latter. + # Coder-Next uses ``z-lab/Qwen3-Coder-Next-DFlash``; the dense + # 27B-4bit has no drafter today and stays unaliased. + "lmstudio-community/Qwen3-Coder-Next-MLX-4bit-Instruct": "Qwen/Qwen3-Coder-Next", "mlx-community/Qwen3.5-4B-bf16": "Qwen/Qwen3.5-4B", "mlx-community/Qwen3.5-7B-bf16": "Qwen/Qwen3.5-7B", "mlx-community/Qwen3.5-14B-bf16": "Qwen/Qwen3.5-14B", diff --git a/tests/test_dflash.py b/tests/test_dflash.py index 8c61f86..e90763b 100644 --- a/tests/test_dflash.py +++ b/tests/test_dflash.py @@ -149,23 +149,27 @@ def test_kimi_k26_mlx_community_alias(self): "z-lab/Kimi-K2.6-DFlash", ) - def test_qwen36_27b_canonical_alias_resolves_to_coder_next(self): - """Qwen3-Coder-Next ships under the canonical repo - ``mlx-community/Qwen3.6-27B-4bit`` (rebrand, same checkpoint), - and ``resolve_dflash_target_ref`` prefers the canonical repo. - The alias must route to the existing Coder-Next drafter so the - runtimeNote stops saying DFLASH is unavailable for users running - ``lmstudio-community/Qwen3-Coder-Next-MLX-4bit``.""" - for variant in ( - "mlx-community/Qwen3.6-27B-4bit", - "mlx-community/Qwen3.6-27B-bf16", - "mlx-community/Qwen3.6-27B-8bit", - ): - self.assertEqual( - get_draft_model(variant), - "z-lab/Qwen3-Coder-Next-DFlash", - f"Coder-Next alias mismatch for {variant}", - ) + def test_coder_next_mlx_4bit_alias_resolves(self): + """FU-041: ``lmstudio-community/Qwen3-Coder-Next-MLX-4bit`` is + the Qwen3-Next MoE coder (qwen3_next architecture, 512 experts, + hidden_size=2048). Confirmed by inspecting the local config.json + — it is NOT the same checkpoint as ``mlx-community/Qwen3.6-27B-4bit`` + (which is the dense Qwen3.6-27B). The alias routes to the + Coder-Next drafter; the dense 27B-4bit has no drafter.""" + self.assertEqual( + get_draft_model("lmstudio-community/Qwen3-Coder-Next-MLX-4bit"), + "z-lab/Qwen3-Coder-Next-DFlash", + ) + + def test_qwen36_27b_4bit_is_dense_not_coder_next(self): + """Regression test for the FU-038 bug we reverted in FU-041: + ``mlx-community/Qwen3.6-27B-4bit`` is the DENSE 27B Coder + (qwen3 architecture, hidden_size=5120). It must NOT alias to + the Qwen3-Next MoE drafter.""" + self.assertNotEqual( + get_draft_model("mlx-community/Qwen3.6-27B-4bit"), + "z-lab/Qwen3-Coder-Next-DFlash", + ) class ModelResolutionTests(unittest.TestCase):