diff --git a/.githooks/README.md b/.githooks/README.md
new file mode 100644
index 0000000000..6624b2c4df
--- /dev/null
+++ b/.githooks/README.md
@@ -0,0 +1,50 @@
+# Repo-managed git hooks
+
+Tracked hooks that mirror CI checks. Cross-platform: Linux, macOS, Windows (Git Bash).
+
+## Enable (once per clone)
+
+```sh
+git config core.hooksPath .githooks
+```
+
+Reverts via `git config --unset core.hooksPath`. Skip once with `git push --no-verify`.
+
+### Windows notes
+
+- Git for Windows ships Git Bash — the `#!/usr/bin/env bash` shebang works out of the box. No extra install needed.
+- If you use PowerShell or `cmd` for `git push`, that's fine — git invokes the hook through its own shell, not yours.
+- File-mode permissions are ignored on Windows; just having the file in `.githooks/` is enough.
+
+### Linux / macOS
+
+Hooks must be executable. Cloning preserves the executable bit (it's set in the repo via `git update-index --chmod=+x`). If you ever lose it locally:
+
+```sh
+chmod +x .githooks/*
+```
+
+## Hooks
+
+- **pre-push** — formatter (`--verify` on the whole tree) + lint on pushed `.das` files. Mirrors `.github/workflows/extended_checks.yml`.
+
+## Requirements
+
+A built `daslang` binary. The script auto-detects:
+
+- `bin/daslang` (Linux / macOS, single-config Make/Ninja)
+- `bin/daslang.exe` (MSYS / cygwin)
+- `bin/Release/daslang.exe` / `bin/Debug/daslang.exe` (Windows MSVC)
+- `build/daslang`, `build/bin/daslang`
+
+Build before pushing:
+
+```sh
+cmake --build build --target daslang
+```
+
+Override the resolved path:
+
+```sh
+DASLANG=/custom/path/daslang git push
+```
diff --git a/.githooks/pre-push b/.githooks/pre-push
new file mode 100755
index 0000000000..29a709f72a
--- /dev/null
+++ b/.githooks/pre-push
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# pre-push: mirror CI's formatter + lint (.github/workflows/extended_checks.yml).
+# Cross-platform: runs under Git Bash on Windows, bash on Linux/macOS.
+#
+# Enable per-clone:
+#   git config core.hooksPath hooks
+# Skip once:
+#   git push --no-verify
+# Override binary path:
+#   DASLANG=/path/to/daslang git push
+
+set -eu
+
+ROOT="$(git rev-parse --show-toplevel)"
+cd "$ROOT"
+
+# Resolve daslang binary across single/multi-config layouts.
+resolve_daslang() {
+    if [ -n "${DASLANG:-}" ] && [ -x "$DASLANG" ]; then
+        echo "$DASLANG"; return
+    fi
+    for p in \
+        "$ROOT/bin/daslang" \
+        "$ROOT/bin/daslang.exe" \
+        "$ROOT/bin/Release/daslang.exe" \
+        "$ROOT/bin/Debug/daslang.exe" \
+        "$ROOT/build/daslang" \
+        "$ROOT/build/bin/daslang"
+    do
+        [ -x "$p" ] && { echo "$p"; return; }
+    done
+    return 1
+}
+
+DASLANG="$(resolve_daslang || true)"
+if [ -z "$DASLANG" ]; then
+    echo "pre-push: daslang not found — build it (cmake --build build --target daslang)" >&2
+    echo "         or set DASLANG=/path/to/daslang" >&2
+    exit 1
+fi
+
+echo "pre-push: using $DASLANG"
+
+# Formatter: whole tree, verify-only (CI parity).
+echo "pre-push: formatter --verify ..."
+"$DASLANG" "$ROOT/utils/das-fmt/dasfmt.das" -- --path "$ROOT" --verify
+
+# Lint: only .das files changed in the pushed range vs remote tip
+# (mirrors CI's `git diff origin/<base>...HEAD`).
+ZERO40="0000000000000000000000000000000000000000"
+ZERO64="0000000000000000000000000000000000000000000000000000000000000000"
+RANGES=()
+while read -r LOCAL_REF LOCAL_SHA REMOTE_REF REMOTE_SHA; do
+    case "$LOCAL_SHA" in "$ZERO40"|"$ZERO64") continue;; esac  # branch delete
+    case "$REMOTE_SHA" in
+        "$ZERO40"|"$ZERO64")
+            BASE="$(git merge-base "$LOCAL_SHA" origin/master 2>/dev/null || true)"
+            [ -n "$BASE" ] && RANGES+=("$BASE..$LOCAL_SHA")
+            ;;
+        *)
+            RANGES+=("$REMOTE_SHA..$LOCAL_SHA")
+            ;;
+    esac
+done
+
+if [ ${#RANGES[@]} -eq 0 ]; then
+    echo "pre-push: no ranges (deleting or empty); skipping lint"
+    exit 0
+fi
+
+# Portable read-into-array (mapfile is bash 4+; macOS default ships bash 3.2).
+CHANGED=()
+while IFS= read -r line; do
+    CHANGED+=("$line")
+done < <(git diff --name-only --diff-filter=AM "${RANGES[@]}" -- '*.das' | sort -u)
+
+if [ ${#CHANGED[@]} -eq 0 ]; then
+    echo "pre-push: no .das files changed; skipping lint"
+    exit 0
+fi
+
+echo "pre-push: linting ${#CHANGED[@]} .das file(s)"
+"$DASLANG" "$ROOT/utils/lint/main.das" -- "${CHANGED[@]}" --quiet
diff --git a/benchmarks/fusion/bench_arr_at_i64.das b/benchmarks/fusion/bench_arr_at_i64.das
new file mode 100644
index 0000000000..2352225301
--- /dev/null
+++ b/benchmarks/fusion/bench_arr_at_i64.das
@@ -0,0 +1,82 @@
+options gen2
+
+require dastest/testing_boost
+
+// Apples-to-apples cost of int / int64 / uint64-indexed array access.
+// All three benches use the same `for (i in <range-iter>(N))` shape,
+// differing only in the index type. Write workload is `arr[i] = 1`
+// (no cast in the body — isolates the index/fusion cost). The outer
+// `for (j in range(OUTER))` amortizes the per-body harness overhead
+// over OUTER * N inner ops.
+
+let N = 10000
+let OUTER = 10
+let TOTAL = N * OUTER
+
+[benchmark]
+def arr_at_int_idx(b : B?) {
+    var arr : array<int>
+    arr |> resize(N)
+    b |> run("write_int_idx/{TOTAL}", TOTAL) {
+        for (_j in range(OUTER)) {
+            for (i in range(N)) {
+                arr[i] = 1
+            }
+        }
+    }
+    b |> run("read_int_idx/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (i in range(N)) {
+                sum += arr[i]
+            }
+        }
+        strictEqual(b, length(arr), N)
+    }
+}
+
+[benchmark]
+def arr_at_int64_idx(b : B?) {
+    var arr : array<int>
+    arr |> resize(N)
+    let N64 = int64(N)
+    b |> run("write_int64_idx/{TOTAL}", TOTAL) {
+        for (_j in range(OUTER)) {
+            for (i in range64(N64)) {
+                arr[i] = 1
+            }
+        }
+    }
+    b |> run("read_int64_idx/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (i in range64(N64)) {
+                sum += arr[i]
+            }
+        }
+        strictEqual(b, length(arr), N)
+    }
+}
+
+[benchmark]
+def arr_at_uint64_idx(b : B?) {
+    var arr : array<int>
+    arr |> resize(N)
+    let N64 = uint64(N)
+    b |> run("write_uint64_idx/{TOTAL}", TOTAL) {
+        for (_j in range(OUTER)) {
+            for (i in urange64(N64)) {
+                arr[i] = 1
+            }
+        }
+    }
+    b |> run("read_uint64_idx/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (i in urange64(N64)) {
+                sum += arr[i]
+            }
+        }
+        strictEqual(b, length(arr), N)
+    }
+}
diff --git a/benchmarks/fusion/bench_table_index_i64.das b/benchmarks/fusion/bench_table_index_i64.das
new file mode 100644
index 0000000000..1a03f49c1c
--- /dev/null
+++ b/benchmarks/fusion/bench_table_index_i64.das
@@ -0,0 +1,66 @@
+options gen2
+
+require dastest/testing_boost
+
+// Apples-to-apples cost of int / int64 / uint64-keyed table indexing.
+// All three benches use the same `for (k in <range-iter>(N))` shape,
+// differing only in the key type. Read workload is `sum += tab[k]`
+// (no cast in the body). The outer `for (j in range(OUTER))` amortizes
+// per-body harness overhead over OUTER * N inner ops.
+
+let N = 10000
+let OUTER = 10
+let TOTAL = N * OUTER
+
+[benchmark]
+def table_index_int_key(b : B?) {
+    var tab : table<int; int>
+    for (k in range(N)) {
+        tab[k] = 1
+    }
+    b |> run("read_int_key/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (k in range(N)) {
+                sum += tab[k]
+            }
+        }
+        strictEqual(b, length(tab), N)
+    }
+}
+
+[benchmark]
+def table_index_int64_key(b : B?) {
+    var tab : table<int64; int>
+    let N64 = int64(N)
+    for (k in range64(N64)) {
+        tab[k] = 1
+    }
+    b |> run("read_int64_key/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (k in range64(N64)) {
+                sum += tab[k]
+            }
+        }
+        strictEqual(b, length(tab), N)
+    }
+}
+
+[benchmark]
+def table_index_uint64_key(b : B?) {
+    var tab : table<uint64; int>
+    let N64 = uint64(N)
+    for (k in urange64(N64)) {
+        tab[k] = 1
+    }
+    b |> run("read_uint64_key/{TOTAL}", TOTAL) {
+        var sum = 0
+        for (_j in range(OUTER)) {
+            for (k in urange64(N64)) {
+                sum += tab[k]
+            }
+        }
+        strictEqual(b, length(tab), N)
+    }
+}
diff --git a/benchmarks/sql/LINQ.md b/benchmarks/sql/LINQ.md
index 4879f3eecd..91289d165b 100644
--- a/benchmarks/sql/LINQ.md
+++ b/benchmarks/sql/LINQ.md
@@ -778,3 +778,7 @@ dastest reports `ns/op` in INTERP mode by default. To bump dataset size as the s
 **`PERF009` suppression in `fold_linq_default`.** The macro's `var pass_N = call` + later `return <- pass_N` pattern triggers PERF009 on single-pass chains. The shape is load-bearing for the array-pipeline semantics (every stage binds so the next can reuse the buffer in-place), so we suppress inline at the qmacro_expr emission site and document why.
 
 **Benchmark variants where SQL has no clean form.** `zip` (not a relational op), `_all(pred)` (no direct `_all` chain terminal in sqlite_linq), `join` with inner-select-from (wiring not exposed), `distinct |> count` (no `COUNT(DISTINCT col)` yet), `take/skip` before aggregate (LIMIT/OFFSET semantics conflict with aggregate-collapse). We either reformulate to a SQL-friendly shape (`count(where ¬p)` for all_match), omit the m1 column (zip, join), or terminate the chain in `to_array` instead of an aggregate (take/skip/distinct).
+
+## Future work — 64-bit sweep
+
+Gated on 64-bit arrays + tables landing in daslang. Once they do, sweep `daslib/linq.das` to add `int64` overloads on every count/index/N-parameter surface. Today only the count family is symmetric — `count(iter; pred)` / `count(arr; pred)` / `long_count(iter; pred)` / `long_count(arr; pred)` all exist with their bare (no-pred) forms. The rest of the surface (`take(N)`, `skip(N)`, `skip_last(N)`, `take_last(N)`, `top_n(N)`, `top_n_by(N)`, `top_n_by_descending(N)`, `top_n_by_with_cmp(N)`, `element_at(N)`) still takes `int` only. Add `int64` overloads alongside, route splice planners to pick the matching variant by argument type, and refresh benchmarks for the large-N regime.
diff --git a/benchmarks/sql/M4_DECS_EXPANSION.md b/benchmarks/sql/M4_DECS_EXPANSION.md
new file mode 100644
index 0000000000..dc6f204549
--- /dev/null
+++ b/benchmarks/sql/M4_DECS_EXPANSION.md
@@ -0,0 +1,266 @@
+# m4_decs_fold lane — expansion plan
+
+Adds a third benchmark target alongside `m1_sql` (SQL via `_sql`) and `m3`/`m3f` (array linq, with/without `_fold` splice). The new lane is `m4_decs_fold`:
+
+```das
+_fold(from_decs_template(type<DecsCar>)._where(...)._select(...).sum())
+```
+
+Goal: tri-platform comparison (SQL vs array vs decs) under the same chain shape, so the splice path's decs win is directly comparable to its array win.
+
+## Baseline matrix (2026-05-20, before m4 expansion)
+
+100K rows, INTERP mode. m3f = `_fold(each(arr)...)`. Lower is better.
+
+### Headlines (m3f wins both)
+
+| benchmark | m1 sql | m3 | m3f | m1/m3f | m3/m3f |
+|---|---:|---:|---:|---:|---:|
+| sum_aggregate | 29 | 30 | **2** | 14.5× | 15.0× |
+| select_where_count | 32 | 57 | **5** | 6.4× | 11.4× |
+| sum_where | 32 | 43 | **4** | 8.0× | 10.8× |
+| chained_where | 36 | 45 | **6** | 6.0× | 7.5× |
+| aggregate_match | 34 | 49 | **5** | 6.8× | 9.8× |
+| all_match | 27 | 21 | **3** | 9.0× | 7.0× |
+| take_while_match | 7 | 22 | **2** | 3.5× | 11.0× |
+| count_aggregate | 29 | 29 | **4** | 7.2× | 7.2× |
+| select_where_sum | 36 | 60 | **7** | 5.1× | 8.6× |
+| to_array_filter | 74 | 42 | **11** | 6.7× | 3.8× |
+
+### Order/sort family (m3f dominates m3)
+
+| benchmark | m1 | m3 | m3f | m3/m3f |
+|---|---:|---:|---:|---:|
+| sort_take | 38 | 763 | **27** | 28.3× |
+| order_take_desc | 38 | 746 | **27** | 27.6× |
+| select_where_order_take | 36 | 379 | **24** | 15.8× |
+| sort_first | 37 | 742 | **756** | (regresses; sort dominates) |
+
+### Group-by family (2-5× wins)
+
+| benchmark | m1 | m3 | m3f | m1/m3f | m3/m3f |
+|---|---:|---:|---:|---:|---:|
+| groupby_sum | 173 | 102 | **37** | 4.7× | 2.8× |
+| groupby_count | 143 | 68 | **36** | 4.0× | 1.9× |
+| groupby_average | 171 | 106 | **52** | 3.3× | 2.0× |
+| groupby_having_count | 144 | 74 | **36** | 4.0× | 2.1× |
+| groupby_having_hidden_sum | 175 | 103 | **40** | 4.4× | 2.6× |
+| groupby_max | 174 | 103 | **44** | 4.0× | 2.3× |
+| groupby_min | 173 | 106 | **43** | 4.0× | 2.5× |
+| groupby_multi_reducer | 190 | 139 | **53** | 3.6× | 2.6× |
+| groupby_select_sum | — | 110 | **59** | — | 1.9× |
+| groupby_where_count | 76 | 64 | **23** | 3.3× | 2.8× |
+| groupby_where_sum | 86 | 79 | **23** | 3.7× | 3.4× |
+| groupby_first | — | 68 | **35** | — | 1.9× |
+
+### Anomalies / weak spots
+
+| benchmark | m1 | m3 | m3f | note |
+|---|---:|---:|---:|---|
+| indexed_lookup | 1,431 | 2,029,891 | 200,399 | SQL B-tree wins 1417× over linear scan; splice helps 10× over m3 but algorithmically can't match SQL. Decs equivalent uses **eid lookup** (archetype hash lookup, fast path). |
+| zip_dot_product | — | 52 | 57 | plan_zip landed but bench shape doesn't hit the splice (m3f slower than m3 by 10%). Worth a follow-up. |
+| join_count | — | 116 | 122 | No splice arm for join. m3f slightly slower. |
+| sort_first | 37 | 742 | 756 | Sort dominates the whole pipeline. m3f doesn't help when terminator is `first` after a full sort. |
+| distinct_take | — | 30 | 0 | m3f=0 ns/op — bench may be constant-folded; verify it actually exercises the chain. |
+| take_count_filtered / take_sum_aggregate / select_count / first_match / element_at_match / reverse_take / skip_take | various | various | 0 | Several 0 ns/op cases — verify the chain isn't being eliminated by DCE. |
+
+## Decs benchmarks (already in `benchmarks/decs/`)
+
+| benchmark | m1_hand_query | m2_eager_bridge | m3_fold_splice / m4_template_fold |
+|---|---:|---:|---:|
+| from_decs_count | 0 (arch.size) | 60 | **0** (matches hand) |
+| from_decs_sum | 4 (query) | 202 | **8** (within 2× of hand) |
+
+## m4_decs_fold expansion — triage
+
+Inventory of every `benchmarks/sql/*.das` benchmark with **decs feasibility** classification. Surfaces that don't yet exist on the decs side are flagged so we can decide whether to expand the decs surface.
+
+### Category A — clean decs map (no new surface needed)
+
+These translate directly: array `each(arr)._chain()` becomes decs `from_decs_template(type<T>)._chain()`. Splice already covers all chain shapes used.
+
+- aggregate_match
+- all_match
+- any_match
+- average_aggregate
+- chained_where
+- contains_match
+- count_aggregate
+- distinct_count
+- first_match
+- first_or_default_match
+- last_match
+- long_count_aggregate
+- max_aggregate
+- min_aggregate
+- select_count
+- select_where
+- select_where_count
+- select_where_sum
+- single_match
+- sum_aggregate
+- sum_where
+- take_count
+- take_count_filtered
+- take_sum_aggregate
+- take_while_match
+- skip_while_match
+- to_array_filter
+
+### Category B — clean decs map but needs Slice 5+ splice arms
+
+Chain shape works on decs surface today (via eager bridge), but splice planner doesn't yet handle the shape — would fall to tier-2 cascade. Listing here so each entry doubles as a future-slice trigger.
+
+- bare_order_where (order_by / reverse on decs — Slice 5+)
+- distinct_take (distinct_by + take on decs — Slice 5+)
+- element_at_match (element_at — Slice 5+)
+- groupby_average / groupby_count / groupby_first / groupby_having_count / groupby_having_hidden_sum / groupby_max / groupby_min / groupby_multi_reducer / groupby_select_sum / groupby_sum / groupby_where_count / groupby_where_sum — all need decs group_by splice (state-table family Slice 5+)
+- order_take_desc / select_where_order_take / sort_first / sort_take — order_by family on decs (Slice 5+)
+- reverse_take — reverse on decs (Slice 5+)
+- skip_take — skip+take chain on decs (Slice 5+)
+
+### Category C — needs new decs surface
+
+Benchmarks whose shape doesn't have a corresponding decs equivalent yet. Decision: build the surface OR skip.
+
+- **indexed_lookup** — SQL uses B-tree on `id`. Decs analog: **eid-based lookup** via `lookup_entity` (decs.das has `[eid]` component lookup). Build a `m4_decs_eid_lookup` lane that exercises this. NEW SURFACE: thin wrapper if needed.
+- **zip_dot_product** — pairs two parallel arrays. Decs analog: pair two component streams from the SAME archetype (intra-archetype zip is free — it's just multi-iter for over both components). Or zip across two archetypes (cross-archetype, harder). NEW SURFACE: `from_decs_template_zip` or syntactic-sugar wrapper. Worth investigating; could be a clean Slice surface add.
+- **join_count** — two tables joined. Decs analog: cross-archetype query with eid linkage. NEW SURFACE: `join_decs` or two-template iter. Larger design exercise; defer to follow-up.
+
+## Proposed execution order
+
+1. **Wave 1 — Category A only** (28 benchmarks). All splice today, all comparable now. Establishes the m4 surface in `_common.das` + per-bench m4 lane. Validates the lane convention before scope grows.
+2. **Wave 2 — Category C surface adds**:
+   - `indexed_lookup` via decs eid lookup (small change)
+   - `zip_dot_product` via decs intra-archetype zip (design discussion)
+   - `join_count` deferred to a later wave (full decs join design)
+3. **Wave 3 — Category B**: ship m4 lanes for these now using the eager bridge (so the matrix is complete), THEN as plan_decs_unroll Slice 5+ lands, the lanes start showing the splice win. Each lane stays valid throughout.
+
+## Conventions for m4 lanes
+
+Per Boris (2026-05-20):
+- One lane per benchmark, named `<bench>_m4` reported as `m4_decs_fold/{n}` (`m4` for ordinal, `decs_fold` for clarity)
+- Per-benchmark fixture call (each file calls `fixture_decs(n)` inline, mirroring how m3 calls `fixture_array(n)` inline)
+- Shared `[decs_template] DecsCar` + `fixture_decs(n)` in `_common.das` (mirrors how m3's `Car` + `fixture_array` are shared)
+- Lambda-typed args (`$(c : Car)`) replaced with `_select(_.field)` macro form in m4 bodies (Car type doesn't match the decs tuple element)
+- Sentinel values (`first_or_default`) use named-tuple literal `(id=…, name=…, …)` matching the iterator element type
+
+## First m4 sweep — results (2026-05-20, 100K rows, INTERP)
+
+47 benchmarks gained an m4 lane (all Cat A + Cat B). Skipped: `indexed_lookup`, `join_count`, `zip_dot_product` (Cat C, need surface).
+
+### Cat A — m4 splices today (chain shape covered by plan_decs_unroll)
+
+| benchmark | m1 | m3 | m3f | m4 | m4/m1 | m4/m3f | notes |
+|---|---:|---:|---:|---:|---:|---:|---|
+| sum_aggregate | 30 | 29 | 2 | **16** | 0.5x | 8x | 6-component multi-iter overhead floor |
+| select_where_sum | 36 | 57 | 7 | **18** | 0.5x | 2.6x | chain splice; beats m1 + m3 |
+| select_where_count | 32 | 58 | 5 | **18** | 0.6x | 3.6x | chain splice; beats m1 + m3 |
+| count_aggregate | 30 | 28 | 4 | **15** | 0.5x | 3.8x | (count with filter — not bare; filter walks all entities) |
+| long_count_aggregate | 29 | 28 | 4 | **15** | 0.5x | 3.8x | parallel to count |
+| sum_where | 32 | 45 | 4 | **17** | 0.5x | 4.2x | |
+| chained_where | 36 | 46 | 6 | **18** | 0.5x | 3x | |
+| select_where | 190 | 28 | 11 | **22** | **0.1x** | 2x | m4 8.6x faster than m1 SQL |
+| max_aggregate | 30 | 36 | 5 | **19** | 0.6x | 3.8x | |
+| min_aggregate | 30 | 38 | 6 | **19** | 0.6x | 3.2x | |
+| average_aggregate | 30 | 34 | 5 | **21** | 0.7x | 4.2x | |
+| all_match | 27 | 20 | 3 | **15** | 0.6x | 5x | early-exit on bridge |
+| to_array_filter | 70 | 43 | 11 | **24** | **0.3x** | 2.2x | m4 2.9x faster than m1 SQL |
+| first_match | 0 | 28 | 0 | **0** | — | — | early-exit on first hit |
+| first_or_default_match | 0 | 31 | 0 | **0** | — | — | |
+| any_match | 0 | 0 | 0 | **0** | — | — | |
+| contains_match | 0 | 28 | 2 | **8** | — | 4x | |
+
+**Net Cat A:** m4 beats SQL on most shapes (8 of 17 with concrete m1+m4 numbers; another 4 at 0 ns/op). m4 vs m3f shows a ~3-5× decs overhead from the 6-component multi-iter for-loop (every component's get_ro participates even when chain only reads one field). This is the splice's structural cost on a multi-field decs schema.
+
+### Cat B — m4 falls back to eager bridge (splice deferred to Slice 5+)
+
+These chain shapes splice on array but not yet on decs. m4 = eager bridge (materialize array<tuple>, then run on array). Slower than m3f but the comparison is real today.
+
+| benchmark | m1 | m3 | m3f | m4 | m4/m3f | needs splice arm |
+|---|---:|---:|---:|---:|---:|---|
+| bare_order_where | 273 | 357 | 120 | 196 | 1.6x | order_by on decs |
+| distinct_count | 41 | 43 | 15 | 97 | 6.5x | distinct on decs |
+| distinct_take | 0 | 30 | 0 | 34 | — | distinct + take on decs |
+| order_take_desc | 37 | 698 | 27 | 117 | 4.3x | order_by + take |
+| reverse_take | 0 | 22 | 0 | 114 | — | reverse + take |
+| select_count | 0 | 32 | 0 | 3 | — | (m4 likely DCE — verify) |
+| select_where_order_take | 36 | 355 | 24 | 102 | 4.2x | order + take after where |
+| skip_take | 0 | 15 | 0 | 37 | — | take/skip on decs |
+| skip_while_match | 3 | 20 | 5 | 83 | 16.6x | skip_while on decs |
+| sort_first | 37 | 713 | 722 | 802 | 1.1x | sort dominates; splice barely helps |
+| sort_take | 38 | 715 | 27 | 119 | 4.4x | order + take |
+| take_count | 3 | 0 | 0 | 36 | — | take on decs |
+| take_count_filtered | — | 29 | 0 | 35 | — | take after where |
+| take_sum_aggregate | — | 28 | 0 | 34 | — | take + sum |
+| take_while_match | 7 | 22 | 2 | 55 | 27.5x | take_while on decs |
+| element_at_match | 0 | 28 | 0 | 35 | — | element_at on decs |
+| last_match | 0 | 29 | 5 | 83 | 16.6x | last on decs |
+| single_match | 0 | 19 | 2 | 80 | 40x | single on decs |
+| groupby_count | 142 | 65 | 37 | 115 | 3.1x | group_by on decs (state-table) |
+| groupby_sum | 171 | 101 | 36 | 115 | 3.2x | group_by on decs |
+| groupby_average | 172 | 99 | 52 | 128 | 2.5x | group_by on decs |
+| groupby_max | 174 | 103 | 43 | 120 | 2.8x | group_by on decs |
+| groupby_min | 175 | 105 | 42 | 122 | 2.9x | group_by on decs |
+| groupby_first | — | 68 | 35 | 112 | 3.2x | group_by on decs |
+| groupby_having_count | 142 | 71 | 37 | 114 | 3.1x | group_by on decs |
+| groupby_having_hidden_sum | 176 | 102 | 40 | 122 | 3.0x | group_by on decs |
+| groupby_multi_reducer | 191 | 138 | 52 | 130 | 2.5x | group_by on decs |
+| groupby_select_sum | — | 109 | 60 | 137 | 2.3x | group_by on decs |
+| groupby_where_count | 76 | 63 | 23 | 101 | 4.4x | group_by on decs |
+| groupby_where_sum | 101 | 81 | 23 | 105 | 4.6x | group_by on decs |
+| aggregate_match | 34 | 50 | 5 | 84 | 16.8x | `aggregate(init, $(acc, c) => …)` — not a `_select(_.x).sum()` shape, distinct planner |
+
+**Net Cat B:** Establishes today's baseline. As Slice 5+ lands group_by/order/distinct/take splice arms on the decs bridge, these rows will drop into Cat A territory and the splice win will become visible without changing the benchmark.
+
+### Suspect "0 ns/op" m4 results — verify the chain isn't getting DCE'd
+
+- `first_match` / `first_or_default_match` — early-exit on the first archetype's first entity is genuinely cheap; plausible
+- `any_match` / `select_count` — both 0 on m4 and m3f; likely constant-folded
+- `take_count` / `take_count_filtered` / `take_sum_aggregate` / `reverse_take` / `skip_take` / `distinct_take` — m4=34-37 ns (eager bridge); m3f=0 ns; m1 mostly 0 too. The m3f=0 cases are suspicious — bench may be eliminating the chain at compile time. Worth dropping a `b->failNow()` floor check or a side effect to confirm.
+
+## Wave 2 — surface expansions
+
+After this m4 sweep ships, expand the surface for Cat C benchmarks:
+
+1. **`indexed_lookup`** — add `m4_decs_eid_lookup` lane using `lookup_entity` / archetype hash lookup. Decs hash-of-eid IS the fast path; benchmark it against SQL B-tree. May require minor surface (a `find_entity_by_field` helper)
+2. **`zip_dot_product`** — design intra-archetype zip surface. Two components from the SAME archetype is just a multi-iter for-loop (free). Cross-archetype zip is harder. Pick the design that lets `_zip` on decs match the array_zip shape
+3. **`join_count`** — needs full decs join design. Multi-table query equivalent via two `[decs_template]` structs + eid linkage. Larger design exercise; defer
+
+## Wave 3 — Slice 5+ enables Cat B splice
+
+As `plan_decs_unroll` gains:
+- take/skip/take_while/skip_while arms (cross-archetype counter / for_each_archetype_find early-exit)
+- distinct/group_by state-table arms (hoisted table above outer for_each_archetype)
+- order_by / reverse buffer arms
+
+…Cat B m4 numbers will drop dramatically (from ~100-130 ns down to the 5-20 ns range), matching the Cat A pattern. The matrix becomes a regression guard for each splice arm.
+
+## Wave 4 — perf optimizations on plan_decs_unroll
+
+Known overhead in the current splice: **6-component multi-iter for-loop walks ALL components** even when the chain only reads one field. Possible optimizations:
+- Track per-chain "components actually accessed" set; emit `get_ro` + iter binding only for those
+- For bare-count chains: arch.size shortcut works (already implemented) → 0 ns
+- For single-field selects: 1-component for-loop should match the array case
+
+If implemented, m4 numbers on Cat A would close most of the 3-5× gap vs m3f, making decs effectively as fast as array for projection-heavy chains.
+
+## Update — sort_first fix (2026-05-20, plan_order_family + first arm)
+
+Extended `plan_order_family` to recognize `first` / `first_or_default` as terminators alongside `take(N)`. `order_by + first` now splices to a single-pass `min_by` (array source: zero-alloc empty-guard + `min_by`; iterator source: `top_n_by(_, 1, _) |> first()`). `order_by_descending` routes to `max_by`. Preserves the eager `first()` panic-on-empty contract.
+
+| benchmark | m1 | m3 | m3f (old) | m3f (new) | m4 (old) | m4 (new) | m3f win |
+|---|---:|---:|---:|---:|---:|---:|---:|
+| sort_first | 37 | 713 | 722 | **42** | 802 | **121** | 17× |
+
+Now `sort_first` lands in line with the rest of the order-family. m4_decs_fold still rides the eager bridge (Slice 5+ will close the gap).
+
+## Update — zip_dot_product fix (2026-05-20, plan_zip + accumulator/early-exit lanes)
+
+PR #2742's accumulator + early-exit terminator work on `plan_zip` was orphaned on a stacked PR base when #2741 merged. Cherry-picked the 3 commits onto fresh master (auto-merged cleanly). `plan_zip` now dispatches to the generalized multi-source `emit_accumulator_lane` (sum / min / max / average / long_count) and `emit_early_exit_lane` (first / first_or_default / any / all / contains) via parallel-array helpers (`srcNames`, `topExprs`) + new `finalize_lane_emission` wrap.
+
+| benchmark | m1 | m3 | m3f (old) | m3f (new) | m3f win |
+|---|---:|---:|---:|---:|---:|
+| zip_dot_product | — | 53 | 58 | **7** | 8.3× |
+
+`zip(xs, ys)._select(_._0 * _._1).sum()` now fuses to a single multi-iter for-loop with inline accumulator, zero alloc. Falls in line with the rest of the accumulator-class benchmarks.
diff --git a/benchmarks/sql/_common.das b/benchmarks/sql/_common.das
index 36c94e42f0..ffa12db7d3 100644
--- a/benchmarks/sql/_common.das
+++ b/benchmarks/sql/_common.das
@@ -7,6 +7,7 @@ require sqlite/sqlite_boost public
 require sqlite/sqlite_linq public
 require dastest/testing_boost public
 require daslib/fio public
+require daslib/decs_boost public
 
 let public BRAND_COUNT = 5
 let public DEALER_COUNT = 100
@@ -75,3 +76,28 @@ def public fixture_dealers_array() : array<Dealer> {
     }
     return <- arr
 }
+
+// m4_decs_fold lane: same Car schema, decs-archetype-backed. Same row generator as fixture_array so fold splice perf is directly comparable across SQL / array / decs.
+[decs_template(prefix = "car_")]
+struct public DecsCar {
+    id        : int
+    name      : string
+    price     : int
+    brand     : int
+    year      : int
+    dealer_id : int
+}
+
+def public fixture_decs(n : int) {
+    restart()
+    create_entities(n) $(eid : EntityId; i : int; var cmp : ComponentMap) {
+        apply_decs_template(cmp, DecsCar(
+            id = i + 1,
+            name = "Car{i}",
+            price = (i * 37) % 1000,
+            brand = i % BRAND_COUNT,
+            year = 2010 + (i * 7) % 16,
+            dealer_id = (i % DEALER_COUNT) + 1
+        ))
+    }
+}
diff --git a/benchmarks/sql/aggregate_match.das b/benchmarks/sql/aggregate_match.das
index f21df6cd36..fd8363ac61 100644
--- a/benchmarks/sql/aggregate_match.das
+++ b/benchmarks/sql/aggregate_match.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
         b |> run("m1_sql/{n}", n) {
             let total = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD)
                                 |> _select(_.price) |> sum())
+            b |> accept(total)
             if (total == 0) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let total = (arr |> _where(_.price > THRESHOLD)
                          |> aggregate(0, $(acc : int, c : Car) => acc + c.price))
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -38,6 +40,19 @@ def run_m3f(b : B?; n : int) {
     b |> run("m3f_array_fold/{n}", n) {
         let total = _fold(each(arr)._where(_.price > THRESHOLD)
             .aggregate(0, $(acc : int, c : Car) => acc + c.price))
+        b |> accept(total)
+        if (total == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let total = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)
+            .aggregate(0, $(acc, c) => acc + c.price))
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -58,3 +73,8 @@ def aggregate_match_m3(b : B?) {
 def aggregate_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def aggregate_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/all_match.das b/benchmarks/sql/all_match.das
index de705ce1df..b140da7537 100644
--- a/benchmarks/sql/all_match.das
+++ b/benchmarks/sql/all_match.das
@@ -14,6 +14,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let bad = _sql(db |> select_from(type<Car>) |> _where(_.price >= 9999) |> count())
+            b |> accept(bad)
             if (bad != 0) {
                 b->failNow()
             }
@@ -25,6 +26,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let yes = arr |> _all(_.price < 9999)
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -34,6 +36,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let yes = _fold(each(arr)._all(_.price < 9999))
+        b |> accept(yes)
+        if (!yes) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let yes = _fold(from_decs_template(type<DecsCar>)._all(_.price < 9999))
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -54,3 +68,8 @@ def all_match_m3(b : B?) {
 def all_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def all_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/any_match.das b/benchmarks/sql/any_match.das
index 70f694e9f6..55c2033e14 100644
--- a/benchmarks/sql/any_match.das
+++ b/benchmarks/sql/any_match.das
@@ -14,6 +14,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let opt = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> _first_opt())
+            b |> accept(opt)
             if (!is_some(opt)) {
                 b->failNow()
             }
@@ -25,6 +26,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let yes = arr |> _any(_.price > THRESHOLD)
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -34,6 +36,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let yes = _fold(each(arr)._any(_.price > THRESHOLD))
+        b |> accept(yes)
+        if (!yes) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let yes = _fold(from_decs_template(type<DecsCar>)._any(_.price > THRESHOLD))
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -54,3 +68,8 @@ def any_match_m3(b : B?) {
 def any_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def any_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/average_aggregate.das b/benchmarks/sql/average_aggregate.das
index af9b2adbd4..f7aa282438 100644
--- a/benchmarks/sql/average_aggregate.das
+++ b/benchmarks/sql/average_aggregate.das
@@ -11,6 +11,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let a = _sql(db |> select_from(type<Car>) |> _select(_.price) |> average())
+            b |> accept(a)
             if (a == 0.0lf) {
                 b->failNow()
             }
@@ -22,6 +23,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let a = arr |> _select(double(_.price)) |> average()
+        b |> accept(a)
         if (a == 0.0lf) {
             b->failNow()
         }
@@ -31,6 +33,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let a = _fold(each(arr)._select(double(_.price)).average())
+        b |> accept(a)
+        if (a == 0.0lf) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let a = _fold(from_decs_template(type<DecsCar>)._select(double(_.price)).average())
+        b |> accept(a)
         if (a == 0.0lf) {
             b->failNow()
         }
@@ -51,3 +65,8 @@ def average_aggregate_m3(b : B?) {
 def average_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def average_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/bare_order_where.das b/benchmarks/sql/bare_order_where.das
index 744b6805e7..400c24b3f0 100644
--- a/benchmarks/sql/bare_order_where.das
+++ b/benchmarks/sql/bare_order_where.das
@@ -18,6 +18,7 @@ def run_m1(b : B?; n : int) {
             let rows <- _sql(db |> select_from(type<Car>)
                                 |> _where(_.price > THRESHOLD)
                                 |> _order_by(_.price))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -30,6 +31,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> _where(_.price > THRESHOLD)
                          |> _order_by(_.price))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -42,6 +44,20 @@ def run_m3f(b : B?; n : int) {
         let rows <- _fold(each(arr)._where(_.price > THRESHOLD)
                                    ._order_by(_.price)
                                    .to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)
+                                   ._order_by(_.price)
+                                   .to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -62,3 +78,8 @@ def bare_order_where_m3(b : B?) {
 def bare_order_where_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def bare_order_where_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/chained_where.das b/benchmarks/sql/chained_where.das
index b433d1ca11..2a37eea0b9 100644
--- a/benchmarks/sql/chained_where.das
+++ b/benchmarks/sql/chained_where.das
@@ -19,6 +19,7 @@ def run_m1(b : B?; n : int) {
                             |> _where(_.price > THRESHOLD)
                             |> _where(_.year >= YEAR_FLOOR)
                             |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -32,6 +33,7 @@ def run_m3(b : B?; n : int) {
         let c = (arr |> _where(_.price > THRESHOLD)
                      |> _where(_.year >= YEAR_FLOOR)
                      |> count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -43,6 +45,20 @@ def run_m3f(b : B?; n : int) {
         let c = _fold(each(arr)._where(_.price > THRESHOLD)
                                ._where(_.year >= YEAR_FLOOR)
                                .count())
+        b |> accept(c)
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)
+                               ._where(_.year >= YEAR_FLOOR)
+                               .count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -63,3 +79,8 @@ def chained_where_m3(b : B?) {
 def chained_where_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def chained_where_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/contains_match.das b/benchmarks/sql/contains_match.das
index 8eb6e2e976..e2ee872219 100644
--- a/benchmarks/sql/contains_match.das
+++ b/benchmarks/sql/contains_match.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         b |> run("m1_sql/{n}", n) {
             // SQL doesn't have a direct CONTAINS for arbitrary values; use _any with _where.
             let opt = _sql(db |> select_from(type<Car>) |> _where(_.id == TARGET_ID) |> _first_opt())
+            b |> accept(opt)
             if (!is_some(opt)) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
         // Project ids out then contains. Mirrors what `_fold(...select.contains(...))` does
         // — an array-source linq chain materializes `select` first then iterates contains.
         let yes = arr |> _select(_.id) |> contains(TARGET_ID)
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -37,6 +39,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let yes = _fold(each(arr)._select(_.id).contains(TARGET_ID))
+        b |> accept(yes)
+        if (!yes) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let yes = _fold(from_decs_template(type<DecsCar>)._select(_.id).contains(TARGET_ID))
+        b |> accept(yes)
         if (!yes) {
             b->failNow()
         }
@@ -57,3 +71,8 @@ def contains_match_m3(b : B?) {
 def contains_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def contains_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/count_aggregate.das b/benchmarks/sql/count_aggregate.das
index f6d92dfca1..d0f770f80a 100644
--- a/benchmarks/sql/count_aggregate.das
+++ b/benchmarks/sql/count_aggregate.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let c = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let c = arr |> _where(_.price > THRESHOLD) |> count()
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -38,6 +40,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let c = _fold(each(arr)._where(_.price > THRESHOLD).count())
+        b |> accept(c)
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -58,3 +72,8 @@ def count_aggregate_m3(b : B?) {
 def count_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def count_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/distinct_count.das b/benchmarks/sql/distinct_count.das
index 4ff234e8ee..290598dfc1 100644
--- a/benchmarks/sql/distinct_count.das
+++ b/benchmarks/sql/distinct_count.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _select(_.brand) |> distinct())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -26,6 +27,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> _select(_.brand) |> distinct())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -35,6 +37,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let rows <- _fold(each(arr)._select(_.brand).distinct().to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>)._select(_.brand).distinct().to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -55,3 +69,8 @@ def distinct_count_m3(b : B?) {
 def distinct_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def distinct_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/distinct_take.das b/benchmarks/sql/distinct_take.das
index 9afee6c444..2bb7d2394b 100644
--- a/benchmarks/sql/distinct_take.das
+++ b/benchmarks/sql/distinct_take.das
@@ -20,6 +20,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _select(_.brand) |> distinct() |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -32,6 +33,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         unsafe {
             let rows <- (each(arr) |> _select(_.brand) |> distinct() |> take(TAKE_N) |> to_array())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -44,6 +46,20 @@ def run_m3f(b : B?; n : int) {
     b |> run("m3f_array_fold/{n}", n) {
         unsafe {
             let rows <- _fold(each(arr)._select(_.brand).distinct().take(TAKE_N).to_array())
+            b |> accept(rows)
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        unsafe {
+            let rows <- _fold(from_decs_template(type<DecsCar>)._select(_.brand).distinct().take(TAKE_N).to_array())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -65,3 +81,8 @@ def distinct_take_m3(b : B?) {
 def distinct_take_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def distinct_take_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/element_at_match.das b/benchmarks/sql/element_at_match.das
index 61c51fa02c..114977ccd6 100644
--- a/benchmarks/sql/element_at_match.das
+++ b/benchmarks/sql/element_at_match.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD)
                               |> skip(INDEX) |> _first())
+            b |> accept(row)
             if (row.price == 0) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _where(_.price > THRESHOLD) |> element_at(INDEX)
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -36,6 +38,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._where(_.price > THRESHOLD).element_at(INDEX))
+        b |> accept(row)
+        if (row.price == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).element_at(INDEX))
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -56,3 +70,8 @@ def element_at_match_m3(b : B?) {
 def element_at_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def element_at_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/first_match.das b/benchmarks/sql/first_match.das
index dee7604523..0531baa4cb 100644
--- a/benchmarks/sql/first_match.das
+++ b/benchmarks/sql/first_match.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> _first())
+            b |> accept(row)
             if (row.price == 0) {
                 b->failNow()
             }
@@ -26,6 +27,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _where(_.price > THRESHOLD) |> first()
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -35,6 +37,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._where(_.price > THRESHOLD).first())
+        b |> accept(row)
+        if (row.price == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).first())
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -55,3 +69,8 @@ def first_match_m3(b : B?) {
 def first_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def first_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/first_or_default_match.das b/benchmarks/sql/first_or_default_match.das
index c67b3da74b..08585b560c 100644
--- a/benchmarks/sql/first_or_default_match.das
+++ b/benchmarks/sql/first_or_default_match.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> _first())
+            b |> accept(row)
             if (row.price == 0) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let sentinel = Car(id = SENTINEL_ID, name = "none", price = 0, brand = 0, year = 0, dealer_id = 0)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _where(_.price > THRESHOLD) |> first_or_default(sentinel)
+        b |> accept(row)
         if (row.id == SENTINEL_ID) {
             b->failNow()
         }
@@ -37,6 +39,19 @@ def run_m3f(b : B?; n : int) {
     let sentinel = Car(id = SENTINEL_ID, name = "none", price = 0, brand = 0, year = 0, dealer_id = 0)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._where(_.price > THRESHOLD).first_or_default(sentinel))
+        b |> accept(row)
+        if (row.id == SENTINEL_ID) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    let sentinel = (id = SENTINEL_ID, name = "none", price = 0, brand = 0, year = 0, dealer_id = 0)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).first_or_default(sentinel))
+        b |> accept(row)
         if (row.id == SENTINEL_ID) {
             b->failNow()
         }
@@ -57,3 +72,8 @@ def first_or_default_match_m3(b : B?) {
 def first_or_default_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def first_or_default_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_average.das b/benchmarks/sql/groupby_average.das
index c2dc5342db..801e168b5e 100644
--- a/benchmarks/sql/groupby_average.das
+++ b/benchmarks/sql/groupby_average.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0,
                                               AvgPrice = _._1 |> select($(c : Car) => c.price) |> average())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0,
                                                        AvgPrice = _._1 |> select($(c : Car) => c.price) |> average())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -41,6 +43,22 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       AvgPrice = _._1 |> select($(c : Car) => c.price) |> average()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      AvgPrice = _._1 |> _select(_.price) |> average()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -61,3 +79,8 @@ def groupby_average_m3(b : B?) {
 def groupby_average_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_average_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_count.das b/benchmarks/sql/groupby_count.das
index 35179e08ff..7905028999 100644
--- a/benchmarks/sql/groupby_count.das
+++ b/benchmarks/sql/groupby_count.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
             let groups <- _sql(db |> select_from(type<Car>)
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0, N = _._1 |> length)))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0, N = _._1 |> length)))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -39,6 +41,21 @@ def run_m3f(b : B?; n : int) {
                             ._group_by(_.brand)
                             ._select((Brand = _._0, N = _._1 |> length))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0, N = _._1 |> length))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -59,3 +76,8 @@ def groupby_count_m3(b : B?) {
 def groupby_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_first.das b/benchmarks/sql/groupby_first.das
index 217c9f80ed..22ed403c52 100644
--- a/benchmarks/sql/groupby_first.das
+++ b/benchmarks/sql/groupby_first.das
@@ -12,6 +12,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0,
                                                        FirstCar = _._1 |> first())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -25,6 +26,22 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       FirstCar = _._1 |> first()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      FirstCar = _._1 |> first()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -40,3 +57,8 @@ def groupby_first_m3(b : B?) {
 def groupby_first_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_first_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_having_count.das b/benchmarks/sql/groupby_having_count.das
index 1a037ed27b..ae26f260ab 100644
--- a/benchmarks/sql/groupby_having_count.das
+++ b/benchmarks/sql/groupby_having_count.das
@@ -18,6 +18,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _having(_._1 |> length >= 5)
                                   |> _select((Brand = _._0, N = _._1 |> length)))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -29,6 +30,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._having(_._1 |> length >= 5)._select((Brand = _._0, N = _._1 |> length)))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -42,6 +44,22 @@ def run_m3f(b : B?; n : int) {
                             ._having(_._1 |> length >= 5)
                             ._select((Brand = _._0, N = _._1 |> length))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._having(_._1 |> length >= 5)
+                            ._select((Brand = _._0, N = _._1 |> length))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -62,3 +80,8 @@ def groupby_having_count_m3(b : B?) {
 def groupby_having_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_having_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_having_hidden_sum.das b/benchmarks/sql/groupby_having_hidden_sum.das
index ab0e2cb696..a064247072 100644
--- a/benchmarks/sql/groupby_having_hidden_sum.das
+++ b/benchmarks/sql/groupby_having_hidden_sum.das
@@ -20,6 +20,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _having(_._1 |> select($(c : Car) => c.price) |> sum > 50000)
                                   |> _select((Brand = _._0, N = _._1 |> length)))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -31,6 +32,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._having(_._1 |> select($(c : Car) => c.price) |> sum > 50000)._select((Brand = _._0, N = _._1 |> length)))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -44,6 +46,22 @@ def run_m3f(b : B?; n : int) {
                             ._having(_._1 |> select($(c : Car) => c.price) |> sum > 50000)
                             ._select((Brand = _._0, N = _._1 |> length))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._having(_._1 |> _select(_.price) |> sum > 50000)
+                            ._select((Brand = _._0, N = _._1 |> length))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -64,3 +82,8 @@ def groupby_having_hidden_sum_m3(b : B?) {
 def groupby_having_hidden_sum_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_having_hidden_sum_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_max.das b/benchmarks/sql/groupby_max.das
index a6faf71237..470bba5e2d 100644
--- a/benchmarks/sql/groupby_max.das
+++ b/benchmarks/sql/groupby_max.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0,
                                               MaxPrice = _._1 |> select($(c : Car) => c.price) |> max())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0,
                                                        MaxPrice = _._1 |> select($(c : Car) => c.price) |> max())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -41,6 +43,22 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       MaxPrice = _._1 |> select($(c : Car) => c.price) |> max()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      MaxPrice = _._1 |> _select(_.price) |> max()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -61,3 +79,8 @@ def groupby_max_m3(b : B?) {
 def groupby_max_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_max_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_min.das b/benchmarks/sql/groupby_min.das
index c6e4935e80..7a12afb468 100644
--- a/benchmarks/sql/groupby_min.das
+++ b/benchmarks/sql/groupby_min.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0,
                                               MinPrice = _._1 |> select($(c : Car) => c.price) |> min())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0,
                                                        MinPrice = _._1 |> select($(c : Car) => c.price) |> min())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -41,6 +43,22 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       MinPrice = _._1 |> select($(c : Car) => c.price) |> min()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      MinPrice = _._1 |> _select(_.price) |> min()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -61,3 +79,8 @@ def groupby_min_m3(b : B?) {
 def groupby_min_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_min_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_multi_reducer.das b/benchmarks/sql/groupby_multi_reducer.das
index 5baaa3264f..367c64f6de 100644
--- a/benchmarks/sql/groupby_multi_reducer.das
+++ b/benchmarks/sql/groupby_multi_reducer.das
@@ -18,6 +18,7 @@ def run_m1(b : B?; n : int) {
                                               N = _._1 |> length,
                                               TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum(),
                                               MaxPrice   = _._1 |> select($(c : Car) => c.price) |> max())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -32,6 +33,7 @@ def run_m3(b : B?; n : int) {
                                                        N = _._1 |> length,
                                                        TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum(),
                                                        MaxPrice   = _._1 |> select($(c : Car) => c.price) |> max())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -47,6 +49,24 @@ def run_m3f(b : B?; n : int) {
                                       TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum(),
                                       MaxPrice   = _._1 |> select($(c : Car) => c.price) |> max()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      N = _._1 |> length,
+                                      TotalPrice = _._1 |> _select(_.price) |> sum(),
+                                      MaxPrice   = _._1 |> _select(_.price) |> max()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -67,3 +87,8 @@ def groupby_multi_reducer_m3(b : B?) {
 def groupby_multi_reducer_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_multi_reducer_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_select_sum.das b/benchmarks/sql/groupby_select_sum.das
index 56fc3117c2..b9e49ec20d 100644
--- a/benchmarks/sql/groupby_select_sum.das
+++ b/benchmarks/sql/groupby_select_sum.das
@@ -12,6 +12,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._select(_.price)._group_by(_ % 100)._select((K = _._0, S = _._1 |> sum())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -25,6 +26,22 @@ def run_m3f(b : B?; n : int) {
                             ._group_by(_ % 100)
                             ._select((K = _._0, S = _._1 |> sum()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._select(_.price)
+                            ._group_by(_ % 100)
+                            ._select((K = _._0, S = _._1 |> sum()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -40,3 +57,8 @@ def groupby_select_sum_m3(b : B?) {
 def groupby_select_sum_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_select_sum_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_sum.das b/benchmarks/sql/groupby_sum.das
index 52a256af29..ffacbd4d37 100644
--- a/benchmarks/sql/groupby_sum.das
+++ b/benchmarks/sql/groupby_sum.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0,
                                               TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._group_by(_.brand)._select((Brand = _._0,
                                                        TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -41,6 +43,22 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      TotalPrice = _._1 |> _select(_.price) |> sum()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -61,3 +79,8 @@ def groupby_sum_m3(b : B?) {
 def groupby_sum_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_sum_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_where_count.das b/benchmarks/sql/groupby_where_count.das
index 658e5c739a..c66567a1c7 100644
--- a/benchmarks/sql/groupby_where_count.das
+++ b/benchmarks/sql/groupby_where_count.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
                                   |> _where(_.price > 500)
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0, N = _._1 |> length)))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._where(_.price > 500)._group_by(_.brand)._select((Brand = _._0, N = _._1 |> length)))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -40,6 +42,22 @@ def run_m3f(b : B?; n : int) {
                             ._group_by(_.brand)
                             ._select((Brand = _._0, N = _._1 |> length))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._where(_.price > 500)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0, N = _._1 |> length))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -60,3 +78,8 @@ def groupby_where_count_m3(b : B?) {
 def groupby_where_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_where_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/groupby_where_sum.das b/benchmarks/sql/groupby_where_sum.das
index 07fa479258..843e68ffc2 100644
--- a/benchmarks/sql/groupby_where_sum.das
+++ b/benchmarks/sql/groupby_where_sum.das
@@ -17,6 +17,7 @@ def run_m1(b : B?; n : int) {
                                   |> _group_by(_.brand)
                                   |> _select((Brand = _._0,
                                               TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum())))
+            b |> accept(groups)
             if (empty(groups)) {
                 b->failNow()
             }
@@ -29,6 +30,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         let groups <- (arr._where(_.price > 500)._group_by(_.brand)._select((Brand = _._0,
                                                                               TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum())))
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -43,6 +45,23 @@ def run_m3f(b : B?; n : int) {
                             ._select((Brand = _._0,
                                       TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum()))
                             .to_array())
+        b |> accept(groups)
+        if (empty(groups)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let groups <- _fold(from_decs_template(type<DecsCar>)
+                            ._where(_.price > 500)
+                            ._group_by(_.brand)
+                            ._select((Brand = _._0,
+                                      TotalPrice = _._1 |> _select(_.price) |> sum()))
+                            .to_array())
+        b |> accept(groups)
         if (empty(groups)) {
             b->failNow()
         }
@@ -63,3 +82,8 @@ def groupby_where_sum_m3(b : B?) {
 def groupby_where_sum_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def groupby_where_sum_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/indexed_lookup.das b/benchmarks/sql/indexed_lookup.das
index 88aad127bf..afb4083dc9 100644
--- a/benchmarks/sql/indexed_lookup.das
+++ b/benchmarks/sql/indexed_lookup.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         let key = n / 2
         b |> run("m1_sql/{n}") {
             let c = _sql(db |> select_from(type<Car>) |> _where(_.id == key) |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -28,6 +29,7 @@ def run_m3(b : B?; n : int) {
     let key = n / 2
     b |> run("m3_array/{n}") {
         let c = arr |> _where(_.id == key) |> count()
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -40,6 +42,7 @@ def run_m3f(b : B?; n : int) {
     let key = n / 2
     b |> run("m3f_array_fold/{n}") {
         let c = _fold(each(arr)._where(_.id == key).count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
diff --git a/benchmarks/sql/join_count.das b/benchmarks/sql/join_count.das
index aa4b275ac5..5c9af14664 100644
--- a/benchmarks/sql/join_count.das
+++ b/benchmarks/sql/join_count.das
@@ -17,6 +17,7 @@ def run_m3(b : B?; n : int) {
                                $(c : Car, d : Dealer) => c.dealer_id == d.id,
                                $(c : Car, d : Dealer) => (c.name, d.name))
                       |> count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -30,6 +31,7 @@ def run_m3f(b : B?; n : int) {
                                     $(c : Car, d : Dealer) => c.dealer_id == d.id,
                                     $(c : Car, d : Dealer) => (c.name, d.name))
                            |> count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
diff --git a/benchmarks/sql/last_match.das b/benchmarks/sql/last_match.das
index cc33f9a939..9a29f0b42f 100644
--- a/benchmarks/sql/last_match.das
+++ b/benchmarks/sql/last_match.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD)
                               |> _order_by_descending(_.id) |> _first())
+            b |> accept(row)
             if (row.price == 0) {
                 b->failNow()
             }
@@ -26,6 +27,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _where(_.price > THRESHOLD) |> last()
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -35,6 +37,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._where(_.price > THRESHOLD).last())
+        b |> accept(row)
+        if (row.price == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).last())
+        b |> accept(row)
         if (row.price == 0) {
             b->failNow()
         }
@@ -55,3 +69,8 @@ def last_match_m3(b : B?) {
 def last_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def last_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/long_count_aggregate.das b/benchmarks/sql/long_count_aggregate.das
index 3515e73835..627ca5f5f7 100644
--- a/benchmarks/sql/long_count_aggregate.das
+++ b/benchmarks/sql/long_count_aggregate.das
@@ -14,6 +14,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let c = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -25,6 +26,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let c = arr |> _where(_.price > THRESHOLD) |> long_count()
+        b |> accept(c)
         if (c == 0l) {
             b->failNow()
         }
@@ -34,6 +36,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let c = _fold(each(arr)._where(_.price > THRESHOLD).long_count())
+        b |> accept(c)
+        if (c == 0l) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).long_count())
+        b |> accept(c)
         if (c == 0l) {
             b->failNow()
         }
@@ -54,3 +68,8 @@ def long_count_aggregate_m3(b : B?) {
 def long_count_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def long_count_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/max_aggregate.das b/benchmarks/sql/max_aggregate.das
index ff424e0f4c..50f4049db7 100644
--- a/benchmarks/sql/max_aggregate.das
+++ b/benchmarks/sql/max_aggregate.das
@@ -11,6 +11,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let m = _sql(db |> select_from(type<Car>) |> _select(_.price) |> max())
+            b |> accept(m)
             if (m == 0) {
                 b->failNow()
             }
@@ -22,6 +23,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let m = arr |> _select(_.price) |> max()
+        b |> accept(m)
         if (m == 0) {
             b->failNow()
         }
@@ -31,6 +33,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let m = _fold(each(arr)._select(_.price).max())
+        b |> accept(m)
+        if (m == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let m = _fold(from_decs_template(type<DecsCar>)._select(_.price).max())
+        b |> accept(m)
         if (m == 0) {
             b->failNow()
         }
@@ -51,3 +65,8 @@ def max_aggregate_m3(b : B?) {
 def max_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def max_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/min_aggregate.das b/benchmarks/sql/min_aggregate.das
index f0bcbd37e9..efd61dbf90 100644
--- a/benchmarks/sql/min_aggregate.das
+++ b/benchmarks/sql/min_aggregate.das
@@ -11,6 +11,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let m = _sql(db |> select_from(type<Car>) |> _select(_.price) |> min())
+            b |> accept(m)
             if (m > 999) {
                 b->failNow()
             }
@@ -22,6 +23,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let m = arr |> _select(_.price) |> min()
+        b |> accept(m)
         if (m > 999) {
             b->failNow()
         }
@@ -31,6 +33,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let m = _fold(each(arr)._select(_.price).min())
+        b |> accept(m)
+        if (m > 999) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let m = _fold(from_decs_template(type<DecsCar>)._select(_.price).min())
+        b |> accept(m)
         if (m > 999) {
             b->failNow()
         }
@@ -51,3 +65,8 @@ def min_aggregate_m3(b : B?) {
 def min_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def min_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/order_take_desc.das b/benchmarks/sql/order_take_desc.das
index c4b61c510f..ad87ceb792 100644
--- a/benchmarks/sql/order_take_desc.das
+++ b/benchmarks/sql/order_take_desc.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _order_by_descending(_.price) |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> _order_by_descending(_.price) |> take(TAKE_N))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -37,6 +39,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let rows <- _fold(each(arr)._order_by_descending(_.price).take(TAKE_N).to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>)._order_by_descending(_.price).take(TAKE_N).to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -57,3 +71,8 @@ def order_take_desc_m3(b : B?) {
 def order_take_desc_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def order_take_desc_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/reverse_take.das b/benchmarks/sql/reverse_take.das
index 4d8fe5eb67..79194c0822 100644
--- a/benchmarks/sql/reverse_take.das
+++ b/benchmarks/sql/reverse_take.das
@@ -19,6 +19,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _order_by_descending(_.id) |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -31,6 +32,7 @@ def run_m3(b : B?; n : int) {
     b |> run("m3_array/{n}", n) {
         unsafe {
             let rows <- (each(arr) |> reverse() |> take(TAKE_N) |> to_array())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -43,6 +45,20 @@ def run_m3f(b : B?; n : int) {
     b |> run("m3f_array_fold/{n}", n) {
         unsafe {
             let rows <- _fold(each(arr).reverse().take(TAKE_N).to_array())
+            b |> accept(rows)
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        unsafe {
+            let rows <- _fold(from_decs_template(type<DecsCar>).reverse().take(TAKE_N).to_array())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -64,3 +80,8 @@ def reverse_take_m3(b : B?) {
 def reverse_take_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def reverse_take_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/select_count.das b/benchmarks/sql/select_count.das
index 2291504e4a..87284405cf 100644
--- a/benchmarks/sql/select_count.das
+++ b/benchmarks/sql/select_count.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let c = _sql(db |> select_from(type<Car>) |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -26,6 +27,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let c = arr |> _select(_.price * 2) |> count()
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -35,6 +37,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let c = _fold(each(arr)._select(_.price * 2).count())
+        b |> accept(c)
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._select(_.price * 2).count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -55,3 +69,8 @@ def select_count_m3(b : B?) {
 def select_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def select_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/select_where.das b/benchmarks/sql/select_where.das
index 1aefe7dddc..4565a4e938 100644
--- a/benchmarks/sql/select_where.das
+++ b/benchmarks/sql/select_where.das
@@ -11,6 +11,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -23,6 +24,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> _where(_.price > THRESHOLD))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -34,6 +36,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let rows <- _fold(each(arr)._where(_.price > THRESHOLD).to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -54,3 +68,8 @@ def select_where_m3(b : B?) {
 def select_where_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def select_where_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/select_where_count.das b/benchmarks/sql/select_where_count.das
index 23587db9f6..7d9db90607 100644
--- a/benchmarks/sql/select_where_count.das
+++ b/benchmarks/sql/select_where_count.das
@@ -19,6 +19,7 @@ def run_m1(b : B?; n : int) {
             // SQL form folds projection into the WHERE filter — the engine evaluates
             // ``price * 2 > T`` per row and counts matches.
             let c = _sql(db |> select_from(type<Car>) |> _where(_.price * 2 > THRESHOLD) |> count())
+            b |> accept(c)
             if (c == 0) {
                 b->failNow()
             }
@@ -30,6 +31,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let c = arr |> _select(_.price * 2) |> _where(_ > THRESHOLD) |> count
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -40,6 +42,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let c = _fold(each(arr)._select(_.price * 2)._where(_ > THRESHOLD).count())
+        b |> accept(c)
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._select(_.price * 2)._where(_ > THRESHOLD).count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -60,3 +74,8 @@ def select_where_count_m3(b : B?) {
 def select_where_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def select_where_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/select_where_order_take.das b/benchmarks/sql/select_where_order_take.das
index ccf57c0454..de68360e48 100644
--- a/benchmarks/sql/select_where_order_take.das
+++ b/benchmarks/sql/select_where_order_take.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
                                 |> _where(_.price > THRESHOLD)
                                 |> _order_by(_.price)
                                 |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -29,6 +30,7 @@ def run_m3(b : B?; n : int) {
         let rows <- (arr |> _where(_.price > THRESHOLD)
                          |> _order_by(_.price)
                          |> take(TAKE_N))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -43,6 +45,21 @@ def run_m3f(b : B?; n : int) {
                                    ._order_by(_.price)
                                    .take(TAKE_N)
                                    .to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)
+                                   ._order_by(_.price)
+                                   .take(TAKE_N)
+                                   .to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -63,3 +80,8 @@ def select_where_order_take_m3(b : B?) {
 def select_where_order_take_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def select_where_order_take_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/select_where_sum.das b/benchmarks/sql/select_where_sum.das
index 0f693c9b2b..8bd80b73c2 100644
--- a/benchmarks/sql/select_where_sum.das
+++ b/benchmarks/sql/select_where_sum.das
@@ -26,6 +26,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let s = db |> query_scalar("SELECT SUM(price * 2) FROM Cars WHERE price * 2 > {THRESHOLD}", type<int>)
+            b |> accept(s)
             if (s == 0) {
                 b->failNow()
             }
@@ -37,6 +38,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let s = arr |> _select(_.price * 2) |> _where(_ > THRESHOLD) |> sum
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -47,6 +49,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let s = _fold(each(arr)._select(_.price * 2)._where(_ > THRESHOLD).sum())
+        b |> accept(s)
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let s = _fold(from_decs_template(type<DecsCar>)._select(_.price * 2)._where(_ > THRESHOLD).sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -67,3 +81,8 @@ def select_where_sum_m3(b : B?) {
 def select_where_sum_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def select_where_sum_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/single_match.das b/benchmarks/sql/single_match.das
index 863e71adad..d14ae1b4c7 100644
--- a/benchmarks/sql/single_match.das
+++ b/benchmarks/sql/single_match.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _where(_.id == TARGET_ID) |> _first())
+            b |> accept(row)
             if (row.id == 0) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _where(_.id == TARGET_ID) |> single()
+        b |> accept(row)
         if (row.id == 0) {
             b->failNow()
         }
@@ -36,6 +38,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._where(_.id == TARGET_ID).single())
+        b |> accept(row)
+        if (row.id == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._where(_.id == TARGET_ID).single())
+        b |> accept(row)
         if (row.id == 0) {
             b->failNow()
         }
@@ -56,3 +70,8 @@ def single_match_m3(b : B?) {
 def single_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def single_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/skip_take.das b/benchmarks/sql/skip_take.das
index 52e000837c..4319781100 100644
--- a/benchmarks/sql/skip_take.das
+++ b/benchmarks/sql/skip_take.das
@@ -16,6 +16,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> skip(SKIP_N) |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -27,6 +28,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> skip(SKIP_N) |> take(TAKE_N))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -36,6 +38,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let rows <- _fold(each(arr).skip(SKIP_N).take(TAKE_N).to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>).skip(SKIP_N).take(TAKE_N).to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -56,3 +70,8 @@ def skip_take_m3(b : B?) {
 def skip_take_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def skip_take_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/skip_while_match.das b/benchmarks/sql/skip_while_match.das
index 32ebd1bd30..05aa398538 100644
--- a/benchmarks/sql/skip_while_match.das
+++ b/benchmarks/sql/skip_while_match.das
@@ -20,6 +20,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let total = _sql(db |> select_from(type<Car>) |> _where(_.id >= THRESHOLD) |> count())
+            b |> accept(total)
             if (total == 0) {
                 b->failNow()
             }
@@ -31,6 +32,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let total = arr |> _skip_while(_.id < THRESHOLD) |> count()
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -40,6 +42,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let total = _fold(each(arr)._skip_while(_.id < THRESHOLD).count())
+        b |> accept(total)
+        if (total == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let total = _fold(from_decs_template(type<DecsCar>)._skip_while(_.id < THRESHOLD).count())
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -60,3 +74,8 @@ def skip_while_match_m3(b : B?) {
 def skip_while_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def skip_while_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/sort_first.das b/benchmarks/sql/sort_first.das
index 290be82624..204c394c04 100644
--- a/benchmarks/sql/sort_first.das
+++ b/benchmarks/sql/sort_first.das
@@ -12,6 +12,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let row = _sql(db |> select_from(type<Car>) |> _order_by(_.price) |> _first())
+            b |> accept(row)
             if (row.id == 0) {
                 b->failNow()
             }
@@ -23,6 +24,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let row = arr |> _order_by(_.price) |> first()
+        b |> accept(row)
         if (row.id == 0) {
             b->failNow()
         }
@@ -32,6 +34,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let row = _fold(each(arr)._order_by(_.price).first())
+        b |> accept(row)
+        if (row.id == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let row = _fold(from_decs_template(type<DecsCar>)._order_by(_.price).first())
+        b |> accept(row)
         if (row.id == 0) {
             b->failNow()
         }
@@ -52,3 +66,8 @@ def sort_first_m3(b : B?) {
 def sort_first_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def sort_first_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/sort_take.das b/benchmarks/sql/sort_take.das
index 5787d519c8..2aaa81a4a1 100644
--- a/benchmarks/sql/sort_take.das
+++ b/benchmarks/sql/sort_take.das
@@ -24,6 +24,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> _order_by(_.price) |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -35,6 +36,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> _order_by(_.price) |> take(TAKE_N))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -45,6 +47,20 @@ def run_m3f(b : B?; n : int) {
     b |> run("m3f_array_fold/{n}", n) {
         unsafe {
             let rows <- _fold(each(arr)._order_by(_.price).take(TAKE_N).to_array())
+            b |> accept(rows)
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        unsafe {
+            let rows <- _fold(from_decs_template(type<DecsCar>)._order_by(_.price).take(TAKE_N).to_array())
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -56,6 +72,7 @@ def run_m3_topn_array(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_topn_array/{n}", n) {
         let rows <- top_n_by(arr, TAKE_N, @@(c : Car -&) => c.price)
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -66,6 +83,7 @@ def run_m3_topn_iter(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_topn_iter/{n}", n) {
         let rows <- top_n_by(arr.to_sequence(), TAKE_N, @@(c : Car -&) => c.price)
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -96,3 +114,8 @@ def sort_take_m3_topn_array(b : B?) {
 def sort_take_m3_topn_iter(b : B?) {
     run_m3_topn_iter(b, 100000)
 }
+
+[benchmark]
+def sort_take_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/sum_aggregate.das b/benchmarks/sql/sum_aggregate.das
index a8999e6474..e67385aaf9 100644
--- a/benchmarks/sql/sum_aggregate.das
+++ b/benchmarks/sql/sum_aggregate.das
@@ -11,6 +11,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let s = _sql(db |> select_from(type<Car>) |> _select(_.price) |> sum())
+            b |> accept(s)
             if (s == 0) {
                 b->failNow()
             }
@@ -22,6 +23,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let s = arr |> _select(_.price) |> sum()
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -31,6 +33,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let s = _fold(each(arr)._select(_.price).sum())
+        b |> accept(s)
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let s = _fold(from_decs_template(type<DecsCar>)._select(_.price).sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -51,3 +65,8 @@ def sum_aggregate_m3(b : B?) {
 def sum_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def sum_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/sum_where.das b/benchmarks/sql/sum_where.das
index 536fa47c02..638404c7ba 100644
--- a/benchmarks/sql/sum_where.das
+++ b/benchmarks/sql/sum_where.das
@@ -15,6 +15,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let s = _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> _select(_.price) |> sum())
+            b |> accept(s)
             if (s == 0) {
                 b->failNow()
             }
@@ -26,6 +27,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let s = (arr |> _where(_.price > THRESHOLD) |> _select(_.price) |> sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -35,6 +37,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let s = _fold(each(arr)._where(_.price > THRESHOLD)._select(_.price).sum())
+        b |> accept(s)
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let s = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)._select(_.price).sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -55,3 +69,8 @@ def sum_where_m3(b : B?) {
 def sum_where_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def sum_where_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/take_count.das b/benchmarks/sql/take_count.das
index fdf10f352e..e7f8c5eb87 100644
--- a/benchmarks/sql/take_count.das
+++ b/benchmarks/sql/take_count.das
@@ -14,6 +14,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let rows <- _sql(db |> select_from(type<Car>) |> take(TAKE_N))
+            b |> accept(rows)
             if (empty(rows)) {
                 b->failNow()
             }
@@ -25,6 +26,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let rows <- (arr |> take(TAKE_N))
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -34,6 +36,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let rows <- _fold(each(arr).take(TAKE_N).to_array())
+        b |> accept(rows)
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let rows <- _fold(from_decs_template(type<DecsCar>).take(TAKE_N).to_array())
+        b |> accept(rows)
         if (empty(rows)) {
             b->failNow()
         }
@@ -54,3 +68,8 @@ def take_count_m3(b : B?) {
 def take_count_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def take_count_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/take_count_filtered.das b/benchmarks/sql/take_count_filtered.das
index dce9967ace..d738307279 100644
--- a/benchmarks/sql/take_count_filtered.das
+++ b/benchmarks/sql/take_count_filtered.das
@@ -14,6 +14,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let c = arr |> _where(_.price > THRESHOLD) |> take(TAKE_N) |> count()
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -23,6 +24,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let c = _fold(each(arr)._where(_.price > THRESHOLD).take(TAKE_N).count())
+        b |> accept(c)
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let c = _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD).take(TAKE_N).count())
+        b |> accept(c)
         if (c == 0) {
             b->failNow()
         }
@@ -38,3 +51,8 @@ def take_count_filtered_m3(b : B?) {
 def take_count_filtered_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def take_count_filtered_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/take_sum_aggregate.das b/benchmarks/sql/take_sum_aggregate.das
index 738ee4b2c8..617b66c813 100644
--- a/benchmarks/sql/take_sum_aggregate.das
+++ b/benchmarks/sql/take_sum_aggregate.das
@@ -14,6 +14,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let s = arr |> _select(_.price) |> take(TAKE_N) |> sum()
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -23,6 +24,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let s = _fold(each(arr)._select(_.price).take(TAKE_N).sum())
+        b |> accept(s)
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let s = _fold(from_decs_template(type<DecsCar>)._select(_.price).take(TAKE_N).sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -38,3 +51,8 @@ def take_sum_aggregate_m3(b : B?) {
 def take_sum_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def take_sum_aggregate_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/take_while_match.das b/benchmarks/sql/take_while_match.das
index b6e7da9925..86d419ed2b 100644
--- a/benchmarks/sql/take_while_match.das
+++ b/benchmarks/sql/take_while_match.das
@@ -19,6 +19,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let total = _sql(db |> select_from(type<Car>) |> _where(_.id < THRESHOLD) |> count())
+            b |> accept(total)
             if (total == 0) {
                 b->failNow()
             }
@@ -30,6 +31,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let total = arr |> _take_while(_.id < THRESHOLD) |> count()
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -39,6 +41,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let total = _fold(each(arr)._take_while(_.id < THRESHOLD).count())
+        b |> accept(total)
+        if (total == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let total = _fold(from_decs_template(type<DecsCar>)._take_while(_.id < THRESHOLD).count())
+        b |> accept(total)
         if (total == 0) {
             b->failNow()
         }
@@ -59,3 +73,8 @@ def take_while_match_m3(b : B?) {
 def take_while_match_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def take_while_match_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/to_array_filter.das b/benchmarks/sql/to_array_filter.das
index d3b6a762a4..62e6b8eb6e 100644
--- a/benchmarks/sql/to_array_filter.das
+++ b/benchmarks/sql/to_array_filter.das
@@ -13,6 +13,7 @@ def run_m1(b : B?; n : int) {
         fixture_db(db, n)
         b |> run("m1_sql/{n}", n) {
             let prices <- _sql(db |> select_from(type<Car>) |> _where(_.price > THRESHOLD) |> _select(_.price))
+            b |> accept(prices)
             if (empty(prices)) {
                 b->failNow()
             }
@@ -24,6 +25,7 @@ def run_m3(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3_array/{n}", n) {
         let prices <- (arr |> _where(_.price > THRESHOLD) |> _select(_.price))
+        b |> accept(prices)
         if (empty(prices)) {
             b->failNow()
         }
@@ -33,6 +35,18 @@ def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
         let prices <- _fold(each(arr)._where(_.price > THRESHOLD)._select(_.price).to_array())
+        b |> accept(prices)
+        if (empty(prices)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m4(b : B?; n : int) {
+    fixture_decs(n)
+    b |> run("m4_decs_fold/{n}", n) {
+        let prices <- _fold(from_decs_template(type<DecsCar>)._where(_.price > THRESHOLD)._select(_.price).to_array())
+        b |> accept(prices)
         if (empty(prices)) {
             b->failNow()
         }
@@ -53,3 +67,8 @@ def to_array_filter_m3(b : B?) {
 def to_array_filter_m3f(b : B?) {
     run_m3f(b, 100000)
 }
+
+[benchmark]
+def to_array_filter_m4(b : B?) {
+    run_m4(b, 100000)
+}
diff --git a/benchmarks/sql/zip_dot_product.das b/benchmarks/sql/zip_dot_product.das
index b23aed7804..6d5dd33bce 100644
--- a/benchmarks/sql/zip_dot_product.das
+++ b/benchmarks/sql/zip_dot_product.das
@@ -21,6 +21,7 @@ def run_m3(b : B?; n : int) {
     let ys <- make_ints(n)
     b |> run("m3_array/{n}", n) {
         let s = (zip(xs, ys) |> _select(_._0 * _._1) |> sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
@@ -31,6 +32,7 @@ def run_m3f(b : B?; n : int) {
     let ys <- make_ints(n)
     b |> run("m3f_array_fold/{n}", n) {
         let s = _fold(zip(xs, ys)._select(_._0 * _._1).sum())
+        b |> accept(s)
         if (s == 0) {
             b->failNow()
         }
diff --git a/daslib/linq.das b/daslib/linq.das
index 33ecdaaa8c..e47989b446 100644
--- a/daslib/linq.das
+++ b/daslib/linq.das
@@ -854,6 +854,28 @@ def long_count(a : array<auto(TT)>) : int64 {
     return int64(length(a))
 }
 
+def long_count(var a : iterator<auto(TT)>; predicate : block<(arg : TT -&) : bool>) : int64 {
+    //! Counts elements in an iterator that satisfy a predicate, using a long integer
+    var count = 0l
+    for (it in a) {
+        if (predicate(it)) {
+            count ++
+        }
+    }
+    return count
+}
+
+def long_count(a : array<auto(TT)>; predicate : block<(arg : TT -&) : bool>) : int64 {
+    //! Counts elements in an array that satisfy a predicate, using a long integer
+    var count = 0l
+    for (it in a) {
+        if (predicate(it)) {
+            count ++
+        }
+    }
+    return count
+}
+
 [unused_argument(tt)]
 def private where_impl(var src; tt : auto(TT); len : int; predicate : block<(arg : TT -&) : bool>) : array<TT -& -const> {
     //! Filters elements in an iterator based on a predicate
@@ -1526,7 +1548,11 @@ def average(var src : iterator<auto(TT)>) : double {
     var total : double = 0lf
     var count : uint64 = 0ul
     for (x in src) {
-        total += double(x)
+        static_if (typeinfo stripped_typename(x) == typeinfo stripped_typename(default<double>)) {
+            total += x
+        } else {
+            total += double(x)
+        }
         count ++
     }
     return count != 0ul ? total / double(count) : 0lf
@@ -1538,7 +1564,11 @@ def average(src : array<auto(TT)>) : double {
     var total : double = 0lf
     var count : uint64 = 0ul
     for (x in src) {
-        total += double(x)
+        static_if (typeinfo stripped_typename(x) == typeinfo stripped_typename(default<double>)) {
+            total += x
+        } else {
+            total += double(x)
+        }
         count ++
     }
     return count != 0ul ? total / double(count) : 0lf
diff --git a/daslib/linq_boost.das b/daslib/linq_boost.das
index ecfcb9aec4..bb7b848dc8 100644
--- a/daslib/linq_boost.das
+++ b/daslib/linq_boost.das
@@ -197,6 +197,16 @@ class private LinqCount : AstCallMacro_LinqPred2 {
     override predName = "count"
 }
 
+[call_macro(name="_long_count")]
+class private LinqLongCount : AstCallMacro_LinqPred2 {
+    //! implements _long_count(iterator, expression) shorthand notation
+    //! that expands into long_count(iterator, $(_) => expression)
+    //! for example::
+    //!
+    //!   each(foo)._long_count(_ > 3)
+    override predName = "long_count"
+}
+
 [call_macro(name="_unique_by")]
 class private LinqUnique : AstCallMacro_LinqPred2 {
     //! implements _unique_by(iterator, expression) shorthand notation
diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index 47ec7f7e72..ded6544ee2 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -415,23 +415,60 @@ def private peel_each(var top : Expression?) : Expression? {
 
 [macro_function]
 def private finalize_invoke(var res : Expression?; at : LineInfo) : Expression? {
-    // Post-emit cleanup: stamp loc+generated for diagnostics; set can_shadow so gensym src coexists with user scope.
+    // Post-emit cleanup: stamp loc+generated for diagnostics; set can_shadow on every block arg so gensym sources coexist with user scope. Loop handles 1-source single planners and N-source zip emission uniformly.
     res.force_at(at)
     res.force_generated(true)
     let blk = (res as ExprInvoke).arguments[0] as ExprMakeBlock
-    (blk._block as ExprBlock).arguments[0].flags.can_shadow = true
+    var blkBlock = blk._block as ExprBlock
+    for (i in 0 .. length(blkBlock.arguments)) {
+        blkBlock.arguments[i].flags.can_shadow = true
+    }
     return res
 }
 
+[macro_function]
+def private finalize_lane_emission(var topExprs : array<Expression?>; srcNames : array<string>;
+                                   var bodyStmts : array<Expression?>; at : LineInfo) : Expression? {
+    // Multi-source-aware invoke wrap used by emit_accumulator_lane / emit_early_exit_lane. 1-source clones top + derives param type + emits 1-arg invoke; 2-source (zip) does both sides + emits 2-arg invoke. Caller has already pushed the for-loop into bodyStmts.
+    let nSrcs = length(srcNames)
+    if (nSrcs != 1 && nSrcs != 2) panic("finalize_lane_emission: only 1- or 2-source supported (got {nSrcs}); higher-arity zip planners must extend this branch (or build the invoke wrap directly)")
+    if (length(topExprs) != nSrcs) panic("finalize_lane_emission: topExprs length {length(topExprs)} != srcNames length {nSrcs}")
+    if (nSrcs == 1) {
+        var topExpr = clone_expression(topExprs[0])
+        topExpr.genFlags.alwaysSafe = true
+        var srcParamType = invoke_src_param_type(topExprs[0])
+        var res = qmacro(invoke($($i(srcNames[0]) : $t(srcParamType)) {
+            $b(bodyStmts)
+        }, $e(topExpr)))
+        return finalize_invoke(res, at)
+    }
+    var topAExpr = clone_expression(topExprs[0])
+    topAExpr.genFlags.alwaysSafe = true
+    var topBExpr = clone_expression(topExprs[1])
+    topBExpr.genFlags.alwaysSafe = true
+    var srcAType = invoke_src_param_type(topExprs[0])
+    var srcBType = invoke_src_param_type(topExprs[1])
+    var res = qmacro(invoke($($i(srcNames[0]) : $t(srcAType), $i(srcNames[1]) : $t(srcBType)) {
+        $b(bodyStmts)
+    }, $e(topAExpr), $e(topBExpr)))
+    return finalize_invoke(res, at)
+}
+
 [macro_function]
 def private emit_length_shortcut(opName : string; var top : Expression?; srcName : string; at : LineInfo) : Expression? {
-    // Count-shaped shortcut: emit `length(src)` (count) or `int64(length(src))` (long_count)
+    // Count-shaped shortcut: emit `length(src)` (count, int) or `int64(length(src))` (long_count). length returns int, so the int() cast on count would be redundant — interp doesn't fold it and PERF020 fires.
     var topExpr = clone_expression(top)
     topExpr.genFlags.alwaysSafe = true
-    let castName = opName == "long_count" ? "int64" : "int"
-    var res = qmacro(invoke($($i(srcName) : typedecl($e(topExpr))) {
-        return $c(castName)(length($i(srcName)))
-    }, $e(topExpr)))
+    var res : Expression?
+    if (opName == "long_count") {
+        res = qmacro(invoke($($i(srcName) : typedecl($e(topExpr))) {
+            return int64(length($i(srcName)))
+        }, $e(topExpr)))
+    } else {
+        res = qmacro(invoke($($i(srcName) : typedecl($e(topExpr))) {
+            return length($i(srcName))
+        }, $e(topExpr)))
+    }
     return finalize_invoke(res, at)
 }
 
@@ -658,13 +695,14 @@ def private emit_array_lane(var top : Expression?; var expr : Expression?; var l
 [macro_function]
 def private emit_accumulator_lane(
                                   opName : string;
-                                  var top : Expression?;
+                                  var topExprs : array<Expression?>;
                                   var projection : Expression?;
                                   var whereCond : Expression?;
                                   var intermediateBinds : array<Expression?>;
                                   var preCondStmts : array<Expression?>;
                                   var elementType : TypeDeclPtr;
-                                  srcName, accName, itName, skipName, takeCountName, skippingName : string;
+                                  srcNames : array<string>;
+                                  accName, itName, skipName, takeCountName, skippingName : string;
                                   var skipExpr, takeExpr, skipWhileCond, takeWhileCond : Expression?;
                                   at : LineInfo
                                   ) : Expression? {
@@ -730,8 +768,15 @@ def private emit_accumulator_lane(
             $i(accName) += $e(valueExpr)
             $i(cntName) ++
         }
-        returnExpr = qmacro_expr() {
-            double($i(accName)) / double($i(cntName))
+        // Skip the double(accName) when accType is already double (interp doesn't fold redundant casts — would trip PERF020 and slow the inner loop tail).
+        if (accType != null && accType.baseType == Type.tDouble) {
+            returnExpr = qmacro_expr() {
+                $i(accName) / double($i(cntName))
+            }
+        } else {
+            returnExpr = qmacro_expr() {
+                double($i(accName)) / double($i(cntName))
+            }
         }
     } elif (opName == "min" || opName == "max") {
         preludeStmts <- qmacro_block_to_array() {
@@ -766,21 +811,24 @@ def private emit_accumulator_lane(
     for (s in preludeStmts) {
         bodyStmts |> push(s)
     }
-    bodyStmts |> push <| qmacro_expr() {
-        for ($i(itName) in $i(srcName)) {
-            $e(loopBody)
+    // For-loop emission: single-source uses one iter var (itName); 2-source (zip) uses literal `itA, itB` parallel iter vars (qmacro for-loop iter-var position doesn't accept $i(...) splice). Caller (plan_zip) threads `let it = (itA, itB)` via preCondStmts so itName resolves inside the loop body.
+    if (length(srcNames) == 1) {
+        bodyStmts |> push <| qmacro_expr() {
+            for ($i(itName) in $i(srcNames[0])) {
+                $e(loopBody)
+            }
+        }
+    } else {
+        bodyStmts |> push <| qmacro_expr() {
+            for (itA, itB in $i(srcNames[0]), $i(srcNames[1])) {
+                $e(loopBody)
+            }
         }
     }
     bodyStmts |> push <| qmacro_expr() {
         return $e(returnExpr)
     }
-    var topExpr = clone_expression(top)
-    topExpr.genFlags.alwaysSafe = true
-    var srcParamType = invoke_src_param_type(top)
-    var res = qmacro(invoke($($i(srcName) : $t(srcParamType)) {
-        $b(bodyStmts)
-    }, $e(topExpr)))
-    return finalize_invoke(res, at)
+    return finalize_lane_emission(topExprs, srcNames, bodyStmts, at)
 }
 
 [macro_function]
@@ -797,14 +845,15 @@ def private emit_any_empty_shortcut(var top : Expression?; srcName : string; at
 [macro_function]
 def private emit_early_exit_lane(
                                  opName : string;
-                                 var top : Expression?;
+                                 var topExprs : array<Expression?>;
                                  var projection : Expression?;
                                  var whereCond : Expression?;
                                  var intermediateBinds : array<Expression?>;
                                  var preCondStmts : array<Expression?>;
                                  var elementType : TypeDeclPtr;
                                  terminatorCall : ExprCall?;
-                                 srcName, itName, skipName, takeCountName, skippingName : string;
+                                 srcNames : array<string>;
+                                 itName, skipName, takeCountName, skippingName : string;
                                  var skipExpr, takeExpr, skipWhileCond, takeWhileCond : Expression?;
                                  at : LineInfo
                                  ) : Expression? {
@@ -1122,21 +1171,24 @@ def private emit_early_exit_lane(
     for (s in preludeStmts) {
         bodyStmts |> push(s)
     }
-    bodyStmts |> push <| qmacro_expr() {
-        for ($i(itName) in $i(srcName)) {
-            $e(loopBody)
+    // For-loop emission: 1-source uses itName; 2-source (zip) uses literal `itA, itB` (qmacro for-iter-var position doesn't accept $i splice). Caller (plan_zip) threads `let it = (itA, itB)` via preCondStmts.
+    if (length(srcNames) == 1) {
+        bodyStmts |> push <| qmacro_expr() {
+            for ($i(itName) in $i(srcNames[0])) {
+                $e(loopBody)
+            }
+        }
+    } else {
+        bodyStmts |> push <| qmacro_expr() {
+            for (itA, itB in $i(srcNames[0]), $i(srcNames[1])) {
+                $e(loopBody)
+            }
         }
     }
     for (s in tailStmts) {
         bodyStmts |> push(s)
     }
-    var topExpr = clone_expression(top)
-    topExpr.genFlags.alwaysSafe = true
-    var srcParamType = invoke_src_param_type(top)
-    var res = qmacro(invoke($($i(srcName) : $t(srcParamType)) {
-        $b(bodyStmts)
-    }, $e(topExpr)))
-    return finalize_invoke(res, at)
+    return finalize_lane_emission(topExprs, srcNames, bodyStmts, at)
 }
 
 [macro_function]
@@ -1195,6 +1247,13 @@ def private order_top_n_call_name(orderName : string) : string {
     return ""
 }
 
+def private order_min_call_name(orderName : string; hasKey : bool) : string {
+    // `order + first` collapses to min/max — asc → min, desc → max. Keyed forms route to min_by/max_by.
+    let isDesc = orderName == "order_descending" || orderName == "order_by_descending"
+    if (hasKey) return isDesc ? "max_by" : "min_by"
+    return isDesc ? "max" : "min"
+}
+
 [macro_function]
 def private try_make_inline_cmp(orderKey : Expression?; orderName : string;
                                 elemType : TypeDeclPtr; at : LineInfo) : Expression? {
@@ -1236,6 +1295,8 @@ def private plan_order_family(var expr : Expression?) : Expression? {
     var orderKey : Expression?
     var orderElemType : TypeDeclPtr
     var takeExpr : Expression?
+    var firstName : string
+    var firstDefaultExpr : Expression?
     var hasOrder = false
     let at = calls[0]._0.at
     let itName = "`it`{at.line}`{at.column}"
@@ -1253,17 +1314,28 @@ def private plan_order_family(var expr : Expression?) : Expression? {
         } elif (name == "order" || name == "order_descending"
                 || name == "order_by" || name == "order_by_descending") {
             if (hasOrder) return null
+            // bail on `order(arr, cmp)` / `order_descending(arr, cmp)` — splice helpers (min/max/top_n) can't honor a user-supplied comparator and would silently drop it.
+            let argCount = cll._0.arguments |> length
+            if ((name == "order" || name == "order_descending") && argCount >= 2) return null
             hasOrder = true
             orderName = name
-            if ((cll._0.arguments |> length) >= 2) {
+            if (argCount >= 2) {
                 orderKey = clone_expression(cll._0.arguments[1])
             }
             orderElemType = clone_type(cll._0._type.firstType)
         } elif (name == "take") {
-            if (!hasOrder || takeExpr != null) return null
+            if (!hasOrder || takeExpr != null || firstName != "") return null
             var arg = cll._0.arguments[1]
             if (arg == null || arg._type == null || arg._type.baseType != Type.tInt) return null
             takeExpr = clone_expression(arg)
+        } elif (name == "first" || name == "first_or_default") {
+            // order + first → min/max (O(N) instead of sort + index). Must be terminal.
+            if (!hasOrder || takeExpr != null || firstName != "" || i != length(calls) - 1) return null
+            firstName = name
+            if (name == "first_or_default") {
+                if ((cll._0.arguments |> length) < 2) return null
+                firstDefaultExpr = clone_expression(cll._0.arguments[1])
+            }
         } else {
             return null
         }
@@ -1278,12 +1350,51 @@ def private plan_order_family(var expr : Expression?) : Expression? {
     if (hasKey) {
         inlineCmp = try_make_inline_cmp(orderKey, orderName, orderElemType, at)
     }
+    let minMaxName = order_min_call_name(orderName, hasKey)
     if (whereCond == null) {
         // No prefilter — direct call to daslib helper.
         var topExpr = clone_expression(top)
         topExpr.genFlags.alwaysSafe = true
         var emission : Expression?
-        if (takeExpr == null) {
+        if (firstName == "first") {
+            // order + first → preserve eager `first()` panic-on-empty. min/max return an uninitialized ref on empty, so wrap in an empty-guard for arrays (zero alloc, O(N) min scan), or use top_n*(_, 1, _) |> first() for iterators (n=1 bounded heap; first() panics on empty).
+            if (top._type.isGoodArrayType) {
+                var srcParamType = invoke_src_param_type(top)
+                let firstSrcName = "`first_src`{at.line}`{at.column}"
+                var minMaxCall : Expression?
+                if (hasKey) {
+                    minMaxCall = qmacro($c(minMaxName)($i(firstSrcName), $e(orderKey)))
+                } else {
+                    minMaxCall = qmacro($c(minMaxName)($i(firstSrcName)))
+                }
+                emission = qmacro(invoke($($i(firstSrcName) : $t(srcParamType)) {
+                    panic("sequence contains no elements") if (empty($i(firstSrcName)))
+                    return $e(minMaxCall)
+                }, $e(topExpr)))
+                emission = finalize_invoke(emission, at)
+            } else {
+                var topNCall : Expression?
+                if (inlineCmp != null) {
+                    topNCall = qmacro(_::top_n_by_with_cmp($e(topExpr), 1, $e(inlineCmp)))
+                } elif (hasKey) {
+                    topNCall = qmacro($c(topNName)($e(topExpr), 1, $e(orderKey)))
+                } else {
+                    topNCall = qmacro($c(topNName)($e(topExpr), 1))
+                }
+                emission = qmacro(_::first($e(topNCall)))
+            }
+        } elif (firstName == "first_or_default") {
+            // No min_by_or_default exists; route through top_n*(_, 1, _) which returns an array (empty or single-element), then first_or_default supplies the default.
+            var topNCall : Expression?
+            if (inlineCmp != null) {
+                topNCall = qmacro(_::top_n_by_with_cmp($e(topExpr), 1, $e(inlineCmp)))
+            } elif (hasKey) {
+                topNCall = qmacro($c(topNName)($e(topExpr), 1, $e(orderKey)))
+            } else {
+                topNCall = qmacro($c(topNName)($e(topExpr), 1))
+            }
+            emission = qmacro(_::first_or_default($e(topNCall), $e(firstDefaultExpr)))
+        } elif (takeExpr == null) {
             // Bare order family — emit the direct call. Same shape as plain LINQ, but via
             if (inlineCmp != null) {
                 // Inlined comparator dispatches to the asc `order(src, block)` overload —
@@ -1304,7 +1415,7 @@ def private plan_order_family(var expr : Expression?) : Expression? {
             }
         }
         // Wrap with to_sequence_move only when emission is array-shaped: take dispatches to
-        let emissionIsArray = takeExpr != null || top._type.isGoodArrayType
+        let emissionIsArray = takeExpr != null || (firstName == "" && top._type.isGoodArrayType)
         if (needIterWrap && emissionIsArray) {
             emission = qmacro($e(emission).to_sequence_move())
         }
@@ -1337,7 +1448,34 @@ def private plan_order_family(var expr : Expression?) : Expression? {
             $e(loopBody)
         }
     }
-    if (takeExpr == null) {
+    if (firstName == "first") {
+        // where + order + first → min/max on prefilter buffer. Empty buf must panic to match eager `first()` semantics; min/max return uninitialized refs on empty.
+        stmts |> push <| qmacro_expr() {
+            panic("sequence contains no elements") if (empty($i(bufName)))
+        }
+        var minMaxCall : Expression?
+        if (hasKey) {
+            minMaxCall = qmacro($c(minMaxName)($i(bufName), $e(orderKey)))
+        } else {
+            minMaxCall = qmacro($c(minMaxName)($i(bufName)))
+        }
+        stmts |> push <| qmacro_expr() {
+            return $e(minMaxCall)
+        }
+    } elif (firstName == "first_or_default") {
+        // No min_by_or_default helper exists; route through top_n*(_, 1, _) + first_or_default for the empty-buf case.
+        var topNCall : Expression?
+        if (inlineCmp != null) {
+            topNCall = qmacro(_::top_n_by_with_cmp($i(bufName), 1, $e(inlineCmp)))
+        } elif (hasKey) {
+            topNCall = qmacro($c(topNName)($i(bufName), 1, $e(orderKey)))
+        } else {
+            topNCall = qmacro($c(topNName)($i(bufName), 1))
+        }
+        stmts |> push <| qmacro_expr() {
+            return _::first_or_default($e(topNCall), $e(firstDefaultExpr))
+        }
+    } elif (takeExpr == null) {
         // Sort the prefilter buffer in place and return it. order*_inplace is void
         var sortCall : Expression?
         if (inlineCmp != null) {
@@ -1511,10 +1649,15 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
             && type_has_length(top._type))
         return emit_length_shortcut(lastName, top, srcName, at)
     // Ring 1: accumulator lane builds its own per-op loop body (typed accumulator, optional
-    if (lane == LinqLane.ACCUMULATOR)
-        return emit_accumulator_lane(lastName, top, projection, whereCond,
-            intermediateBinds, preCondStmts, elementType, srcName, accName, itName, skipName, takeCountName,
+    if (lane == LinqLane.ACCUMULATOR) {
+        var laneTops : array<Expression?>
+        laneTops |> push(top)
+        var laneSrcs : array<string>
+        laneSrcs |> push(srcName)
+        return emit_accumulator_lane(lastName, laneTops, projection, whereCond,
+            intermediateBinds, preCondStmts, elementType, laneSrcs, accName, itName, skipName, takeCountName,
             skippingName, skipExpr, takeExpr, skipWhileCond, takeWhileCond, at)
+    }
     // Ring 2: early-exit lane — `any` no-pred + no upstream work + no limits + length-bearing
     if (lane == LinqLane.EARLY_EXIT) {
         let terminatorCall = calls.back()._0
@@ -1522,8 +1665,12 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
         if (isAnyNoPred && whereCond == null && allProjectionsPure && noLimits
                 && type_has_length(top._type))
             return emit_any_empty_shortcut(top, srcName, at)
-        return emit_early_exit_lane(lastName, top, projection, whereCond,
-            intermediateBinds, preCondStmts, elementType, terminatorCall, srcName, itName, skipName,
+        var laneTops : array<Expression?>
+        laneTops |> push(top)
+        var laneSrcs : array<string>
+        laneSrcs |> push(srcName)
+        return emit_early_exit_lane(lastName, laneTops, projection, whereCond,
+            intermediateBinds, preCondStmts, elementType, terminatorCall, laneSrcs, itName, skipName,
             takeCountName, skippingName, skipExpr, takeExpr, skipWhileCond, takeWhileCond, at)
     }
     // Build the per-element loop body for COUNTER / ARRAY. Both lanes follow the same shape:
@@ -2882,6 +3029,7 @@ struct private DecsBridgeShape {
     forExpr     : ExpressionPtr     // cloned ExprFor — inner multi-iter for-loop; body replaced when splicing
     iterNames   : array<string>     // bridge's iter var names (prefixed component names)
     userNames   : array<string>     // user-facing field names from the push tuple — feed the named-tuple bind that lets the user's `_.userName` chain access work
+    elementType : TypeDeclPtr       // named-tuple type (from resVar._type.firstType) — used by to_array/first when no projection
 }
 
 [macro_function]
@@ -2973,11 +3121,122 @@ def private extract_decs_bridge(var top : Expression?) : DecsBridgeShape? {
         archName := archName,
         forExpr = clone_expression(forExpr),
         iterNames <- iterNames,
-        userNames <- userNames
+        userNames <- userNames,
+        elementType = clone_type(resVar._type.firstType)
+    )
+    return info
+}
+
+[macro_function]
+def private build_decs_tup_bind(bridge : DecsBridgeShape?; tupName : string; at : LineInfo) : Expression? {
+    // Named-tuple bind: `var tup = (n1=iter1, n2=iter2, ...)` — fold_linq_cond peels user lambdas substituting `_.userName` → `tup.userName`.
+    var mkTup = new ExprMakeTuple(at = at)
+    mkTup.recordNames |> resize(length(bridge.userNames))
+    for (i in 0 .. length(bridge.userNames)) {
+        mkTup.recordNames[i] := bridge.userNames[i]
+        mkTup.values |> emplace_new(new ExprVar(at = at, name := bridge.iterNames[i]))
+    }
+    return qmacro_expr() {
+        var $i(tupName) = $e(mkTup)
+    }
+}
+
+[macro_function]
+def private build_decs_inner_for(bridge : DecsBridgeShape?; var tupBind : Expression?; var body : Expression?; at : LineInfo) : Expression? {
+    // Clone bridge's inner multi-iter for-loop; substitute body with [tupBind, body]. Reuses bridge.archName so cloned get_ro sources stay valid.
+    var forBodyStmts <- [tupBind, body]
+    var forBody = stmts_to_expr(forBodyStmts)
+    var clonedForExpr = clone_expression(bridge.forExpr)
+    var clonedFor = clonedForExpr as ExprFor
+    var newForBody = new ExprBlock(at = at)
+    newForBody.list |> push(forBody)
+    clonedFor.body = newForBody
+    return clonedForExpr
+}
+
+struct private DecsChainInfo {
+    bindAt      : array<string>    // bind name visible at each chain position
+    finalBind   : string            // bind name AFTER full chain — what terminator references
+    finalType   : TypeDeclPtr       // element type AFTER full chain (constant + ref stripped)
+    selectCount : int               // number of `select` ops in chain; 0 means finalBind == tupName
+}
+
+[macro_function]
+def private compute_decs_chain_info(var calls : array<tuple<ExprCall?; LinqCall?>>;
+                                    intermediateEnd : int;
+                                    tupName : string;
+                                    bridge : DecsBridgeShape?;
+                                    at : LineInfo) : DecsChainInfo? {
+    // Each select introduces a fresh `decs_sel{N}` bind; subsequent ops peel against it. Returns null on any non-where_/select op.
+    var info = new DecsChainInfo(
+        finalBind := tupName,
+        finalType = clone_type(bridge.elementType),
+        selectCount = 0
     )
+    info.bindAt |> reserve(intermediateEnd)
+    var curBind = tupName
+    var curType : TypeDeclPtr = clone_type(bridge.elementType)
+    for (i in 0 .. intermediateEnd) {
+        info.bindAt |> push(curBind)
+        var cll & = unsafe(calls[i])
+        let opName = cll._1.name
+        if (opName == "select") {
+            info.selectCount ++
+            curBind = "`decs_sel`{at.line}`{at.column}`{info.selectCount}"
+            var peeled = fold_linq_cond(cll._0.arguments[1], info.bindAt[i])
+            if (peeled == null || peeled._type == null) return null
+            curType = clone_type(peeled._type)
+        } elif (opName != "where_") return null
+    }
+    info.finalBind := curBind
+    info.finalType = curType
+    if (info.finalType != null) {
+        info.finalType.flags.constant = false
+        info.finalType.flags.ref = false
+    }
     return info
 }
 
+[macro_function]
+def private wrap_decs_chain(var action : Expression?;
+                            chainInfo : DecsChainInfo?;
+                            var calls : array<tuple<ExprCall?; LinqCall?>>;
+                            intermediateEnd : int;
+                            at : LineInfo) : Expression? {
+    // Reverse-walk chain, wrapping action with `if (pred) { ... }` for where_ and `let bindN+1 = proj; ...` for select.
+    var current = action
+    for (rev in 0 .. intermediateEnd) {
+        let i = intermediateEnd - 1 - rev
+        var cll & = unsafe(calls[i])
+        let opName = cll._1.name
+        let bindHere = chainInfo.bindAt[i]
+        if (opName == "where_") {
+            var pred = fold_linq_cond(cll._0.arguments[1], bindHere)
+            if (pred == null) return null
+            current = qmacro_expr() {
+                if ($e(pred)) {
+                    $e(current)
+                }
+            }
+        } elif (opName == "select") {
+            var proj = fold_linq_cond(cll._0.arguments[1], bindHere)
+            if (proj == null) return null
+            let nextBind = (i + 1 < intermediateEnd) ? chainInfo.bindAt[i + 1] : chainInfo.finalBind
+            var letStmt = qmacro_expr() {
+                let $i(nextBind) = $e(proj)
+            }
+            var stmts : array<Expression?>
+            stmts |> reserve(2)
+            stmts |> push(letStmt)
+            stmts |> push(current)
+            current = stmts_to_expr(stmts)
+        } else {
+            return null
+        }
+    }
+    return current
+}
+
 [macro_function]
 def private emit_decs_count_archsize(bridge : DecsBridgeShape?; at : LineInfo) : Expression? {
     // Bare count(): no chain ops, sum arch.size per archetype — skips the per-entity walk entirely.
@@ -3000,52 +3259,64 @@ def private emit_decs_count_archsize(bridge : DecsBridgeShape?; at : LineInfo) :
 [macro_function]
 def private emit_decs_accumulator(bridge : DecsBridgeShape?;
                                   opName : string;
-                                  var projection : Expression?;
-                                  var whereCond : Expression?;
+                                  chainInfo : DecsChainInfo?;
+                                  var calls : array<tuple<ExprCall?; LinqCall?>>;
+                                  intermediateEnd : int;
+                                  terminatorCall : ExprCall?;
                                   var accType : TypeDeclPtr;
                                   at : LineInfo) : Expression? {
-    // Slice 2 accumulator emission: count / long_count / sum with optional _where + single _select chain ops.
+    // Slice 2/3a/4 accumulator emission: count / long_count / sum / min / max / average with chained _select + interleaved _where + optional _count(pred).
     let accName = "`decs_acc`{at.line}`{at.column}"
     let tupName = "`decs_tup`{at.line}`{at.column}"
-    // Per-element body: acc += <value> for sum, acc++ for count/long_count.
+    let cntName = "`decs_cnt`{at.line}`{at.column}"
+    let firstName = "`decs_first`{at.line}`{at.column}"
+    let valBindName = "`decs_val`{at.line}`{at.column}"
+    let finalBind = chainInfo.finalBind
     var perElement : Expression?
     if (opName == "sum") {
         perElement = qmacro_expr() {
-            $i(accName) += $e(projection)
+            $i(accName) += $i(finalBind)
         }
-    } elif (opName == "long_count") {
-        perElement = qmacro_expr() {
-            $i(accName) ++
+    } elif (opName == "average") {
+        perElement = qmacro_block() {
+            $i(accName) += $i(finalBind)
+            $i(cntName) ++
+        }
+    } elif (opName == "min" || opName == "max") {
+        let workhorse = (chainInfo.finalType != null && chainInfo.finalType.isWorkhorseType)
+        var compareExpr = min_max_compare(workhorse, opName, valBindName, accName)
+        perElement = qmacro_block() {
+            let $i(valBindName) = $i(finalBind)
+            if ($i(firstName)) {
+                $i(accName) := $i(valBindName)
+                $i(firstName) = false
+            } elif ($e(compareExpr)) {
+                $i(accName) := $i(valBindName)
+            }
         }
     } else {
+        // count, long_count
         perElement = qmacro_expr() {
             $i(accName) ++
         }
     }
-    // Wrap with where filter.
-    var body = wrap_with_condition(perElement, whereCond)
-    // Named-tuple bind: fold_linq_cond(lambda, tupName) rebinds `_.userName` → `tup.userName`, so chain ops see a real named tuple.
-    var mkTup = new ExprMakeTuple(at = at)
-    mkTup.recordNames |> resize(length(bridge.userNames))
-    for (i in 0 .. length(bridge.userNames)) {
-        mkTup.recordNames[i] := bridge.userNames[i]
-        mkTup.values |> emplace_new(new ExprVar(at = at, name := bridge.iterNames[i]))
-    }
-    var tupBind : Expression? = qmacro_expr() {
-        var $i(tupName) = $e(mkTup)
+    // _count(pred): count/long_count with extra predicate — wrap perElement with `if (pred) {...}`. Predicate peels against finalBind so it sees the post-chain element.
+    if ((opName == "count" || opName == "long_count") && length(terminatorCall.arguments) > 1) {
+        var predExpr = fold_linq_cond(clone_expression(terminatorCall.arguments[1]), finalBind)
+        if (predExpr == null) return null
+        perElement = qmacro_expr() {
+            if ($e(predExpr)) {
+                $e(perElement)
+            }
+        }
     }
-    var forBodyStmts <- [tupBind, body]
-    var forBody = stmts_to_expr(forBodyStmts)
-    // Cloned for retains the bridge's iter vars + get_ro sources (which reference archName); reuse archName below.
-    var clonedForExpr = clone_expression(bridge.forExpr)
-    var clonedFor = clonedForExpr as ExprFor
-    var newForBody = new ExprBlock(at = at)
-    newForBody.list |> push(forBody)
-    clonedFor.body = newForBody
+    var body = wrap_decs_chain(perElement, chainInfo, calls, intermediateEnd, at)
+    if (body == null) return null
+    var tupBind = build_decs_tup_bind(bridge, tupName, at)
+    var forExprNode = build_decs_inner_for(bridge, tupBind, body, at)
     let archName = bridge.archName
     var reqExpr = clone_expression(bridge.reqHashExpr)
     var erqExpr = clone_expression(bridge.erqExpr)
-    var forExprNode : Expression? = clonedForExpr
     var emission : Expression?
     if (opName == "long_count") {
         emission = qmacro(invoke($() : int64 {
@@ -3071,9 +3342,254 @@ def private emit_decs_accumulator(bridge : DecsBridgeShape?;
             })
             return $i(accName)
         }))
+    } elif (opName == "average") {
+        // Empty source → 0.0 / 0.0 → IEEE NaN (numerator + denominator both cast to double before division). Matches emit_accumulator_lane.
+        emission = qmacro(invoke($() : double {
+            var $i(accName) : $t(accType) = default<$t(accType)>
+            var $i(cntName) = 0
+            for_each_archetype($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) {
+                $e(forExprNode)
+            })
+            return double($i(accName)) / double($i(cntName))
+        }))
+    } elif (opName == "min" || opName == "max") {
+        // No empty-panic — matches non-decs emit_accumulator_lane (returns default-initialized acc).
+        emission = qmacro(invoke($() : $t(accType) {
+            var $i(firstName) = true
+            var $i(accName) : $t(accType)
+            for_each_archetype($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) {
+                $e(forExprNode)
+            })
+            return $i(accName)
+        }))
+    } else {
+        return null
+    }
+    emission.force_at(at)
+    emission.force_generated(true)
+    return emission
+}
+
+[macro_function]
+def private emit_decs_early_exit(bridge : DecsBridgeShape?;
+                                 opName : string;
+                                 chainInfo : DecsChainInfo?;
+                                 var calls : array<tuple<ExprCall?; LinqCall?>>;
+                                 intermediateEnd : int;
+                                 terminatorCall : ExprCall?;
+                                 at : LineInfo) : Expression? {
+    // Slice 3b/4 early-exit: first / first_or_default / any / all / contains via for_each_archetype_find (block returns true to stop archetype walk). Supports chained _select + interleaved _where via wrap_decs_chain.
+    let tupName = "`decs_tup`{at.line}`{at.column}"
+    let foundName = "`decs_found`{at.line}`{at.column}"
+    let resultName = "`decs_result`{at.line}`{at.column}"
+    let containsValName = "`decs_cv`{at.line}`{at.column}"
+    let defaultName = "`decs_dv`{at.line}`{at.column}"
+    let finalBind = chainInfo.finalBind
+    var elemType = clone_type(chainInfo.finalType)
+    var perElement : Expression?
+    var preludeStmts : array<Expression?>
+    var tailStmts : array<Expression?>
+    if (opName == "first" || opName == "first_or_default") {
+        preludeStmts |> push <| qmacro_expr() {
+            var $i(foundName) = false
+        }
+        preludeStmts |> push <| qmacro_expr() {
+            var $i(resultName) : $t(elemType)
+        }
+        if (opName == "first_or_default") {
+            var defaultExpr = clone_expression(terminatorCall.arguments[1])
+            preludeStmts |> push <| qmacro_expr() {
+                let $i(defaultName) = $e(defaultExpr)
+            }
+        }
+        perElement = qmacro_block() {
+            $i(resultName) := $i(finalBind)
+            $i(foundName) = true
+            return true
+        }
+        if (opName == "first") {
+            tailStmts |> push <| qmacro_expr() {
+                if (!$i(foundName)) panic("sequence contains no elements")
+            }
+            tailStmts |> push <| qmacro_expr() {
+                return $i(resultName)
+            }
+        } else {
+            tailStmts |> push <| qmacro_expr() {
+                if (!$i(foundName)) return $i(defaultName)
+            }
+            tailStmts |> push <| qmacro_expr() {
+                return $i(resultName)
+            }
+        }
+    } elif (opName == "any") {
+        let argCount = length(terminatorCall.arguments)
+        if (argCount > 1) {
+            var predExpr = fold_linq_cond(clone_expression(terminatorCall.arguments[1]), finalBind)
+            if (predExpr == null) return null
+            perElement = qmacro_expr() {
+                if ($e(predExpr)) return true
+            }
+        } else {
+            perElement = qmacro_expr() {
+                return true
+            }
+        }
+    } elif (opName == "all") {
+        var predExpr = fold_linq_cond(clone_expression(terminatorCall.arguments[1]), finalBind)
+        if (predExpr == null) return null
+        perElement = qmacro_expr() {
+            if (!$e(predExpr)) return true
+        }
+    } elif (opName == "contains") {
+        var valExpr = clone_expression(terminatorCall.arguments[1])
+        preludeStmts |> push <| qmacro_expr() {
+            let $i(containsValName) = $e(valExpr)
+        }
+        perElement = qmacro_expr() {
+            if ($i(finalBind) == $i(containsValName)) return true
+        }
     } else {
         return null
     }
+    var body = wrap_decs_chain(perElement, chainInfo, calls, intermediateEnd, at)
+    if (body == null) return null
+    var tupBind = build_decs_tup_bind(bridge, tupName, at)
+    var forExprNode = build_decs_inner_for(bridge, tupBind, body, at)
+    let archName = bridge.archName
+    var reqExpr = clone_expression(bridge.reqHashExpr)
+    var erqExpr = clone_expression(bridge.erqExpr)
+    // Combine prelude + invocation + tail into ONE bodyStmts list — multiple $b splices in the same qmacro fragment isolate variable scope.
+    var bodyStmts : array<Expression?>
+    bodyStmts |> reserve(length(preludeStmts) + length(tailStmts) + 1)
+    for (s in preludeStmts) {
+        bodyStmts |> push(s)
+    }
+    if (opName == "any" || opName == "contains") {
+        bodyStmts |> push <| qmacro_expr() {
+            return for_each_archetype_find($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) : bool {
+                $e(forExprNode)
+                return false
+            })
+        }
+    } elif (opName == "all") {
+        // `all` is "no counterexample found" — inner returns true on FAIL, so negate.
+        bodyStmts |> push <| qmacro_expr() {
+            return !for_each_archetype_find($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) : bool {
+                $e(forExprNode)
+                return false
+            })
+        }
+    } else {
+        bodyStmts |> push <| qmacro_expr() {
+            for_each_archetype_find($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) : bool {
+                $e(forExprNode)
+                return false
+            })
+        }
+        for (s in tailStmts) {
+            bodyStmts |> push(s)
+        }
+    }
+    var emission : Expression?
+    if (opName == "first" || opName == "first_or_default") {
+        emission = qmacro(invoke($() : $t(elemType) {
+            $b(bodyStmts)
+        }))
+    } else {
+        emission = qmacro(invoke($() : bool {
+            $b(bodyStmts)
+        }))
+    }
+    emission.force_at(at)
+    emission.force_generated(true)
+    return emission
+}
+
+[macro_function]
+def private emit_decs_to_array(bridge : DecsBridgeShape?;
+                               chainInfo : DecsChainInfo?;
+                               var calls : array<tuple<ExprCall?; LinqCall?>>;
+                               intermediateEnd : int;
+                               at : LineInfo) : Expression? {
+    // Slice 3c/4 to_array: hoist `var buf` above outer for_each_archetype; per-element push_clone of post-chain value at chainInfo.finalBind.
+    let tupName = "`decs_tup`{at.line}`{at.column}"
+    let bufName = "`decs_buf`{at.line}`{at.column}"
+    let finalBind = chainInfo.finalBind
+    var elemType = clone_type(chainInfo.finalType)
+    var perElement : Expression? = qmacro_expr() {
+        $i(bufName) |> push_clone($i(finalBind))
+    }
+    var body = wrap_decs_chain(perElement, chainInfo, calls, intermediateEnd, at)
+    if (body == null) return null
+    var tupBind = build_decs_tup_bind(bridge, tupName, at)
+    var forExprNode = build_decs_inner_for(bridge, tupBind, body, at)
+    let archName = bridge.archName
+    var reqExpr = clone_expression(bridge.reqHashExpr)
+    var erqExpr = clone_expression(bridge.erqExpr)
+    var emission : Expression? = qmacro(invoke($() : array<$t(elemType)> {
+        var $i(bufName) : array<$t(elemType)>
+        for_each_archetype($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) {
+            $e(forExprNode)
+        })
+        return <- $i(bufName)
+    }))
+    emission.force_at(at)
+    emission.force_generated(true)
+    return emission
+}
+
+[macro_function]
+def private emit_decs_min_max_by(bridge : DecsBridgeShape?;
+                                 opName : string;
+                                 chainInfo : DecsChainInfo?;
+                                 var calls : array<tuple<ExprCall?; LinqCall?>>;
+                                 intermediateEnd : int;
+                                 terminatorCall : ExprCall?;
+                                 at : LineInfo) : Expression? {
+    // Slice 4: min_by / max_by — track best (key, element) pair across archetypes. Element retained via `:=`, matches min_by_impl's empty→default semantics.
+    let tupName = "`decs_tup`{at.line}`{at.column}"
+    let firstName = "`decs_first`{at.line}`{at.column}"
+    let bestKeyName = "`decs_bkey`{at.line}`{at.column}"
+    let bestElemName = "`decs_belem`{at.line}`{at.column}"
+    let keyBindName = "`decs_key`{at.line}`{at.column}"
+    let finalBind = chainInfo.finalBind
+    var elemType = clone_type(chainInfo.finalType)
+    var keyExpr = fold_linq_cond(clone_expression(terminatorCall.arguments[1]), finalBind)
+    if (keyExpr == null || keyExpr._type == null) return null
+    var keyType = clone_type(keyExpr._type)
+    keyType.flags.constant = false
+    keyType.flags.ref = false
+    let workhorse = keyType.isWorkhorseType
+    let opCmp = (opName == "min_by") ? "min" : "max"
+    var compareExpr = min_max_compare(workhorse, opCmp, keyBindName, bestKeyName)
+    var perElement = qmacro_block() {
+        let $i(keyBindName) = $e(keyExpr)
+        if ($i(firstName)) {
+            $i(bestKeyName) := $i(keyBindName)
+            $i(bestElemName) := $i(finalBind)
+            $i(firstName) = false
+        } elif ($e(compareExpr)) {
+            $i(bestKeyName) := $i(keyBindName)
+            $i(bestElemName) := $i(finalBind)
+        }
+    }
+    var body = wrap_decs_chain(perElement, chainInfo, calls, intermediateEnd, at)
+    if (body == null) return null
+    var tupBind = build_decs_tup_bind(bridge, tupName, at)
+    var forExprNode = build_decs_inner_for(bridge, tupBind, body, at)
+    let archName = bridge.archName
+    var reqExpr = clone_expression(bridge.reqHashExpr)
+    var erqExpr = clone_expression(bridge.erqExpr)
+    var emission = qmacro(invoke($() : $t(elemType) {
+        var $i(firstName) = true
+        var $i(bestKeyName) : $t(keyType)
+        var $i(bestElemName) : $t(elemType)
+        for_each_archetype($e(reqExpr), $e(erqExpr), $($i(archName) : Archetype) {
+            $e(forExprNode)
+        })
+        return $i(bestElemName)
+    }))
     emission.force_at(at)
     emission.force_generated(true)
     return emission
@@ -3083,70 +3599,81 @@ def private emit_decs_accumulator(bridge : DecsBridgeShape?;
 def private plan_decs_unroll(var expr : Expression?) : Expression? {
     var (top, calls) = flatten_linq(expr)
     let bridge = extract_decs_bridge(top)
-    if (bridge == null || empty(calls)) return null
-    let lastName = calls.back()._1.name
+    if (bridge == null) return null
     let at = expr.at
     // Slice 1: bare count → arch.size shortcut.
-    if (lastName == "count" && length(calls) == 1) return emit_decs_count_archsize(bridge, at)
-    // Slice 2: chain-aware count/long_count/sum with _where + single _select.
-    if (lastName != "count" && lastName != "long_count" && lastName != "sum") return null
+    if (length(calls) == 1 && calls.back()._1.name == "count" && length(calls.back()._0.arguments) == 1) return emit_decs_count_archsize(bridge, at)
+    // to_array is `skip = true` in linqCalls so it's already peeled — "no recognized terminator" means implicit to_array.
+    let lastName = empty(calls) ? "" : calls.back()._1.name
+    let isAccum = (lastName == "count" || lastName == "long_count" || lastName == "sum"
+        || lastName == "min" || lastName == "max" || lastName == "average")
+    let isEarlyExit = (lastName == "first" || lastName == "first_or_default"
+        || lastName == "any" || lastName == "all" || lastName == "contains")
+    let isMinMaxBy = (lastName == "min_by" || lastName == "max_by")
+    let isTerminator = isAccum || isEarlyExit || isMinMaxBy
     let tupName = "`decs_tup`{at.line}`{at.column}"
-    let intermediateEnd = length(calls) - 1
-    var whereCond : Expression?
-    var projection : Expression?
-    var seenSelect = false
-    for (i in 0 .. intermediateEnd) {
-        var cll & = unsafe(calls[i])
-        let opName = cll._1.name
-        if (opName == "where_") {
-            // After-select where: defer to follow-up; canonical order is where-then-select.
-            if (seenSelect) return null
-            var pred = fold_linq_cond(cll._0.arguments[1], tupName)
-            if (pred == null) return null
-            if (whereCond == null) {
-                whereCond = pred
-            } else {
-                whereCond = qmacro($e(whereCond) && $e(pred))
-            }
-        } elif (opName == "select") {
-            // Chained selects: defer to follow-up.
-            if (seenSelect) return null
-            projection = fold_linq_cond(cll._0.arguments[1], tupName)
-            if (projection == null) return null
-            seenSelect = true
-        } else {
-            return null
+    let intermediateEnd = isTerminator ? length(calls) - 1 : length(calls)
+    let chainInfo = compute_decs_chain_info(calls, intermediateEnd, tupName, bridge, at)
+    if (chainInfo == null) return null
+    if (isAccum) {
+        // sum/min/max/average need a scalar element — selectCount==0 keeps finalType = bridge.elementType (a tuple) which can't be summed/compared.
+        var accType : TypeDeclPtr
+        if (lastName == "sum" || lastName == "min" || lastName == "max" || lastName == "average") {
+            if (chainInfo.selectCount == 0 || chainInfo.finalType == null) return null
+            accType = clone_type(chainInfo.finalType)
         }
+        return emit_decs_accumulator(bridge, lastName, chainInfo, calls, intermediateEnd, calls.back()._0, accType, at)
     }
-    // sum requires a scalar projection (tuple sources can't sum directly).
-    var accType : TypeDeclPtr
-    if (lastName == "sum") {
-        if (projection == null || projection._type == null) return null
-        accType = clone_type(projection._type)
-        accType.flags.constant = false
-        accType.flags.ref = false
-    }
-    return emit_decs_accumulator(bridge, lastName, projection, whereCond, accType, at)
+    if (isEarlyExit) return emit_decs_early_exit(bridge, lastName, chainInfo, calls, intermediateEnd, calls.back()._0, at)
+    if (isMinMaxBy) return emit_decs_min_max_by(bridge, lastName, chainInfo, calls, intermediateEnd, calls.back()._0, at)
+    // Iterator-typed chains cascade to tier-2; only fire to_array when expr is array-typed (user wrote `.to_array()`, peeled by skip=true).
+    if (expr._type == null || !expr._type.isGoodArrayType) return null
+    return emit_decs_to_array(bridge, chainInfo, calls, intermediateEnd, at)
 }
 
 [macro_function]
 def private plan_zip(var expr : Expression?) : Expression? {
-    // Phase 2 Z1/Z2/Z3: 2-ary lockstep zip splice. Supports bare zip (array/iterator), no-pred count/long_count, and fused where_/select/take/skip/take_while/skip_while chain ops between zip and terminator. Result-selector form (3-arg zip), accumulator terminators (sum/min/max/etc.), and chained selects bail to tier-2 cascade.
+    // Phase 2 Z1/Z2/Z3 + accumulator/early-exit: 2-ary lockstep zip splice. Supports bare zip (array/iterator), no-pred count/long_count + accumulator (sum/min/max/average) + early-exit (first/first_or_default/any/all/contains) terminators, and fused where_/select/take/skip/take_while/skip_while chain ops between zip and terminator. Result-selector form (3-arg zip) and chained selects bail to tier-2 cascade.
     var (top, calls) = flatten_linq(expr)
     if (empty(calls) || calls[0]._1.name != "zip") return null
     var zipCall = calls[0]._0
     let zipArgCount = zipCall.arguments |> length
     // Z6 bail: result-selector form (3-arg zip = 2 sources + selector) yields scalar element stream — different splice shape, defer.
     if (zipArgCount != 2) return null
-    // Identify recognized terminator (count/long_count). Anything else: if it's a recognized chain op, treat as no-terminator (bare); else bail.
+    // Identify recognized terminator. Counter: count/long_count. Accumulator: sum/min/max/average. Early-exit: first/first_or_default/any/all/contains. Anything else: treat as no-terminator (bare → ARRAY lane); unrecognized chain op bails inside the chain walk.
     var lastName = ""
     var intermediateEnd = length(calls)
     if (length(calls) > 1) {
         let candidateName = calls.back()._1.name
         let candidateCall = calls.back()._0
+        let candidateArgs = candidateCall.arguments |> length
         if (candidateName == "count" || candidateName == "long_count") {
             // No-pred form only (1 arg = the source). Predicate-form bails (would change per-element work).
-            if (candidateCall.arguments |> length != 1) return null
+            if (candidateArgs != 1) return null
+            lastName = candidateName
+            intermediateEnd = length(calls) - 1
+        } elif (candidateName == "sum" || candidateName == "min" || candidateName == "max" || candidateName == "average") {
+            // Accumulator: no-arg form only (1 arg = the source).
+            if (candidateArgs != 1) return null
+            lastName = candidateName
+            intermediateEnd = length(calls) - 1
+        } elif (candidateName == "first") {
+            if (candidateArgs != 1) return null
+            lastName = candidateName
+            intermediateEnd = length(calls) - 1
+        } elif (candidateName == "first_or_default" || candidateName == "contains") {
+            // first_or_default(d): 1 default arg. contains(v): 1 value arg.
+            if (candidateArgs != 2) return null
+            lastName = candidateName
+            intermediateEnd = length(calls) - 1
+        } elif (candidateName == "any") {
+            // any(): no pred. any(p): 1 predicate arg. Both forms recognized.
+            if (candidateArgs != 1 && candidateArgs != 2) return null
+            lastName = candidateName
+            intermediateEnd = length(calls) - 1
+        } elif (candidateName == "all") {
+            // all(p): must have predicate.
+            if (candidateArgs != 2) return null
             lastName = candidateName
             intermediateEnd = length(calls) - 1
         }
@@ -3259,12 +3786,42 @@ def private plan_zip(var expr : Expression?) : Expression? {
     let isCounter = lastName == "count" || lastName == "long_count"
     // Semantic guard: impure `select` projection AHEAD of a range op (skip/take/skip_while/take_while). The eager `select(proj)|>skip(K)` pipeline runs proj on every element; our spliced order (skip-then-push) would drop the side effects on skipped items. Bail to tier-2 which preserves eager semantics. Plan_loop_or_count has the analogous gap (see PR #2741 review thread #r3269663361); fixing both planners uniformly is a separate follow-up.
     if (seenSelect && !allProjectionsPure && !noLimits) return null
+    // Accumulator + early-exit lane dispatch: reuses the generalized emit_*_lane helpers (parallel-array form). preCondStmts threads `let it = (itA, itB)` so itName resolves inside the loop body when the where/projection/predicate/value references the tuple element. `long_count` is classified ACCUMULATOR but routes through the COUNTER path locally (existing length-shortcut + counter loop already cover it); `!isCounter` excludes it from this branch.
+    let lane = classify_terminator(lastName)
+    if (lane == LinqLane.ACCUMULATOR && !isCounter) {
+        // sum/min/max/average without projection: elementType is tuple<...>, accumulator op would not typecheck — bail to tier-2 (typer rejects anyway, but explicit bail keeps the error inside linq's normal cascade rather than the splice).
+        if (projection == null) return null
+        var preCondStmts : array<Expression?>
+        preCondStmts |> push <| qmacro_expr() {     // nolint:STYLE012
+            let $i(itName) = (itA, itB)
+        }
+        var intermediateBinds : array<Expression?>
+        var laneTops <- [srcAExpr, srcBExpr]
+        let laneSrcs <- [srcAName, srcBName]
+        return emit_accumulator_lane(lastName, laneTops, projection, whereCond,
+            intermediateBinds, preCondStmts, elementType, laneSrcs, accName, itName, skipName, takeCountName,
+            skippingName, skipExpr, takeExpr, skipWhileCond, takeWhileCond, at)
+    }
+    if (lane == LinqLane.EARLY_EXIT) {
+        var preCondStmts : array<Expression?>
+        preCondStmts |> push <| qmacro_expr() {     // nolint:STYLE012
+            let $i(itName) = (itA, itB)
+        }
+        var intermediateBinds : array<Expression?>
+        let terminatorCall = calls.back()._0
+        var laneTops <- [srcAExpr, srcBExpr]
+        let laneSrcs <- [srcAName, srcBName]
+        return emit_early_exit_lane(lastName, laneTops, projection, whereCond,
+            intermediateBinds, preCondStmts, elementType, terminatorCall, laneSrcs, itName, skipName,
+            takeCountName, skippingName, skipExpr, takeExpr, skipWhileCond, takeWhileCond, at)
+    }
     // Length shortcut: count/long_count, no chain, both length-bearing → return min(lenA, lenB) without entering the loop.
     if (noChain && bothHaveLength && isCounter) {
         var bodyStmts : array<Expression?>
         if (lastName == "count") {
+            // length returns int already; bare `length(...)` matches return type — no cast (PERF020).
             bodyStmts |> push <| qmacro_expr() {
-                return int(length($i(srcAName)) < length($i(srcBName)) ? length($i(srcAName)) : length($i(srcBName)))
+                return length($i(srcAName)) < length($i(srcBName)) ? length($i(srcAName)) : length($i(srcBName))
             }
         } else {
             bodyStmts |> push <| qmacro_expr() {
diff --git a/daslib/lint.das b/daslib/lint.das
index 34c5af12db..17e5fd8458 100644
--- a/daslib/lint.das
+++ b/daslib/lint.das
@@ -57,11 +57,20 @@ class LintVisitor : AstVisitor {
     // collection mode — when true, errors are appended to `errors` instead of error()
     collect_errors : bool = false
     errors : array<string>
+    // Filters: disabled (denylist) and enabled (whitelist; empty == all).
+    // Caller-populated via collect overload; applied alongside // nolint suppression.
+    disabled_codes : table<string>
+    enabled_codes : table<string>
     def LintVisitor() {
         pass
     }
     def is_suppressed(text : string; at : LineInfo) : bool {
-        return is_lint_suppressed(at, extract_lint_code(text))
+        let code = extract_lint_code(text)
+        if (!empty(code)) {
+            return true if (key_exists(disabled_codes, code)
+                            || (!empty(enabled_codes) && !key_exists(enabled_codes, code)))
+        }
+        return is_lint_suppressed(at, code)
     }
     def lint_error(text : string; at : LineInfo) : void {
         if (noLint || self->is_suppressed(text, at)) return
@@ -273,7 +282,17 @@ def public paranoid(prog : ProgramPtr; compile_time_errors : bool) {
 def public paranoid_collect(prog : ProgramPtr; var errors : array<string>) : int {
     //! Runs the paranoid lint visitor and collects errors as strings.
     //! Returns the number of lint issues found.
+    let empty_set : table<string>
+    return paranoid_collect(prog, errors, empty_set, empty_set)
+}
+
+def public paranoid_collect(prog : ProgramPtr; var errors : array<string>;
+                            disabled_codes, enabled_codes : table<string>) : int {
+    //! Filter-aware overload. `disabled_codes` is a denylist; `enabled_codes`
+    //! is a whitelist (empty == all). Codes use the bare form (e.g. "LINT002").
     var astVisitor = new LintVisitor(compile_time_errors = false, collect_errors = true)
+    astVisitor.disabled_codes := disabled_codes
+    astVisitor.enabled_codes := enabled_codes
     make_visitor(*astVisitor) $(adapter) {
         astVisitor.astVisitorAdapter = adapter
         visit(prog, astVisitor.astVisitorAdapter)
diff --git a/daslib/perf_lint.das b/daslib/perf_lint.das
index f4207822e7..d715259bd6 100644
--- a/daslib/perf_lint.das
+++ b/daslib/perf_lint.das
@@ -33,6 +33,7 @@ module perf_lint shared private
 //!   PERF018 — for (i in range(length(arr))) where i only indexes arr — use 'for (c in arr)'
 //!   PERF019 — int(T.a) | int(T.b) on bitfield-or-enum-with-operator-| — collapse to int(T.a | T.b)
 //!   PERF020 — T(x) where x is already T (workhorse type) — drop the redundant cast
+//!   PERF021 — cond ? T(a) : T(b) on workhorse cast T — hoist to T(cond ? a : b)
 
 require daslib/ast_boost
 require strings
@@ -125,8 +126,18 @@ class PerfLintVisitor : AstVisitor {
         return uint64(at.line) | (uint64(at.column) << uint64(20))
     }
 
+    // Filters: disabled (denylist) and enabled (whitelist; empty == all).
+    // Caller-populated via collect overload; applied alongside // nolint suppression.
+    disabled_codes : table<string>
+    enabled_codes : table<string>
+
     def is_suppressed(text : string; at : LineInfo) : bool {
-        return is_lint_suppressed(at, extract_lint_code(text))
+        let code = extract_lint_code(text)
+        if (!empty(code)) {
+            return true if (key_exists(disabled_codes, code)
+                            || (!empty(enabled_codes) && !key_exists(enabled_codes, code)))
+        }
+        return is_lint_suppressed(at, code)
     }
 
     def perf_warning(text : string; at : LineInfo) : void {
@@ -381,7 +392,7 @@ class PerfLintVisitor : AstVisitor {
         if (inner == null || !(inner is ExprCall)) return null
         let call = inner as ExprCall
         if (call.func == null || empty(call.arguments)) return null
-        let fname = call.func.fromGeneric != null ? string(call.func.fromGeneric.name) : string(call.func.name)
+        let fname = string(call.func.fromGeneric != null ? call.func.fromGeneric.name : call.func.name)
         if (fname != "int") return null
         return call.arguments[0]
     }
@@ -948,7 +959,7 @@ class PerfLintVisitor : AstVisitor {
 
     def check_perf020_redundant_cast(call : ExprCall?) : void {
         if (call == null || call.func == null || length(call.arguments) != 1) return
-        let fname = call.func.fromGeneric != null ? string(call.func.fromGeneric.name) : string(call.func.name)
+        let fname = string(call.func.fromGeneric != null ? call.func.fromGeneric.name : call.func.name)
         let target = self->perf020_target_basetype(fname)
         if (target == Type.none) return
         var arg = call.arguments[0]
@@ -959,6 +970,100 @@ class PerfLintVisitor : AstVisitor {
         self->perf_warning("PERF020: redundant {fname}(...) cast — argument is already {fname}", call.at)
     }
 
+    // --- PERF021: cond ? T(a) : T(b) — hoist common workhorse cast out of ternary ---
+
+    def cast_call_workhorse_target(e : Expression?; var fname_out : string&) : Type {
+        //! If `e` is a workhorse cast call (after peeling ExprRef2Value), return
+        //! its target Type and write the cast name to `fname_out`. Otherwise
+        //! return Type.none. Accepts any argument count — `string(int)` is bound
+        //! with `value,hex,context,at` (4 daslang args), other casts use 1.
+        var ee = e
+        if (ee != null && ee is ExprRef2Value) {
+            ee = (ee as ExprRef2Value.subexpr)
+        }
+        if (ee == null || !(ee is ExprCall)) return Type.none
+        let c = ee as ExprCall
+        if (c.func == null || empty(c.arguments)) return Type.none
+        let n = string(c.func.fromGeneric != null ? c.func.fromGeneric.name : c.func.name)
+        let t = self->perf020_target_basetype(n)
+        if (t == Type.none) return Type.none
+        fname_out = n
+        return t
+    }
+
+    def cast_call_arg_basetype(e : Expression?) : Type {
+        //! Peel ExprRef2Value, take call.arguments[0], peel ExprRef2Value, return its baseType
+        //! (or Type.none if anything along the chain is null).
+        var ee = e
+        if (ee != null && ee is ExprRef2Value) {
+            ee = (ee as ExprRef2Value.subexpr)
+        }
+        if (ee == null || !(ee is ExprCall)) return Type.none
+        let c = ee as ExprCall
+        if (empty(c.arguments)) return Type.none
+        var arg = c.arguments[0]
+        if (arg is ExprRef2Value) {
+            arg = (arg as ExprRef2Value.subexpr)
+        }
+        if (arg == null || arg._type == null) return Type.none
+        return arg._type.baseType
+    }
+
+    def cast_call_tail_args_equal(le, re : Expression?) : bool {
+        //! Compare arguments[1..] structurally between two cast calls. Skip
+        //! ExprFakeContext / ExprFakeLineInfo (auto-injected by the typer for
+        //! Context*/LineInfoArg* parameters; differ at every call site by
+        //! design). The remaining tail args — e.g. `string(int)`'s `hex` flag —
+        //! must match in count and via expr_equal_struct, otherwise the hoist
+        //! would silently lose semantically-significant arguments.
+        var lee = le
+        var ree = re
+        if (lee != null && lee is ExprRef2Value) {
+            lee = (lee as ExprRef2Value.subexpr)
+        }
+        if (ree != null && ree is ExprRef2Value) {
+            ree = (ree as ExprRef2Value.subexpr)
+        }
+        if (lee == null || ree == null || !(lee is ExprCall) || !(ree is ExprCall)) return false
+        let lc = lee as ExprCall
+        let rc = ree as ExprCall
+        var li = 1
+        var ri = 1
+        let ln = length(lc.arguments)
+        let rn = length(rc.arguments)
+        while (true) {
+            while (li < ln && (lc.arguments[li] is ExprFakeContext || lc.arguments[li] is ExprFakeLineInfo)) {
+                li++
+            }
+            while (ri < rn && (rc.arguments[ri] is ExprFakeContext || rc.arguments[ri] is ExprFakeLineInfo)) {
+                ri++
+            }
+            if (li >= ln || ri >= rn) return (li >= ln && ri >= rn)
+            if (!self->expr_equal_struct(lc.arguments[li], rc.arguments[ri], false)) return false
+            li++
+            ri++
+        }
+        return true
+    }
+
+    def check_perf021_ternary_cast_hoist(expr : ExprOp3?) : void {
+        if (expr == null || expr.op != "?"
+            || expr.left == null || expr.right == null) return
+        var lname = ""
+        var rname = ""
+        let lt = self->cast_call_workhorse_target(expr.left, lname)
+        let rt = self->cast_call_workhorse_target(expr.right, rname)
+        let la = self->cast_call_arg_basetype(expr.left)
+        let ra = self->cast_call_arg_basetype(expr.right)
+        // Tail-args must match so the hoisted form preserves semantically-significant
+        // arguments (e.g. `string(int, hex)`'s `hex` flag). Auto-injected
+        // ExprFakeContext / ExprFakeLineInfo are skipped — they differ at every site.
+        if (lt == Type.none || rt == Type.none || lt != rt || lname != rname
+            || la == Type.none || la != ra
+            || !self->cast_call_tail_args_equal(expr.left, expr.right)) return
+        self->perf_warning("PERF021: redundant per-branch {lname}(...) cast in ternary — hoist as {lname}(cond ? a : b)", expr.at)
+    }
+
     // --- PERF014: closed-interval char-class range checks ---
 
     def parse_range_leg(leg : Expression?; var v_out : Expression?&; var bound_out : int&; var is_hi_out : bool&) : bool {
@@ -1119,6 +1224,9 @@ class PerfLintVisitor : AstVisitor {
     // --- PERF015 / PERF016: ternary min/max/abs ---
 
     def override preVisitExprOp3(expr : ExprOp3?) : void {
+        // PERF021: fires anywhere, including in closures — a redundant per-branch cast
+        // is redundant regardless of where the ternary lives.
+        self->check_perf021_ternary_cast_hoist(expr)
         if (in_closure > 0 || expr.op != "?"
             || expr.subexpr == null || !(expr.subexpr is ExprOp2)) return
         var cmp = expr.subexpr as ExprOp2
@@ -1402,7 +1510,17 @@ def public perf_lint(prog : ProgramPtr; compile_time_errors : bool) : int {
 def public perf_lint_collect(prog : ProgramPtr; var warnings : array<string>) : int {
     //! Runs the performance lint visitor and collects warnings as strings.
     //! Returns the number of warnings found.
+    let empty_set : table<string>
+    return perf_lint_collect(prog, warnings, empty_set, empty_set)
+}
+
+def public perf_lint_collect(prog : ProgramPtr; var warnings : array<string>;
+                             disabled_codes, enabled_codes : table<string>) : int {
+    //! Filter-aware overload. `disabled_codes` is a denylist; `enabled_codes`
+    //! is a whitelist (empty == all). Codes use the bare form (e.g. "PERF001").
     var astVisitor = new PerfLintVisitor(compile_time_errors = false, collect_warnings = true)
+    astVisitor.disabled_codes := disabled_codes
+    astVisitor.enabled_codes := enabled_codes
     make_visitor(*astVisitor) $(astVisitorAdapter) {
         visit(prog, astVisitorAdapter)
     }
diff --git a/daslib/style_lint.das b/daslib/style_lint.das
index dbf8df4fd7..c53a446859 100644
--- a/daslib/style_lint.das
+++ b/daslib/style_lint.das
@@ -36,12 +36,24 @@ module style_lint shared private
 //!   STYLE026 — nested 'unsafe { ... }' block; outer wrap already covers the scope — drop the inner
 
 require daslib/ast_boost
+require daslib/is_local
 require strings
 
 // ---------------------------------------------------------------------------
 // Visitor
 // ---------------------------------------------------------------------------
 
+struct UnsafeFrame {
+    //! Per-expression subtree summary tracked on `StyleLintVisitor.unsafe_stack`.
+    //! `count` is the number of inherently-unsafe leaves in the subtree.
+    //! `has_non_local_let_ref` flags an enclosing `let v & = E` where `E` is
+    //! non-local-non-temporary (per `isLocalOrGlobal` in ast_infer_type.cpp:4989)
+    //! — narrowing the enclosing `unsafe { ... }` to expression-form would
+    //! leave the let-ref binding unsatisfied, so STYLE025 must stay silent.
+    count : int
+    has_non_local_let_ref : bool
+}
+
 class StyleLintVisitor : AstVisitor {
     compile_time_errors : bool
     comment_hygiene : bool = false
@@ -58,8 +70,8 @@ class StyleLintVisitor : AstVisitor {
     @do_not_delete pending_uninit_vars : array<Variable?>
     // STYLE024/025 — tight unsafe checks.
     @do_not_delete skip_userSaidItsSafe : array<Expression?>
-    @do_not_delete unsafeExprs : table<Expression?; int>
-    unsafe_stack : array<int>
+    @do_not_delete unsafeExprs : table<Expression?; UnsafeFrame>
+    unsafe_stack : array<UnsafeFrame>
     unsafe_block_stack : array<int>
 
     def StyleLintVisitor() {
@@ -70,8 +82,18 @@ class StyleLintVisitor : AstVisitor {
         return uint64(at.line) | (uint64(at.column) << uint64(20))
     }
 
+    // Filters: disabled (denylist) and enabled (whitelist; empty == all).
+    // Caller-populated via collect overload; applied alongside // nolint suppression.
+    disabled_codes : table<string>
+    enabled_codes : table<string>
+
     def is_suppressed(text : string; at : LineInfo) : bool {
-        return is_lint_suppressed(at, extract_lint_code(text))
+        let code = extract_lint_code(text)
+        if (!empty(code)) {
+            return true if (key_exists(disabled_codes, code)
+                            || (!empty(enabled_codes) && !key_exists(enabled_codes, code)))
+        }
+        return is_lint_suppressed(at, code)
     }
 
     def style_warning(text : string; at : LineInfo) : void {
@@ -1238,40 +1260,52 @@ class StyleLintVisitor : AstVisitor {
 
     // --- STYLE024/STYLE025: redundant or over-broad `unsafe` ---
 
-    def unsafe_count_for(expr : Expression?) : int {
-        return 0 if (expr == null)
-        return unsafeExprs |> key_exists(expr) ? unsafeExprs[expr] : 0
+    def unsafe_frame_for(expr : Expression?) : UnsafeFrame {
+        return UnsafeFrame() if (expr == null)
+        return unsafeExprs |> key_exists(expr) ? unsafeExprs[expr] : UnsafeFrame()
     }
 
     def mark_unsafe_in_stack() : void {
         //! Add 1 to the current top of `unsafe_stack`.
         let n = length(unsafe_stack)
         if (n > 0) {
-            unsafe_stack[n - 1] = unsafe_stack[n - 1] + 1
+            unsafe_stack[n - 1].count++
+        }
+    }
+
+    def mark_non_local_let_ref_in_stack() : void {
+        //! Flag the current top of `unsafe_stack` as containing a non-local
+        //! let-ref binding. Propagates upward through `visitExpression`.
+        let n = length(unsafe_stack)
+        if (n > 0) {
+            unsafe_stack[n - 1].has_non_local_let_ref = true
         }
     }
 
     def override preVisitExpression(expr : ExpressionPtr) : void {
-        //! Push a fresh 0 slot for this node.
-        unsafe_stack |> push(0)
+        //! Push a fresh frame for this node.
+        unsafe_stack |> push(UnsafeFrame())
     }
 
     def override visitExpression(var expr : ExpressionPtr) : ExpressionPtr {
-        //! Pop the slot for this node. The popped value is the subtree's
-        //! count of inherently-unsafe leaves. Use it to:
-        //!   - store `unsafeExprs[expr.at] = count` for later queries
+        //! Pop the slot for this node. The popped frame is the subtree's
+        //! summary. Use it to:
+        //!   - store `unsafeExprs[expr] = frame` for later queries
         //!     (used by `visitExprUnsafe` to decide STYLE024 vs STYLE025),
-        //!   - propagate count to the parent's slot (now top after our pop),
+        //!   - propagate count + non-local-let-ref flag to the parent slot,
         //!   - fire STYLE024 if `expr` is an `unsafe(...)` wrap target
         //!     (`userSaidItsSafe`) and its subtree had count 0.
         let n = length(unsafe_stack)
-        let count = unsafe_stack[n - 1]
+        let frame = unsafe_stack[n - 1]
         unsafe_stack |> pop
-        if (count > 0) {
-            unsafeExprs |> insert(expr, count)
+        if (frame.count > 0 || frame.has_non_local_let_ref) {
+            unsafeExprs |> insert(expr, frame)
             let m = length(unsafe_stack)
             if (m > 0) {
-                unsafe_stack[m - 1] = unsafe_stack[m - 1] + count
+                unsafe_stack[m - 1].count = unsafe_stack[m - 1].count + frame.count
+                if (frame.has_non_local_let_ref) {
+                    unsafe_stack[m - 1].has_non_local_let_ref = true
+                }
             }
         }
         // STYLE024 expression form: parser sets `userSaidItsSafe` on the
@@ -1280,12 +1314,33 @@ class StyleLintVisitor : AstVisitor {
         if (expr.genFlags.userSaidItsSafe && !expr.genFlags.generated
             && (current_function == null || current_function.fromGeneric == null)
             && !(skip_userSaidItsSafe |> has_value(expr))
-            && count == 0) {
+            && frame.count == 0) {
             self->style_warning("STYLE024: redundant 'unsafe(...)' wrap; the inner expression has no operation that requires unsafe — drop the wrap", expr.at)
         }
         return expr
     }
 
+    def override preVisitExprLet(expr : ExprLet?) : void {
+        //! Mirror `isLocalOrGlobal` check from ast_infer_type.cpp:4989 —
+        //! `let v & = E` requires unsafe at statement level when `E` is
+        //! non-local-non-temporary. Mark the let's frame so the enclosing
+        //! `unsafe { ... }` block can detect it (STYLE025 must stay silent
+        //! when narrowing would leave the let-ref binding unsatisfied).
+        if (expr.genFlags.generated) return
+        for (v in expr.variables) {
+            continue if (
+                v._type == null ||
+                !v._type.flags.ref ||
+                v.init == null ||
+                v.init.genFlags.alwaysSafe ||
+                (v.init._type != null && v.init._type.flags.temporary) ||
+                is_local_or_global_expr(v.init)
+            )
+            self->mark_non_local_let_ref_in_stack()
+            return
+        }
+    }
+
     def call_func_needs_unsafe(func : Function const?; is_for_loop_src : bool) : bool {
         return (func == null
             || func.flags.unsafeOperation
@@ -1331,6 +1386,29 @@ class StyleLintVisitor : AstVisitor {
         }
     }
 
+    def override preVisitExprSafeAt(expr : ExprSafeAt?) : void {
+        // `?[]` on table<> / array<> / pointer-to-(table|array|pointer)
+        // requires unsafe — see ast_infer_type.cpp errors
+        // unsafe_table_safe_index / unsafe_array_safe_index /
+        // unsafe_pointer_safe_index. Safe-at on vector / fixed_array is OK.
+        if (expr.genFlags.generated) return
+        if (expr.subexpr == null || expr.subexpr._type == null) {
+            self->mark_unsafe_in_stack()
+            return
+        }
+        let bt = expr.subexpr._type.baseType
+        if (bt == Type.tTable || bt == Type.tArray) {
+            self->mark_unsafe_in_stack()
+            return
+        }
+        if (bt == Type.tPointer && expr.subexpr._type.firstType != null) {
+            let inner = expr.subexpr._type.firstType.baseType
+            if (inner == Type.tTable || inner == Type.tArray || inner == Type.tPointer) {
+                self->mark_unsafe_in_stack()
+            }
+        }
+    }
+
     def override preVisitExprField(expr : ExprField?) : void {
         // variant.field requires unsafe (write context; over-mark on read
         // is safer than under-mark — biases toward keeping wraps).
@@ -1408,14 +1486,15 @@ class StyleLintVisitor : AstVisitor {
             || (current_function != null && current_function.fromGeneric != null)
             || !(expr.body is ExprBlock)) return expr
         let blk = expr.body as ExprBlock
-        let count = self->unsafe_count_for(blk)
-        if (count == 0) {
+        let frame = self->unsafe_frame_for(blk)
+        if (frame.count == 0) {
             self->style_warning("STYLE024: redundant 'unsafe \{ ... }' block; no statement requires unsafe — drop the wrap", expr.at)
-        } elif (count == 1 &&
+        } elif (frame.count == 1 &&
                 !(blk.list[0] is ExprYield ||
                   blk.list[0] is ExprDelete ||
                   blk.list[0] is ExprReturn ||
-                  blk.list[0] is ExprNew)) {
+                  blk.list[0] is ExprNew)
+                && !frame.has_non_local_let_ref) {
             self->style_warning("STYLE025: 'unsafe \{ ... }' block scope is too broad; only one operation requires unsafe — narrow to 'unsafe(<sub-expr>)' wrapping that operation", expr.at)
         }
         return expr
@@ -1450,7 +1529,18 @@ def public style_lint_collect(prog : ProgramPtr; var warnings : array<string>; c
     //! Runs the style lint visitor and collects warnings as strings.
     //! Returns the number of warnings found.
     //! Pass ``comment_hygiene = true`` to enable STYLE014/STYLE015 checks.
+    let empty_set : table<string>
+    return style_lint_collect(prog, warnings, empty_set, empty_set, comment_hygiene)
+}
+
+def public style_lint_collect(prog : ProgramPtr; var warnings : array<string>;
+                              disabled_codes, enabled_codes : table<string>;
+                              comment_hygiene : bool = false) : int {
+    //! Filter-aware overload. `disabled_codes` is a denylist; `enabled_codes`
+    //! is a whitelist (empty == all). Codes use the bare form (e.g. "STYLE024").
     var astVisitor = new StyleLintVisitor(compile_time_errors = false, collect_warnings = true, comment_hygiene = comment_hygiene)
+    astVisitor.disabled_codes := disabled_codes
+    astVisitor.enabled_codes := enabled_codes
     make_visitor(*astVisitor) $(astVisitorAdapter) {
         visit_with_generics(prog, astVisitorAdapter)
     }
diff --git a/doc/source/reference/language/lint.rst b/doc/source/reference/language/lint.rst
index 6a25019d10..0df9bab30c 100644
--- a/doc/source/reference/language/lint.rst
+++ b/doc/source/reference/language/lint.rst
@@ -722,6 +722,56 @@ The rule deliberately does NOT cover:
 Cross-type casts (widening, narrowing, signedness change, float ↔ int)
 are genuine work and do NOT fire.
 
+PERF021 — hoist common workhorse cast out of ternary
+======================================================
+
+``cond ? T(a) : T(b)`` where both branches apply the **same** workhorse
+cast ``T`` emits two ``ExprCall`` nodes that do identical work regardless
+of which branch is taken. Hoisting the cast outside the ternary collapses
+them to one: ``T(cond ? a : b)``.
+
+Uses the same 15-name workhorse cast set as PERF020. The rule fires only
+when:
+
+- Both ternary branches are calls to the same workhorse cast name (after
+  peeling ``ExprRef2Value``).
+- Both calls share the same target ``Type``.
+- Both arguments share the same ``baseType`` — so the hoisted
+  ``T(cond ? a : b)`` typechecks without an intermediate cast.
+
+If the argument base types differ (e.g. ``cond ? string(intV) :
+string(int64V)``), the rule does NOT fire; the rewrite would need a
+manual widen on one branch and that is left to the author.
+
+.. code-block:: das
+
+    // Bad
+    def to_str(c : bool; a, b : int) : string {
+        return c ? string(a) : string(b)                    // PERF021
+    }
+
+    def widen(c : bool; a, b : int) : int64 {
+        return c ? int64(a) : int64(b)                      // PERF021
+    }
+
+    // Good
+    def to_str(c : bool; a, b : int) : string {
+        return string(c ? a : b)
+    }
+
+    def widen(c : bool; a, b : int) : int64 {
+        return int64(c ? a : b)
+    }
+
+The rewrite is unconditionally safe: the original ternary evaluates
+exactly one of ``a`` / ``b``, and so does the hoisted form — argument
+evaluation count is unchanged. Only the per-branch cast dispatch is
+eliminated.
+
+User-named struct / enum / bitfield constructors (``MyEnum(x)``,
+``Foo(v=x)``) and multi-argument vector constructors (``float2(x, y)``)
+do not match the workhorse cast set and are intentionally out of scope.
+
 .. _style_lint:
 
 -----------
diff --git a/include/daScript/misc/memory_model.h b/include/daScript/misc/memory_model.h
index d95af0d29f..82311f754b 100644
--- a/include/daScript/misc/memory_model.h
+++ b/include/daScript/misc/memory_model.h
@@ -302,7 +302,10 @@ namespace das {
         void setTrackAllocations ( bool on );
         __forceinline bool isTrackingAllocations() const { return trackAllocations; }
         CustomGrowFunction      customGrow;
-        uint32_t                alignMask;
+        // Mask must be uint64 — `(size + alignMask) & ~alignMask` in allocate/free/reallocate
+        // would otherwise zero-extend `~alignMask` from uint32 to uint64 as 0x00000000FFFFFFF0,
+        // silently truncating any allocation ≥ 4 GB to its low 32 bits.
+        uint64_t                alignMask;
         uint64_t                totalAllocated;
         uint64_t                maxAllocated;
         uint64_t                initialSize = 0;
@@ -411,7 +414,10 @@ namespace das {
         CustomGrowFunction  customGrow;
         uint64_t    unadjustedInitialSize = 0;
         uint64_t    initialSize = 0;
-        uint32_t    alignMask = 15;
+        // uint64 — see MemoryModel::alignMask. `~alignMask` must be uint64 so the
+        // `DAS_VERIFYF(s <= UINT32_MAX)` cap check in allocate() actually fires on
+        // >4 GB requests instead of seeing a silently-truncated low-32-bit size.
+        uint64_t    alignMask = 15;
         HeapChunk * chunk = nullptr;
     };
 
diff --git a/src/ast/ast_infer_type.cpp b/src/ast/ast_infer_type.cpp
index c38c099bf7..95e01f70de 100644
--- a/src/ast/ast_infer_type.cpp
+++ b/src/ast/ast_infer_type.cpp
@@ -2273,6 +2273,12 @@ namespace das {
             } else if (expr->trait == "is_numeric") {
                 reportAstChanged();
                 return new ExprConstBool(expr->at, expr->typeexpr->isNumeric());
+            } else if (expr->trait == "is_int") {
+                reportAstChanged();
+                return new ExprConstBool(expr->at, expr->typeexpr->baseType == Type::tInt && expr->typeexpr->dim.size() == 0);
+            } else if (expr->trait == "is_int64") {
+                reportAstChanged();
+                return new ExprConstBool(expr->at, expr->typeexpr->baseType == Type::tInt64 && expr->typeexpr->dim.size() == 0);
             } else if (expr->trait == "is_numeric_comparable") {
                 reportAstChanged();
                 return new ExprConstBool(expr->at, expr->typeexpr->isNumericComparable());
diff --git a/src/ast/ast_infer_type_op.cpp b/src/ast/ast_infer_type_op.cpp
index 9bddbfe66c..21442f29c0 100644
--- a/src/ast/ast_infer_type_op.cpp
+++ b/src/ast/ast_infer_type_op.cpp
@@ -221,6 +221,9 @@ namespace das {
                 }
             } else if (expr->left->rtti_isAt()) {
                 ExprAt *eat = (ExprAt *)(expr->left);
+                if ( !eat->subexpr->type || eat->subexpr->type->isExprType() ) {
+                    return nullptr;
+                }
                 auto complexName = "[]" + expr->name;
                 if (auto atComplex = inferGenericOperator3(complexName, eat->at, eat->subexpr, eat->index, expr->right)) {
                     atComplex->alwaysSafe = eat->alwaysSafe | expr->alwaysSafe;
diff --git a/src/simulate/simulate_fusion_at_array.cpp b/src/simulate/simulate_fusion_at_array.cpp
index f879b3ad47..7cda4d0e1a 100644
--- a/src/simulate/simulate_fusion_at_array.cpp
+++ b/src/simulate/simulate_fusion_at_array.cpp
@@ -35,6 +35,12 @@ namespace das {
         uint32_t  stride, offset;
     };
 
+    // int64-indexed parallel base for fused arr[i64] access
+    struct SimNode_Op2ArrayAt_I64 : SimNode_Op2ArrayAt {};
+
+    // uint64-indexed parallel base for fused arr[u64] access
+    struct SimNode_Op2ArrayAt_U64 : SimNode_Op2ArrayAt {};
+
 /* ArrayAtR2V SCALAR */
 
 #define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
@@ -151,10 +157,258 @@ namespace das {
 
     IMPLEMENT_ANY_SETOP(__forceinline, ArrayAt, Ptr, StringPtr, StringPtr);
 
+/* ArrayAtR2V_I64 SCALAR */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_I64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = r.subexpr->evalInt64(context); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            return *((CTYPE *)(pl->data + uint64_t(rr)*uint64_t(stride) + offset)); \
+        } \
+        DAS_NODE(TYPE,CTYPE); \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_I64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = *((int64_t *)r.compute##COMPUTER(context)); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            return *((CTYPE *)(pl->data + uint64_t(rr)*uint64_t(stride) + offset)); \
+        } \
+        DAS_NODE(TYPE,CTYPE); \
+    };
+
+#undef IMPLEMENT_OP2_SET_SETUP_NODE
+#define IMPLEMENT_OP2_SET_SETUP_NODE(result,node) \
+    auto rn = (SimNode_Op2ArrayAt_I64 *)result; \
+    auto sn = (SimNode_ArrayAt_I64 *)node; \
+    rn->stride = sn->stride; \
+    rn->offset = sn->offset;
+
+#undef FUSION_OP2_SUBEXPR_LEFT
+#undef FUSION_OP2_SUBEXPR_RIGHT
+#define FUSION_OP2_SUBEXPR_LEFT(CTYPE,node)     ((static_cast<SimNode_ArrayAt_I64 *>(node))->l)
+#define FUSION_OP2_SUBEXPR_RIGHT(CTYPE,node)    ((static_cast<SimNode_ArrayAt_I64 *>(node))->r)
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_SETOP_SCALAR(ArrayAtR2V_I64);
+
+/* ArrayAtR2V_I64 VECTOR */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_I64 { \
+         DAS_EVAL_ABI virtual vec4f eval ( Context & context ) override { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = r.subexpr->evalInt64(context); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            vec4f __r; \
+            DAS_LDU_WORKHORSE(__r, pl->data + uint64_t(rr)*uint64_t(stride) + offset, CTYPE); \
+            return __r; \
+        } \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_I64 { \
+         DAS_EVAL_ABI virtual vec4f eval ( Context & context ) override { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = *((int64_t *)r.compute##COMPUTER(context)); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            vec4f __r; \
+            DAS_LDU_WORKHORSE(__r, pl->data + uint64_t(rr)*uint64_t(stride) + offset, CTYPE); \
+            return __r; \
+        } \
+    };
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_SETOP_NUMERIC_VEC(ArrayAtR2V_I64);
+
+/* ArrayAt_I64 */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_I64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = r.subexpr->evalInt64(context); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            return pl->data + uint64_t(rr)*uint64_t(stride) + offset; \
+        } \
+        DAS_PTR_NODE; \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_I64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            int64_t rr = *((int64_t *)r.compute##COMPUTER(context)); \
+            if ( rr<0 || uint64_t(rr) >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %lld of %llu", (long long)rr, (unsigned long long)pl->size); \
+            return pl->data + uint64_t(rr)*uint64_t(stride) + offset; \
+        } \
+        DAS_PTR_NODE; \
+    };
+
+#undef IMPLEMENT_OP2_SET_SETUP_NODE
+#define IMPLEMENT_OP2_SET_SETUP_NODE(result,node) \
+    auto rn = (SimNode_Op2ArrayAt_I64 *)result; \
+    auto sn = (SimNode_ArrayAt_I64 *)node; \
+    rn->stride = sn->stride; \
+    rn->offset = sn->offset; \
+    rn->baseType = Type::none;
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_ANY_SETOP(__forceinline, ArrayAt_I64, Ptr, StringPtr, StringPtr);
+
+/* ArrayAtR2V_U64 SCALAR */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_U64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = r.subexpr->evalUInt64(context); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            return *((CTYPE *)(pl->data + rr*uint64_t(stride) + offset)); \
+        } \
+        DAS_NODE(TYPE,CTYPE); \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_U64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = *((uint64_t *)r.compute##COMPUTER(context)); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            return *((CTYPE *)(pl->data + rr*uint64_t(stride) + offset)); \
+        } \
+        DAS_NODE(TYPE,CTYPE); \
+    };
+
+#undef IMPLEMENT_OP2_SET_SETUP_NODE
+#define IMPLEMENT_OP2_SET_SETUP_NODE(result,node) \
+    auto rn = (SimNode_Op2ArrayAt_U64 *)result; \
+    auto sn = (SimNode_ArrayAt_U64 *)node; \
+    rn->stride = sn->stride; \
+    rn->offset = sn->offset;
+
+#undef FUSION_OP2_SUBEXPR_LEFT
+#undef FUSION_OP2_SUBEXPR_RIGHT
+#define FUSION_OP2_SUBEXPR_LEFT(CTYPE,node)     ((static_cast<SimNode_ArrayAt_U64 *>(node))->l)
+#define FUSION_OP2_SUBEXPR_RIGHT(CTYPE,node)    ((static_cast<SimNode_ArrayAt_U64 *>(node))->r)
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_SETOP_SCALAR(ArrayAtR2V_U64);
+
+/* ArrayAtR2V_U64 VECTOR */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_U64 { \
+         DAS_EVAL_ABI virtual vec4f eval ( Context & context ) override { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = r.subexpr->evalUInt64(context); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            vec4f __r; \
+            DAS_LDU_WORKHORSE(__r, pl->data + rr*uint64_t(stride) + offset, CTYPE); \
+            return __r; \
+        } \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_U64 { \
+         DAS_EVAL_ABI virtual vec4f eval ( Context & context ) override { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = *((uint64_t *)r.compute##COMPUTER(context)); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            vec4f __r; \
+            DAS_LDU_WORKHORSE(__r, pl->data + rr*uint64_t(stride) + offset, CTYPE); \
+            return __r; \
+        } \
+    };
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_SETOP_NUMERIC_VEC(ArrayAtR2V_U64);
+
+/* ArrayAt_U64 */
+
+#undef IMPLEMENT_OP2_SET_NODE_ANY
+#define IMPLEMENT_OP2_SET_NODE_ANY(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_Any : SimNode_Op2ArrayAt_U64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = r.subexpr->evalUInt64(context); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            return pl->data + rr*uint64_t(stride) + offset; \
+        } \
+        DAS_PTR_NODE; \
+    };
+
+#undef IMPLEMENT_OP2_SET_NODE
+#define IMPLEMENT_OP2_SET_NODE(INLINE,OPNAME,TYPE,CTYPE,COMPUTEL,COMPUTER) \
+    struct SimNode_##OPNAME##_##COMPUTEL##_##COMPUTER : SimNode_Op2ArrayAt_U64 { \
+        INLINE auto compute ( Context & context ) { \
+            DAS_PROFILE_NODE \
+            auto pl = (Array *) l.compute##COMPUTEL(context); \
+            uint64_t rr = *((uint64_t *)r.compute##COMPUTER(context)); \
+            if ( rr >= pl->size ) context.throw_error_at(debugInfo,"array index out of range, %llu of %llu", (unsigned long long)rr, (unsigned long long)pl->size); \
+            return pl->data + rr*uint64_t(stride) + offset; \
+        } \
+        DAS_PTR_NODE; \
+    };
+
+#undef IMPLEMENT_OP2_SET_SETUP_NODE
+#define IMPLEMENT_OP2_SET_SETUP_NODE(result,node) \
+    auto rn = (SimNode_Op2ArrayAt_U64 *)result; \
+    auto sn = (SimNode_ArrayAt_U64 *)node; \
+    rn->stride = sn->stride; \
+    rn->offset = sn->offset; \
+    rn->baseType = Type::none;
+
+#include "daScript/simulate/simulate_fusion_op2_set_impl.h"
+#include "daScript/simulate/simulate_fusion_op2_set_perm.h"
+
+    IMPLEMENT_ANY_SETOP(__forceinline, ArrayAt_U64, Ptr, StringPtr, StringPtr);
+
     void createFusionEngine_at_array() {
         REGISTER_SETOP_SCALAR(ArrayAtR2V);
         REGISTER_SETOP_NUMERIC_VEC(ArrayAtR2V);
         (*getFusionEngine())["ArrayAt"].emplace_back(new FusionPoint_Set_ArrayAt_StringPtr());
+        REGISTER_SETOP_SCALAR(ArrayAtR2V_I64);
+        REGISTER_SETOP_NUMERIC_VEC(ArrayAtR2V_I64);
+        (*getFusionEngine())["ArrayAt_I64"].emplace_back(new FusionPoint_Set_ArrayAt_I64_StringPtr());
+        REGISTER_SETOP_SCALAR(ArrayAtR2V_U64);
+        REGISTER_SETOP_NUMERIC_VEC(ArrayAtR2V_U64);
+        (*getFusionEngine())["ArrayAt_U64"].emplace_back(new FusionPoint_Set_ArrayAt_U64_StringPtr());
     }
 }
 
diff --git a/tests-cpp/small/test_heap_64bit.cpp b/tests-cpp/small/test_heap_64bit.cpp
index b011ef87bf..87d4fcc93a 100644
--- a/tests-cpp/small/test_heap_64bit.cpp
+++ b/tests-cpp/small/test_heap_64bit.cpp
@@ -13,6 +13,7 @@
 
 #include "daScript/daScript.h"
 #include "daScript/daScriptC.h"
+#include "daScript/simulate/aot_builtin.h"   // heap_bytes_allocated
 
 #include <cstdlib>
 #include <cstring>
@@ -116,6 +117,53 @@ TEST_CASE("legacy uint32_t C-API still works after heap widening") {
     cleanup_inline_ctx(ic);
 }
 
+TEST_CASE("alignMask uint32 truncation guard: 4 GB allocation reports correct bytesAllocated (gated)") {
+    // Repro for the alignMask uint32_t truncation in MemoryModel::allocate
+    // (`size = (size + alignMask) & ~alignMask` — `~alignMask` is uint32 and
+    // zero-extends to 0x00000000FFFFFFF0 when ANDed with uint64 size). Sizes
+    // ≥ 4 GB lose their high 32 bits, the function takes the shoe path with
+    // size=0, computes `si = (0>>4) - 1 = 0xFFFFFFFF`, and dereferences
+    // `chunks[0xFFFFFFFF]` — a wild-address read that crashes the process.
+    //
+    // On master with the bug: das_context_allocate_i64(ctx, 4GB) crashes
+    // inside MemoryModel::allocate via shoe.chunks OOB.
+    // After widening alignMask to uint64_t: the AND no longer truncates,
+    // the allocation lands in the bigStuff path, and bytesAllocated() grows
+    // by ≥ 4 GB. The CHECK below catches a regression where the mask flips
+    // back to uint32 (bytesAllocated grows by ~16 instead of ≥ 4 GB).
+    if constexpr ( sizeof(void*) < 8 ) {
+        WARN("DASLANG_HUGE_HEAP_TESTS: 32-bit build, skipping");
+        return;
+    }
+    const char * env = getenv("DASLANG_HUGE_HEAP_TESTS");
+    if ( !env || env[0] != '1' ) {
+        WARN("DASLANG_HUGE_HEAP_TESTS=1 not set, skipping 4 GB alignMask probe");
+        return;
+    }
+
+    // persistent_heap routes through PersistentHeapAllocator (MemoryModel/bigStuff).
+    // The default LinearHeapAllocator is uint32-bounded per the policy at
+    // memory_model.h:415-422 — >4GB allocations through it should panic with a
+    // clear message rather than silently truncate. PR-A's widening of
+    // LinearChunkAllocator::alignMask also enables that policy check to fire.
+    static const char * SRC =
+        "options gen2\n"
+        "options persistent_heap = true\n"
+        "[export] def main {}\n";
+    InlineCtx ic = compile_inline(SRC);
+    REQUIRE(ic.ctx != nullptr);
+
+    const uint64_t HUGE_BYTES = uint64_t(4) * 1024 * 1024 * 1024;  // exactly 4 GB
+    const uint64_t before = heap_bytes_allocated(reinterpret_cast<Context*>(ic.ctx));
+    void * p = das_context_allocate_i64(ic.ctx, HUGE_BYTES);
+    REQUIRE(p != nullptr);
+    const uint64_t after = heap_bytes_allocated(reinterpret_cast<Context*>(ic.ctx));
+    CHECK(after - before >= HUGE_BYTES);
+    das_context_free_i64(ic.ctx, p, HUGE_BYTES);
+
+    cleanup_inline_ctx(ic);
+}
+
 TEST_CASE("uint64 size accepts values larger than UINT32_MAX (gated)") {
     // Only runs when DASLANG_HUGE_HEAP_TESTS=1 — a 5GB allocation isn't free
     // even on big runners. Compile-time disabled on 32-bit builds where
@@ -130,7 +178,13 @@ TEST_CASE("uint64 size accepts values larger than UINT32_MAX (gated)") {
         return;
     }
 
-    static const char * SRC = "options gen2\n[export] def main {}\n";
+    // persistent_heap required: default LinearHeapAllocator is uint32-bounded
+    // (per memory_model.h:415-422). >4GB allocations need PersistentHeapAllocator
+    // / MemoryModel::bigStuff path.
+    static const char * SRC =
+        "options gen2\n"
+        "options persistent_heap = true\n"
+        "[export] def main {}\n";
     InlineCtx ic = compile_inline(SRC);
     REQUIRE(ic.ctx != nullptr);
 
diff --git a/tests/linq/test_linq_fold.das b/tests/linq/test_linq_fold.das
index 07a18b8fcb..a9b0636a95 100644
--- a/tests/linq/test_linq_fold.das
+++ b/tests/linq/test_linq_fold.das
@@ -2776,3 +2776,70 @@ def test_take_while_skip_while_cascade_bails(t : T?) {
         tt |> equal(0, length(got))
     }
 }
+
+[test]
+def test_fold_order_by_first(t : T?) {
+    t |> run("order_by + first → min_by") @(tt : T?) {
+        let v = _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by(_).first())
+        tt |> equal(2, v, "minimum element after order_by")
+    }
+    t |> run("order_by_descending + first → max_by") @(tt : T?) {
+        let v = _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by_descending(_).first())
+        tt |> equal(30, v, "maximum element after order_by_descending")
+    }
+    t |> run("where + order_by + first → min on prefilter buf") @(tt : T?) {
+        let v = _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._where(_ > 5)._order_by(_).first())
+        tt |> equal(8, v, "min of (10,20,8,30,15,25) = 8")
+    }
+    t |> run("order_by + first_or_default — empty source returns default") @(tt : T?) {
+        let empty : array<int>
+        let v = _fold(each(empty)._order_by(_).first_or_default(-1))
+        tt |> equal(-1, v, "empty source → default")
+    }
+    t |> run("order_by + first_or_default — non-empty returns min") @(tt : T?) {
+        let v = _fold(each([10, 5, 20])._order_by(_).first_or_default(-1))
+        tt |> equal(5, v, "non-empty source → min element")
+    }
+    t |> run("where + order_by + first_or_default — empty after filter") @(tt : T?) {
+        let v = _fold(each([1, 2, 3])._where(_ > 100)._order_by(_).first_or_default(-1))
+        tt |> equal(-1, v, "filter excludes all → default")
+    }
+    t |> run("order_by + first on empty array panics") @(tt : T?) {
+        let empty : array<int>
+        var didPanic = false
+        try {
+            let v = _fold(each(empty)._order_by(_).first())
+            tt |> success(false, "expected panic; got {v}")
+        } recover {
+            didPanic = true
+        }
+        tt |> success(didPanic, "first() on empty array panicked")
+    }
+    t |> run("where + order_by + first filtered-empty panics") @(tt : T?) {
+        var didPanic = false
+        try {
+            let v = _fold(each([1, 2, 3])._where(_ > 100)._order_by(_).first())
+            tt |> success(false, "expected panic; got {v}")
+        } recover {
+            didPanic = true
+        }
+        tt |> success(didPanic, "first() on filtered-empty array panicked")
+    }
+    // ── order(arr, cmp) + first — splice MUST honor the custom comparator (was emitting bare min/max, ignoring cmp)
+    t |> run("order(arr, cmp) + first preserves custom comparator (descending cmp → max)") @(tt : T?) {
+        let v = _fold(each([10, 5, 20, 3]) |> order($(a : int, b : int) => a > b) |> first())
+        tt |> equal(20, v, "cmp orders descending → first returns max")
+    }
+    t |> run("order_descending(arr, cmp) + first preserves custom comparator") @(tt : T?) {
+        // order_descending swaps cmp args, so cmp `a > b` becomes effectively `b > a` (ascending) → first → min
+        let v = _fold(each([10, 5, 20, 3]) |> order_descending($(a : int, b : int) => a > b) |> first())
+        tt |> equal(3, v, "order_descending negates cmp → first is min")
+    }
+    t |> run("order(arr, cmp) + take preserves custom comparator (pre-existing recognizer hole)") @(tt : T?) {
+        // cmp orders descending → take(2) returns top 2
+        let result <- _fold(each([10, 5, 20, 3, 25, 8]) |> order($(a : int, b : int) => a > b) |> take(2) |> to_array())
+        tt |> equal(2, length(result), "take 2 of descending order")
+        tt |> equal(25, result[0], "first is max (25)")
+        tt |> equal(20, result[1], "second is next-max (20)")
+    }
+}
diff --git a/tests/linq/test_linq_fold_ast.das b/tests/linq/test_linq_fold_ast.das
index 86e3153bd3..d80344431e 100644
--- a/tests/linq/test_linq_fold_ast.das
+++ b/tests/linq/test_linq_fold_ast.das
@@ -160,6 +160,103 @@ def target_zip_impure_select_skip_bails() : array<int> {
     return <- [10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => side_effect_zip_proj(t._0)) |> skip(2) |> _fold()
 }
 
+// ── PR Phase 2B+ — accumulator (sum/min/max/average) terminators on zip ──
+// sum/min/max/average require a `select` projection to scalarize the tuple element (accumulator ops aren't defined on tuples). `long_count` is the exception: it routes through the COUNTER path locally (length shortcut + counter loop) since the accumulator is just `int64 acc++` regardless of element type.
+[export, marker(no_coverage)]
+def target_zip_sum_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> sum())
+}
+
+[export, marker(no_coverage)]
+def target_zip_min_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 - t._1) |> min())
+}
+
+[export, marker(no_coverage)]
+def target_zip_max_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 * t._1) |> max())
+}
+
+[export, marker(no_coverage)]
+def target_zip_average_proj_fold() : double {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> average())
+}
+
+[export, marker(no_coverage)]
+def target_zip_where_sum_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> where_($(t : tuple<int; int>) => t._0 > 20) |> select($(t : tuple<int; int>) => t._0 + t._1) |> sum())
+}
+
+[export, marker(no_coverage)]
+def target_zip_where_long_count_fold() : int64 {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> where_($(t : tuple<int; int>) => t._0 > 20) |> long_count())
+}
+
+// ── PR Phase 2B+ — early-exit (first/first_or_default/any/all/contains) terminators on zip ──
+[export, marker(no_coverage)]
+def target_zip_first_no_proj_fold() : tuple<int; int> {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> first())
+}
+
+[export, marker(no_coverage)]
+def target_zip_first_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> first())
+}
+
+[export, marker(no_coverage)]
+def target_zip_where_first_fold() : tuple<int; int> {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> where_($(t : tuple<int; int>) => t._0 > 20) |> first())
+}
+
+[export, marker(no_coverage)]
+def target_zip_first_or_default_proj_fold() : int {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> where_($(v : int) => v > 1000) |> first_or_default(-1))
+}
+
+[export, marker(no_coverage)]
+def target_zip_any_no_pred_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> any())
+}
+
+[export, marker(no_coverage)]
+def target_zip_any_no_pred_empty_fold() : bool {
+    var emptyA : array<int>
+    return _fold(emptyA.zip([1, 2, 3]) |> any())
+}
+
+[export, marker(no_coverage)]
+def target_zip_any_pred_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> any($(t : tuple<int; int>) => t._0 > 30))
+}
+
+[export, marker(no_coverage)]
+def target_zip_all_pred_true_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> all($(t : tuple<int; int>) => t._0 > 0))
+}
+
+[export, marker(no_coverage)]
+def target_zip_all_pred_false_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> all($(t : tuple<int; int>) => t._0 > 30))
+}
+
+[export, marker(no_coverage)]
+def target_zip_contains_proj_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> contains(33))
+}
+
+[export, marker(no_coverage)]
+def target_zip_contains_proj_miss_fold() : bool {
+    return _fold([10, 20, 30, 40, 50].zip([1, 2, 3, 4, 5]) |> select($(t : tuple<int; int>) => t._0 + t._1) |> contains(999))
+}
+
+// PR #2742 Copilot review #r3270337476 — DEFERRED: emit_accumulator_lane.average semantics divergence from linq.das. Pre-existing in helper (affects plan_loop_or_count's single-source path too): accumulates in accType (often int → overflow risk) and returns NaN on empty cnt, while linq.das average accumulates in double and returns 0.0lf on empty. Follow-up PR must fix the helper uniformly AND update the existing fold test "average: empty → NaN" in test_linq_fold.das to expect 0.0lf.
+[export, marker(no_coverage)]
+def target_zip_average_empty_fold() : double {
+    var emptyA : array<int>
+    var emptyB : array<int>
+    return _fold(emptyA.zip(emptyB) |> select($(t : tuple<int; int>) => t._0 + t._1) |> average())
+}
+
 [export, marker(no_coverage)]
 def target_zip3_fold() : array<tuple<int; int; int>> {
     return <- [1, 2, 3]._select(_ * 2).zip([10, 20, 30]._select(_ + 1), [100, 200, 300]._select(_ / 10))._fold()
@@ -339,6 +436,9 @@ def test_zip_long_count_uses_length_shortcut(t : T?) {
         t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
         let n = count_inner_for_loops(body_expr)
         t |> equal(n, 0, "zip(arr,arr).long_count() must use length shortcut (no for-loop)")
+        // Distinguish length-shortcut splice from tier-2 cascade by elimination: for_loops==0 rules out the counter-loop splice path; long_count==0 rules out tier-2 (which leaves the long_count call in place). Both together ⇒ length shortcut. (Can't grep for length() directly: count_call doesn't recurse into ExprOp3 ternary where length() lives in the min(...) expression.)
+        let longCountCalls = count_call(body_expr, "long_count")
+        t |> equal(longCountCalls, 0, "long_count must be inlined into the splice (no runtime call)")
     }
 }
 
@@ -569,6 +669,213 @@ def test_zip_select_count_pure_skips_it_bind(t : T?) {
     }
 }
 
+// ── PR Phase 2B+ accumulator (sum/min/max/average) terminators on zip ──
+// Source: arrA=[10,20,30,40,50], arrB=[1,2,3,4,5]; pairs are (10,1)(20,2)(30,3)(40,4)(50,5).
+
+[test]
+def test_zip_sum_proj_fold_result(t : T?) {
+    t |> run("zip+select+sum returns scalar accumulator") @(t : T?) {
+        // (10+1)+(20+2)+(30+3)+(40+4)+(50+5) = 11+22+33+44+55 = 165
+        t |> equal(target_zip_sum_proj_fold(), 165)
+    }
+}
+
+[test]
+def test_zip_min_proj_fold_result(t : T?) {
+    t |> run("zip+select+min picks smallest projected value") @(t : T?) {
+        // diffs: 9,18,27,36,45 → min 9
+        t |> equal(target_zip_min_proj_fold(), 9)
+    }
+}
+
+[test]
+def test_zip_max_proj_fold_result(t : T?) {
+    t |> run("zip+select+max picks largest projected value") @(t : T?) {
+        // products: 10,40,90,160,250 → max 250
+        t |> equal(target_zip_max_proj_fold(), 250)
+    }
+}
+
+[test]
+def test_zip_average_proj_fold_result(t : T?) {
+    t |> run("zip+select+average returns double") @(t : T?) {
+        // (11+22+33+44+55)/5 = 33.0
+        t |> equal(target_zip_average_proj_fold(), 33.0lf)
+    }
+}
+
+[test]
+def test_zip_where_sum_proj_fold_result(t : T?) {
+    t |> run("zip+where+select+sum filters then sums") @(t : T?) {
+        // survivors where t._0>20: (30,3)(40,4)(50,5) → 33+44+55 = 132
+        t |> equal(target_zip_where_sum_proj_fold(), 132)
+    }
+}
+
+[test]
+def test_zip_where_long_count_fold_result(t : T?) {
+    t |> run("zip+where+long_count returns int64 survivor count") @(t : T?) {
+        // 3 survivors
+        t |> equal(target_zip_where_long_count_fold(), 3l)
+    }
+}
+
+// long_count + chain op (where) must emit a single multi-iter counter for-loop, not cascade to tier-2. Regression guard against routing long_count through the ACCUMULATOR projection-required branch (PR #2742 Copilot review #r3270242609).
+[test]
+def test_zip_where_long_count_emits_counter_loop(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_zip_where_long_count_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        let nfor = count_inner_for_loops(body_expr)
+        t |> equal(nfor, 1, "zip+where+long_count must emit one multi-iter counter for-loop")
+        let zipCalls = count_call(body_expr, "zip")
+        t |> equal(zipCalls, 0, "zip call must be inlined into the splice")
+        let longCountCalls = count_call(body_expr, "long_count")
+        t |> equal(longCountCalls, 0, "long_count terminator must be fused (no runtime call)")
+    }
+}
+
+// ── PR Phase 2B+ early-exit (first/first_or_default/any/all/contains) on zip ──
+
+[test]
+def test_zip_first_no_proj_fold_result(t : T?) {
+    t |> run("zip+first (no projection) returns first tuple") @(t : T?) {
+        let result = target_zip_first_no_proj_fold()
+        t |> equal(result._0, 10)
+        t |> equal(result._1, 1)
+    }
+}
+
+[test]
+def test_zip_first_proj_fold_result(t : T?) {
+    t |> run("zip+select+first returns first projected scalar") @(t : T?) {
+        // first pair (10,1) projected → 11
+        t |> equal(target_zip_first_proj_fold(), 11)
+    }
+}
+
+[test]
+def test_zip_where_first_fold_result(t : T?) {
+    t |> run("zip+where+first returns first survivor tuple") @(t : T?) {
+        // first pair where t._0>20: (30,3)
+        let result = target_zip_where_first_fold()
+        t |> equal(result._0, 30)
+        t |> equal(result._1, 3)
+    }
+}
+
+[test]
+def test_zip_first_or_default_proj_fold_result(t : T?) {
+    t |> run("zip+select+where+first_or_default returns default when no survivor") @(t : T?) {
+        // no projected sum > 1000, so default -1 fires
+        t |> equal(target_zip_first_or_default_proj_fold(), -1)
+    }
+}
+
+[test]
+def test_zip_any_no_pred_fold_result(t : T?) {
+    t |> run("zip+any (no pred) on non-empty pair → true") @(t : T?) {
+        t |> equal(target_zip_any_no_pred_fold(), true)
+    }
+}
+
+[test]
+def test_zip_any_no_pred_empty_fold_result(t : T?) {
+    t |> run("zip+any (no pred) on empty source → false") @(t : T?) {
+        // emptyA.zip(...) → min(0,3) = 0 iters → any returns false
+        t |> equal(target_zip_any_no_pred_empty_fold(), false)
+    }
+}
+
+[test]
+def test_zip_any_pred_fold_result(t : T?) {
+    t |> run("zip+any(pred) returns true if any pair matches") @(t : T?) {
+        // (40,4) and (50,5) satisfy t._0>30 → true
+        t |> equal(target_zip_any_pred_fold(), true)
+    }
+}
+
+[test]
+def test_zip_all_pred_true_fold_result(t : T?) {
+    t |> run("zip+all(pred) returns true when all match") @(t : T?) {
+        t |> equal(target_zip_all_pred_true_fold(), true)
+    }
+}
+
+[test]
+def test_zip_all_pred_false_fold_result(t : T?) {
+    t |> run("zip+all(pred) returns false when any fails") @(t : T?) {
+        // 10, 20, 30 fail t._0>30
+        t |> equal(target_zip_all_pred_false_fold(), false)
+    }
+}
+
+[test]
+def test_zip_contains_proj_fold_result(t : T?) {
+    t |> run("zip+select+contains returns true on hit") @(t : T?) {
+        // 30+3 = 33 matches
+        t |> equal(target_zip_contains_proj_fold(), true)
+    }
+}
+
+[test]
+def test_zip_contains_proj_miss_fold_result(t : T?) {
+    t |> run("zip+select+contains returns false on miss") @(t : T?) {
+        t |> equal(target_zip_contains_proj_miss_fold(), false)
+    }
+}
+
+// PR #2742 Copilot review #r3270337476 — DEFERRED-fix tracking test. Documents the desired post-fix behavior (zip+select+average on empty source should return 0.0lf per linq.das semantics). Currently emit_accumulator_lane returns NaN, so this test is skipped until the helper is aligned (uniform fix across plan_loop_or_count + plan_zip; also requires updating "average: empty → NaN" in test_linq_fold.das to expect 0.0lf).
+[test]
+def test_zip_average_empty_returns_zero_when_fixed(t : T?) {
+    t->skip("DEFERRED: zip+select+average on empty source should return 0.0lf per linq.das semantics. Blocked on follow-up PR aligning emit_accumulator_lane.average with linq.das (double accumulator + cnt==0 guard); pre-existing helper divergence — affects single-source plan_loop_or_count too. Un-skip + assert `equal(target_zip_average_empty_fold(), 0.0lf)` when fix lands.")
+}
+
+// AST-shape: zip+accumulator splices into a single for-loop with no surviving runtime `sum`/`zip` calls.
+[test]
+def test_zip_sum_proj_emits_inline_accumulator(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_zip_sum_proj_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        let nfor = count_inner_for_loops(body_expr)
+        t |> equal(nfor, 1, "zip+select+sum must emit exactly one multi-iter for-loop")
+        let zipCalls = count_call(body_expr, "zip")
+        t |> equal(zipCalls, 0, "zip call must be inlined into the splice")
+        let sumCalls = count_call(body_expr, "sum")
+        t |> equal(sumCalls, 0, "sum terminator must be fused (no runtime sum call)")
+    }
+}
+
+// AST-shape: zip+early-exit emits the same single-for shape with no surviving terminator call.
+[test]
+def test_zip_first_proj_emits_early_return(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_zip_first_proj_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        let nfor = count_inner_for_loops(body_expr)
+        t |> equal(nfor, 1, "zip+select+first must emit exactly one multi-iter for-loop")
+        let zipCalls = count_call(body_expr, "zip")
+        t |> equal(zipCalls, 0, "zip call must be inlined")
+        let firstCalls = count_call(body_expr, "first")
+        t |> equal(firstCalls, 0, "first terminator must be fused (no runtime first call)")
+    }
+}
+
 // ── Targets for `_fold` Phase-2A loop planner ──────────────────────────
 
 [export, marker(no_coverage)]
@@ -3531,3 +3838,130 @@ def test_select_then_take_while_cascades_to_tier2(t : T?) {
     }
 }
 
+// ── order_by + first → min_by / max_by splice arm ───────────────────────
+
+[export, marker(no_coverage)]
+def target_order_by_first_splices_min_by() : int {
+    return _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by(_).first())
+}
+
+[export, marker(no_coverage)]
+def target_order_by_descending_first_splices_max_by() : int {
+    return _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by_descending(_).first())
+}
+
+[export, marker(no_coverage)]
+def target_where_order_by_first_splices_min_by_on_buf() : int {
+    return _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._where(_ > 5)._order_by(_).first())
+}
+
+[export, marker(no_coverage)]
+def target_order_by_first_or_default_splices_top_n_by() : int {
+    return _fold(each([10, 20, 5, 8, 30])._order_by(_).first_or_default(-1))
+}
+
+[export, marker(no_coverage)]
+def target_order_with_cmp_first_bails_to_tier2() : int {
+    // order(arr, cmp) with a custom comparator — splice must bail (min/max can't honor arbitrary cmp).
+    return _fold(each([10, 5, 20, 3]) |> order($(a : int, b : int) => a > b) |> first())
+}
+
+[test]
+def test_order_by_first_emits_min_by(t : T?) {
+    // order_by + first → min_by(top, key) directly. No sort, no take, no order_by/order helpers.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_first_splices_min_by)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "min_by") >= 1, "should emit a min_by call")
+        t |> equal(0, count_call(body_expr, "max_by"), "should NOT emit max_by (ascending)")
+        t |> equal(0, count_call(body_expr, "order_by"), "should NOT emit order_by")
+        t |> equal(0, count_call(body_expr, "first"), "should NOT emit first (splice absorbs)")
+    }
+}
+
+[test]
+def test_order_by_descending_first_emits_max_by(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_descending_first_splices_max_by)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "max_by") >= 1, "should emit a max_by call")
+        t |> equal(0, count_call(body_expr, "min_by"), "should NOT emit min_by (descending)")
+        t |> equal(0, count_call(body_expr, "order_by_descending"), "should NOT emit order_by_descending")
+        t |> equal(0, count_call(body_expr, "first"), "should NOT emit first (splice absorbs)")
+    }
+}
+
+[test]
+def test_where_order_by_first_emits_min_by_on_prefilter(t : T?) {
+    // where + order_by + first → fused prefilter loop + min_by on buf. Same shape as
+    // where + order_by + take, but min_by instead of top_n_by.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_where_order_by_first_splices_min_by_on_buf)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "min_by") >= 1, "should emit min_by on prefilter buf")
+        t |> equal(0, count_call(body_expr, "order_by"), "should NOT emit order_by")
+        t |> equal(0, count_call(body_expr, "first"), "should NOT emit first (splice absorbs)")
+    }
+}
+
+[test]
+def test_order_by_first_or_default_emits_top_n_by_first_or_default(t : T?) {
+    // order_by + first_or_default → top_n_by(top, 1, key) |> first_or_default(d) — empty
+    // array handles the default fallback; no min_by_or_default helper exists.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_first_or_default_splices_top_n_by)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "top_n_by_with_cmp") + count_call(body_expr, "top_n_by") >= 1,
+            "should emit top_n_by (or top_n_by_with_cmp for inline key)")
+        t |> success(count_call(body_expr, "first_or_default") >= 1,
+            "should emit first_or_default on the 1-elem array")
+        t |> equal(0, count_call(body_expr, "order_by"), "should NOT emit order_by")
+    }
+}
+
+[test]
+def test_order_with_cmp_first_bails_no_min_max(t : T?) {
+    // `order(arr, cmp)` with a custom comparator block: splice helpers can't honor an arbitrary cmp,
+    // so plan_order_family must bail. fold_linq_default takes over and rewrites to `order_to_array` + `first`.
+    // The key invariant: NO min/max splice (which would silently drop the cmp).
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_order_with_cmp_first_bails_to_tier2)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> equal(0, count_call(body_expr, "min_by"), "must NOT emit min_by (cmp can't be honored)")
+        t |> equal(0, count_call(body_expr, "max_by"), "must NOT emit max_by")
+        t |> equal(0, count_call(body_expr, "min"), "must NOT emit min")
+        t |> equal(0, count_call(body_expr, "max"), "must NOT emit max")
+        t |> equal(0, count_call(body_expr, "top_n"), "must NOT emit top_n (bare top_n would drop cmp)")
+        t |> equal(0, count_call(body_expr, "top_n_by"), "must NOT emit top_n_by")
+        // fold_linq_default rewrites the bailed chain to order_to_array + first. Confirm a sort step survives.
+        t |> success(count_call(body_expr, "order_to_array") + count_call(body_expr, "order") >= 1,
+            "sort step must survive (order_to_array or order)")
+        t |> success(count_call(body_expr, "first") >= 1, "first call must survive")
+    }
+}
+
diff --git a/tests/linq/test_linq_from_decs.das b/tests/linq/test_linq_from_decs.das
index 46fdc7aab9..eae9057428 100644
--- a/tests/linq/test_linq_from_decs.das
+++ b/tests/linq/test_linq_from_decs.das
@@ -321,3 +321,364 @@ def test_unroll_select_sum_splice_shape(t : T?) {
         t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
     }
 }
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 3a: min/max/average accumulator extensions
+// ─────────────────────────────────────────────────────────────────────────────
+
+[export, marker(no_coverage)]
+def target_unroll_min_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val).min())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_max_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val).max())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_average_fold() : double {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val).average())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_where_max_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._where(_.flag == 1)._select(_.val).max())
+}
+
+[test]
+def test_unroll_min_parity(t : T?) {
+    fixture_unroll2(5)
+    // vals 0,1,2,3,4 → min = 0
+    t |> equal(target_unroll_min_fold(), 0, "min splice parity")
+}
+
+[test]
+def test_unroll_max_parity(t : T?) {
+    fixture_unroll2(5)
+    // vals 0,1,2,3,4 → max = 4
+    t |> equal(target_unroll_max_fold(), 4, "max splice parity")
+}
+
+[test]
+def test_unroll_average_parity(t : T?) {
+    fixture_unroll2(5)
+    // vals 0,1,2,3,4 → avg = 10/5 = 2.0
+    t |> equal(target_unroll_average_fold(), 2.0lf, "average splice parity")
+}
+
+[test]
+def test_unroll_where_max_parity(t : T?) {
+    fixture_unroll2(5)
+    // flag==1 rows: vals 1,3 → max 3
+    t |> equal(target_unroll_where_max_fold(), 3, "where+select+max splice parity")
+}
+
+[test]
+def test_unroll_max_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_max_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_max_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "max splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 3b: early-exit terminators (first/first_or_default/any/all/contains)
+// ─────────────────────────────────────────────────────────────────────────────
+
+[export, marker(no_coverage)]
+def target_unroll_any_bare_fold() : bool {
+    return _fold(from_decs_template(type<UnrollChainRow>).any())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_any_pred_fold() : bool {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._any(_ > 100))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_all_fold() : bool {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._all(_ >= 0))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_contains_fold() : bool {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val).contains(3))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_first_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._where(_.flag == 1)._select(_.val).first())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_first_or_default_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._where(_.val > 1000)._select(_.val).first_or_default(-7))
+}
+
+[test]
+def test_unroll_any_bare_parity(t : T?) {
+    fixture_unroll2(3)
+    t |> equal(target_unroll_any_bare_fold(), true, "bare any with entities")
+    restart()
+    commit()
+    t |> equal(target_unroll_any_bare_fold(), false, "bare any with no entities")
+}
+
+[test]
+def test_unroll_any_pred_parity(t : T?) {
+    fixture_unroll2(5)
+    // vals 0..4, none > 100
+    t |> equal(target_unroll_any_pred_fold(), false, "any(pred) — no match")
+    fixture_unroll2(150)
+    // vals 0..149, some > 100
+    t |> equal(target_unroll_any_pred_fold(), true, "any(pred) — has match")
+}
+
+[test]
+def test_unroll_all_parity(t : T?) {
+    fixture_unroll2(5)
+    t |> equal(target_unroll_all_fold(), true, "all(>=0) on 0..4")
+}
+
+[test]
+def test_unroll_contains_parity(t : T?) {
+    fixture_unroll2(5)
+    t |> equal(target_unroll_contains_fold(), true, "contains(3) in 0..4")
+    fixture_unroll2(2)
+    t |> equal(target_unroll_contains_fold(), false, "contains(3) in 0..1")
+}
+
+[test]
+def test_unroll_first_parity(t : T?) {
+    fixture_unroll2(5)
+    // flag==1 first: i=1 → val=1
+    t |> equal(target_unroll_first_fold(), 1, "first(where flag==1)")
+}
+
+[test]
+def test_unroll_first_or_default_parity(t : T?) {
+    fixture_unroll2(5)
+    // val>1000 → none match → default -7
+    t |> equal(target_unroll_first_or_default_fold(), -7, "first_or_default empty match")
+}
+
+[test]
+def test_unroll_first_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_first_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_first_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "first splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype_find"), 1, "first splice uses for_each_archetype_find")
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 3c: to_array buffer terminator
+// ─────────────────────────────────────────────────────────────────────────────
+
+[export, marker(no_coverage)]
+def target_unroll_to_array_fold() : array<int> {
+    return <- _fold(from_decs_template(type<UnrollChainRow>)._where(_.flag == 1)._select(_.val).to_array())
+}
+
+[test]
+def test_unroll_to_array_parity(t : T?) {
+    fixture_unroll2(6)
+    // flag==1 rows: i=1,3,5 → vals 1,3,5
+    let got <- target_unroll_to_array_fold()
+    t |> equal(got |> length, 3, "to_array length")
+    if (length(got) == 3) {
+        t |> equal(got[0], 1, "to_array[0]")
+        t |> equal(got[1], 3, "to_array[1]")
+        t |> equal(got[2], 5, "to_array[2]")
+    }
+}
+
+[test]
+def test_unroll_to_array_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_to_array_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_to_array_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "to_array splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 4: chained _select + interleaved _where + _count(pred) + _min_by / _max_by
+// ─────────────────────────────────────────────────────────────────────────────
+
+[export, marker(no_coverage)]
+def target_unroll_chained_select_sum_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._select(_ * 10).sum())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_select_then_where_sum_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._where(_ > 1).sum())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_where_select_where_sum_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._where(_.flag == 1)._select(_.val)._where(_ > 1).sum())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_count_pred_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._count(_.val > 2))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_long_count_pred_fold() : int64 {
+    return _fold(from_decs_template(type<UnrollChainRow>)._long_count(_.val >= 3))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_chained_select_to_array_fold() : array<int> {
+    return <- _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._select(_ + 100).to_array())
+}
+
+[export, marker(no_coverage)]
+def target_unroll_min_by_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._min_by(-_))
+}
+
+[export, marker(no_coverage)]
+def target_unroll_max_by_fold() : int {
+    return _fold(from_decs_template(type<UnrollChainRow>)._select(_.val)._max_by(-_))
+}
+
+[test]
+def test_unroll_chained_select_sum_parity(t : T?) {
+    fixture_unroll2(5)
+    // (0+1+2+3+4)*10 = 100
+    t |> equal(target_unroll_chained_select_sum_fold(), 100, "chained select+sum splice parity")
+}
+
+[test]
+def test_unroll_select_then_where_sum_parity(t : T?) {
+    fixture_unroll2(5)
+    // select val → 0,1,2,3,4 → where >1 → 2,3,4 → sum 9
+    t |> equal(target_unroll_select_then_where_sum_fold(), 9, "select→where→sum splice parity")
+}
+
+[test]
+def test_unroll_where_select_where_sum_parity(t : T?) {
+    fixture_unroll2(6)
+    // flag==1 rows: i=1,3,5 (vals 1,3,5) → select val → 1,3,5 → where >1 → 3,5 → sum 8
+    t |> equal(target_unroll_where_select_where_sum_parity_compute(), 8, "where→select→where→sum splice parity")
+}
+
+def private target_unroll_where_select_where_sum_parity_compute() : int {
+    return target_unroll_where_select_where_sum_fold()
+}
+
+[test]
+def test_unroll_count_pred_parity(t : T?) {
+    fixture_unroll2(6)
+    // vals 0..5, pred val>2 → 3,4,5 → 3 hits
+    t |> equal(target_unroll_count_pred_fold(), 3, "_count(pred) splice parity")
+}
+
+[test]
+def test_unroll_long_count_pred_parity(t : T?) {
+    fixture_unroll2(6)
+    // vals 0..5, pred val>=3 → 3,4,5 → 3 hits
+    t |> equal(target_unroll_long_count_pred_fold(), 3l, "_long_count(pred) splice parity")
+}
+
+[test]
+def test_unroll_chained_select_to_array_parity(t : T?) {
+    fixture_unroll2(3)
+    // select val → 0,1,2 → select +100 → 100,101,102
+    let got <- target_unroll_chained_select_to_array_fold()
+    t |> equal(got |> length, 3, "chained to_array length")
+    if (length(got) == 3) {
+        t |> equal(got[0], 100, "[0]")
+        t |> equal(got[1], 101, "[1]")
+        t |> equal(got[2], 102, "[2]")
+    }
+}
+
+[test]
+def test_unroll_min_by_parity(t : T?) {
+    fixture_unroll2(5)
+    // select val → 0,1,2,3,4; min by -val → max val → 4
+    t |> equal(target_unroll_min_by_fold(), 4, "_min_by splice parity")
+}
+
+[test]
+def test_unroll_max_by_parity(t : T?) {
+    fixture_unroll2(5)
+    // select val → 0,1,2,3,4; max by -val → min val → 0
+    t |> equal(target_unroll_max_by_fold(), 0, "_max_by splice parity")
+}
+
+[test]
+def test_unroll_chained_select_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_chained_select_sum_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_chained_select_sum_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "chained-select sum splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
+    }
+}
+
+[test]
+def test_unroll_count_pred_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_count_pred_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_count_pred_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "_count(pred) splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
+    }
+}
+
+[test]
+def test_unroll_min_by_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_min_by_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_min_by_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "to_sequence"), 0, "_min_by splice must NOT call to_sequence")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
+    }
+}
diff --git a/tests/long_array_table/test_fusion_arr_i64.das b/tests/long_array_table/test_fusion_arr_i64.das
new file mode 100644
index 0000000000..69b4aa73b3
--- /dev/null
+++ b/tests/long_array_table/test_fusion_arr_i64.das
@@ -0,0 +1,98 @@
+options gen2
+require dastest/testing_boost public
+
+// Slice A coverage: fused path for arr[i64] / arr[u64] across the
+// compute-mode cross-product (Local_Const / Local_Local / Argument).
+// Correctness only; fusion-vs-unfused parity is a benchmark concern.
+
+[test]
+def test_arr_i64_const_idx(t : T?) {
+    var arr : array<int>
+    arr |> resize(10)
+    for (i in iter_range(arr)) {
+        arr[i] = i * 7
+    }
+    // i64 const literal indices — _I64_Local_Const fusion variant
+    t |> equal(arr[0_l], 0)
+    t |> equal(arr[3_l], 21)
+    t |> equal(arr[9_l], 63)
+}
+
+[test]
+def test_arr_u64_const_idx(t : T?) {
+    var arr : array<int>
+    arr |> resize(10)
+    for (i in iter_range(arr)) {
+        arr[i] = i * 11
+    }
+    // u64 const literal indices — _U64_Local_Const fusion variant
+    t |> equal(arr[0_ul], 0)
+    t |> equal(arr[4_ul], 44)
+    t |> equal(arr[9_ul], 99)
+}
+
+[test]
+def test_arr_i64_var_write_read(t : T?) {
+    var arr : array<int>
+    arr |> resize(5)
+    let i : int64 = 2_l
+    let j : int64 = 4_l
+    arr[i] = 100
+    arr[j] = 200
+    t |> equal(arr[i], 100)
+    t |> equal(arr[j], 200)
+}
+
+[test]
+def test_arr_i64_via_argument(t : T?) {
+    var arr : array<int>
+    arr |> resize(8)
+    for (i in iter_range(arr)) {
+        arr[i] = i + 1000
+    }
+    t |> equal(read_at(arr, 0_l), 1000)
+    t |> equal(read_at(arr, 5_l), 1005)
+    t |> equal(read_at(arr, 7_l), 1007)
+}
+
+def read_at(arr : array<int>; idx : int64) : int {
+    return arr[idx]
+}
+
+[test]
+def test_arr_i64_float_value_type(t : T?) {
+    var arr : array<float>
+    arr |> resize(4)
+    arr[0_l] = 1.5f
+    arr[1_l] = 2.5f
+    arr[2_l] = 3.5f
+    arr[3_l] = 4.5f
+    t |> equal(arr[0_l], 1.5f)
+    t |> equal(arr[3_l], 4.5f)
+}
+
+[test]
+def test_arr_i64_lvalue_write_via_fusion(t : T?) {
+    // Exercises ArrayAt_I64 (DAS_PTR_NODE) — writes through the fused ptr return.
+    var arr : array<int>
+    arr |> resize(4)
+    arr[0_l] = 10
+    arr[1_l] = 20
+    arr[2_l] = 30
+    arr[3_l] = 40
+    var total = 0
+    for (v in arr) {
+        total += v
+    }
+    t |> equal(total, 100)
+}
+
+[test]
+def test_arr_u64_lvalue_write_via_fusion(t : T?) {
+    var arr : array<float>
+    arr |> resize(3)
+    arr[0_ul] = 1.0f
+    arr[1_ul] = 2.0f
+    arr[2_ul] = 3.0f
+    t |> equal(arr[0_ul] + arr[1_ul] + arr[2_ul], 6.0f)
+}
diff --git a/tests/long_array_table/test_fusion_table_i64.das b/tests/long_array_table/test_fusion_table_i64.das
new file mode 100644
index 0000000000..a317ab807c
--- /dev/null
+++ b/tests/long_array_table/test_fusion_table_i64.das
@@ -0,0 +1,64 @@
+options gen2
+require dastest/testing_boost public
+
+// Slice B audit: int64/uint64-keyed table indexing already has fusion variants
+// registered via IMPLEMENT_SETOP_NUMERIC(TableIndex) at
+// src/simulate/simulate_fusion_tableindex.cpp:81. These tests verify the
+// fused path produces correct values for int64/uint64 keys.
+
+[test]
+def test_table_i64_key_index_write_read(t : T?) {
+    var tab : table<int64; int>
+    tab[1_l] = 10
+    tab[1000000000000_l] = 20
+    tab[-1_l] = 30
+    t |> equal(tab[1_l], 10)
+    t |> equal(tab[1000000000000_l], 20)
+    t |> equal(tab[-1_l], 30)
+}
+
+[test]
+def test_table_i64_key_overwrite(t : T?) {
+    var tab : table<int64; string>
+    tab[42_l] = "first"
+    tab[42_l] = "second"
+    t |> equal(length(tab), 1)
+    t |> equal(tab[42_l], "second")
+}
+
+[test]
+def test_table_u64_key_index(t : T?) {
+    var tab : table<uint64; int>
+    tab[1_ul] = 100
+    tab[18446744073709551615_ul] = 200
+    t |> equal(tab[1_ul], 100)
+    t |> equal(tab[18446744073709551615_ul], 200)
+}
+
+[test]
+def test_table_i64_key_via_local(t : T?) {
+    var tab : table<int64; int>
+    let k1 : int64 = 5_l
+    let k2 : int64 = 100_l
+    tab[k1] = 50
+    tab[k2] = 1000
+    t |> equal(tab[k1], 50)
+    t |> equal(tab[k2], 1000)
+}
+
+[test]
+def test_table_i64_key_via_arg(t : T?) {
+    var tab : table<int64; int>
+    fill_table(tab, 7_l, 77)
+    fill_table(tab, 13_l, 133)
+    t |> equal(read_table(tab, 7_l), 77)
+    t |> equal(read_table(tab, 13_l), 133)
+}
+
+def fill_table(var tab : table<int64; int>; k : int64; v : int) {
+    tab[k] = v
+}
+
+def read_table(var tab : table<int64; int>; k : int64) : int {
+    return tab[k]
+}
diff --git a/tests/long_array_table/test_huge_array_index_offset.das b/tests/long_array_table/test_huge_array_index_offset.das
new file mode 100644
index 0000000000..e98178d59f
--- /dev/null
+++ b/tests/long_array_table/test_huge_array_index_offset.das
@@ -0,0 +1,56 @@
+options gen2
+options persistent_heap = true   // ~4.4 GB array<int> needs PersistentHeapAllocator
+
+require dastest/testing_boost public
+require daslib/fio
+
+// Memory-gated probe targeting the SimNode_ArrayAt offset-math widening.
+// `array<int>` with stride=4 and ~1.1G elements puts `idx * stride` above
+// UINT32_MAX, so the address computation MUST be `uint64_t(idx) *
+// uint64_t(stride) + offset` (per Phase 2/runtime_array.h). A uint32
+// regression would wrap and read the wrong cache line — usually a segfault,
+// occasionally a silently-wrong value.
+//
+// ~4.3 GB allocation. Skipped silently unless DASLANG_HUGE_HEAP_TESTS=1 on
+// a 64-bit build.
+
+def huge_enabled() : bool {
+    static_if (typeinfo sizeof(type<int?>) < 8) {
+        return false
+    }
+    return has_env_variable("DASLANG_HUGE_HEAP_TESTS") && get_env_variable("DASLANG_HUGE_HEAP_TESTS") == "1"
+}
+
+// (UINT32_MAX / 4) + ~32M slack = 1.1G elements; idx*stride > 4 GB at the tail.
+let STRIDE_OVERFLOW_N = 1_100_000_000_l
+
+[test]
+def test_int_array_high_index_int64(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<int>
+    arr |> resize(STRIDE_OVERFLOW_N)
+    t |> equal(long_length(arr), STRIDE_OVERFLOW_N)
+    // Write to a position where idx*4 > UINT32_MAX, then read it back.
+    // If the address math is uint32, we either segfault or read garbage.
+    let high = STRIDE_OVERFLOW_N - 1_l
+    arr[high] = int(0x12345678)
+    arr[0_l] = int(0x0BADC0DE)
+    t |> equal(arr[high], int(0x12345678))
+    t |> equal(arr[0_l], int(0x0BADC0DE))
+    delete arr
+}
+
+[test]
+def test_int_array_high_index_uint64(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<int>
+    arr |> resize(STRIDE_OVERFLOW_N)
+    // Same test, uint64 index instead of int64 — exercises the U64 SimNode
+    // variant's offset math.
+    let high : uint64 = uint64(STRIDE_OVERFLOW_N) - 1_ul
+    arr[high] = int(0x12345678)
+    arr[0_ul] = int(0x0BADC0DE)
+    t |> equal(arr[high], int(0x12345678))
+    t |> equal(arr[0_ul], int(0x0BADC0DE))
+    delete arr
+}
diff --git a/tests/long_array_table/test_huge_array_iterate.das b/tests/long_array_table/test_huge_array_iterate.das
new file mode 100644
index 0000000000..812b5315ea
--- /dev/null
+++ b/tests/long_array_table/test_huge_array_iterate.das
@@ -0,0 +1,84 @@
+options gen2
+options persistent_heap = true   // >2 GB arrays need PersistentHeapAllocator
+
+require dastest/testing_boost public
+require daslib/fio
+require daslib/functional
+
+// Memory-gated probe: iterate a > INT_MAX-element array four different ways
+// and assert each visits exactly long_length(arr) elements. If any iteration
+// shape uses an int counter internally, the count check fails (or wraps).
+//
+// Auto-discovered by dastest; inline `huge_enabled()` gate silent-returns when
+// the env var is unset, so CI sees a no-op pass. Manual run actually allocates.
+
+def huge_enabled() : bool {
+    static_if (typeinfo sizeof(type<int?>) < 8) {
+        return false
+    }
+    return has_env_variable("DASLANG_HUGE_HEAP_TESTS") && get_env_variable("DASLANG_HUGE_HEAP_TESTS") == "1"
+}
+
+// 2.2 GB elements of uint8 — exceeds INT_MAX (2.147 GB), stays under
+// UINT32_MAX (4 GB) for faster runs.
+let HUGE_N = 2200_l * 1024_l * 1024_l
+
+[test]
+def test_iterate_via_range64_long_length(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    var count = 0_l
+    for (_i in range64(0_l, long_length(arr))) {
+        count++
+    }
+    t |> equal(count, HUGE_N)
+    delete arr
+}
+
+[test]
+def test_iterate_via_long_iter_range(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    var count = 0_l
+    for (_i in long_iter_range(arr)) {
+        count++
+    }
+    t |> equal(count, HUGE_N)
+    delete arr
+}
+
+[test]
+def test_iterate_via_index_read(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    // Write a sentinel at the last index, iterate with int64 counter and
+    // assert we read it back via arr[i] indexing (stride math).
+    arr[HUGE_N - 1_l] = uint8(0x77)
+    var sum = 0_l
+    for (i in range64(0_l, long_length(arr))) {
+        sum += int64(arr[i])
+    }
+    t |> equal(sum, 119_l)  // only one non-zero byte at the very end (0x77)
+    delete arr
+}
+
+[test]
+def test_iterate_via_long_enumerate(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    var last_index = -1_l
+    var count = 0_l
+    unsafe {
+        for ((i, _v) in long_enumerate(each(arr))) {
+            last_index = i
+            count++
+        }
+    }
+    t |> equal(count, HUGE_N)
+    t |> equal(last_index, HUGE_N - 1_l)
+    delete arr
+}
diff --git a/tests/long_array_table/test_huge_array_push_emplace_clone.das b/tests/long_array_table/test_huge_array_push_emplace_clone.das
new file mode 100644
index 0000000000..5d694325cc
--- /dev/null
+++ b/tests/long_array_table/test_huge_array_push_emplace_clone.das
@@ -0,0 +1,65 @@
+options gen2
+options persistent_heap = true   // >2 GB arrays need PersistentHeapAllocator
+
+require dastest/testing_boost public
+require daslib/fio
+
+// Memory-gated probe: resize an array past INT_MAX, then push/emplace/
+// push_clone one element and assert the new tail reads back correctly.
+//
+// The plan flagged `tests/long_array_table/test_push_returns_int64.das` as
+// owed once Phase 4b lands `arr[i64]` (now landed). The daslib `push` wrapper
+// doesn't surface the underlying `__builtin_array_push_back` int64 position,
+// so we verify indirectly: long_length grows by 1, and arr[before] reads
+// back the pushed value via int64 indexing (which only works if the position
+// the wrapper used internally was the correct int64).
+//
+// Skipped silently unless DASLANG_HUGE_HEAP_TESTS=1 on a 64-bit build.
+
+def huge_enabled() : bool {
+    static_if (typeinfo sizeof(type<int?>) < 8) {
+        return false
+    }
+    return has_env_variable("DASLANG_HUGE_HEAP_TESTS") && get_env_variable("DASLANG_HUGE_HEAP_TESTS") == "1"
+}
+
+// 2.2 GB elements of uint8 — exceeds INT_MAX (2.147 GB).
+let HUGE_N = 2200_l * 1024_l * 1024_l
+
+[test]
+def test_push_past_int_max(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    let before = long_length(arr)
+    arr |> push(uint8(0xAB))
+    t |> equal(long_length(arr), before + 1_l)
+    t |> equal(arr[before], uint8(0xAB))
+    delete arr
+}
+
+[test]
+def test_emplace_past_int_max(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    let before = long_length(arr)
+    var v : uint8 = uint8(0xCD)
+    arr |> emplace(v)
+    t |> equal(long_length(arr), before + 1_l)
+    t |> equal(arr[before], uint8(0xCD))
+    delete arr
+}
+
+[test]
+def test_push_clone_past_int_max(t : T?) {
+    if (!huge_enabled()) return
+    var arr : array<uint8>
+    arr |> resize(HUGE_N)
+    let before = long_length(arr)
+    let v : uint8 = uint8(0xEF)
+    arr |> push_clone(v)
+    t |> equal(long_length(arr), before + 1_l)
+    t |> equal(arr[before], uint8(0xEF))
+    delete arr
+}
diff --git a/tests/long_array_table/test_huge_array_resize_index.das b/tests/long_array_table/test_huge_array_resize_index.das
new file mode 100644
index 0000000000..4746cc6460
--- /dev/null
+++ b/tests/long_array_table/test_huge_array_resize_index.das
@@ -0,0 +1,56 @@
+options gen2
+options persistent_heap = true   // >4 GB arrays need PersistentHeapAllocator
+
+require dastest/testing_boost public
+require daslib/fio
+
+// Memory-gated probe. Allocates a 5 GB `array<uint8>` and exercises
+// int64 indexing at index 0 and index N-1 (a high position that requires
+// `uint64_t(idx)*uint64_t(stride) + offset` for the address math).
+//
+// Skipped silently unless:
+//   * running a 64-bit daslang (static_if on pointer size)
+//   * DASLANG_HUGE_HEAP_TESTS=1 in the environment
+//
+// dastest auto-discovers this file (filename starts with `test_`), but the
+// inline `huge_enabled()` gate silent-returns when the env var is unset, so
+// CI sees a no-op pass. Run manually with the env var to actually allocate:
+//   $env:DASLANG_HUGE_HEAP_TESTS = "1"
+//   bin/Release/daslang.exe dastest/dastest.das -- --test tests/long_array_table/test_huge_array_resize_index.das
+
+def huge_enabled() : bool {
+    static_if (typeinfo sizeof(type<int?>) < 8) {
+        return false
+    }
+    return has_env_variable("DASLANG_HUGE_HEAP_TESTS") && get_env_variable("DASLANG_HUGE_HEAP_TESTS") == "1"
+}
+
+[test]
+def test_huge_array_resize_round_trip(t : T?) {
+    if (!huge_enabled()) return
+    let N = 5_l * 1024_l * 1024_l * 1024_l  // 5 GB elements of uint8
+    var arr : array<uint8>
+    arr |> resize(N)
+    t |> equal(long_length(arr), N)
+    arr[0_l] = uint8(0xC0)
+    arr[N - 1_l] = uint8(0xDE)
+    t |> equal(arr[0_l], uint8(0xC0))
+    t |> equal(arr[N - 1_l], uint8(0xDE))
+    delete arr
+}
+
+[test]
+def test_huge_array_length_panics_long_length_ok(t : T?) {
+    if (!huge_enabled()) return
+    // 2.2 GB > INT_MAX (2.147 GB). long_length is safe; length() should panic
+    // per the surface contract added in PR #2746.
+    let N = 2200_l * 1024_l * 1024_l
+    var arr : array<uint8>
+    arr |> resize(N)
+    t |> equal(long_length(arr), N)
+    // length(arr) here should panic — but try/recover is banned for panic UX testing
+    // (feedback_no_try_recover_for_soft_fail), so we don't assert the panic here.
+    // Reproduce manually by uncommenting:
+    //   let _l = length(arr)
+    delete arr
+}
diff --git a/tests/long_array_table/test_int_int64_disjunction.das b/tests/long_array_table/test_int_int64_disjunction.das
new file mode 100644
index 0000000000..ba8c6e8d0d
--- /dev/null
+++ b/tests/long_array_table/test_int_int64_disjunction.das
@@ -0,0 +1,44 @@
+options gen2
+
+require dastest/testing_boost public
+
+// Phase 8b spike: can a function accept `int | int64` as a single parameter
+// signature and fork inside the body via `static_if`? If yes, PR-D (linq
+// surface widening) can use one signature per function instead of doubling
+// overloads. If no, PR-D falls back to separate `: int` / `: int64` pairs.
+//
+// Reference for the disjunction-parameter shape: tests/language/option_type.das.
+// `typeinfo is_int` / `typeinfo is_int64` are added in the same PR — string-
+// compare `stripped_typename(x) == "int"` was the fallback while landing this.
+
+def take_or(x : int | int64) : int64 {
+    static_if (typeinfo is_int(x)) {
+        return int64(x) + 1_l
+    } else {
+        return x + 1_l
+    }
+}
+
+[test]
+def test_int_branch(t : T?) {
+    let r = take_or(40)
+    t |> equal(r, 41_l)
+}
+
+[test]
+def test_int64_branch(t : T?) {
+    let r = take_or(40_l)
+    t |> equal(r, 41_l)
+}
+
+// Type-trait contract: locks `is_int` / `is_int64` against silent reverts.
+[test]
+def test_typeinfo_is_int_traits(t : T?) {
+    static_assert(typeinfo is_int(type<int>), "is_int(int) must be true")
+    static_assert(!typeinfo is_int(type<int64>), "is_int(int64) must be false")
+    static_assert(!typeinfo is_int(type<uint>), "is_int(uint) must be false")
+    static_assert(typeinfo is_int64(type<int64>), "is_int64(int64) must be true")
+    static_assert(!typeinfo is_int64(type<int>), "is_int64(int) must be false")
+    static_assert(!typeinfo is_int64(type<uint>), "is_int64(uint) must be false")
+    t |> equal(0, 0)
+}
diff --git a/utils/lint/main.das b/utils/lint/main.das
index e2f9400295..8bb6473094 100644
--- a/utils/lint/main.das
+++ b/utils/lint/main.das
@@ -11,6 +11,7 @@ require daslib/json_boost
 require math
 require strings
 require daslib/strings_boost
+require daslib/rtti
 require ../common/parallel_workers.das
 
 let JSON_PREFIX = "##lint##\n"
@@ -49,6 +50,14 @@ struct Config {
     @clarg_doc = "Worker mode: read newline-delimited file paths and emit one JSON-wrapped LintResult per file on stdout. Internal — used by parallel driver. Skips positional args, banner, summary."
     paths_from : string
 
+    @clarg_short = "d"
+    @clarg_doc = "Disable rule(s). Repeatable or comma-separated. Codes only (e.g. PERF001,STYLE024). On overlap with --enable, --disable wins."
+    disable : array<string>
+
+    @clarg_short = "e"
+    @clarg_doc = "Enable rule(s) — whitelist. If used, only listed rules run (subject to --disable). Repeatable or comma-separated."
+    enable : array<string>
+
     @clarg_short = "?"
     @clarg_doc = "Show this help and exit"
     help : bool
@@ -127,7 +136,48 @@ def scan_das_files(path : string; var files : array<string>; var cache : table<s
     }
 }
 
-def lint_file(file : string; run_paranoid, run_perf, run_style, comment_hygiene : bool) : LintResult {
+// Validates a rule code: ^(LINT|PERF|STYLE)\d{3}$. Returns true if shape matches.
+def is_valid_rule_code(code : string) : bool {
+    let n = length(code)
+    return false if (n < 7 || n > 8)
+    var prefix_len = 0
+    if (code |> starts_with("LINT") || code |> starts_with("PERF")) {
+        prefix_len = 4
+    } elif (code |> starts_with("STYLE")) {
+        prefix_len = 5
+    } else {
+        return false
+    }
+    let digits = slice(code, prefix_len)
+    return false if (length(digits) != 3)
+    var all_digits = true
+    peek_data(digits) $(d) {
+        for (c in d) {
+            if (int(c) < int('0') || int(c) > int('9')) {
+                all_digits = false
+            }
+        }
+    }
+    return all_digits
+}
+
+// Comma-split, strip, uppercase, validate. On bad token returns the bad token
+// (caller emits error + exits); on success returns "" and dest is populated.
+def normalize_rule_list(raw : array<string>; var dest : table<string>) : string {
+    for (entry in raw) {
+        let parts <- entry |> split(",")
+        for (p in parts) {
+            let token = p |> strip |> to_upper
+            continue if (empty(token))
+            return token if (!is_valid_rule_code(token))
+            dest |> insert(token)
+        }
+    }
+    return ""
+}
+
+def lint_file(file : string; run_paranoid, run_perf, run_style, comment_hygiene : bool;
+              disabled_codes, enabled_codes : table<string>) : LintResult {
     var result = LintResult(file = file)
     if (has_expect_directive(file)) {
         result.skip_reason = "(intentional compile errors via `expect`)"
@@ -157,21 +207,21 @@ def lint_file(file : string; run_paranoid, run_perf, run_style, comment_hygiene
                 }
                 if (run_paranoid) {
                     var paranoid_issues : array<string>
-                    result.count += paranoid_collect(program, paranoid_issues)
+                    result.count += paranoid_collect(program, paranoid_issues, disabled_codes, enabled_codes)
                     for (w in paranoid_issues) {
                         result.errors |> push(w)
                     }
                 }
                 if (run_perf) {
                     var perf_issues : array<string>
-                    result.count += perf_lint_collect(program, perf_issues)
+                    result.count += perf_lint_collect(program, perf_issues, disabled_codes, enabled_codes)
                     for (w in perf_issues) {
                         result.errors |> push(w)
                     }
                 }
                 if (run_style) {
                     var style_issues : array<string>
-                    result.count += style_lint_collect(program, style_issues, comment_hygiene)
+                    result.count += style_lint_collect(program, style_issues, disabled_codes, enabled_codes, comment_hygiene)
                     for (w in style_issues) {
                         result.errors |> push(w)
                     }
@@ -222,9 +272,10 @@ def read_paths_from(path : string; var out : array<string>) : bool {
 // Worker entry: lint every file in `paths`, emit one JSON-wrapped
 // LintResult per file between `##lint##\n` markers. Caller (driver)
 // parses the stream and aggregates.
-def run_worker(paths : array<string>; run_paranoid, run_perf, run_style, comment_hygiene : bool) {
+def run_worker(paths : array<string>; run_paranoid, run_perf, run_style, comment_hygiene : bool;
+               disabled_codes, enabled_codes : table<string>) {
     for (f in paths) {
-        let result = lint_file(f, run_paranoid, run_perf, run_style, comment_hygiene)
+        let result = lint_file(f, run_paranoid, run_perf, run_style, comment_hygiene, disabled_codes, enabled_codes)
         print("{JSON_PREFIX}{write_json(JV(result))}\n{JSON_PREFIX}")
     }
 }
@@ -303,6 +354,14 @@ def run_parallel_lint(files : array<string>; cfg : Config) : array<LintResult> {
         if (cfg.style_only) {
             argv |> push("--style-only")
         }
+        if (!empty(cfg.disable)) {
+            argv |> push("--disable")
+            argv |> push(cfg.disable |> join(","))
+        }
+        if (!empty(cfg.enable)) {
+            argv |> push("--enable")
+            argv |> push(cfg.enable |> join(","))
+        }
         argvs |> emplace(argv)
     }
 
@@ -370,6 +429,19 @@ def main() : int {
     let run_perf = !any_only || cfg.perf_only
     let run_style = !any_only || cfg.style_only
 
+    var disabled_codes : table<string>
+    var enabled_codes : table<string>
+    let dis_bad = normalize_rule_list(cfg.disable, disabled_codes)
+    if (!empty(dis_bad)) {
+        print("error: invalid rule code \"{dis_bad}\" in --disable (expected LINT|PERF|STYLE + 3 digits)\n")
+        return 1
+    }
+    let en_bad = normalize_rule_list(cfg.enable, enabled_codes)
+    if (!empty(en_bad)) {
+        print("error: invalid rule code \"{en_bad}\" in --enable (expected LINT|PERF|STYLE + 3 digits)\n")
+        return 1
+    }
+
     // Worker mode: lint a pre-resolved list from a paths-from file and emit
     // JSON-wrapped results. No banner, no summary, no exit-code derivation.
     if (!empty(cfg.paths_from)) {
@@ -378,7 +450,7 @@ def main() : int {
             print("error: cannot read paths-from file: {cfg.paths_from}\n")
             return 1
         }
-        run_worker(paths, run_paranoid, run_perf, run_style, cfg.comment_hygiene)
+        run_worker(paths, run_paranoid, run_perf, run_style, cfg.comment_hygiene, disabled_codes, enabled_codes)
         return 0
     }
 
@@ -434,7 +506,7 @@ def main() : int {
             if (!cfg.quiet) {
                 print("checking {file}...\n")
             }
-            let result = lint_file(file, run_paranoid, run_perf, run_style, cfg.comment_hygiene)
+            let result = lint_file(file, run_paranoid, run_perf, run_style, cfg.comment_hygiene, disabled_codes, enabled_codes)
             print_result(result, cfg.quiet)
             if (!empty(result.skip_reason)) {
                 total_skipped++
diff --git a/utils/lint/tests/perf021_ternary_cast_hoist.das b/utils/lint/tests/perf021_ternary_cast_hoist.das
new file mode 100644
index 0000000000..22b6d1477f
--- /dev/null
+++ b/utils/lint/tests/perf021_ternary_cast_hoist.das
@@ -0,0 +1,117 @@
+options gen2
+// PERF021: cond ? T(a) : T(b) on workhorse cast T — hoist as T(cond ? a : b).
+//
+// Problem:
+//   `cond ? string(a) : string(b)` emits two ExprCall nodes that both convert to
+//   the same target type. The work is identical regardless of branch, and the
+//   parser/typer doesn't fold them. Two cast dispatches, twice the source noise.
+//
+// Bad pattern:
+//   def to_str(c : bool; a, b : int) : string {
+//       return c ? string(a) : string(b)             // PERF021
+//   }
+//
+// Good pattern:
+//   def to_str(c : bool; a, b : int) : string {
+//       return string(c ? a : b)
+//   }
+//
+// Side-effect note: the rewrite is unconditionally safe. The original ternary
+// evaluates exactly one of `a` / `b` (whichever branch is taken); so does the
+// hoisted form. Argument evaluation count is unchanged, only the number of
+// cast dispatches drops from 1-of-2 to 1.
+//
+// dastest readers: same approach as the perf020 fixture — parameters (not
+// const literals) keep the runtime forms intact under inference-time folding.
+
+expect 31208:12
+
+require daslib/perf_lint
+
+struct SomeStruct {
+    v : int64
+}
+
+// --- Bad patterns (one PERF021 each — runtime, survive folding) ---
+
+def bad_string_from_int(c : bool; a, b : int) : string {
+    return c ? string(a) : string(b)                         // PERF021
+}
+
+def bad_string_from_int64(c : bool; a, b : int64) : string {
+    return c ? string(a) : string(b)                         // PERF021
+}
+
+def bad_int64_widen(c : bool; a, b : int) : int64 {
+    return c ? int64(a) : int64(b)                           // PERF021
+}
+
+def bad_int_narrow(c : bool; a, b : int64) : int {
+    return c ? int(a) : int(b)                               // PERF021
+}
+
+def bad_uint_signedness(c : bool; a, b : int) : uint {
+    return c ? uint(a) : uint(b)                             // PERF021
+}
+
+def bad_float_from_int(c : bool; a, b : int) : float {
+    return c ? float(a) : float(b)                           // PERF021
+}
+
+def bad_double_from_int(c : bool; a, b : int) : double {
+    return c ? double(a) : double(b)                         // PERF021
+}
+
+def bad_int8(c : bool; a, b : int) : int8 {
+    return c ? int8(a) : int8(b)                             // PERF021
+}
+
+def bad_uint16(c : bool; a, b : int) : uint16 {
+    return c ? uint16(a) : uint16(b)                         // PERF021
+}
+
+def bad_field_access(c : bool; x, y : SomeStruct) : string {
+    return c ? string(x.v) : string(y.v)                     // PERF021 — args are field accesses, same baseType
+}
+
+def bad_subexpr(c : bool; a, b, d : int) : int64 {
+    return c ? int64(a + b) : int64(d)                       // PERF021 — args are int / int sub-expressions
+}
+
+def bad_same_hex_string(c : bool; a, b : int) : string {
+    return c ? string(a, true) : string(b, true)             // PERF021 — same hex flag, hoist is equivalent
+}
+
+// --- Good patterns (no warnings) ---
+
+def good_one_branch_cast(c : bool; a : int; b : int64) : int64 {
+    return c ? int64(a) : b                                  // only one branch is a cast
+}
+
+def good_different_cast(c : bool; a, b : int) : int64 {
+    return c ? int64(a) : int64(b) + 1l                      // RHS isn't a bare cast (sub-expr around the cast)
+}
+
+def good_arg_basetype_differs(c : bool; a : int; b : int64) : string {
+    return c ? string(a) : string(b)                         // same cast, but arg baseTypes differ — rewrite needs widen
+}
+
+def good_user_named_ctor(c : bool; a, b : int) : SomeStruct {
+    return c ? SomeStruct(v = int64(a)) : SomeStruct(v = int64(b))   // SomeStruct ctor, not a workhorse cast
+}
+
+def good_vector_ctor(c : bool; x, y : float) : float2 {
+    return c ? float2(x, y) : float2(y, x)                   // 2-arg ctor — single-arg gate excludes
+}
+
+def good_literal_branches(c : bool) : string {
+    return c ? "yes" : "no"                                  // no casts at all
+}
+
+def good_call_one_side_only(c : bool; a : int) : int {
+    return c ? a + 1 : a - 1                                 // ternary, no casts on either side
+}
+
+def good_different_hex_string(c : bool; a, b : int) : string {
+    return c ? string(a, true) : string(b, false)            // hex flag differs — hoisting would lose semantic info
+}