forksnd · pull · May 19, 2026 · May 18, 2026 · May 18, 2026 · May 19, 2026
diff --git a/benchmarks/sql/LINQ.md b/benchmarks/sql/LINQ.md
@@ -31,7 +31,10 @@ See `~/.claude/plans/keen-hopping-balloon.md` and `~/.claude/plans/enumerated-ba
 | 1 cascade | Retire `_old_fold`, drop the 7 `g_foldSeq` patterns, restructure `_fold` into a three-tier cascade (splice → `fold_linq_default` → raw clone). Cascade is always-safe — `_fold(chain)` is observationally equivalent to `chain`. `_select|_where` etc. cases that used to fall to raw clone now cascade to tier 2 array-shape with `_inplace` reuse. Phase 2D "fail-loudly contract" scrapped — always-safe cascade obviates it. | ✅ done |
 | 3a/b/c | BufferTopN splice arm via new `plan_order_family` planner: `[where_*]* + order/order_by/order_descending/order_by_descending [+ take(K)]`. No `take` → direct call to `order*` helper (or fused prefilter loop + `order*_inplace` when where is present). With `take(K)` → dispatch to `top_n[_by][_descending]` (or fused prefilter loop + `top_n*` on the buffer when where is present). New `top_n_by_descending` + `top_n_descending` library helpers added to `daslib/linq.das` (4 functions, mirrors of `top_n_by` / `top_n` with flipped comparator). | ✅ done |
 | 3d | `select + where + terminator` splice arm via `replaceVariablePeeling` (new helper in `daslib/templates_boost.das`). Substitutes the projection into the where predicate, peeling the typer-inserted ExprRef2Value wrappers that would otherwise be left orphaned around a non-reference value. Bails to tier 2 when the projection has side effects (would double-evaluate). Lands all four terminator lanes (ARRAY / COUNTER / ACCUMULATOR / EARLY_EXIT). | ✅ done |
-| 3+ | Remaining buffer-required emit modes (deferred): BufferDistinct, BufferGroupBy, BufferReverse, MultiSourceZip, BufferedJoin. Currently cascade to tier 2 array-shape. | ⏳ |
+| 3+ BufferReverse | `[where_*]?[select?] \|> reverse [\|> take(N)]? [\|> {to_array, count, first, first_or_default}]?` — single-pass prefilter into buffer + `reverse_inplace` + optional `resize(min(N,len))`; count/first lanes elide the buffer entirely (counter or last-survivor tracker). | ✅ |
+| 3+ BufferDistinct | `[where_*]?[select?] \|> distinct[_by(key)] [\|> take(N)]? [\|> {to_array, count, sum, long_count}]?` — streaming dedup via `table<unique_key>` integrated into the prefilter loop; take(N) → break-after-N early-exit (guard placed BEFORE the consume site so non-positive N short-circuits); count terminator elides the buffer and returns `length(seen)`. `first`/`first_or_default`/`min`/`max`/`average` after `distinct` cascade to tier 2. | ✅ core |
+| 3+ BufferGroupBy | `src \|> group_by[_lazy](key) \|> select(group_proj) [\|> {to_array, count}]?` — incremental-aggregate fusion: emits `table<unique_key; tuple<KT; AccT>>` updated per element. Recognized reducers: `_._1 \|> {length, count, sum, long_count}`. Bare + named-tuple wrap `(K=_._0, V=_._1\|>reducer)` both supported. Bails to tier 2 for inner-select-sum, multi-reducer, `having_*`, **or any upstream `where_*`/`select*` between source and group_by** (upstream fusion deferred). | ✅ core |
+| 3+ Buffer remainder | MultiSourceZip, BufferedJoin, inner-select-sum (groupby_sum gap), group_by min/max/first/average reducers, multi-reducer named tuples. Currently cascade to tier 2 array-shape. | ⏳ |
 | 4 | Final coverage pass + docs; full 3-way comparison table refresh; parity-test sweep | ⏳ |
 
 ## Baselines (100K rows, INTERP mode)
@@ -54,11 +57,11 @@ Notation: `—` means the variant is not applicable for this benchmark (operator
 | to_array_filter | `where → select → to_array` | 70 | 43 | 11 |
 | take_count | `take → to_array` | 3 | 0\* | 0\* |
 | skip_take | `skip → take → to_array` | 0\* | 16 | 23 |
-| distinct_count | `select → distinct → to_array` | 41 | 43 | 33 |
+| distinct_count | `select → distinct → to_array` | 41 | 43 | 33 → **15** (this PR) |
 | sort_first | `order_by → first` | 37 | 2170 | 2238 |
 | sort_take | `order_by → take` | 38 | 2188 | 2269 |
-| groupby_count | `group_by → select(_, length)` | 140 | 70 | 76 |
-| groupby_sum | `group_by → select(_, sum)` | 172 | 101 | 107 |
+| groupby_count | `group_by → select(_, length)` | 140 | 70 | 76 → **33** (this PR) |
+| groupby_sum | `group_by → select(_, select → sum)` | 172 | 101 | 107 (deferred — inner-select-sum) |
 | chained_where | `where → where → count` | 36 | 45 | 17 |
 | zip_dot_product | `zip → select → sum` | — | 53 | 37 |
 | join_count | `join → count` | —\*\* | 116 | 122 |
@@ -300,6 +303,77 @@ The sort/order rows now BEAT `m1` SQLite by ~30%. PR #2707 closed the comparator
 
 **Parser bonus:** the multi-arg `$($i(a) : T, $i(b) : T) { ... }` qmacro form failed parse with `30701: block argument is already declared MACRO``TAG` because the parser stamped every `$i(...)` in block-arg position with the literal placeholder name and dup-checked them before macro tag resolution. Fixed in `src/parser/parser_impl.cpp:885` by skipping the dup check when `name_at.tag != nullptr` (genuine post-resolution dups surface as ordinary local-lookup conflicts during type inference). General-purpose fix — usable by any macro that needs to emit a typed block with N tagged-name args.
 
+## Phase 3+ — buffer-required splice arms (this PR)
+
+Three new planners — `plan_reverse`, `plan_distinct`, `plan_group_by` — slot into the cascade between `plan_order_family` and `plan_loop_or_count`. Each is name-gated and rejects early on shape mismatch. `is_buffer_required_op` still lists all buffer-required ops (`order_by`, `distinct`/`distinct_by`, `reverse`, `group_by`/`group_by_lazy`, `zip`, `join`/`left_join`/`group_join`); it acts as a fallback marker for the shapes the dedicated planners don't cover (`distinct().first()`, `order_by(...).first()`, etc. — terminators outside the spliced set cascade to `is_buffer_required_op` which triggers the tier-2 fallthrough). `zip`/`join`/`left_join`/`group_join` cascade unconditionally pending BufferZip / BufferedJoin.
+
+### plan_reverse
+
+Recognized chain: `src [|> where_(p)]* [|> select(proj)]? |> reverse [|> take(N)]? [|> {to_array, count, first, first_or_default}]`.
+
+| Terminator | Emission |
+|---|---|
+| `to_array` (default) | prefilter loop → push into buffer → `reverse_inplace(buf)` → `return <- buf` |
+| `take(N) \|> to_array` | prefilter → push → `reverse_inplace` → `resize(min(N, length(buf)))` → `return <- buf` (semantics: "last N of source in reverse order") |
+| `count` | counter loop only (no buffer, no reverse) — reverse doesn't change count |
+| `first` | last-survivor tracker — keep last element seen, `panic` if none |
+| `first_or_default(d)` | last-survivor tracker — keep last element seen, return `d` if none |
+
+### plan_distinct
+
+Recognized chain: `src [|> where_(p)]* [|> select(proj)]? |> distinct[_by(key)] [|> take(N)]? [|> terminator]`. Two emission shapes by terminator class:
+
+**A. Buffer-required terminator** (`to_array`, with optional `take(N)`): streaming dedup. Single pass, push only if key not in seen-table, optional `take(N)` early-exit. Inside the "fresh key" branch the take guard runs BEFORE `seen.insert` / `buf.push_clone` so non-positive N short-circuits without ever mutating `seen` or `buf`:
+
+```
+if ($e(whereCond)) {
+    $e(intermediateBinds)
+    let `k` = _::unique_key(<keyExpr>)
+    if (!`seen` |> key_exists(`k`)) {
+        if (`taken` >= N) break    // only when take(N) present
+        `taken`++                  // only when take(N) present
+        `seen` |> insert(`k`)
+        `buf` |> push_clone(<projection>)
+    }
+}
+```
+
+**B. Buffer-not-required terminators** (`count`, `sum`, `long_count`): elide the buffer; only the seen-table is materialized. `count` returns `length(seen)`; `long_count` returns `int64(length(seen))` (both are count-shaped — no per-element accumulator); `sum` folds inline at the freshly-inserted site with `acc += <projection>`. `first`/`first_or_default` after `distinct` are not yet spliced — they cascade to tier 2.
+
+**Intentional divergence — `distinct[_by] |> take(N)` is streaming, not eager**: the runtime helpers (`distinct_impl` / `distinct_by_impl`) materialize the FULL distinct set first, then `take(N)` slices the head. The splice instead breaks the source loop the moment N distinct keys are observed. For pure code this is observationally identical (and strictly faster — same result, less work). For side-effecting key functions or upstream `where_`/`select` projections, the splice fires the side-effect on **fewer source elements** than the runtime would. This is by design: the splice gives `distinct().take()` the iterator-form semantics regardless of source shape (the iterator-form `distinct()` in `daslib/linq.das:668` is already a lazy generator and stops early under `take` even without the splice). If you need exact runtime parity for side-effect counts, hoist the side-effect out of the key/projection before the splice fires.
+
+### plan_group_by
+
+Recognized chain: `src |> group_by[_lazy](key) |> select(group_proj) [|> {to_array, count}]?`. Avoids bucket-array materialization entirely by emitting `table<unique_key; tuple<KT; AccT>>` updated per element. The `select(group_proj)` body is recognized via `is_bucket_reducer_call` ([linq_fold.das:1666](daslib/linq_fold.das#L1666)); recognized shapes are:
+
+| `group_proj` body shape | Acc | Per-element update |
+|---|---|---|
+| `_._1 \|> length` / `\|> count` | `int = 0` | `entry._1++` |
+| `_._1 \|> long_count` | `int64 = 0l` | `entry._1++` |
+| `_._1 \|> sum` (numeric bucket) | `T = default<T>` | `entry._1 += it` |
+
+Named-tuple wrap forms `(K = _._0, V = <recognized>)` preserve the user's field-name hints by using `group_proj._type` as the table value type directly (no AST mutation; `kv` flowing out of `values(tab)` is already shaped correctly via `push_clone(kv)`). Terminators: `to_array` (default) builds the result; `count` returns `length(tab)` (number of groups).
+
+Bails to tier 2 cascade on: inner-select-sum (`_._1 |> select(<inner>) |> sum` — see deferred follow-up below), multi-reducer named tuples, `_._1` appearing outside the recognized reducer chains, `having_*` between `group_by` and `select(group_proj)`, **any upstream `where_*`/`select*` between source and `group_by_lazy`** (upstream fusion deferred). Side-effectful key bodies are NOT a bail — the emitted code evaluates each key once per element (same as plain `group_by_lazy`).
+
+### Headline benchmarks (100K rows, INTERP, this PR)
+
+| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f (prev) | m3f (this PR) | Win |
+|---|---|---:|---:|---:|---:|---:|
+| distinct_count | `each(arr) → select(brand) → distinct → to_array` | 41 | 43 | 33 | **16** | **2.7× over m3 / 2.1× over prev / faster than m1 SQL** |
+| distinct_take (NEW) | `each(arr) → select(brand) → distinct → take(3) → to_array` | 0 | 30 | — | **0** | early-exit at 3 unique brands — splice eliminates generator-dispatch overhead |
+| groupby_count | `each(arr) → group_by(brand) → select(brand, length) → to_array` | 141 | 71 | 76 | **37** | **2.1× over prev / 1.9× over m3 / 3.8× over m1 SQL** |
+| reverse_take (NEW) | `each(arr) → reverse → take(10) → to_array` | 0\* | 22 | — | **34** | regression vs m3 — see footnote |
+| groupby_sum (deferred) | `each(arr) → group_by(brand) → select(brand, select(price) → sum) → to_array` | 173 | 98 | 107 | **108** | unchanged — inner-select-sum (see follow-up) |
+
+**`distinct_count` win**: plain LINQ materializes a full distinct array then counts. The splice fuses streaming dedup into a single pass over the source, producing the buffer once. **`distinct_take`** is the extreme case — both m3 and m3f use matching `each(arr)` iterator sources (so plain LINQ's lazy `distinct().take()` *also* early-exits on the iterator). The splice's win over m3 (~30 → ~0 ns/op) comes from eliminating per-yield generator-dispatch overhead in the lazy chain, not from "lazy vs eager source" comparison.
+
+**`groupby_count` win**: plain LINQ materializes per-bucket arrays then counts each. The splice keeps only `table<brand → int>` updated per element with `entry._1++`. The result-build loop iterates `values(tab)` (5 entries) — total work is one source pass + 5 emplaces vs tier 2's per-bucket array allocation+push+length-call. Result beats SQL by 4.3× because no engine round-trip overhead.
+
+**`reverse_take` regression vs m3** (34 vs 22 ns/op): both variants materialize the full source into a buffer (the iterator-form `reverse()` in [linq.das:234](daslib/linq.das#L234) does it via `generator capture` + per-yield indexing; the splice does it via `push_clone` in the prefilter loop). The cost difference is the second pass: m3's generator yields `buffer[len-i-1]` lazily, so `take(TAKE_N)` only triggers N yields; the splice instead calls `reverse_inplace(buf)` (full O(length) swap) and then `resize(N)`. For TAKE_N << length the lazy-yield approach wins because the reverse is bounded to N instead of length. The splice still wins the headline shape (`reverse |> to_array` without take) where both variants pay full-length cost but the splice avoids the generator-dispatch overhead. A future optimization could detect `reverse |> take(N)` on array-typed sources and emit a backward index loop bounded to `[max(0, len-N), len)` — deferred. \* m1 SQL: `ORDER BY id DESC LIMIT 10` collapses to 0 ns/op via index.
+
+**`groupby_sum` unchanged**: the benchmark uses `g._1 |> _select($(c : Car) => c.price) |> _sum()` — an inner-select-sum chain. The G4 recognizer covers bare `_._1 |> sum` but doesn't yet inline the inner projection into the `+=` site. Falls back to tier 2 array-shape (correct but slow). Deferred to follow-up (~80 LOC; see plan).
+
 ## Operator-coverage checklist (parity tests)
 
 The 24 benchmarks above cover the most common shapes. The end-game target is one benchmark per `_fold`-applicable scenario in the broader `tests/linq/` operator suite. Tracking the long-tail coverage below; PRs that add splice support for new operators should add a benchmark here if not already present.
@@ -310,11 +384,11 @@ The 24 benchmarks above cover the most common shapes. The end-game target is one
 | `test_linq_aggregation.das` | count/sum/min/max/avg/aggregate | count/sum/min/max/average_aggregate, sum_where, long_count_aggregate | ✅ core; `aggregate(seed, fn)` ⏳ |
 | `test_linq_querying.das` | any/all/contains | any_match, all_match, contains_match | ✅ core |
 | `test_linq_transform.das` | select/select_many/zip | to_array_filter, zip_dot_product | ✅ select/zip; `select_many` ⏳ |
-| `test_linq_sorting.das` | order/order_by/reverse | sort_first, sort_take, select_where_order_take | ✅ ascending; `order_descending` + `reverse` ⏳ |
-| `test_linq_group_by.das` | group_by/group_by_lazy/having | groupby_count, groupby_sum | ✅ basic; `having_` ⏳ |
+| `test_linq_sorting.das` | order/order_by/reverse | sort_first, sort_take, select_where_order_take, reverse_take | ✅ ascending + `order_descending` (Phase 3); ✅ `reverse` (this PR; `reverse \|> take(N)` shape is a regression vs lazy iterator — see Phase 3+ notes) |
+| `test_linq_group_by.das` | group_by/group_by_lazy/having | groupby_count, groupby_sum | ✅ count/long_count/bare-sum reducers + named-tuple wrap (this PR); ⏳ inner-select-sum, min/max/first/average, multi-reducer, `having_` |
 | `test_linq_join.das` | join/left_join/right_join/full_outer/cross | join_count | ✅ inner; outer joins + cross ⏳ |
 | `test_linq_partition.das` | take/skip/take_while/skip_while/chunk | take_count, skip_take, take_sum_aggregate, take_count_filtered | ✅ take/skip in splice lanes; `_while` + `chunk` ⏳ |
-| `test_linq_set.das` | distinct/union/except/intersect/unique | distinct_count | ✅ distinct; set ops ⏳ |
+| `test_linq_set.das` | distinct/union/except/intersect/unique | distinct_count, distinct_take | ✅ distinct + distinct_by (streaming dedup, this PR); union/except/intersect/unique ⏳ |
 | `test_linq_element.das` | first/last/single/element_at + _or_default | first_match, first_or_default_match | ✅ first/first_or_default; last/single/element_at ⏳ |
 | `test_linq_concat.das` | concat/prepend/append | — | ⏳ |
 | `test_linq_generation.das` | range/repeat/etc. | — | ⏳ |

diff --git a/benchmarks/sql/distinct_take.das b/benchmarks/sql/distinct_take.das
@@ -0,0 +1,67 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let TAKE_N = 3
+
+// _select(_.brand) |> distinct() |> take(N) — early-exit dedup pattern.
+// SQL: SELECT DISTINCT brand FROM Cars LIMIT N — engine bounds the dedup hashset.
+// Both m3 and m3f use each(arr) for apples-to-apples comparison.
+// m3 (plain LINQ on iterator) routes through linq.das:668 — a lazy `distinct()` generator
+// composed with lazy `take(N)`, both yielding per element. Stops at N distinct yields.
+// m3f (spliced via plan_distinct): single fused for-loop with outer-loop take-guard
+// at the top — breaks as soon as the Nth distinct value has been consumed (true O(N)
+// source elements worst-case). No generator overhead. With BRAND_COUNT=5 and TAKE_N=3
+// the splice exits at source position ~3.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let rows <- _sql(db |> select_from(type<Car>) |> _select(_.brand) |> distinct() |> take(TAKE_N))
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        unsafe {
+            let rows <- (each(arr) |> _select(_.brand) |> distinct() |> take(TAKE_N) |> to_array())
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        unsafe {
+            let rows <- _fold(each(arr)._select(_.brand).distinct().take(TAKE_N).to_array())
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+[benchmark]
+def distinct_take_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def distinct_take_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def distinct_take_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/benchmarks/sql/reverse_take.das b/benchmarks/sql/reverse_take.das
@@ -0,0 +1,66 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let TAKE_N = 10
+
+// _reverse() |> take(N) — last N rows of the source, reversed.
+// SQL: SELECT * FROM Cars ORDER BY id DESC LIMIT N — engine indexes-and-limits.
+// m3 (plain LINQ iterator-reverse): generator captures the source, fills an internal buffer,
+// then yields buffer[len-i-1] lazily — take(N) only triggers N yields.
+// m3f (spliced via plan_reverse): single prefilter pass into a buffer + reverse_inplace
+// + resize(min(N, length(buf))). The full O(length) reverse is paid before resize, so for
+// TAKE_N << length this is a REGRESSION vs m3 (~34 vs ~22 ns/op at 100K). See LINQ.md
+// "Phase 3+" notes — a backward-index-loop optimization is deferred.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let rows <- _sql(db |> select_from(type<Car>) |> _order_by_descending(_.id) |> take(TAKE_N))
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        unsafe {
+            let rows <- (each(arr) |> reverse() |> take(TAKE_N) |> to_array())
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        unsafe {
+            let rows <- _fold(each(arr).reverse().take(TAKE_N).to_array())
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+[benchmark]
+def reverse_take_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def reverse_take_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def reverse_take_m3f(b : B?) {
+    run_m3f(b, 100000)
+}