From de9dd75089376674fc74f795c3e6eb30046efe38 Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 06:41:42 -0700
Subject: [PATCH 1/7] ast,daslib: fix spurious 30151 on concept_assert under
 lint flags (#2830)

Two independent bugs surfaced together by the linter's
`no_infer_time_folding=true` + `no_optimizations=true` policy combo.

1. Compiler: `concept_assert` / `static_assert` cond never gets folded.
   `typeinfo sizeof(T) <= typeinfo sizeof(U)` rewrites to `ExprConstInt <=
   ExprConstInt` at infer time, but `InferTypes::visit(ExprOp1/Op2/Op3)`
   only folds when `enableInferTimeFolding` is on. With lint disabling
   it and `no_optimizations` skipping `ConstFolding`, the binop stayed
   unfolded and `ContractFolding::visit(ExprStaticAssert*)` raised the
   spurious 30151. Mirror the existing static_if save+force-enable+
   restore pattern in `preVisit`/`visit(ExprStaticAssert*)` so the cond
   subtree always folds.

2. Daslib: 6 sites used `if (typeinfo X)` instead of `static_if`. These
   relied on the same infer-time folding to elide the dead branch
   (whose body references fields/operations only valid in the true
   branch's universe). Under lint flags, both branches survive and
   the dead one fails to resolve. Convert all to `static_if`:
   `decs_boost.das:244` (`a._aka`), `builtin.das:403/892/914/1183`,
   `json_boost.das:477`.

`extended_checks` gates the lint step to `matrix.target == 'linux'`, so
this surfaced as "linux x64 only" in CI; verified platform-independent
locally on Windows daslang.exe.

Also: 2 pre-existing STYLE028 hits in `decs_boost.das` (`self->implement`
-> `implement`), required by the "every changed .das file lint-clean" PR
rule once the PR touches that file.

Regression fixture: `tests/_issue_2830_lint_repro.das`. CI lint runs
over changed .das files; the fixture exercises the failing path on
this PR and continues to exercise it on any future PR that touches it.

Verified locally on WSL Ubuntu2404-CI (clang 18.1.3, Release):
  - lint clean on all 4 repro variants
  - tests/decs   245/245 pass
  - tests/json   266/266 pass
  - tests/lint   8/8 + utils/lint/tests 38/38 pass

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 daslib/builtin.das                    |  8 +++---
 daslib/decs_boost.das                 |  6 ++---
 daslib/json_boost.das                 |  2 +-
 include/daScript/ast/ast_infer_type.h |  1 +
 src/ast/ast_infer_type.cpp            | 15 +++++++++++
 tests/_issue_2830_lint_repro.das      | 37 +++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 tests/_issue_2830_lint_repro.das
diff --git a/daslib/builtin.das b/daslib/builtin.das
index 1d8d8ec4d..c142a1988 100644
--- a/daslib/builtin.das
+++ b/daslib/builtin.das
@@ -400,7 +400,7 @@ def push_clone(var Arr : array<auto(numT)>; var varr : numT[] ==const) {
 def push_clone(var Arr : array<auto(numT)[]>; varr : numT[] ==const) {
     static_if (typeinfo can_copy(type<numT>)) {
         static_if (typeinfo sizeof(Arr[0]) == typeinfo sizeof(varr)) {
-            if (typeinfo can_clone_from_const(varr)) {
+            static_if (typeinfo can_clone_from_const(varr)) {
                 for (t in varr) {
                     Arr[__builtin_array_push_back_zero(Arr, typeinfo sizeof(Arr[0]))] := t
                 }
@@ -889,7 +889,7 @@ def insert_clone(var Tab : table<auto(keyT); auto(valT)>; at : keyT | #; var val
 [unused_argument(Tab, at, val)]
 def insert_clone(var Tab : table<auto(keyT); auto(valT)>; at : keyT | #; val : valT ==const | #) {
     static_if (typeinfo can_clone(val)) {
-        if (typeinfo can_clone_from_const(val)) {
+        static_if (typeinfo can_clone_from_const(val)) {
             unsafe(Tab[at]) := val
         } else {
             concept_assert(false, "can't insert value, which can't be cloned from const")
@@ -911,7 +911,7 @@ def insert_clone(var Tab : table<auto(keyT); auto(valT)[]>; at : keyT | #; var v
 [unused_argument(Tab, at, val)]
 def insert_clone(var Tab : table<auto(keyT); auto(valT)[]>; at : keyT | #; val : valT[] ==const | #) {
     static_if (typeinfo can_clone(val)) {
-        if (typeinfo can_clone_from_const(val)) {
+        static_if (typeinfo can_clone_from_const(val)) {
             unsafe(Tab[at]) := val
         } else {
             concept_assert(false, "can't insert value, which can't be cloned from const")
@@ -1180,7 +1180,7 @@ def clone_to_move(var clone_src : auto(TT) ==const | #) : TT -const -# {
 
 def clone_dim(var a; b : auto | #) {
     static_if (typeinfo is_dim(a) && typeinfo is_dim(b) && typeinfo dim(a) == typeinfo dim(b)) {
-        if (typeinfo is_pod(a)) {
+        static_if (typeinfo is_pod(a)) {
             unsafe {
                 memcpy(addr(a[0]), addr(b[0]), typeinfo sizeof(a[0]) * length(a))
             }
diff --git a/daslib/decs_boost.das b/daslib/decs_boost.das
index 40043d1f4..4ded362ca 100644
--- a/daslib/decs_boost.das
+++ b/daslib/decs_boost.das
@@ -241,7 +241,7 @@ def private append_iterator(arch_name : string; var qloop : ExprFor?; a; prefix,
     qloop.iterators |> resize(qli + 1)
     qloop.iterators[qli] := "{prefix}{a.name}{suffix}"
     qloop.iteratorsAka |> resize(qli + 1)
-    if (typeinfo has_field<_aka>(a)) {
+    static_if (typeinfo has_field<_aka>(a)) {
         qloop.iteratorsAka[qli] := a._aka
     } else {
         qloop.iteratorsAka[qli] := ""
@@ -384,7 +384,7 @@ class DecsQueryMacro : AstCallMacro {
         macro_verify(totalArgs == 1 || totalArgs == 2, prog, expr.at, "expecting query($(block_with_arguments)) or query(eid,$(block_with_arguments))")
         let qt = totalArgs == 2 ? DecsQueryType.eid_query : DecsQueryType.query
         let block_arg_index = totalArgs - 1
-        return <- self->implement(expr, block_arg_index, qt)
+        return <- implement(expr, block_arg_index, qt)
     }
     def implement(var expr : ExprCallMacro?; block_arg_index : int; qt : DecsQueryType) : ExpressionPtr {
         for (arg in expr.arguments) {
@@ -539,7 +539,7 @@ class DecsFindQueryMacro : DecsQueryMacro {
     //! Note: if return is missing, or end of find_query block is reached - its assumed that find_query did not find anything, and will return false.
     def override visit(prog : ProgramPtr; mod : Module?; var expr : ExprCallMacro?) : ExpressionPtr {
         macro_verify(length(expr.arguments) == 1, prog, expr.at, "expecting find_query($(block_with_arguments))")
-        return <- self->implement(expr, 0, DecsQueryType.find_query)
+        return <- implement(expr, 0, DecsQueryType.find_query)
     }
 }
 
diff --git a/daslib/json_boost.das b/daslib/json_boost.das
index ccc63a165..9fab0a3ab 100644
--- a/daslib/json_boost.das
+++ b/daslib/json_boost.das
@@ -474,7 +474,7 @@ def from_JV(v : JsonValue const explicit?; anything : auto(TT)) {
             let arr & = v.value as _array
             ret |> reserve(arr |> long_length)
             for (a in arr) {
-                if (typeinfo can_copy(anything[0])) {
+                static_if (typeinfo can_copy(anything[0])) {
                     ret |> push_clone <| _::from_JV(a, decltype_noref(anything[0]))
                 } else {
                     ret |> emplace <| _::from_JV(a, decltype_noref(anything[0]))
diff --git a/include/daScript/ast/ast_infer_type.h b/include/daScript/ast/ast_infer_type.h
index 28fc0c52c..f7480e937 100644
--- a/include/daScript/ast/ast_infer_type.h
+++ b/include/daScript/ast/ast_infer_type.h
@@ -62,6 +62,7 @@ namespace das {
         bool enableInferTimeFolding = true;
         bool savedFoldingForEnum = true;        // preVisitEnumerationValue / visitEnumerationValue save-restore
         bool savedFoldingForStaticIf = true;    // preVisit(ExprIfThenElse) / visit(ExprIfThenElse) save-restore (block hooks skipped for static_if)
+        bool savedFoldingForStaticAssert = true; // preVisit(ExprStaticAssert) / visit(ExprStaticAssert) save-restore
         bool disableAot = false;
         bool multiContext = false;
         bool standaloneContext = false;
diff --git a/src/ast/ast_infer_type.cpp b/src/ast/ast_infer_type.cpp
index 27e435d85..71d63f99a 100644
--- a/src/ast/ast_infer_type.cpp
+++ b/src/ast/ast_infer_type.cpp
@@ -1215,8 +1215,23 @@ namespace das {
         for (auto &arg : expr->arguments) {
             markNoDiscard(arg);
         }
+        // static_assert / concept_assert needs the cond to fold to a const
+        // before verifyAndFoldContracts runs. Mirror the static_if path
+        // above: with `no_infer_time_folding` set (lint policies) plus
+        // `no_optimizations`, `int_const op int_const` shapes (typically
+        // `typeinfo sizeof(X) <= typeinfo sizeof(Y)` after typeinfo rewrites
+        // itself to ExprConstInt) stay as unfolded ExprOp1/Op2/Op3, and the
+        // contract pass raises a spurious "static assert condition is not
+        // constexpr or const" (30151). Force-enable folding for the cond
+        // subtree; restore in visit().
+        savedFoldingForStaticAssert = enableInferTimeFolding;
+        if (!enableInferTimeFolding) {
+            enableInferTimeFolding = true;
+        }
     }
     ExpressionPtr InferTypes::visit(ExprStaticAssert *expr) {
+        // Restore folding state before any early-return path below.
+        enableInferTimeFolding = savedFoldingForStaticAssert;
         if (expr->argumentsFailedToInfer) {
             if (func)
                 func->notInferred();
diff --git a/tests/_issue_2830_lint_repro.das b/tests/_issue_2830_lint_repro.das
new file mode 100644
index 000000000..5d67299bb
--- /dev/null
+++ b/tests/_issue_2830_lint_repro.das
@@ -0,0 +1,37 @@
+options gen2
+
+// Regression fixture for https://github.com/GaijinEntertainment/daScript/issues/2830
+// `[decs_template]` synthesizes a per-field `decs::set(cmp, name, src.field)` call
+// chain; `decs::set` is a generic with a `concept_assert(typeinfo sizeof(value) <=
+// typeinfo sizeof(cmp[0].data), ...)`. Under lint flags
+// (`no_infer_time_folding=true` plus `no_optimizations=true`) the `int_const <=
+// int_const` cond stays as an unfolded `ExprOp2`, and `ContractFolding` raised a
+// spurious `error[30151]` (static assert condition is not constexpr or const).
+//
+// `decs_boost`'s `append_iterator` also used a plain `if (typeinfo has_field<_aka>(a))`
+// where the dead branch's `a._aka` reference fails to resolve once infer-time folding
+// is off — same root cause, separate site.
+//
+// `extended_checks` runs lint on changed .das files (linux x64 only by gate); this
+// file lives in `tests/` so the lint job picks it up.
+
+require daslib/decs_boost
+require daslib/linq_boost
+require daslib/linq_fold
+
+[decs_template(prefix = "x_")]
+struct X {
+    a : int
+    b : int
+    c : int
+}
+
+[export]
+def target() : int {
+    return _fold(from_decs_template(type<X>)._where(_.a + _.b + _.c > 0).count())
+}
+
+[export]
+def main {
+    print("count={target()}\n")
+}

From 72257687b956b8341e0825bf6ba394f8676250b7 Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 05:01:18 -0700
Subject: [PATCH 2/7] linq_fold: trivial-let elision + reverse_take
 skip-into-tail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two slices closing the final m4-vs-m3f gaps from #2824's residual outliers.

Slice 1 — trivial-let elision (closes sum_aggregate_m4 1.3ns gap):
When `_select(_.userName)` peels to a single `decs_tup.<field>` reference,
rename the chain bind directly to the iter var instead of synthesizing
`decs_sel_N`. wrap_decs_chain skips emitting the `let decs_sel_N = car_price`
binding entirely; the action's `acc += <iterVar>` references the iter var
natively. Required extending DecsTupUsageScanner with an iter-var-→-user-name
reverse map so bare iter-var refs still seed the pruner (previously: empty
usedNames fell through to unpruned-default, defeating the elision).

Slice 2 — reverse_take skip-into-tail (closes reverse_take_m4 5.2× gap):
For `from_decs(...).reverse().take(N).to_array()` with no where/select,
emit a two-pass invoke: pass 1 sums `arch.size` (no entity load), pass 2
uses for_each_archetype_find to skip whole archetypes whose cumulative
size still fits below the skip threshold, then a per-iter skip-counter
through the partial archetype, push into a takeN-sized buffer, and `return
true` to stop iteration once buf is full. reverse_inplace runs on the
small N buffer at end, not the full source. where/select fall through to
the legacy buffer+reverse_inplace+resize emit unchanged.

Bench (INTERP, 100K rows, ns/op):
- sum_aggregate_m4    3.4 → 2.1   matches m3f (was the systemic 1.3ns gap)
- reverse_take_m4    48.0 → 9.2   5.2× win, allocs 42B → 1B
- select_where_sum_m4 7.5 → 7.5   matches m3f (elision benefits this too)
- contains_match_m4   2.1 → 1.4   beats m3f at 2.2
- chained_where_m4    6.6 → 6.6   no regression
- count_aggregate_m4  4.1 → 4.1   no regression

Tests:
- New splice-shape assertions: trivial-let elision (no decs_sel binding
  for `_select(_.val).sum()` and `_where(_)._select(_.val).sum()`)
- New splice-shape for skip-into-tail (for_each_archetype_find count==1,
  decs_skips local presence)
- New parity tests: multi-archetype reverse+take, take(N>total),
  empty source — covers the whole-archetype-skip + partial-archetype +
  early-return arms

1388/1388 linq + 245/245 decs + 782/782 dasSQLITE green INTERP. MCP + CI
lint clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 daslib/linq_fold.das               | 112 ++++++++++++++++++++++--
 tests/linq/test_linq_from_decs.das | 132 +++++++++++++++++++++++++++++
 2 files changed, 237 insertions(+), 7 deletions(-)

diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index be5be7a36..143a8b37d 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -3078,14 +3078,16 @@ def private build_decs_inner_for(bridge : DecsBridgeShape?; var tupBind : Expres
     return clonedForExpr
 }
 
-// Walks a chain body to collect which fields of the decs_tup named-tuple bind are actually referenced. allFieldsUsed flips when the bind is referenced as a whole-var (not via field access) — e.g. push_clone(decs_tup) in a bare to_array, or pass-to-user-fn — so the helper falls back to unpruned emission and preserves the user-visible output shape.
+// Walks a chain body to collect which fields of the decs_tup named-tuple bind are actually referenced. allFieldsUsed flips when the bind is referenced as a whole-var (not via field access) — e.g. push_clone(decs_tup) in a bare to_array, or pass-to-user-fn — so the helper falls back to unpruned emission and preserves the user-visible output shape. Also picks up bare iter-var refs (when `iterToUser` is non-empty) so the trivial-let-elision path — which rewrites `decs_sel_N`-bound projections to the iter var directly, bypassing `decs_tup` entirely — still hits the pruned arm instead of falling through to unpruned-default.
 class private DecsTupUsageScanner : AstVisitor {
     tupName       : string
+    iterToUser    : table<string; string>
     usedFields    : table<string>
     allFieldsUsed : bool = false
     inTargetField : bool = false
-    def DecsTupUsageScanner(n : string) {
+    def DecsTupUsageScanner(n : string; var i2u : table<string; string>) {
         tupName = n
+        iterToUser <- i2u
     }
     def override preVisitExprField(expr : ExprField?) : void {
         var v = expr.value
@@ -3104,17 +3106,23 @@ class private DecsTupUsageScanner : AstVisitor {
     def override preVisitExprVar(expr : ExprVar?) : void {
         if (expr.name == tupName && !inTargetField) {
             allFieldsUsed = true
+            return
+        }
+        let nm = string(expr.name)
+        if (key_exists(iterToUser, nm)) {
+            usedFields |> insert(iterToUser[nm])
         }
     }
 }
 
 [macro_function]
-def private collect_decs_tup_usage(var e : Expression?; tupName : string) : tuple<bool; array<string>> {
-    // Returns (allFieldsUsed, usedFieldNames). usedFieldNames is unordered — caller filters bridge.userNames in original order to preserve get_ro emission order.
+def private collect_decs_tup_usage(var e : Expression?; tupName : string; bridge : DecsBridgeShape?) : tuple<bool; array<string>> {
+    // Returns (allFieldsUsed, usedFieldNames). usedFieldNames is unordered — caller filters bridge.userNames in original order to preserve get_ro emission order. The bridge feeds an iter-var → user-name reverse map so bare iter-var refs (from trivial-let elision) count as field usage too.
     var allUsed = false
     var used : array<string>
     if (e == null) return (allUsed, used)
-    var sc = new DecsTupUsageScanner(tupName)
+    var iterToUser <- {for (i in 0 .. length(bridge.iterNames)); bridge.iterNames[i] => bridge.userNames[i]}
+    var sc = new DecsTupUsageScanner(tupName, iterToUser)
     make_visitor(*sc) $(astVisitorAdapter) {
         visit_expression(e, astVisitorAdapter)
     }
@@ -3167,7 +3175,7 @@ def private build_decs_inner_for_pruned(bridge : DecsBridgeShape?;
                                         var body : Expression?;
                                         at : LineInfo) : Expression? {
     // Walks body for `tupName.<field>` references; when pruning is safe + beneficial, emits the inner multi-iter for with unused get_ro slots dropped and a matching shrunk named-tuple bind. Otherwise falls through to the unpruned path so user-visible shape stays intact (bare to_array, push_clone(decs_tup), pass-to-user-fn).
-    let (allUsed, usedNames) = collect_decs_tup_usage(body, tupName)
+    let (allUsed, usedNames) = collect_decs_tup_usage(body, tupName, bridge)
     // Fallback to the unpruned bind ONLY for whole-var refs (bare to_array, push_clone(decs_tup), pass-to-user-fn) or the edge case where the body never touches decs_tup at all. The "all fields used via field access" case still benefits from flatten + bind elision — no slots dropped, but the per-iter tuple-make and field reads disappear.
     if (allUsed || empty(usedNames)) {
         var tupBind = build_decs_tup_bind(bridge, tupName, at)
@@ -3211,6 +3219,7 @@ def private build_decs_inner_for_pruned(bridge : DecsBridgeShape?;
 
 struct private DecsChainInfo {
     bindAt      : array<string>    // bind name visible at each chain position
+    elidedAt    : array<bool>      // true when the select at position i was elided (peeled to a single decs_tup field access → reused iter var as next bind, no `let decs_sel_N = ...` emit needed)
     finalBind   : string            // bind name AFTER full chain — what terminator references
     finalType   : TypeDeclPtr       // element type AFTER full chain (constant + ref stripped)
     selectCount : int               // number of `select` ops in chain; 0 means finalBind == tupName
@@ -3229,17 +3238,47 @@ def private compute_decs_chain_info(var calls : array<tuple<ExprCall?; LinqCall?
         selectCount = 0
     )
     info.bindAt |> reserve(intermediateEnd)
+    info.elidedAt |> reserve(intermediateEnd)
     var curBind = tupName
     var curType : TypeDeclPtr = clone_type(bridge.elementType)
     for (i in 0 .. intermediateEnd) {
         info.bindAt |> push(curBind)
+        info.elidedAt |> push(false)
         var cll & = unsafe(calls[i])
         let opName = cll._1.name
         if (opName == "select") {
             info.selectCount ++
-            curBind = "`decs_sel`{at.line}`{at.column}`{info.selectCount}"
             var peeled = peel_lambda_rename_var(cll._0.arguments[1], info.bindAt[i])
             if (peeled == null || peeled._type == null) return null
+            // Trivial-let elision: `_select(_.userName)` against decs_tup → iter var directly. The pruned-for keeps the iter var (its name = bridge.iterNames[idx]) and the flattener would otherwise rewrite the synthetic `let decs_sel_N = decs_tup.userName` to `let decs_sel_N = <iterVar>` — a pure copy. Renaming finalBind to the iter var name skips the binding entirely and the action references it natively. Typer wraps both the projection root (`peeled` may be ExprRef2Value) AND the ExprVar inside the ExprField in ExprRef2Value — peel both.
+            var elided = false
+            if (curBind == tupName) {
+                var top = peeled
+                if (top is ExprRef2Value) {
+                    top = (top as ExprRef2Value).subexpr
+                }
+                if (top is ExprField) {
+                    var pf = top as ExprField
+                    var pv = pf.value
+                    if (pv is ExprRef2Value) {
+                        pv = (pv as ExprRef2Value).subexpr
+                    }
+                    if (pv is ExprVar && (pv as ExprVar).name == tupName) {
+                        let fname = string(pf.name)
+                        for (ui in 0 .. length(bridge.userNames)) {
+                            if (bridge.userNames[ui] == fname) {
+                                curBind = bridge.iterNames[ui]
+                                elided = true
+                                info.elidedAt[i] = true
+                                break
+                            }
+                        }
+                    }
+                }
+            }
+            if (!elided) {
+                curBind = "`decs_sel`{at.line}`{at.column}`{info.selectCount}"
+            }
             curType = clone_type(peeled._type)
         } elif (opName != "where_") return null
     }
@@ -3274,6 +3313,8 @@ def private wrap_decs_chain(var action : Expression?;
                 }
             }
         } elif (opName == "select") {
+            // Skip emission entirely when this select was elided in compute_decs_chain_info — the next bind is already the iter var the flattener would have produced for `let nextBind = decs_tup.userName`. No copy needed; downstream `current` already references the iter var by the elided name.
+            continue if (chainInfo.elidedAt[i])
             var proj = peel_lambda_rename_var(cll._0.arguments[1], bindHere)
             if (proj == null) return null
             let nextBind = (i + 1 < intermediateEnd) ? chainInfo.bindAt[i + 1] : chainInfo.finalBind
@@ -4634,6 +4675,63 @@ def private plan_decs_reverse(var expr : Expression?) : Expression? {
     let bufName = qn("decs_buf", at)
     let needIterWrap = expr._type.isIterator
     var bufElemType = strip_const_ref(clone_type(projection != null ? projection._type : bridge.elementType))
+    // Skip-into-tail fast path: `reverse |> take(N) |> to_array` with no where/select. Walk archetypes once to sum `arch.size` (cheap, no entity load), compute skip = total - takeN, then for_each_archetype_find skips whole archetypes whose size still fits below the skip threshold and short-circuits once the buffer reaches takeN. `where` would invalidate the size-based skip (count after filter is unknown without iterating); `select` would only affect element shape, not count, but is skipped here to keep v1 minimal.
+    if (takeExpr != null && whereCond == null && projection == null) {
+        let takeNName = qn("take_n", at)
+        let totalName = qn("decs_total", at)
+        let actualName = qn("decs_actual", at)
+        let skipName = qn("decs_skip", at)
+        let seenName = qn("decs_seen", at)
+        let skipsLeftName = qn("decs_skips", at)
+        let tupBind = build_decs_tup_bind(bridge, tupName, at)
+        // Inner-for body: skip-counter early-out before the named-tuple wrap (so skipped iters pay no per-component load); push + break-on-quota after.
+        var innerBody : Expression? = qmacro_block() {
+            if ($i(skipsLeftName) > 0_l) {
+                $i(skipsLeftName) --
+                continue
+            }
+            $e(tupBind)
+            $i(bufName) |> push_clone($i(tupName))
+            if (int64(length($i(bufName))) >= $i(actualName)) {
+                break
+            }
+        }
+        var clonedForExpr = clone_expression(bridge.forExpr)
+        var clonedFor = clonedForExpr as ExprFor
+        var newForBody = new ExprBlock(at = at)
+        newForBody.list |> push(innerBody)
+        clonedFor.body = newForBody
+        var emission : Expression? = qmacro(invoke($() : array<$t(bufElemType)> {
+            // Pass 1: arch.size sum — no entity walk, just archetype-header iteration.
+            var $i(totalName) = 0_l
+            for_each_archetype($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) {
+                $i(totalName) += $i(archName).size
+            })
+            let $i(takeNName) = $e(takeExpr)
+            let $i(actualName) = ($i(takeNName) <= 0) ? 0_l : ((int64($i(takeNName)) < $i(totalName)) ? int64($i(takeNName)) : $i(totalName))
+            let $i(skipName) = $i(totalName) - $i(actualName)
+            var $i(bufName) : array<$t(bufElemType)>
+            if ($i(actualName) == 0_l) {
+                return <- $i(bufName)
+            }
+            $i(bufName) |> reserve(int($i(actualName)))
+            // Pass 2: skip whole archetypes via size sum; partial archetype uses skip-counter; subsequent archetypes feed directly. Returns true once buf reaches actualTake to stop iteration across remaining archetypes.
+            var $i(seenName) = 0_l
+            for_each_archetype_find($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) : bool {
+                if ($i(seenName) + $i(archName).size <= $i(skipName)) {
+                    $i(seenName) += $i(archName).size
+                    return false
+                }
+                var $i(skipsLeftName) = ($i(skipName) > $i(seenName)) ? ($i(skipName) - $i(seenName)) : 0_l
+                $e(clonedForExpr)
+                $i(seenName) += $i(archName).size
+                return int64(length($i(bufName))) >= $i(actualName)
+            })
+            _::reverse_inplace($i(bufName))
+            return <- $i(bufName)
+        }))
+        return finalize_decs_emission(emission, at, needIterWrap)
+    }
     var pushExpr : Expression?
     if (projection != null) {
         pushExpr = qmacro_expr() {
diff --git a/tests/linq/test_linq_from_decs.das b/tests/linq/test_linq_from_decs.das
index 2ae9239f7..131a8273d 100644
--- a/tests/linq/test_linq_from_decs.das
+++ b/tests/linq/test_linq_from_decs.das
@@ -3088,3 +3088,135 @@ def test_unroll_take_last_splice_shape(t : T?) {
     }
 }
 
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 1: trivial-let elision for `_select(_.userName)` → iter var (skips
+//          the `let decs_sel_N = car_price` no-op binding when the projection
+//          is a single field access on decs_tup)
+// ─────────────────────────────────────────────────────────────────────────────
+
+[test]
+def test_unroll_select_sum_trivial_let_elision_splice_shape(t : T?) {
+    // `_select(_.val).sum()` peels to a single ExprField(decs_tup.val) which the flattener would rewrite to a bare iter var anyway — slice 1 elides the synthetic `let decs_sel_N = car_price` entirely and rewrites finalBind to the iter var, leaving body as `acc += <iterVar>` with no intermediate binding.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_select_sum_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_select_sum_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "decs_sel"), 0, "trivial single-field _select elided — no synthetic decs_sel binding")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "still exactly one for_each_archetype walk")
+    }
+}
+
+[test]
+def test_unroll_where_select_sum_trivial_let_elision_splice_shape(t : T?) {
+    // Same elision applies through a where filter: where(_.flag==1).select(_.val).sum() — the select still peels to a single ExprField on decs_tup, so elision fires and the binding is dropped. Asserts the elision is not gated on whether the chain is bare or filtered.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll_where_select_sum_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll_where_select_sum_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "decs_sel"), 0, "trivial single-field _select elided after a where_")
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Slice 2: reverse + take(N) skip-into-tail fast path on decs
+//          Two-pass: sum arch.size, then for_each_archetype_find skips whole
+//          archetypes that fit below the threshold + early-exits once the
+//          takeN-sized buffer is full. Reverses the small N buffer at end.
+// ─────────────────────────────────────────────────────────────────────────────
+
+[test]
+def test_unroll5d_reverse_take_skip_into_tail_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_unroll5d_reverse_take_fold)
+        t |> success(func != null, "RTTI must resolve target_unroll5d_reverse_take_fold")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected splice invoke wrapper")
+        t |> equal(describe_count(body_expr, "for_each_archetype_find"), 1, "skip-into-tail uses _find for early-exit once buf reaches takeN")
+        t |> equal(describe_count(body_expr, "for_each_archetype"), 2, "two walks total: 1 size-sum + 1 _find (describe_count is substring match)")
+        // decs_skips is the slice-2 unique skip-counter local (qn(\"decs_skips\", at)). Its presence proves the fast path fired vs. the legacy buffer+reverse_inplace+resize emit.
+        t |> success(describe_count(body_expr, "decs_skips") >= 1, "decs_skips local present — slice-2 fast path fired")
+    }
+}
+
+// ── Multi-archetype reverse + take: exercises whole-archetype-skip arm + partial-archetype skip-counter arm ──
+
+[decs_template(prefix = "rev2_")]
+struct RevTakeMultiArchA {
+    id : int
+}
+
+[decs_template(prefix = "rev2_b_")]
+struct RevTakeMultiArchB {
+    bid : int
+    bval : int
+}
+
+[test]
+def test_reverse_take_multi_archetype_parity(t : T?) {
+    // Creates two archetypes (A with 4 rows, B with 5 rows). A 9-row sum across archetypes; reverse + take(3) must return 3 rows. The unrelated B archetype is filtered out by the from_decs_template request anyway — but its presence in the decs state ensures for_each_archetype iterates across more than one archetype-class even when the request matches just one.
+    restart()
+    for (i in 0..4) {
+        create_entity() @(eid, cmp) {
+            cmp.eid := eid
+            cmp.rev2_id := i
+        }
+    }
+    for (i in 0..5) {
+        create_entity() @(eid, cmp) {
+            cmp.eid := eid
+            cmp.rev2_b_bid := i + 100
+            cmp.rev2_b_bval := i * 10
+        }
+    }
+    commit()
+    let got <- _fold(from_decs_template(type<RevTakeMultiArchA>).reverse().take(3).to_array())
+    t |> equal(got.length(), 3, "reverse+take(3) on multi-archetype world returns 3 rows from the matching archetype only")
+    var idSet : table<int>
+    for (r in got) {
+        idSet |> insert(r.id)
+    }
+    // All returned ids must be from the A archetype (0..3); none from B (100..104).
+    for (r in got) {
+        t |> success(r.id >= 0 && r.id < 4, "row id {r.id} must be from RevTakeMultiArchA archetype")
+    }
+    t |> equal(length(idSet), 3, "all 3 ids are distinct")
+}
+
+[test]
+def test_reverse_take_skip_zero_when_take_exceeds_total(t : T?) {
+    // takeN > totalCount → skip = 0, returns all rows reversed. Exercises the actualTake = totalCount branch and the early-exit not firing (buf never reaches takeN before iteration completes).
+    restart()
+    for (i in 0..3) {
+        create_entity() @(eid, cmp) {
+            cmp.eid := eid
+            cmp.rev2_id := i
+        }
+    }
+    commit()
+    let got <- _fold(from_decs_template(type<RevTakeMultiArchA>).reverse().take(99).to_array())
+    t |> equal(got.length(), 3, "take(N) where N > total returns all rows")
+}
+
+[test]
+def test_reverse_take_empty_source(t : T?) {
+    // No matching archetypes → totalCount = 0 → early-return empty buf before for_each_archetype_find.
+    restart()
+    commit()
+    let got <- _fold(from_decs_template(type<RevTakeMultiArchA>).reverse().take(3).to_array())
+    t |> success(empty(got), "empty source returns empty")
+}
+

From eec60024f9252c5c5df39ff23996c3643c9942f2 Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 10:01:31 -0700
Subject: [PATCH 3/7] tests/linq: bring back test_wave4_all_fields_via_access
 from #2828
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#2835 fixed the typer-pass-order #2830 that originally tripped this test
on the extended_checks (linux, 64) lane. With master now containing the
fix, the test compiles cleanly on all lanes. Re-adding it covers a case
the current suite missed:

  from_decs_template(type<Row>)._where(_.a >= 0)._where(_.b >= 0).count()

Three chained single-field _where_s — all 3 fields read via field access,
no whole-var ref. The splice must keep all 3 get_ros (no slot pruning)
but elide the named-tuple bind (no decs_tup in the body, iter vars read
directly). Lesson saved to memory: not every CI lane runs every test, so
"platform-specific" failures often mean "we only check this on one
platform" — not that the bug itself is platform-specific.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/linq/test_linq_from_decs.das | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/linq/test_linq_from_decs.das b/tests/linq/test_linq_from_decs.das
index 131a8273d..8671766de 100644
--- a/tests/linq/test_linq_from_decs.das
+++ b/tests/linq/test_linq_from_decs.das
@@ -2525,6 +2525,43 @@ def test_wave4_take_count_no_field_ref_parity(t : T?) {
     t |> equal(target_wave4_take_count_no_field_ref_fold(), 5, "take(5).count() splice must still return 5")
 }
 
+[export, marker(no_coverage)]
+def target_wave4_all_fields_via_access_fold() : int {
+    // All 3 fields referenced via field access (chained single-field where_s — same all-fields trigger as a compound predicate, but each peeled body has a single field-access node, matching the predicate shape the rest of the suite uses).
+    return _fold(from_decs_template(type<Wave4Row>)
+        ._where(_.brand >= 0)
+        ._where(_.price >= 0)
+        ._where(_.year > 0)
+        .count())
+}
+
+[test]
+def test_wave4_all_fields_via_access_parity(t : T?) {
+    fixture_wave4(10)
+    // brand = i%4 (always >= 0), price = i*10 (always >= 0), year = 2000+(i%25) (always > 0) → all 10 rows match.
+    t |> equal(target_wave4_all_fields_via_access_fold(), 10, "all-fields-access predicate must return all rows")
+}
+
+[test]
+def test_wave4_all_fields_via_access_splice_shape(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_wave4_all_fields_via_access_fold)
+        t |> success(func != null, "RTTI must resolve target")
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched, "qmatch must capture body")
+        // No slots pruned (all 3 fields read), but the bind is elided and body references iter vars directly.
+        t |> equal(describe_count(body_expr, "get_ro"), 3, "all 3 get_ros present (no slot pruning)")
+        t |> equal(describe_count(body_expr, "decs_tup"), 0, "named-tuple bind elided when no whole-var ref")
+        t |> success(describe_count(body_expr, "wave4_brand") >= 2, "brand iter var read directly (slot + body ref)")
+        t |> success(describe_count(body_expr, "wave4_price") >= 2, "price iter var read directly")
+        t |> success(describe_count(body_expr, "wave4_year") >= 2, "year iter var read directly")
+    }
+}
+
 // ── C2: buffer-required planners (order/reverse/distinct) ──────────────────────
 
 [export, marker(no_coverage)]

From 75df958f06dc6a0066e3a005b069f50b89ebda49 Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 10:07:41 -0700
Subject: [PATCH 4/7] PR #2834 Copilot C3: fix multi-archetype reverse_take
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original test created one A-only and one B-only archetype, but
the query `from_decs_template(type<RevTakeMultiArchA>)` only matches A
— the B archetype never enters for_each_archetype, so the cross-
archetype skipping arm wasn't actually exercised.

Now creates two MATCHING archetypes: both have `rev2_id` (so both
satisfy the query), but the second group also has the rev2_b_* extras
which lands it in a separate archetype class. With A1=4 + A2=5 →
totalCount=9, take(3) → skip=6: A1 (size 4) skipped via the size-sum
arm, A2 enters with skipsLeft=2 → drains 2, pushes 3, returns true.
Exercises both the whole-archetype-skip and partial-archetype +
early-exit paths.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/linq/test_linq_from_decs.das | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/linq/test_linq_from_decs.das b/tests/linq/test_linq_from_decs.das
index 8671766de..2d20b6811 100644
--- a/tests/linq/test_linq_from_decs.das
+++ b/tests/linq/test_linq_from_decs.das
@@ -3198,13 +3198,13 @@ struct RevTakeMultiArchA {
 
 [decs_template(prefix = "rev2_b_")]
 struct RevTakeMultiArchB {
-    bid : int
+    bid  : int
     bval : int
 }
 
 [test]
 def test_reverse_take_multi_archetype_parity(t : T?) {
-    // Creates two archetypes (A with 4 rows, B with 5 rows). A 9-row sum across archetypes; reverse + take(3) must return 3 rows. The unrelated B archetype is filtered out by the from_decs_template request anyway — but its presence in the decs state ensures for_each_archetype iterates across more than one archetype-class even when the request matches just one.
+    // Creates two MATCHING archetypes — both have `rev2_id` so both satisfy `from_decs_template(type<RevTakeMultiArchA>)`. Group A1: just rev2_id. Group A2: rev2_id + extra rev2_b_* components — same query matches, but the extra components land it in a separate archetype. With A1=4 + A2=5 → totalCount=9, take(3) → skip=6: A1 (size 4) skipped entirely via the size-sum arm, A2 enters with skipsLeft=2 → drains 2 then pushes 3 then returns true. Exercises both the whole-archetype-skip and partial-archetype skip-counter + early-exit.
     restart()
     for (i in 0..4) {
         create_entity() @(eid, cmp) {
@@ -3215,22 +3215,25 @@ def test_reverse_take_multi_archetype_parity(t : T?) {
     for (i in 0..5) {
         create_entity() @(eid, cmp) {
             cmp.eid := eid
-            cmp.rev2_b_bid := i + 100
+            cmp.rev2_id := i + 100
+            cmp.rev2_b_bid := i + 1000
             cmp.rev2_b_bval := i * 10
         }
     }
     commit()
     let got <- _fold(from_decs_template(type<RevTakeMultiArchA>).reverse().take(3).to_array())
-    t |> equal(got.length(), 3, "reverse+take(3) on multi-archetype world returns 3 rows from the matching archetype only")
+    t |> equal(got.length(), 3, "reverse+take(3) on two matching archetypes returns 3 rows")
     var idSet : table<int>
     for (r in got) {
         idSet |> insert(r.id)
     }
-    // All returned ids must be from the A archetype (0..3); none from B (100..104).
+    t |> equal(length(idSet), 3, "all 3 ids are distinct")
+    // All returned ids must be from one of the two matching archetypes: A1 (ids 0..3) or A2 (ids 100..104). The exact subset depends on archetype iteration order (a decs-internal detail), so we just check membership.
     for (r in got) {
-        t |> success(r.id >= 0 && r.id < 4, "row id {r.id} must be from RevTakeMultiArchA archetype")
+        let inA1 = r.id >= 0 && r.id < 4
+        let inA2 = r.id >= 100 && r.id < 105
+        t |> success(inA1 || inA2, "row id {r.id} must be from one of the two matching archetypes")
     }
-    t |> equal(length(idSet), 3, "all 3 ids are distinct")
 }
 
 [test]

From 7a7991be373d056df3ed3c20014342025576a962 Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 10:26:27 -0700
Subject: [PATCH 5/7] daslib: drop dead `var res` in json_boost.from_JV<EnumT>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR b42a9524e (daslib: tighten unsafe usage) condensed the enum-name
lookup loop to a single `return unsafe(reinterpret<EnumTT>(ef.value)) if (name == ef.name)`,
which left `var res : auto(EnumTT) = default<EnumT>` as dead code — only
used to bind `EnumTT` (an alias of `EnumT` since `default<EnumT>` has
type `EnumT`).

Drop the dead var and inline the rename: `reinterpret<EnumTT>` →
`reinterpret<EnumT>`. No behavior change.

Verified by `tests/json/test_json_edge.das::test_enum_json` (128 tests
pass; covers both the string-name path and the round-trip path at
:630/:634).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 daslib/json_boost.das | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/daslib/json_boost.das b/daslib/json_boost.das
index 9fab0a3ab..84902d67c 100644
--- a/daslib/json_boost.das
+++ b/daslib/json_boost.das
@@ -233,10 +233,9 @@ def from_JV(v : JsonValue const explicit?; ent : auto(EnumT); defV : EnumT = def
     if (v == null || !((v.value is _string) || (v.value is _number) || (v.value is _longint))) return defV
     if (v.value is _string) {
         let name = v.value as _string
-        var res : auto(EnumTT) = default<EnumT>
         let ti = typeinfo rtti_typeinfo(type<EnumT>)
         for (ef in *ti.enumType) {
-            return unsafe(reinterpret<EnumTT>(ef.value)) if (name == ef.name)
+            return unsafe(reinterpret<EnumT>(ef.value)) if (name == ef.name)
         }
         panic("not a valid enumeration {name} in {typeinfo typename(type<EnumT>)}")
     } else {

From 3e72e39703cdd71d39995537f5e39d5aa80747ea Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 11:58:55 -0700
Subject: [PATCH 6/7] linq_fold: bounded-heap / streaming-min for
 plan_decs_order_family
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For `from_decs._order_by(KEY).take(N).to_array()` with an inline-able key,
emit a bounded heap of size N maintained during the for_each_archetype walk
instead of materializing the full M-element buffer and then partial-sorting.
For `from_decs._order_by(KEY).first()` / `first_or_default(d)`, emit a
streaming-min: single `best` + `seen` flag instead of buf + min_by.

Cuts 100K push_clones (full DecsCar struct + string alloc) down to ~N
push_clones (only when the element wins the heap test).

Bench results (INTERP, 100K rows, ns/op):
  sort_first_m4         72.0 → 23.9 (3.0×, BEATS m3f 41.3)
  sort_take_m4          52.1 → 30.6 (1.7×, +7.9 vs m3f 22.7)
  order_take_desc_m4    52.1 → 30.5 (1.7×, +8.2 vs m3f 22.3)
  select_where_order_take_m4   35.1 → 25.1 (1.4×, +3.5 vs m3f 21.6)

Adds two thin re-exports in linq.das (`spliced_push_heap`, `spliced_pop_heap`)
so the splice can call sort_boost::{push,pop}_heap from any user module
without requiring sort_boost directly. The bounded-heap less-test uses a
new `make_inline_less_call` helper that templates the key body twice with
direct operand expressions — no block dispatch on the hot path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 daslib/linq.das                    |  10 +++
 daslib/linq_fold.das               | 121 ++++++++++++++++++++++++++++-
 tests/linq/test_linq_from_decs.das |  16 ++--
 3 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/daslib/linq.das b/daslib/linq.das
index bb7a43270..5634b95e8 100644
--- a/daslib/linq.das
+++ b/daslib/linq.das
@@ -601,6 +601,16 @@ def top_n_by_with_cmp(var a : iterator<auto(TT)>; n : int; cmp : block<(v1 : TT
     return <- buf
 }
 
+def public spliced_push_heap(var buf : array<auto(TT)>; cmp : block<(x, y : TT) : bool>) {
+    //! Thin re-export of sort_boost::push_heap so plan_decs_order_family's bounded-heap splice can call it via _:: from any user module without requiring sort_boost directly.
+    sort_boost::push_heap(buf, cmp)
+}
+
+def public spliced_pop_heap(var buf : array<auto(TT)>; cmp : block<(x, y : TT) : bool>) {
+    //! Thin re-export of sort_boost::pop_heap for the bounded-heap splice (see spliced_push_heap).
+    sort_boost::pop_heap(buf, cmp)
+}
+
 def unique_key(a) {
     //! generates unique key of workhorse type for the value
     static_if (typeinfo is_workhorse(a)) {
diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index 143a8b37d..f953efea3 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -1200,12 +1200,35 @@ def private try_make_inline_cmp(orderKey : Expression?; orderName : string;
     } else {
         cmpExpr = qmacro(_::less($e(b1), $e(b2)))
     }
-    // Emit untyped block args (`$(v1, v2) { ... }`). The typer infers v1/v2 types from
+    // Emit untyped block args (`$(v1, v2) { ... }`); typer infers v1/v2 types from the call site (top_n_by_with_cmp / order_inplace cmp-param signatures).
     return qmacro($(v1, v2) {
         return $e(cmpExpr)
     })
 }
 
+def private make_inline_less_call(orderKey : Expression?; orderName : string;
+                                  var lhsExpr, rhsExpr : Expression?; at : LineInfo) : Expression? {
+    // Direct-call mirror of try_make_inline_cmp: builds `less(KEY_BODY[lhsExpr], KEY_BODY[rhsExpr])` (or flipped for descending) for inline use without a block dispatch — needed for the bounded-heap less-test where invoking a block on a 100K-element hot path costs ~5ns/op extra.
+    if (orderKey == null || !(orderKey is ExprMakeBlock)) return null
+    var mblk = orderKey as ExprMakeBlock
+    var blk = mblk._block as ExprBlock
+    if (blk.arguments |> length != 1 || blk.list |> length != 1
+            || !(blk.list[0] is ExprReturn)) return null
+    var ret = blk.list[0] as ExprReturn
+    if (ret.subexpr == null || has_sideeffects(ret.subexpr)) return null
+    let argName = string(blk.arguments[0].name)
+    var b1 = clone_expression(ret.subexpr)
+    var b2 = clone_expression(ret.subexpr)
+    var r1 : Template
+    r1 |> replaceVariable(argName, lhsExpr)
+    var r2 : Template
+    r2 |> replaceVariable(argName, rhsExpr)
+    apply_template(r1, b1.at, b1)
+    apply_template(r2, b2.at, b2)
+    return qmacro(_::less($e(b2), $e(b1))) if (orderName == "order_by_descending")
+    return qmacro(_::less($e(b1), $e(b2)))
+}
+
 [macro_function]
 def private plan_order_family(var expr : Expression?) : Expression? {
     var (top, calls) = flatten_linq(expr)
@@ -4424,6 +4447,99 @@ def private plan_decs_order_family(var expr : Expression?) : Expression? {
     if (hasKey) {
         inlineCmp = try_make_inline_cmp(orderKey, orderName, elemType, at)
     }
+    // Bounded-heap / streaming-min fast paths: when the key is inline-able, skip the materialize-all + min_by/top_n pattern in favor of a per-walk state (single best for first[_or_default], heap of size N for take). Slashes 100K push_clones to ~N — the rest of the elements only pay a cmp.
+    let useBoundedHeap = takeExpr != null && inlineCmp != null && firstName == ""
+    let useStreamingMin = firstName != "" && inlineCmp != null
+    let archName = bridge.archName
+    let needIterWrap = expr._type.isIterator
+    var emission : Expression?
+    if (useStreamingMin) {
+        let bestName = qn("decs_best", at)
+        let seenName = qn("decs_seen", at)
+        var lessTest = make_inline_less_call(orderKey, orderName,
+            qmacro($i(tupName)), qmacro($i(bestName)), at)
+        var perElement : Expression? = qmacro_expr() {
+            if (!$i(seenName)) {
+                $i(bestName) := $i(tupName)
+                $i(seenName) = true
+            } elif ($e(lessTest)) {
+                $i(bestName) := $i(tupName)
+            }
+        }
+        if (whereCond != null) {
+            perElement = qmacro_expr() {
+                if ($e(whereCond)) {
+                    $e(perElement)
+                }
+            }
+        }
+        var forExprNode = build_decs_inner_for_pruned(bridge, tupName, perElement, at)
+        if (firstName == "first") {
+            emission = qmacro(invoke($() : $t(elemType) {
+                var $i(bestName) = default<$t(elemType)>
+                var $i(seenName) = false
+                for_each_archetype($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) {
+                    $e(forExprNode)
+                })
+                panic("sequence contains no elements") if (!$i(seenName))
+                return $i(bestName)
+            }))
+        } else {
+            let dBindName = qn("order_d", at)
+            emission = qmacro(invoke($() : $t(elemType) {
+                let $i(dBindName) = $e(firstDefaultExpr)
+                var $i(bestName) = default<$t(elemType)>
+                var $i(seenName) = false
+                for_each_archetype($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) {
+                    $e(forExprNode)
+                })
+                return $i(bestName) if ($i(seenName))
+                return $i(dBindName)
+            }))
+        }
+        return finalize_decs_emission(emission, at, false)
+    }
+    if (useBoundedHeap) {
+        let takeNName = qn("decs_take_n", at)
+        // Direct less-test on the hot path: `less(KEY[decs_tup], KEY[buf[0]])` inlined, no block dispatch.
+        var lessTest = make_inline_less_call(orderKey, orderName,
+            qmacro($i(tupName)), qmacro($i(bufName)[0]), at)
+        var perElement : Expression? = qmacro_expr() {
+            if (length($i(bufName)) < $i(takeNName)) {
+                $i(bufName) |> push_clone($i(tupName))
+                _::spliced_push_heap($i(bufName), $e(inlineCmp))
+            } elif ($e(lessTest)) {
+                _::spliced_pop_heap($i(bufName), $e(inlineCmp))
+                $i(bufName)[length($i(bufName)) - 1] := $i(tupName)
+                _::spliced_push_heap($i(bufName), $e(inlineCmp))
+            }
+        }
+        if (whereCond != null) {
+            perElement = qmacro_expr() {
+                if ($e(whereCond)) {
+                    $e(perElement)
+                }
+            }
+        }
+        var forExprNode = build_decs_inner_for_pruned(bridge, tupName, perElement, at)
+        var bhStmts : array<Expression?>
+        bhStmts |> reserve(7)
+        bhStmts |> push_from <| qmacro_block_to_array() {
+            let $i(takeNName) = $e(takeExpr)
+            var $i(bufName) : array<$t(elemType)>
+            return <- $i(bufName) if ($i(takeNName) <= 0)
+            $i(bufName) |> reserve($i(takeNName))
+            for_each_archetype($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) {
+                $e(forExprNode)
+            })
+            _::order_inplace($i(bufName), $e(inlineCmp))
+            return <- $i(bufName)
+        }
+        emission = qmacro(invoke($() : array<$t(elemType)> {
+            $b(bhStmts)
+        }))
+        return finalize_decs_emission(emission, at, needIterWrap)
+    }
     var perElement : Expression? = qmacro_expr() {
         $i(bufName) |> push_clone($i(tupName))
     }
@@ -4435,7 +4551,6 @@ def private plan_decs_order_family(var expr : Expression?) : Expression? {
         }
     }
     var forExprNode = build_decs_inner_for_pruned(bridge, tupName, perElement, at)
-    let archName = bridge.archName
     var bodyStmts : array<Expression?>
     bodyStmts |> reserve(5)
     bodyStmts |> push_from <| qmacro_block_to_array() {
@@ -4444,8 +4559,6 @@ def private plan_decs_order_family(var expr : Expression?) : Expression? {
             $e(forExprNode)
         })
     }
-    let needIterWrap = expr._type.isIterator
-    var emission : Expression?
     if (firstName == "first") {
         // order + first → min/max on buffer. Empty buf must panic to match eager `first()` semantics.
         bodyStmts |> push <| qmacro_expr() {
diff --git a/tests/linq/test_linq_from_decs.das b/tests/linq/test_linq_from_decs.das
index 2d20b6811..10e25b6e4 100644
--- a/tests/linq/test_linq_from_decs.das
+++ b/tests/linq/test_linq_from_decs.das
@@ -1987,8 +1987,10 @@ def test_unroll5d_order_by_take_splice_shape(t : T?) {
         t |> equal(describe_count(body_expr, "for_each_archetype_find"), 0, "order family uses for_each_archetype (no early-exit needed)")
         // Buffer hoisted ABOVE for_each_archetype so it survives the archetype walk.
         t |> success(describe_count(body_expr, "decs_buf") >= 2, "decs_buf declared + populated")
-        // top_n_by call replaces the array-side to_array / sort sequence.
-        t |> equal(describe_count(body_expr, "top_n_by"), 1, "splice dispatches to top_n_by")
+        // Bounded-heap emit: spliced_push_heap appears twice (fill + replace), spliced_pop_heap once (replace). top_n_by is no longer used.
+        t |> equal(describe_count(body_expr, "spliced_push_heap"), 2, "bounded-heap fill + replace each call spliced_push_heap")
+        t |> equal(describe_count(body_expr, "spliced_pop_heap"), 1, "bounded-heap replace calls spliced_pop_heap once")
+        t |> equal(describe_count(body_expr, "top_n_by"), 0, "bounded-heap path does NOT call top_n_by")
     }
 }
 
@@ -2005,10 +2007,12 @@ def test_unroll5d_order_by_first_splice_shape(t : T?) {
         t |> success(r.matched, "qmatch must capture body")
         t |> equal(describe_count(body_expr, "to_sequence"), 0, "order_by+first splice must NOT fall to tier-2 to_sequence")
         t |> equal(describe_count(body_expr, "for_each_archetype"), 1, "exactly one for_each_archetype")
-        // first → min_by on the buffer (NOT top_n_by + index).
-        t |> equal(describe_count(body_expr, "min_by"), 1, "order_by+first emits min_by call")
-        t |> equal(describe_count(body_expr, "top_n_by"), 0, "order_by+first should NOT use top_n_by")
-        // Empty-buffer panic guard present.
+        // Streaming-min emit: best + seen state vars, no buffer + min_by/top_n.
+        t |> equal(describe_count(body_expr, "min_by"), 0, "streaming-min path does NOT call min_by")
+        t |> equal(describe_count(body_expr, "top_n_by"), 0, "streaming-min path does NOT call top_n_by")
+        t |> success(describe_count(body_expr, "decs_best") >= 2, "best state var declared + updated")
+        t |> success(describe_count(body_expr, "decs_seen") >= 2, "seen flag declared + updated")
+        // Empty-source panic guard preserved.
         t |> equal(describe_count(body_expr, "sequence contains no elements"), 1, "panic-on-empty guard for first()")
     }
 }

From 761f605371b2f647a93dddc55928c00b7010016c Mon Sep 17 00:00:00 2001
From: Boris Batkin <bbatkin@gmail.com>
Date: Sat, 23 May 2026 12:27:10 -0700
Subject: [PATCH 7/7] PR #2837 Copilot: drop reserve(takeN) on bounded-heap buf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

reserve(takeN) for a user-supplied take(N) on a decs source of unknown
cardinality risks a large upfront allocation when N >> actual source
size — same OOM trap that top_n_by_with_cmp's iterator variant already
documents (linq.das:482-484). The fill phase grows geometrically to
min(N, M) in O(log) reallocs anyway, and at our common N (≤100) the
bench delta is in the noise floor (sort_take_m4 30.6 → 29.8, within
measurement noise).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 daslib/linq_fold.das | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index f953efea3..ac98b43d0 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -4524,11 +4524,11 @@ def private plan_decs_order_family(var expr : Expression?) : Expression? {
         var forExprNode = build_decs_inner_for_pruned(bridge, tupName, perElement, at)
         var bhStmts : array<Expression?>
         bhStmts |> reserve(7)
+        // No `reserve(takeN)` on the bounded buf — matches the policy in linq.das top_n_by_with_cmp iterator variant. Caller may pass takeN >> actual source size, and the decs cardinality is unknown ahead of the walk; pre-reserving N slots would risk a large upfront allocation for no win (fill phase grows geometrically to min(N, M) in O(log) reallocs anyway).
         bhStmts |> push_from <| qmacro_block_to_array() {
             let $i(takeNName) = $e(takeExpr)
             var $i(bufName) : array<$t(elemType)>
             return <- $i(bufName) if ($i(takeNName) <= 0)
-            $i(bufName) |> reserve($i(takeNName))
             for_each_archetype($e(bridge.reqHashExpr), $e(bridge.erqExpr), $($i(archName) : Archetype) {
                 $e(forExprNode)
             })