From 9a9483697445b4ab52203a7d280500131466ea50 Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 6 May 2026 12:33:18 +0000 Subject: [PATCH 1/3] yeast: Add per-rule .repeated() flag to opt into iterative matching Previously, after a rule fired the engine would always re-try that same rule on the result root. A rule whose output matched its own query (intentionally or by accident) would loop until the global MAX_REWRITE_DEPTH safety net kicked in. Make the default behavior fire-once-per-node: after a rule fires on node N, the engine no longer tries that same rule on the result root. Other rules and child traversal are unaffected. Rules that intentionally rewrite iteratively can opt into the old behavior via the new Rule::repeated() builder method. Add two regression tests using a self-swapping assignment rule: - with .repeated(), the swap loops and trips the depth limit - without it (default), the swap fires once and terminates --- shared/yeast/doc/yeast.md | 16 +++++++++++ shared/yeast/src/lib.rs | 38 ++++++++++++++++++++++--- shared/yeast/tests/test.rs | 57 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 4 deletions(-) diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index d49ff96f11df..45e0006f909c 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -61,6 +61,22 @@ rule matches, the node is kept and its children are processed recursively. A rule can replace one node with zero nodes (deletion), one node (rewriting), or multiple nodes (expansion). +By default a rule fires **at most once on a given node**: after firing, the +engine will not re-try that same rule on the result root. Other rules may +still fire on the result, and the rule may still fire on different nodes +(including the result's children). To opt into iterative behaviour — when a +rule's output is intentionally re-matched by the same rule — call +`.repeated()` on the constructed `Rule`: + +```rust +let r = yeast::rule!((foo ...) => (foo ...)).repeated(); +``` + +Without `.repeated()`, a rule whose output happens to match its own query +simply fires once and stops. With `.repeated()`, the rule is allowed to +re-match indefinitely; the runner still enforces a global rewrite-depth +limit (currently 100) as a safety net against accidental cycles. + ## Query language Queries use a syntax inspired by diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 46629e198406..0c66a0c85210 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -471,11 +471,29 @@ pub type Transform = Box< pub struct Rule { query: QueryNode, transform: Transform, + /// If true, after this rule fires on a node the engine will try to + /// re-apply this same rule on the result root. Defaults to false: + /// each rule fires at most once on a given node, which prevents + /// accidental loops where a rule's output matches its own query. + repeated: bool, } impl Rule { pub fn new(query: QueryNode, transform: Transform) -> Self { - Self { query, transform } + Self { + query, + transform, + repeated: false, + } + } + + /// Mark this rule as allowed to fire multiple times on the same node. + /// Use when the rule is intentionally iterative (its output may match + /// its own query). Without this, a rule fires at most once per node; + /// other rules can still fire on the result. + pub fn repeated(mut self) -> Self { + self.repeated = true; + self } fn try_rule( @@ -537,7 +555,7 @@ fn apply_rules( fresh: &tree_builder::FreshScope, ) -> Result, String> { let index = RuleIndex::new(rules); - apply_rules_inner(&index, ast, id, fresh, 0) + apply_rules_inner(&index, ast, id, fresh, 0, None) } fn apply_rules_inner( @@ -546,6 +564,7 @@ fn apply_rules_inner( id: Id, fresh: &tree_builder::FreshScope, rewrite_depth: usize, + skip_rule: Option<*const Rule>, ) -> Result, String> { if rewrite_depth > MAX_REWRITE_DEPTH { return Err(format!( @@ -556,7 +575,16 @@ fn apply_rules_inner( let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or(""); for rule in index.rules_for_kind(node_kind) { + let rule_ptr = *rule as *const Rule; + if Some(rule_ptr) == skip_rule { + continue; + } if let Some(result_node) = rule.try_rule(ast, id, fresh)? { + // For non-repeated rules, suppress further application of *this* + // rule on the result root, so a rule whose output matches its own + // query doesn't loop. Other rules and child traversal are + // unaffected. + let next_skip = if rule.repeated { None } else { Some(rule_ptr) }; let mut results = Vec::new(); for node in result_node { results.extend(apply_rules_inner( @@ -565,6 +593,7 @@ fn apply_rules_inner( node, fresh, rewrite_depth + 1, + next_skip, )?); } return Ok(results); @@ -579,13 +608,14 @@ fn apply_rules_inner( .collect(); // recursively descend into all the fields - // Child traversal does not increment rewrite depth + // Child traversal does not increment rewrite depth and starts fresh + // (no rule is skipped on child subtrees). let mut changed = false; let mut new_fields = BTreeMap::new(); for (field_id, children) in field_entries { let mut new_children = Vec::new(); for child_id in children { - let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?; + let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?; if result.len() != 1 || result[0] != child_id { changed = true; } diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index e4485857bff1..aebda65eae4d 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -22,6 +22,18 @@ fn run_and_dump(input: &str, rules: Vec) -> String { dump_ast(&ast, ast.get_root(), input) } +/// Helper: like `run_and_dump`, but returns the runner error (if any) +/// instead of unwrapping. +fn run_and_get_error(input: &str, rules: Vec) -> String { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let runner = Runner::with_schema(lang, &schema, &rules); + runner + .run(input) + .expect_err("expected runner to return an error") +} + /// Assert that a dump equals the expected string, treating the expected /// string as an indented multiline literal: leading/trailing blank lines /// are stripped, and the common leading indentation is removed from every @@ -382,6 +394,51 @@ fn test_chained_rules_output_only_kind() { ); } +// A rule that swaps `assignment.left` and `assignment.right`. Each +// application produces another `assignment` whose query the rule +// matches again, so without the once-per-node default it would loop. +fn swap_assignment_rule() -> Rule { + yeast::rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ) + ) +} + +#[test] +fn test_repeated_rule_hits_depth_limit() { + // With `.repeated()` the rule is allowed to fire on its own output, + // which cycles forever and trips the rewrite-depth safety net. + let err = run_and_get_error("x = 1", vec![swap_assignment_rule().repeated()]); + assert!( + err.contains("exceeded maximum rewrite depth"), + "expected depth-limit error, got: {err}" + ); +} + +#[test] +fn test_default_rule_fires_at_most_once_per_node() { + // Without `.repeated()` (the default), a rule fires at most once on a + // given node. The swap therefore happens exactly once and the desugaring + // terminates cleanly. + let dump = run_and_dump("x = 1", vec![swap_assignment_rule()]); + assert_dump_eq( + &dump, + r#" + program + assignment + left: integer "1" + right: identifier "x" + "#, + ); +} + // ---- Cursor tests ---- #[test] From 957c89b478ded5d3d753e7dff861b6b1584c582f Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 6 May 2026 15:30:54 +0000 Subject: [PATCH 2/3] yeast: Support multi-phase desugaring via DesugaringConfig::add_phase Extend the desugaring config from a single flat list of rules to an ordered sequence of named Phases. Each phase runs to completion (a full traversal applying its rules) before the next phase starts. Rules in different phases never compete for matches. The config is built via the new chainable API: DesugaringConfig::new() .add_phase("cleanup", cleanup_rules) .add_phase("desugar", desugar_rules) .with_output_node_types_yaml(yaml); Single-phase configs are just .add_phase(...) called once. A single FreshScope is shared across phases so generated identifier names (e.g. $tmp-N) are unique throughout the run. Phase names appear in error messages, e.g. "Phase `desugar`: exceeded maximum rewrite depth". Add two regression tests: one verifying basic two-phase chained desugaring, and one verifying that errors include the failing phase name. --- shared/yeast/doc/yeast.md | 14 ++++- shared/yeast/src/lib.rs | 109 +++++++++++++++++++++++++------------ shared/yeast/tests/test.rs | 76 +++++++++++++++++++++++++- 3 files changed, 158 insertions(+), 41 deletions(-) diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index 45e0006f909c..bc243a4b274c 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -319,11 +319,16 @@ capture name to a field of the same name on the output node. ## Integration with the extractor A YEAST desugaring pass is configured with a [`DesugaringConfig`], which -carries the rules and an optional output node-types schema (in YAML -format). Attach it to a language spec to enable rewriting: +carries one or more named [`Phase`]s of rules and an optional output +node-types schema (in YAML format). Each phase is a complete traversal +that runs to completion before the next phase starts; rules in different +phases never compete for matches. Attach the config to a language spec +to enable rewriting: ```rust -let desugar = yeast::DesugaringConfig::new(my_rules) +let desugar = yeast::DesugaringConfig::new() + .add_phase("cleanup", cleanup_rules()) + .add_phase("desugar", desugar_rules()) .with_output_node_types_yaml(include_str!("output-node-types.yml")); let lang = simple::LanguageSpec { @@ -335,6 +340,9 @@ let lang = simple::LanguageSpec { }; ``` +A single-phase config is just `.add_phase(...)` called once. Phase names +appear in error messages so you can tell which phase failed. + The same YAML node-types is used for both the runtime yeast `Schema` (so rules can refer to output-only kinds and fields) and TRAP validation (it is converted to JSON internally). diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 0c66a0c85210..8c0aa6545b5e 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -635,16 +635,46 @@ fn apply_rules_inner( Ok(vec![ast.nodes.len() - 1]) } -/// Configuration for a desugaring pass: a set of rules and an optional -/// output node-types schema (in YAML format). +/// One phase of a desugaring pass: a named bundle of rules that runs to +/// completion (a full traversal applying its rules) before the next phase +/// starts. Rules within a phase compete for matches as usual; rules in +/// different phases never compete because they don't see each other's input. +pub struct Phase { + /// Name used in error messages. + pub name: String, + pub rules: Vec, +} + +impl Phase { + pub fn new(name: impl Into, rules: Vec) -> Self { + Self { + name: name.into(), + rules, + } + } +} + +/// Configuration for a desugaring pass: an ordered list of [`Phase`]s and +/// an optional output node-types schema (in YAML format). /// /// When attached to a `LanguageSpec` (in the shared tree-sitter extractor), /// enables yeast-based AST rewriting before TRAP extraction. The same YAML /// is used both to validate TRAP output (via JSON conversion) and to /// resolve output-only node kinds and fields at runtime. +/// +/// Construct with `DesugaringConfig::new()` and add phases via +/// `add_phase`: +/// +/// ```ignore +/// let config = yeast::DesugaringConfig::new() +/// .add_phase("cleanup", cleanup_rules) +/// .add_phase("desugar", desugar_rules) +/// .with_output_node_types_yaml(yaml); +/// ``` +#[derive(Default)] pub struct DesugaringConfig { - /// Rules to apply during desugaring. - pub rules: Vec, + /// Phases of rule application, applied in order. + pub phases: Vec, /// Output node-types in YAML format. If `None`, the input grammar's /// node types are used (i.e. the desugared AST has the same node types /// as the tree-sitter grammar). @@ -652,11 +682,16 @@ pub struct DesugaringConfig { } impl DesugaringConfig { - pub fn new(rules: Vec) -> Self { - Self { - rules, - output_node_types_yaml: None, - } + /// Create an empty configuration. Add phases via [`add_phase`] and an + /// optional output schema via [`with_output_node_types_yaml`]. + pub fn new() -> Self { + Self::default() + } + + /// Append a new phase with the given name and rules. + pub fn add_phase(mut self, name: impl Into, rules: Vec) -> Self { + self.phases.push(Phase::new(name, rules)); + self } pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self { @@ -678,17 +713,17 @@ impl DesugaringConfig { pub struct Runner<'a> { language: tree_sitter::Language, schema: schema::Schema, - rules: &'a [Rule], + phases: &'a [Phase], } impl<'a> Runner<'a> { /// Create a runner using the input grammar's schema for output. - pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self { + pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self { let schema = schema::Schema::from_language(&language); Self { language, schema, - rules, + phases, } } @@ -696,12 +731,12 @@ impl<'a> Runner<'a> { pub fn with_schema( language: tree_sitter::Language, schema: &schema::Schema, - rules: &'a [Rule], + phases: &'a [Phase], ) -> Self { Self { language, schema: schema.clone(), - rules, + phases, } } @@ -714,27 +749,17 @@ impl<'a> Runner<'a> { Ok(Self { language, schema, - rules: &config.rules, + phases: &config.phases, }) } pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result { - let fresh = tree_builder::FreshScope::new(); let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language); - let root = ast.get_root(); - let res = apply_rules(self.rules, &mut ast, root, &fresh)?; - if res.len() != 1 { - return Err(format!( - "Expected exactly one result node, got {}", - res.len() - )); - } - ast.set_root(res[0]); + self.run_phases(&mut ast)?; Ok(ast) } pub fn run(&self, input: &str) -> Result { - let fresh = tree_builder::FreshScope::new(); let mut parser = tree_sitter::Parser::new(); parser .set_language(&self.language) @@ -743,15 +768,29 @@ impl<'a> Runner<'a> { .parse(input, None) .ok_or_else(|| "Failed to parse input".to_string())?; let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language); - let root = ast.get_root(); - let res = apply_rules(self.rules, &mut ast, root, &fresh)?; - if res.len() != 1 { - return Err(format!( - "Expected exactly one result node, got {}", - res.len() - )); - } - ast.set_root(res[0]); + self.run_phases(&mut ast)?; Ok(ast) } + + /// Apply each phase in turn to the AST, threading the root through. + /// A single `FreshScope` is shared across phases so that fresh + /// identifiers generated in different phases don't collide. + fn run_phases(&self, ast: &mut Ast) -> Result<(), String> { + let fresh = tree_builder::FreshScope::new(); + let mut root = ast.get_root(); + for phase in self.phases { + let res = apply_rules(&phase.rules, ast, root, &fresh) + .map_err(|e| format!("Phase `{}`: {e}", phase.name))?; + if res.len() != 1 { + return Err(format!( + "Phase `{}`: expected exactly one result node, got {}", + phase.name, + res.len() + )); + } + root = res[0]; + } + ast.set_root(root); + Ok(()) + } } diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index aebda65eae4d..badfe9ba5cbc 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -12,12 +12,19 @@ fn parse_and_dump(input: &str) -> String { dump_ast(&ast, ast.get_root(), input) } -/// Helper: parse Ruby source with a custom output schema and rules, return dump. +/// Helper: parse Ruby source with a custom output schema and a single +/// phase of rules, return dump. fn run_and_dump(input: &str, rules: Vec) -> String { + run_phased_and_dump(input, vec![Phase::new("test", rules)]) +} + +/// Helper: parse Ruby source with a custom output schema and multiple +/// rule phases, return dump. +fn run_phased_and_dump(input: &str, phases: Vec) -> String { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); - let runner = Runner::with_schema(lang, &schema, &rules); + let runner = Runner::with_schema(lang, &schema, &phases); let ast = runner.run(input).unwrap(); dump_ast(&ast, ast.get_root(), input) } @@ -28,7 +35,8 @@ fn run_and_get_error(input: &str, rules: Vec) -> String { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); - let runner = Runner::with_schema(lang, &schema, &rules); + let phases = vec![Phase::new("test", rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); runner .run(input) .expect_err("expected runner to return an error") @@ -439,6 +447,68 @@ fn test_default_rule_fires_at_most_once_per_node() { ); } +// ---- Phase tests ---- + +#[test] +fn test_phased_desugaring() { + // Two phases that could equally have been a single one with chained + // rules. Splitting them makes the intent (cleanup, then desugar) + // explicit and provides per-phase error messages. + let cleanup = vec![yeast::rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => first_node + )]; + let desugar = vec![yeast::rule!( + (first_node + left: (_) @left + right: (_) @right + ) + => second_node + )]; + + let dump = run_phased_and_dump( + "x = 1", + vec![ + Phase::new("cleanup", cleanup), + Phase::new("desugar", desugar), + ], + ); + assert_dump_eq( + &dump, + r#" + program + second_node + left: identifier "x" + right: integer "1" + "#, + ); +} + +#[test] +fn test_phase_error_includes_phase_name() { + // A repeated rule that loops; the error message should identify the + // phase that tripped the depth limit. + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])]; + let runner = Runner::with_schema(lang, &schema, &phases); + let err = runner + .run("x = 1") + .expect_err("expected runner to return an error"); + assert!( + err.contains("Phase `buggy`"), + "error should mention the failing phase, got: {err}" + ); + assert!( + err.contains("exceeded maximum rewrite depth"), + "error should mention the depth limit, got: {err}" + ); +} + // ---- Cursor tests ---- #[test] From e0d663f79b93f9c7aac22253d1b1a8a6c956a42e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 12:35:46 +0000 Subject: [PATCH 3/3] yeast: address review wording in phase docs Agent-Logs-Url: https://github.com/github/codeql/sessions/6d23db05-a6e9-4de4-8951-b465980fd0ef Co-authored-by: tausbn <1104778+tausbn@users.noreply.github.com> --- shared/yeast/doc/yeast.md | 7 ++++--- shared/yeast/src/lib.rs | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index bc243a4b274c..130d722b96ea 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -321,8 +321,9 @@ capture name to a field of the same name on the output node. A YEAST desugaring pass is configured with a [`DesugaringConfig`], which carries one or more named [`Phase`]s of rules and an optional output node-types schema (in YAML format). Each phase is a complete traversal -that runs to completion before the next phase starts; rules in different -phases never compete for matches. Attach the config to a language spec +that runs to completion before the next phase starts; only the current +phase's rules are considered during that traversal. Attach the config to +a language spec to enable rewriting: ```rust @@ -349,5 +350,5 @@ is converted to JSON internally). For the dbscheme/QL code generator, set `Language::desugar` to a `DesugaringConfig` carrying the same YAML; the generator converts it to -JSON for downstream code generation. The `rules` field of the config is +JSON for downstream code generation. The `phases` field of the config is unused at code-generation time. diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 8c0aa6545b5e..0f0ea45a9857 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -638,7 +638,8 @@ fn apply_rules_inner( /// One phase of a desugaring pass: a named bundle of rules that runs to /// completion (a full traversal applying its rules) before the next phase /// starts. Rules within a phase compete for matches as usual; rules in -/// different phases never compete because they don't see each other's input. +/// different phases never compete because each traversal only considers the +/// current phase's rules. pub struct Phase { /// Name used in error messages. pub name: String,