diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index d49ff96f11df..130d722b96ea 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -61,6 +61,22 @@ rule matches, the node is kept and its children are processed recursively. A rule can replace one node with zero nodes (deletion), one node (rewriting), or multiple nodes (expansion). +By default a rule fires **at most once on a given node**: after firing, the +engine will not re-try that same rule on the result root. Other rules may +still fire on the result, and the rule may still fire on different nodes +(including the result's children). To opt into iterative behaviour — when a +rule's output is intentionally re-matched by the same rule — call +`.repeated()` on the constructed `Rule`: + +```rust +let r = yeast::rule!((foo ...) => (foo ...)).repeated(); +``` + +Without `.repeated()`, a rule whose output happens to match its own query +simply fires once and stops. With `.repeated()`, the rule is allowed to +re-match indefinitely; the runner still enforces a global rewrite-depth +limit (currently 100) as a safety net against accidental cycles. + ## Query language Queries use a syntax inspired by @@ -303,11 +319,17 @@ capture name to a field of the same name on the output node. ## Integration with the extractor A YEAST desugaring pass is configured with a [`DesugaringConfig`], which -carries the rules and an optional output node-types schema (in YAML -format). Attach it to a language spec to enable rewriting: +carries one or more named [`Phase`]s of rules and an optional output +node-types schema (in YAML format). Each phase is a complete traversal +that runs to completion before the next phase starts; only the current +phase's rules are considered during that traversal. Attach the config to +a language spec +to enable rewriting: ```rust -let desugar = yeast::DesugaringConfig::new(my_rules) +let desugar = yeast::DesugaringConfig::new() + .add_phase("cleanup", cleanup_rules()) + .add_phase("desugar", desugar_rules()) .with_output_node_types_yaml(include_str!("output-node-types.yml")); let lang = simple::LanguageSpec { @@ -319,11 +341,14 @@ let lang = simple::LanguageSpec { }; ``` +A single-phase config is just `.add_phase(...)` called once. Phase names +appear in error messages so you can tell which phase failed. + The same YAML node-types is used for both the runtime yeast `Schema` (so rules can refer to output-only kinds and fields) and TRAP validation (it is converted to JSON internally). For the dbscheme/QL code generator, set `Language::desugar` to a `DesugaringConfig` carrying the same YAML; the generator converts it to -JSON for downstream code generation. The `rules` field of the config is +JSON for downstream code generation. The `phases` field of the config is unused at code-generation time. diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 46629e198406..0f0ea45a9857 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -471,11 +471,29 @@ pub type Transform = Box< pub struct Rule { query: QueryNode, transform: Transform, + /// If true, after this rule fires on a node the engine will try to + /// re-apply this same rule on the result root. Defaults to false: + /// each rule fires at most once on a given node, which prevents + /// accidental loops where a rule's output matches its own query. + repeated: bool, } impl Rule { pub fn new(query: QueryNode, transform: Transform) -> Self { - Self { query, transform } + Self { + query, + transform, + repeated: false, + } + } + + /// Mark this rule as allowed to fire multiple times on the same node. + /// Use when the rule is intentionally iterative (its output may match + /// its own query). Without this, a rule fires at most once per node; + /// other rules can still fire on the result. + pub fn repeated(mut self) -> Self { + self.repeated = true; + self } fn try_rule( @@ -537,7 +555,7 @@ fn apply_rules( fresh: &tree_builder::FreshScope, ) -> Result, String> { let index = RuleIndex::new(rules); - apply_rules_inner(&index, ast, id, fresh, 0) + apply_rules_inner(&index, ast, id, fresh, 0, None) } fn apply_rules_inner( @@ -546,6 +564,7 @@ fn apply_rules_inner( id: Id, fresh: &tree_builder::FreshScope, rewrite_depth: usize, + skip_rule: Option<*const Rule>, ) -> Result, String> { if rewrite_depth > MAX_REWRITE_DEPTH { return Err(format!( @@ -556,7 +575,16 @@ fn apply_rules_inner( let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or(""); for rule in index.rules_for_kind(node_kind) { + let rule_ptr = *rule as *const Rule; + if Some(rule_ptr) == skip_rule { + continue; + } if let Some(result_node) = rule.try_rule(ast, id, fresh)? { + // For non-repeated rules, suppress further application of *this* + // rule on the result root, so a rule whose output matches its own + // query doesn't loop. Other rules and child traversal are + // unaffected. + let next_skip = if rule.repeated { None } else { Some(rule_ptr) }; let mut results = Vec::new(); for node in result_node { results.extend(apply_rules_inner( @@ -565,6 +593,7 @@ fn apply_rules_inner( node, fresh, rewrite_depth + 1, + next_skip, )?); } return Ok(results); @@ -579,13 +608,14 @@ fn apply_rules_inner( .collect(); // recursively descend into all the fields - // Child traversal does not increment rewrite depth + // Child traversal does not increment rewrite depth and starts fresh + // (no rule is skipped on child subtrees). let mut changed = false; let mut new_fields = BTreeMap::new(); for (field_id, children) in field_entries { let mut new_children = Vec::new(); for child_id in children { - let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?; + let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth, None)?; if result.len() != 1 || result[0] != child_id { changed = true; } @@ -605,16 +635,47 @@ fn apply_rules_inner( Ok(vec![ast.nodes.len() - 1]) } -/// Configuration for a desugaring pass: a set of rules and an optional -/// output node-types schema (in YAML format). +/// One phase of a desugaring pass: a named bundle of rules that runs to +/// completion (a full traversal applying its rules) before the next phase +/// starts. Rules within a phase compete for matches as usual; rules in +/// different phases never compete because each traversal only considers the +/// current phase's rules. +pub struct Phase { + /// Name used in error messages. + pub name: String, + pub rules: Vec, +} + +impl Phase { + pub fn new(name: impl Into, rules: Vec) -> Self { + Self { + name: name.into(), + rules, + } + } +} + +/// Configuration for a desugaring pass: an ordered list of [`Phase`]s and +/// an optional output node-types schema (in YAML format). /// /// When attached to a `LanguageSpec` (in the shared tree-sitter extractor), /// enables yeast-based AST rewriting before TRAP extraction. The same YAML /// is used both to validate TRAP output (via JSON conversion) and to /// resolve output-only node kinds and fields at runtime. +/// +/// Construct with `DesugaringConfig::new()` and add phases via +/// `add_phase`: +/// +/// ```ignore +/// let config = yeast::DesugaringConfig::new() +/// .add_phase("cleanup", cleanup_rules) +/// .add_phase("desugar", desugar_rules) +/// .with_output_node_types_yaml(yaml); +/// ``` +#[derive(Default)] pub struct DesugaringConfig { - /// Rules to apply during desugaring. - pub rules: Vec, + /// Phases of rule application, applied in order. + pub phases: Vec, /// Output node-types in YAML format. If `None`, the input grammar's /// node types are used (i.e. the desugared AST has the same node types /// as the tree-sitter grammar). @@ -622,11 +683,16 @@ pub struct DesugaringConfig { } impl DesugaringConfig { - pub fn new(rules: Vec) -> Self { - Self { - rules, - output_node_types_yaml: None, - } + /// Create an empty configuration. Add phases via [`add_phase`] and an + /// optional output schema via [`with_output_node_types_yaml`]. + pub fn new() -> Self { + Self::default() + } + + /// Append a new phase with the given name and rules. + pub fn add_phase(mut self, name: impl Into, rules: Vec) -> Self { + self.phases.push(Phase::new(name, rules)); + self } pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self { @@ -648,17 +714,17 @@ impl DesugaringConfig { pub struct Runner<'a> { language: tree_sitter::Language, schema: schema::Schema, - rules: &'a [Rule], + phases: &'a [Phase], } impl<'a> Runner<'a> { /// Create a runner using the input grammar's schema for output. - pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self { + pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self { let schema = schema::Schema::from_language(&language); Self { language, schema, - rules, + phases, } } @@ -666,12 +732,12 @@ impl<'a> Runner<'a> { pub fn with_schema( language: tree_sitter::Language, schema: &schema::Schema, - rules: &'a [Rule], + phases: &'a [Phase], ) -> Self { Self { language, schema: schema.clone(), - rules, + phases, } } @@ -684,27 +750,17 @@ impl<'a> Runner<'a> { Ok(Self { language, schema, - rules: &config.rules, + phases: &config.phases, }) } pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result { - let fresh = tree_builder::FreshScope::new(); let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language); - let root = ast.get_root(); - let res = apply_rules(self.rules, &mut ast, root, &fresh)?; - if res.len() != 1 { - return Err(format!( - "Expected exactly one result node, got {}", - res.len() - )); - } - ast.set_root(res[0]); + self.run_phases(&mut ast)?; Ok(ast) } pub fn run(&self, input: &str) -> Result { - let fresh = tree_builder::FreshScope::new(); let mut parser = tree_sitter::Parser::new(); parser .set_language(&self.language) @@ -713,15 +769,29 @@ impl<'a> Runner<'a> { .parse(input, None) .ok_or_else(|| "Failed to parse input".to_string())?; let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language); - let root = ast.get_root(); - let res = apply_rules(self.rules, &mut ast, root, &fresh)?; - if res.len() != 1 { - return Err(format!( - "Expected exactly one result node, got {}", - res.len() - )); - } - ast.set_root(res[0]); + self.run_phases(&mut ast)?; Ok(ast) } + + /// Apply each phase in turn to the AST, threading the root through. + /// A single `FreshScope` is shared across phases so that fresh + /// identifiers generated in different phases don't collide. + fn run_phases(&self, ast: &mut Ast) -> Result<(), String> { + let fresh = tree_builder::FreshScope::new(); + let mut root = ast.get_root(); + for phase in self.phases { + let res = apply_rules(&phase.rules, ast, root, &fresh) + .map_err(|e| format!("Phase `{}`: {e}", phase.name))?; + if res.len() != 1 { + return Err(format!( + "Phase `{}`: expected exactly one result node, got {}", + phase.name, + res.len() + )); + } + root = res[0]; + } + ast.set_root(root); + Ok(()) + } } diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index e4485857bff1..badfe9ba5cbc 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -12,16 +12,36 @@ fn parse_and_dump(input: &str) -> String { dump_ast(&ast, ast.get_root(), input) } -/// Helper: parse Ruby source with a custom output schema and rules, return dump. +/// Helper: parse Ruby source with a custom output schema and a single +/// phase of rules, return dump. fn run_and_dump(input: &str, rules: Vec) -> String { + run_phased_and_dump(input, vec![Phase::new("test", rules)]) +} + +/// Helper: parse Ruby source with a custom output schema and multiple +/// rule phases, return dump. +fn run_phased_and_dump(input: &str, phases: Vec) -> String { let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); let schema = yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); - let runner = Runner::with_schema(lang, &schema, &rules); + let runner = Runner::with_schema(lang, &schema, &phases); let ast = runner.run(input).unwrap(); dump_ast(&ast, ast.get_root(), input) } +/// Helper: like `run_and_dump`, but returns the runner error (if any) +/// instead of unwrapping. +fn run_and_get_error(input: &str, rules: Vec) -> String { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let phases = vec![Phase::new("test", rules)]; + let runner = Runner::with_schema(lang, &schema, &phases); + runner + .run(input) + .expect_err("expected runner to return an error") +} + /// Assert that a dump equals the expected string, treating the expected /// string as an indented multiline literal: leading/trailing blank lines /// are stripped, and the common leading indentation is removed from every @@ -382,6 +402,113 @@ fn test_chained_rules_output_only_kind() { ); } +// A rule that swaps `assignment.left` and `assignment.right`. Each +// application produces another `assignment` whose query the rule +// matches again, so without the once-per-node default it would loop. +fn swap_assignment_rule() -> Rule { + yeast::rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ) + ) +} + +#[test] +fn test_repeated_rule_hits_depth_limit() { + // With `.repeated()` the rule is allowed to fire on its own output, + // which cycles forever and trips the rewrite-depth safety net. + let err = run_and_get_error("x = 1", vec![swap_assignment_rule().repeated()]); + assert!( + err.contains("exceeded maximum rewrite depth"), + "expected depth-limit error, got: {err}" + ); +} + +#[test] +fn test_default_rule_fires_at_most_once_per_node() { + // Without `.repeated()` (the default), a rule fires at most once on a + // given node. The swap therefore happens exactly once and the desugaring + // terminates cleanly. + let dump = run_and_dump("x = 1", vec![swap_assignment_rule()]); + assert_dump_eq( + &dump, + r#" + program + assignment + left: integer "1" + right: identifier "x" + "#, + ); +} + +// ---- Phase tests ---- + +#[test] +fn test_phased_desugaring() { + // Two phases that could equally have been a single one with chained + // rules. Splitting them makes the intent (cleanup, then desugar) + // explicit and provides per-phase error messages. + let cleanup = vec![yeast::rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => first_node + )]; + let desugar = vec![yeast::rule!( + (first_node + left: (_) @left + right: (_) @right + ) + => second_node + )]; + + let dump = run_phased_and_dump( + "x = 1", + vec![ + Phase::new("cleanup", cleanup), + Phase::new("desugar", desugar), + ], + ); + assert_dump_eq( + &dump, + r#" + program + second_node + left: identifier "x" + right: integer "1" + "#, + ); +} + +#[test] +fn test_phase_error_includes_phase_name() { + // A repeated rule that loops; the error message should identify the + // phase that tripped the depth limit. + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let phases = vec![Phase::new("buggy", vec![swap_assignment_rule().repeated()])]; + let runner = Runner::with_schema(lang, &schema, &phases); + let err = runner + .run("x = 1") + .expect_err("expected runner to return an error"); + assert!( + err.contains("Phase `buggy`"), + "error should mention the failing phase, got: {err}" + ); + assert!( + err.contains("exceeded maximum rewrite depth"), + "error should mention the depth limit, got: {err}" + ); +} + // ---- Cursor tests ---- #[test]