diff --git a/parsimonious/examples/__init__.py b/parsimonious/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsimonious/examples/grammar_syntax_extension.py b/parsimonious/examples/grammar_syntax_extension.py new file mode 100644 index 0000000..1f1692d --- /dev/null +++ b/parsimonious/examples/grammar_syntax_extension.py @@ -0,0 +1,98 @@ +""" +This example extends parsimonious's grammar syntax for a different approach to token grammars: +* CAPITALIZED references are refer to ``token.type`` names. They do not need to be explicitly + named elsewhere in the grammar. +* lowercase references are refer to other rules. +* A token's attributes can match rules, e.g. requiring that an attribute be a date in a particular + format. This uses a syntax similar to Xpath's ``node[@attr='value']`` syntax. +""" + +from typing import Dict + +from parsimonious.grammar import Grammar, LazyReference +from parsimonious.expressions import Expression +from parsimonious.nodes import Node + + +class TokenRef(Expression): + def __init__(self, ref, name=""): + super().__init__(name=name) + self.ref = ref + + def __repr__(self): + if self.name: + return f"TokenRef({self.ref!r}, {self.name!r})" + else: + return f"TokenRef({self.ref!r})" + + def __str__(self): + return self.ref + + def _as_rhs(self): + return self.ref + + def _uncached_match(self, token_list, pos, cache, error): + if self.ref == getattr(token_list[pos], "type", None): + return Node(self, token_list, pos, pos + 1, children=[]) + + +class AttrsPredicateExpression(Expression): + """ + A predicate expression that matches a node with a given set of attributes. + """ + + def __init__(self, token_type, attrs: Dict[str, str], name=""): + self.attrs = attrs + self.token_type = token_type + super().__init__(name=name) + + def __repr__(self) -> str: + return f"AttrsPredicateExpression({self.token_type}[{self.attrs}])" % self.attrs + + def _uncached_match(self, token_list, pos, cache, error): + + tok_match = self.token_type.match_core(token_list, pos, cache, error) + if tok_match: + tok = token_list[pos] + for k, v in self.attrs.items(): + attr = getattr(tok, k, None) + if not isinstance(attr, str) or not v.parse(attr): + return None + # TODO: should children have each of the attr matches? + return Node(self, token_list, pos, pos+1, children=[tok_match]) + + +class AttrsTokenGrammar(Grammar): + rule_grammar = Grammar.rule_grammar.extend(r""" + # TODO: Support lexer natively? + term = attrs_predicate_expression / ^term + + # Token names are required to be all-caps alphanumeric, with underscores. + reference = token_reference / ^reference + token_reference = ~r"[A-Z_][A-Z0-9_]*" _ !equals + + attrs_predicate_expression = token_reference "[" _ attr_expressions "]" _ + attr_expressions = ("@" label "=" _ expression _)+ + """) + + class visitor_cls(Grammar.visitor_cls): + def visit_token_reference(self, node, children) -> str: + ref, _, _ = children + return TokenRef(ref.text, name=ref.text) + + def visit_reference(self, node, children): + if isinstance(children[0], TokenRef): + return children[0] + else: + return LazyReference(children[0]) + + def visit_attrs_predicate_expression(self, node, children): + label, _, lbrac, attr_expressions, rbrac, _ = children + return AttrsPredicateExpression(label, attr_expressions) + + def visit_attr_expressions(self, node, children) -> Dict[str, Expression]: + predicates = {} + for at, label, equals, _, expression, _ in children: + assert isinstance(label, str) + predicates[label] = expression + return predicates diff --git a/parsimonious/expressions.py b/parsimonious/expressions.py index f93f2c6..aafa9f5 100644 --- a/parsimonious/expressions.py +++ b/parsimonious/expressions.py @@ -125,6 +125,9 @@ def resolve_refs(self, rule_map): # Nothing to do on the base expression. return self + def resolve_inherited_references(self, rule_map): + return self + def parse(self, text, pos=0): """Return a parse tree of ``text``. @@ -324,6 +327,10 @@ def resolve_refs(self, rule_map): self.members = tuple(m.resolve_refs(rule_map) for m in self.members) return self + def resolve_inherited_references(self, rule_map): + self.members = tuple(m.resolve_inherited_references(rule_map) for m in self.members) + return self + def __hash__(self): # Note we leave members out of the hash computation, since compounds can get added to # sets, then have their members mutated. See RuleVisitor._resolve_refs. diff --git a/parsimonious/grammar.py b/parsimonious/grammar.py index 367f27e..a888e99 100644 --- a/parsimonious/grammar.py +++ b/parsimonious/grammar.py @@ -7,6 +7,7 @@ """ from collections import OrderedDict from textwrap import dedent +from typing import Type from parsimonious.exceptions import BadGrammar, UndefinedLabel from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, @@ -44,6 +45,9 @@ class Grammar(OrderedDict): increase cache hit ratio. [Is this implemented yet?] """ + rule_visitor: Type["RuleVisitor"] + rule_grammar: "Grammar" + def __init__(self, rules='', **more_rules): """Construct a grammar. @@ -58,15 +62,29 @@ def __init__(self, rules='', **more_rules): ``rules`` in case of naming conflicts. """ + # Retain a copy of the arguments to allow grammar extensions + self.rule_definition = rules, more_rules decorated_custom_rules = { k: (expression(v, k, self) if is_callable(v) else v) for k, v in more_rules.items()} - exprs, first = self._expressions_from_rules(rules, decorated_custom_rules) + exprs, first = self.expressions_from_rules(rules, decorated_custom_rules) super().__init__(exprs.items()) self.default_rule = first # may be None + def extend(self, rules: str, **more_rules) -> "Grammar": + """Return a new grammar with the given rules added. + """ + new_rules = f""" + {self.rule_definition[0]} + ========================= + {rules} + """ + new_more_rules = self.rule_definition[1].copy() + new_more_rules.update(more_rules) + return Grammar(new_rules, **new_more_rules) + def default(self, rule_name): """Return a new Grammar whose :term:`default rule` is ``rule_name``.""" new = self._copy() @@ -86,7 +104,8 @@ def _copy(self): new.default_rule = self.default_rule return new - def _expressions_from_rules(self, rules, custom_rules): + @classmethod + def expressions_from_rules(cls, rules, custom_rules): """Return a 2-tuple: a dict of rule names pointing to their expressions, and then the first rule. @@ -99,8 +118,8 @@ def _expressions_from_rules(self, rules, custom_rules): Expressions """ - tree = rule_grammar.parse(rules) - return RuleVisitor(custom_rules).visit(tree) + tree = cls.rule_grammar.parse(rules) + return cls.visitor_cls(custom_rules).visit(tree) def parse(self, text, pos=0): """Parse some text with the :term:`default rule`. @@ -141,18 +160,6 @@ def __repr__(self): return "Grammar({!r})".format(str(self)) -class TokenGrammar(Grammar): - """A Grammar which takes a list of pre-lexed tokens instead of text - - This is useful if you want to do the lexing yourself, as a separate pass: - for example, to implement indentation-based languages. - - """ - def _expressions_from_rules(self, rules, custom_rules): - tree = rule_grammar.parse(rules) - return TokenRuleVisitor(custom_rules).visit(tree) - - class BootstrappingGrammar(Grammar): """The grammar used to recognize the textual rules that describe other grammars @@ -162,7 +169,7 @@ class BootstrappingGrammar(Grammar): grammar description syntax. """ - def _expressions_from_rules(self, rule_syntax, custom_rules): + def expressions_from_rules(self, rule_syntax, custom_rules): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, @@ -222,6 +229,7 @@ def _expressions_from_rules(self, rule_syntax, custom_rules): # leafmost kinds of nodes. Literals like "/" count as leaves. rules = _ rule* + rule = label equals expression equals = "=" _ literal = spaceless_literal _ @@ -238,11 +246,12 @@ def _expressions_from_rules(self, rule_syntax, custom_rules): lookahead_term = "&" term _ term = not_term / lookahead_term / quantified / atom quantified = atom quantifier - atom = reference / literal / regex / parenthesized + atom = inherited_reference / reference / literal / regex / parenthesized regex = "~" spaceless_literal ~"[ilmsuxa]*"i _ parenthesized = "(" _ expression ")" _ quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _ reference = label !equals + inherited_reference = "^" reference # A subsequent equal sign is the only thing that distinguishes a label # (which begins a new rule) from a reference (which is just a pointer to a @@ -251,8 +260,13 @@ def _expressions_from_rules(self, rule_syntax, custom_rules): # _ = ~r"\s*(?:#[^\r\n]*)?\s*" _ = meaninglessness* - meaninglessness = ~r"\s+" / comment + meaninglessness = ~r"\s+" / comment / divider comment = ~r"#[^\r\n]*" + + # At least two dashes or equals signs. Used for separating grammars which inherit by + # concatenation. Currently has no semantic content, though may later be used to make + # the syntax of the inherited/overridden rules explicit. + divider = ~r"={2,}|-{2,}" ''') @@ -292,7 +306,10 @@ def resolve_refs(self, rule_map): # Just for debugging: def _as_rhs(self): - return '' % self + return f'<{self.__class__.__name__} to %s>' % self + + def resolve_inherited_references(self, rule_map): + return self class RuleVisitor(NodeVisitor): @@ -389,7 +406,7 @@ def visit_reference(self, node, reference): We resolve them all later. """ - label, not_equals = reference + label, *_ = reference return LazyReference(label) def visit_regex(self, node, regex): @@ -459,7 +476,9 @@ def visit_rules(self, node, rules_list): # override earlier ones. This lets us define rules multiple times and # have the last declaration win, so you can extend grammars by # concatenation. - rule_map = OrderedDict((expr.name, expr) for expr in rules) + rule_map = OrderedDict() + for rule in rules: + rule_map[rule.name] = rule.resolve_inherited_references(rule_map) # And custom rules override string-based rules. This is the least # surprising choice when you compare the dict constructor: @@ -479,6 +498,24 @@ def visit_rules(self, node, rules_list): return rule_map, (rule_map[rules[0].name] if isinstance(rules, list) and rules else None) + def visit_descendant_rules(self, node, visited_children): + divider, _, rules = visited_children + return rules + + def visit_inherited_reference(self, node, visited_children): + caret, name = visited_children + return LazyInheritedReference(name) + + +class LazyInheritedReference(LazyReference): + def resolve_refs(self, rule_map): + # If triggered, this indicates a bug in RuleVisitor.visit_rules. + raise AssertionError( + f"Inherited references should have been resolved, but has not been resolved {self!r}.") + + def resolve_inherited_references(self, rule_map): + return rule_map[self] + class TokenRuleVisitor(RuleVisitor): """A visitor which builds expression trees meant to work on sequences of @@ -496,13 +533,24 @@ def visit_regex(self, node, regex): 'than characters.') +class TokenGrammar(Grammar): + """A Grammar which takes a list of pre-lexed tokens instead of text + + This is useful if you want to do the lexing yourself, as a separate pass: + for example, to implement indentation-based languages. + + """ + visitor_cls = TokenRuleVisitor + + # Bootstrap to level 1... -rule_grammar = BootstrappingGrammar(rule_syntax) +Grammar.visitor_cls = RuleVisitor +rule_grammar = Grammar.rule_grammar = BootstrappingGrammar(rule_syntax) # ...and then to level 2. This establishes that the node tree of our rule # syntax is built by the same machinery that will build trees of our users' # grammars. And the correctness of that tree is tested, indirectly, in # test_grammar. -rule_grammar = Grammar(rule_syntax) +rule_grammar = Grammar.rule_grammar = Grammar(rule_syntax) # TODO: Teach Expression trees how to spit out Python representations of diff --git a/parsimonious/nodes.py b/parsimonious/nodes.py index 7839097..7d22cea 100644 --- a/parsimonious/nodes.py +++ b/parsimonious/nodes.py @@ -215,7 +215,7 @@ def visit(self, node): # Don't catch and re-wrap already-wrapped exceptions. raise except Exception as exc: - # implentors may define exception classes that should not be + # implementors may define exception classes that should not be # wrapped. if isinstance(exc, self.unwrapped_exceptions): raise diff --git a/parsimonious/tests/test_examples.py b/parsimonious/tests/test_examples.py new file mode 100644 index 0000000..16fbf1f --- /dev/null +++ b/parsimonious/tests/test_examples.py @@ -0,0 +1,40 @@ +from types import SimpleNamespace + +import pytest + +from parsimonious.exceptions import ParseError +from parsimonious.examples.grammar_syntax_extension import AttrsTokenGrammar + + +def noparse(grammar, text): + with pytest.raises(ParseError): + grammar.parse(text) + + +def test_extended_grammar(): + Tok = SimpleNamespace + g = AttrsTokenGrammar(r""" + a = B[@foo=("bar" / "baz") @baz=~"baz"+] + """) + + assert g.parse([Tok(type="B", foo="bar", baz="bazbaz")]) + assert g.parse([Tok(type="B", foo="baz", baz="bazbaz")]) + noparse(g, [Tok(type="C", foo="bar", baz="baz")]) + noparse(g, [Tok(type="C", foo="bar", baz="baz")]) + + g2 = AttrsTokenGrammar(r""" + segment = TEXT (DATA_SEP TEXT)* SEG_TERM + """) + Tok2 = lambda t: SimpleNamespace(type=t) + tokens = [ + Tok2("TEXT"), + *([Tok2("DATA_SEP"), Tok2("TEXT")] * 10), + Tok2("SEG_TERM"), + ] + assert g2.parse(tokens) + SEGMENT_GRAMMAR = AttrsTokenGrammar(r""" + x12 = segment+ + segment = TEXT (DATA_SEP elem)* SEG_TERM + elem = value (REPEAT_SEP value)* + value = TEXT (COMPONENT_SEP TEXT)* + """) diff --git a/parsimonious/tests/test_grammar.py b/parsimonious/tests/test_grammar.py index 2f979f6..e3e016e 100644 --- a/parsimonious/tests/test_grammar.py +++ b/parsimonious/tests/test_grammar.py @@ -167,7 +167,7 @@ def test_expressions_from_rules(self): That the correct ``Expression`` tree is built is already tested in ``RuleGrammarTests``. This tests only that the ``Grammar`` base class's - ``_expressions_from_rules`` works. + ``expressions_from_rules`` works. """ greeting_grammar = Grammar('greeting = "hi" / "howdy"') @@ -620,7 +620,6 @@ def test_binary_grammar(): body = ~b"[^\xFF]*" terminator = b"\xFF" """) - length = 22 assert g.parse(b"\xff22~" + (b"a" * 22) + b"\xff") is not None @@ -650,6 +649,40 @@ def test_inconsistent_string_types_in_grammar(): """) +def test_grammar_extend_method(): + g = Grammar(r""" + a = (b / c)+ + b = "b" + c = "c" + """) + g2 = g.extend(r""" + b = ^b / "B" + c = ^c / "C" + """) + assert g.parse("bc") + assert g2.parse("bBcC") + with pytest.raises(ParseError): + g.parse("bBcC") + + +def test_grammar_extend_dsl(): + g = Grammar(r""" + a = (b / c)+ + b = "b" + c = "c" + """) + g2 = Grammar(fr""" + {g.rule_definition[0]} + ====================== + b = ^b / "B" + c = ^c / "C" + """) + assert g.parse("bc") + assert g2.parse("bBcC") + with pytest.raises(ParseError): + g.parse("bBcC") + + def test_left_associative(): # Regression test for https://github.com/erikrose/parsimonious/issues/209 language_grammar = r"""