Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grammar extensions #208

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions parsimonious/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ def resolve_refs(self, rule_map):
# Nothing to do on the base expression.
return self

def resolve_inherited_references(self, rule_map):
return self

def parse(self, text, pos=0):
"""Return a parse tree of ``text``.

Expand Down Expand Up @@ -321,6 +324,10 @@ def resolve_refs(self, rule_map):
self.members = tuple(m.resolve_refs(rule_map) for m in self.members)
return self

def resolve_inherited_references(self, rule_map):
self.members = tuple(m.resolve_inherited_references(rule_map) for m in self.members)
return self

def __hash__(self):
# Note we leave members out of the hash computation, since compounds can get added to
# sets, then have their members mutated. See RuleVisitor._resolve_refs.
Expand Down
94 changes: 71 additions & 23 deletions parsimonious/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""
from collections import OrderedDict
from textwrap import dedent
from typing import Type

from parsimonious.exceptions import BadGrammar, UndefinedLabel
from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
Expand Down Expand Up @@ -44,6 +45,9 @@ class Grammar(OrderedDict):
increase cache hit ratio. [Is this implemented yet?]

"""
rule_visitor: Type["RuleVisitor"]
rule_grammar: "Grammar"

def __init__(self, rules='', **more_rules):
"""Construct a grammar.

Expand All @@ -58,15 +62,29 @@ def __init__(self, rules='', **more_rules):
``rules`` in case of naming conflicts.

"""
# Retain a copy of the arguments to allow grammar extensions
self.rule_definition = rules, more_rules

decorated_custom_rules = {
k: (expression(v, k, self) if is_callable(v) else v)
for k, v in more_rules.items()}

exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
exprs, first = self.expressions_from_rules(rules, decorated_custom_rules)
super().__init__(exprs.items())
self.default_rule = first # may be None

def extend(self, rules: str, **more_rules) -> "Grammar":
"""Return a new grammar with the given rules added.
"""
new_rules = f"""
{self.rule_definition[0]}
=========================
{rules}
"""
new_more_rules = self.rule_definition[1].copy()
new_more_rules.update(more_rules)
return Grammar(new_rules, **new_more_rules)

def default(self, rule_name):
"""Return a new Grammar whose :term:`default rule` is ``rule_name``."""
new = self._copy()
Expand All @@ -86,7 +104,8 @@ def _copy(self):
new.default_rule = self.default_rule
return new

def _expressions_from_rules(self, rules, custom_rules):
@classmethod
def expressions_from_rules(cls, rules, custom_rules):
"""Return a 2-tuple: a dict of rule names pointing to their
expressions, and then the first rule.

Expand All @@ -99,8 +118,8 @@ def _expressions_from_rules(self, rules, custom_rules):
Expressions

"""
tree = rule_grammar.parse(rules)
return RuleVisitor(custom_rules).visit(tree)
tree = cls.rule_grammar.parse(rules)
return cls.visitor_cls(custom_rules).visit(tree)

def parse(self, text, pos=0):
"""Parse some text with the :term:`default rule`.
Expand Down Expand Up @@ -141,18 +160,6 @@ def __repr__(self):
return "Grammar({!r})".format(str(self))


class TokenGrammar(Grammar):
"""A Grammar which takes a list of pre-lexed tokens instead of text

This is useful if you want to do the lexing yourself, as a separate pass:
for example, to implement indentation-based languages.

"""
def _expressions_from_rules(self, rules, custom_rules):
tree = rule_grammar.parse(rules)
return TokenRuleVisitor(custom_rules).visit(tree)


class BootstrappingGrammar(Grammar):
"""The grammar used to recognize the textual rules that describe other
grammars
Expand All @@ -162,7 +169,7 @@ class BootstrappingGrammar(Grammar):
grammar description syntax.

"""
def _expressions_from_rules(self, rule_syntax, custom_rules):
def expressions_from_rules(self, rule_syntax, custom_rules):
"""Return the rules for parsing the grammar definition syntax.

Return a 2-tuple: a dict of rule names pointing to their expressions,
Expand Down Expand Up @@ -222,6 +229,7 @@ def _expressions_from_rules(self, rule_syntax, custom_rules):
# leafmost kinds of nodes. Literals like "/" count as leaves.

rules = _ rule*

rule = label equals expression
equals = "=" _
literal = spaceless_literal _
Expand All @@ -238,11 +246,12 @@ def _expressions_from_rules(self, rule_syntax, custom_rules):
lookahead_term = "&" term _
term = not_term / lookahead_term / quantified / atom
quantified = atom quantifier
atom = reference / literal / regex / parenthesized
atom = inherited_reference / reference / literal / regex / parenthesized
regex = "~" spaceless_literal ~"[ilmsuxa]*"i _
parenthesized = "(" _ expression ")" _
quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _
reference = label !equals
inherited_reference = "^" reference

# A subsequent equal sign is the only thing that distinguishes a label
# (which begins a new rule) from a reference (which is just a pointer to a
Expand All @@ -251,8 +260,13 @@ def _expressions_from_rules(self, rule_syntax, custom_rules):

# _ = ~r"\s*(?:#[^\r\n]*)?\s*"
_ = meaninglessness*
meaninglessness = ~r"\s+" / comment
meaninglessness = ~r"\s+" / comment / divider
comment = ~r"#[^\r\n]*"

# At least two dashes or equals signs. Used for separating grammars which inherit by
# concatenation. Currently has no semantic content, though may later be used to make
# the syntax of the inherited/overridden rules explicit.
divider = ~r"={2,}|-{2,}"
''')


Expand Down Expand Up @@ -292,7 +306,10 @@ def resolve_refs(self, rule_map):

# Just for debugging:
def _as_rhs(self):
return '<LazyReference to %s>' % self
return f'<{self.__class__.__name__} to %s>' % self

def resolve_inherited_references(self, rule_map):
return self


class RuleVisitor(NodeVisitor):
Expand Down Expand Up @@ -459,7 +476,9 @@ def visit_rules(self, node, rules_list):
# override earlier ones. This lets us define rules multiple times and
# have the last declaration win, so you can extend grammars by
# concatenation.
rule_map = OrderedDict((expr.name, expr) for expr in rules)
rule_map = OrderedDict()
for rule in rules:
rule_map[rule.name] = rule.resolve_inherited_references(rule_map)

# And custom rules override string-based rules. This is the least
# surprising choice when you compare the dict constructor:
Expand All @@ -479,6 +498,24 @@ def visit_rules(self, node, rules_list):
return rule_map, (rule_map[rules[0].name]
if isinstance(rules, list) and rules else None)

def visit_descendant_rules(self, node, visited_children):
divider, _, rules = visited_children
return rules

def visit_inherited_reference(self, node, visited_children):
caret, name = visited_children
return LazyInheritedReference(name)


class LazyInheritedReference(LazyReference):
def resolve_refs(self, rule_map):
# This is a bug in RuleVisitor.visit_rules.
lucaswiman marked this conversation as resolved.
Show resolved Hide resolved
raise AssertionError(
f"Inherited references should have been resolved, but has not been resolved {self!r}.")

def resolve_inherited_references(self, rule_map):
return rule_map[self]


class TokenRuleVisitor(RuleVisitor):
"""A visitor which builds expression trees meant to work on sequences of
Expand All @@ -496,13 +533,24 @@ def visit_regex(self, node, regex):
'than characters.')


class TokenGrammar(Grammar):
"""A Grammar which takes a list of pre-lexed tokens instead of text

This is useful if you want to do the lexing yourself, as a separate pass:
for example, to implement indentation-based languages.

"""
visitor_cls = TokenRuleVisitor


# Bootstrap to level 1...
rule_grammar = BootstrappingGrammar(rule_syntax)
Grammar.visitor_cls = RuleVisitor
rule_grammar = Grammar.rule_grammar = BootstrappingGrammar(rule_syntax)
# ...and then to level 2. This establishes that the node tree of our rule
# syntax is built by the same machinery that will build trees of our users'
# grammars. And the correctness of that tree is tested, indirectly, in
# test_grammar.
rule_grammar = Grammar(rule_syntax)
rule_grammar = Grammar.rule_grammar = Grammar(rule_syntax)


# TODO: Teach Expression trees how to spit out Python representations of
Expand Down
Empty file.
78 changes: 78 additions & 0 deletions parsimonious/tests/examples/grammar_syntax_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
This example extends parsimonious's grammar syntax for a different approach to token grammars:
* CAPITALIZED references are refer to ``token.type`` names. They do not need to be explicitly
named elsewhere in the grammar.
* lowercase references are refer to other rules.
* A token's attributes can match rules, e.g. requiring that an attribute be a date in a particular
format. This uses a syntax similar to Xpath's ``node[@attr='value']`` syntax.
"""

from typing import Dict

from parsimonious.grammar import Grammar
from parsimonious.expressions import Expression
from parsimonious.nodes import Node


class TokenRef(Expression):
def __init__(self, ref, name=""):
super().__init__(name=name)
self.ref = ref

def _uncached_match(self, token_list, pos, cache, error):
if self.ref == getattr(token_list[pos], "type", None):
return Node(self, token_list, pos, pos + 1, children=[])


class AttrsPredicateExpression(Expression):
"""
A predicate expression that matches a node with a given set of attributes.
"""

def __init__(self, token_type, attrs: Dict[str, str]):
self.attrs = attrs
self.token_type = token_type

def __repr__(self) -> str:
return f"AttrsPredicateExpression({self.token_type}[{self.attrs}])" % self.attrs

def _uncached_match(self, token_list, pos, cache, error):

tok_match = self.token_type.match_core(token_list, pos, cache, error)
if tok_match:
tok = token_list[pos]
for k, v in self.attrs.items():
attr = getattr(tok, k, None)
if not isinstance(attr, str) or not v.parse(attr):
return None
# TODO: should children have each of the attr matches?
return Node(self, token_list, pos, pos+1, children=[tok_match])


class AttrsTokenGrammar(Grammar):
rule_grammar = Grammar.rule_grammar.extend(r"""
# TODO: Support lexer natively?
term = attrs_predicate_expression / ^term

# Token names are required to be all-caps alphanumeric, with underscores.
reference = token_reference / ^reference
token_reference = ~r"[A-Z_][A-Z0-9_]*"

attrs_predicate_expression = token_reference "[" _ attr_expressions "]" _
attr_expressions = ("@" label "=" _ expression _)+
""")

class visitor_cls(Grammar.visitor_cls):
def visit_token_reference(self, node, children) -> str:
return TokenRef(node.text)

def visit_attrs_predicate_expression(self, node, children):
label, _, lbrac, attr_expressions, rbrac, _ = children
return AttrsPredicateExpression(label, attr_expressions)

def visit_attr_expressions(self, node, children) -> Dict[str, Expression]:
predicates = {}
for at, label, equals, _, expression, _ in children:
assert isinstance(label, str)
predicates[label] = expression
return predicates
22 changes: 22 additions & 0 deletions parsimonious/tests/test_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from types import SimpleNamespace

import pytest

from parsimonious.exceptions import ParseError
from parsimonious.tests.examples.grammar_syntax_extension import AttrsTokenGrammar


def noparse(grammar, text):
with pytest.raises(ParseError):
grammar.parse(text)


def test_extended_grammar():
g = AttrsTokenGrammar(r"""
a = B[@foo=("bar" / "baz") @baz=~"baz"+]
""")

assert g.parse([SimpleNamespace(type="B", foo="bar", baz="bazbaz")])
assert g.parse([SimpleNamespace(type="B", foo="baz", baz="bazbaz")])
noparse(g, [SimpleNamespace(type="C", foo="bar", baz="baz")])
noparse(g, [SimpleNamespace(type="C", foo="bar", baz="baz")])
37 changes: 35 additions & 2 deletions parsimonious/tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_expressions_from_rules(self):

That the correct ``Expression`` tree is built is already tested in
``RuleGrammarTests``. This tests only that the ``Grammar`` base class's
``_expressions_from_rules`` works.
``expressions_from_rules`` works.

"""
greeting_grammar = Grammar('greeting = "hi" / "howdy"')
Expand Down Expand Up @@ -621,7 +621,6 @@ def test_binary_grammar():
body = ~b"[^\xFF]*"
terminator = b"\xFF"
""")
length = 22
assert g.parse(b"\xff22~" + (b"a" * 22) + b"\xff") is not None


Expand Down Expand Up @@ -649,3 +648,37 @@ def test_inconsistent_string_types_in_grammar():
foo = "foo"
bar = "bar"
""")


def test_grammar_extend_method():
g = Grammar(r"""
a = (b / c)+
b = "b"
c = "c"
""")
g2 = g.extend(r"""
b = ^b / "B"
c = ^c / "C"
""")
assert g.parse("bc")
assert g2.parse("bBcC")
with pytest.raises(ParseError):
g.parse("bBcC")


def test_grammar_extend_dsl():
g = Grammar(r"""
a = (b / c)+
b = "b"
c = "c"
""")
g2 = Grammar(fr"""
{g.rule_definition[0]}
======================
b = ^b / "B"
c = ^c / "C"
""")
assert g.parse("bc")
assert g2.parse("bBcC")
with pytest.raises(ParseError):
g.parse("bBcC")