Permalink
Browse files

Regex and Sequence expressions are implemented. Caching is implemente…

…d but untested. Doesn't return a parse tree yet--just checks for match success and returns the length of the match--but it should be pretty straightforward to add.
  • Loading branch information...
erikrose committed Feb 3, 2012
0 parents commit 1ae9d2571313134337d7d45989572ee0a69eeaf0
19 LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2012 Erik Rose
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,2 @@
+include README.rst
+include LICENSE
@@ -0,0 +1,39 @@
+============
+Parsimonious
+============
+
+Parsimonious is the fastest PEG parser I could write in pure Python. It was
+designed to undergird a MediaWiki parser that wouldn't take 5 seconds or a GB
+of RAM to do one page.
+
+Beyond speed, secondary goals include...
+
+* Frugal RAM use
+* Minimalistic, understandable code
+* Readable grammars
+* Extensible grammars
+* Complete test coverage
+
+Nice to have are...
+
+* Good error messages
+
+
+A Little About PEG Parsers
+==========================
+
+PEG parsers don't draw a distinction between lexing and parsing; everything's
+done at once. As a result, there is no lookahead limit, as there is with, for
+instance, Yacc. And, due to both of these properties, PEG grammars are easier
+to write: they're basically just EBNF.
+
+
+Optimizing Grammars
+===================
+
+Don't repeat stuff. If you need a ``Regex('such-and-such')`` at some point in
+your grammar, don't type it twice; make it a rule of its own, and reference it
+from wherever you need it. You'll get the most out of the caching this way,
+since cache lookups are by expression object identity (for speed). Even if you
+have an expression that's very simple, not repeating it will save RAM, as there
+can, at worst, be a cached int for every char in the text you're parsing.
@@ -0,0 +1 @@
+from grammar import Grammar
@@ -0,0 +1,94 @@
+"""Subexpressions that make up a parsed grammar"""
+
+# TODO: Make sure all symbol refs are local--not class lookups or
+# anything--for speed. And kill all the dots.
+
+import re
+
+
+class Expression(object):
+ """A thing that can be matched against a piece of text"""
+
+ # Slots are about twice as fast as __dict__-based attributes:
+ # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why
+ __slots__ = []
+
+ def parse(self, text):
+ """Return a parse tree of ``text``.
+
+ Initialize the packrat cache and kick off the first ``match()`` call.
+
+ """
+ # The packrat cache. {expr: [length matched at text index 0,
+ # length matched at text index 1, ...],
+ # ...}
+ cache = {}
+
+ return self.match(text, 0, cache)
+ # TODO: Freak out if the text didn't parse completely: if we didn't get
+ # all the way to the end.
+
+ # TODO: Make match() return a bit of the parse tree that the caller can
+ # stitch together.
+ def match(self, text, pos, cache):
+ """Return length of match, ``None`` if no match.
+
+ Check the cache first.
+
+ """
+ # TODO: Optimize. Probably a hot spot.
+ # Is there a way of lookup up cached stuff that's faster than hashing
+ # this id-pos pair?
+ expr_id = id(self)
+ cached = cache.get((expr_id, pos), ())
+ if cached is not ():
+ return cached
+ match = self._match(text, pos, cache)
+ cache[(expr_id, pos)] = match
+ return match
+
+
+class Regex(Expression):
+ """An expression that matches what a regex does.
+
+ Use these as much as you can and jam as much into each one as you can;
+ they're fast.
+
+ """
+ __slots__ = ['re']
+
+ def __init__(self, pattern):
+ self.re = re.compile(pattern)
+
+ def _match(self, text, pos, cache):
+ """Return length of match, ``None`` if no match."""
+ m = self.re.match(text, pos)
+ if m is not None:
+ span = m.span()
+ return span[1] - span[0]
+
+
+class Sequence(Expression):
+ """A series of expressions that must match contiguous, ordered pieces of the text
+
+ In other words, it's a concatenation operator: each piece has to match, one
+ after another.
+
+ """
+ __slots__ = ['members']
+
+ def __init__(self, members):
+ """``members`` is a sequence of expressions."""
+ self.members = members
+
+ def _match(self, text, pos, cache):
+ new_pos = pos
+ length_of_sequence = 0
+ for m in self.members:
+ length = m.match(text, new_pos, cache)
+ if length is None:
+ return None
+ new_pos += length
+ length_of_sequence += length
+ # Hooray! We got through all the members!
+ return length_of_sequence
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+"""The fastest pure-Python PEG parser I could muster"""
+
+
+from parsimonious.expressions import *
+
+
+class Grammar(object):
+ """A collection of rules that describe a language
+
+ You can start parsing from any of them, just by grabbing one as if out of a
+ dict::
+
+ g = Grammar('Greeting = Hi | Hello'
+ 'PoliteGreeting = Greeting, my good sir')
+ g['Greeting'].parse('Hello')
+
+ """
+ def __init__(self, peg):
+ # Maybe we should keep the original PEG text around in case people want to
+ # extend already-compiled grammars. We can't rely on callers to nicely
+ # expose their PEG strings. We can either have extending callers pull
+ # the text off Grammar.peg, or we could get fancy and define __add__ on
+ # Grammars and strings.
+ self._rules = self._rules_from_peg(peg)
+
+ def _rules_from_peg(self, peg):
+ """Return a useful data structure derived from textual PEG rules.
+
+ It's a map of rules, all referencing each other. Typically, there's a
+ single root to the web of references, and that root is the starting
+ symbol for parsing, but there's nothing saying you can't have multiple
+ roots.
+
+ The grammar gets divided into (sub)expressions, which are arranged in
+ some kind of tree. The parser then traverses that tree, trying to find
+ matches to the expressions.
+
+ """
+ # TODO: Unstub.
+
No changes.
@@ -0,0 +1,26 @@
+"""Tests to show that the benchmarks we based our speed optimizations on are still valid"""
+
+from timeit import timeit
+
+
+def test_lists_vs_dicts():
+ """See what's faster at int key lookup: dicts or lists."""
+ list_time = timeit('item = l[9000]', 'l = [0] * 10000')
+ dict_time = timeit('item = d[9000]', 'd = dict((x, 0) for x in xrange(10000))')
+
+ # Dicts take about 1.6x as long as lists in Python 2.6 and 2.7.
+ print '%s < %s' % (list_time, dict_time)
+ assert list_time < dict_time
+
+
+def test_call_vs_inline():
+ """How bad is the calling penalty?"""
+ no_call = timeit('l[0] += 1', 'l = [0]')
+ call = timeit('add(); l[0] += 1', 'l = [0]\n'
+ 'def add():\n'
+ ' pass')
+
+ # Calling a function is pretty fast; it takes just 1.2x as long as the
+ # global var access and addition in l[0] += 1.
+ print '%s (no call) < %s (call)' % (no_call, call)
+ assert no_call < call
@@ -0,0 +1,17 @@
+from nose.tools import eq_
+
+from parsimonious.expressions import Regex, Sequence
+
+
+def test_regex():
+ eq_(Regex('hello')._match('ehello', 1, {}), 5) # simple
+ eq_(Regex('hello*')._match('hellooo', 0, {}), 7) # *
+ eq_(Regex('hello*')._match('goodbye', 0, {}), None) # no match
+
+def test_sequence():
+ eq_(Sequence([Regex('hi*'), Regex('lo'), Regex('.ingo')])._match('hiiiilobingo1234', 0, {}),
+ 12) # succeed
+ eq_(Sequence([Regex('hi*'), Regex('lo'), Regex('.ingo')])._match('hiiiilobing', 0, {}),
+ None) # don't
+ eq_(Sequence([Regex('hi*')])._match('>hiiii', 1, {}),
+ 5) # non-0 pos
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+# def test_parsing():
+# g = Grammar("""S := A c* !. & a* C"""
+# """A := a A b / ø"""
+# """C := b C c / ø""") # If you want to extend a grammar, just prepend (or append?) your string to its. Yours will take precedence.
+# p = Grammar['S'].parse('aaabbbccc')

0 comments on commit 1ae9d25

Please sign in to comment.