Merge pull request #47 from goodmami/gh-44-optimize-char-class

Release v0.5.2 Fix optimizations regarding character classes and grammar mutation
goodmami · Mar 29, 2024 · 350e89d · 350e89d
2 parents ab684d7 + 4b8e405
commit 350e89d
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,17 @@
 ## [Unreleased][unreleased]
 
 
+## [v0.5.2][]
+
+**Release date: 2024-03-28**
+
+### Fixed
+
+* Optimization returns new grammar instead of mutating original ([#44])
+* Make union of choice of character classes ([#44])
+* `Flag.STRICT` now raises parsing errors in machine parser
+
+
 ## [v0.5.1][]
 
 **Release date: 2023-12-31**
@@ -171,6 +182,7 @@ descent parser and a work-in-progress state-machine parser.
 [v0.4.0]: ../../releases/tag/v0.4.0
 [v0.5.0]: ../../releases/tag/v0.5.0
 [v0.5.1]: ../../releases/tag/v0.5.1
+[v0.5.2]: ../../releases/tag/v0.5.2
 
 [#6]: https://github.com/goodmami/pe/issues/6
 [#7]: https://github.com/goodmami/pe/issues/7
@@ -186,3 +198,4 @@ descent parser and a work-in-progress state-machine parser.
 [#31]: https://github.com/goodmami/pe/issues/31
 [#36]: https://github.com/goodmami/pe/issues/36
 [#38]: https://github.com/goodmami/pe/issues/38
+[#44]: https://github.com/goodmami/pe/issues/44
diff --git a/pe/_cy_machine.pyx b/pe/_cy_machine.pyx
@@ -13,7 +13,7 @@ from enum import IntEnum
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from pe._constants import Operator, Flag, FAIL as FAILURE
-from pe._errors import Error
+from pe._errors import Error, ParseError
 from pe._match import Match
 from pe._types import Memo
 from pe._definition import Definition
@@ -176,6 +176,8 @@ class MachineParser(Parser):
         idx = self._index[self.start]
         end = self._parser.match(idx, s, pos, args, kwargs, memo)
         if end < 0:
+            if flags & Flag.STRICT:
+                raise ParseError()
             return None
         else:
             return Match(

diff --git a/pe/_meta.py b/pe/_meta.py
@@ -2,4 +2,4 @@
 Meta-information about pe.
 """
 
-__version__ = '0.5.1'
+__version__ = '0.5.2'
diff --git a/pe/_optimize.py b/pe/_optimize.py
@@ -128,22 +128,18 @@ def _common(defn):
         if len(ranges) == 1 and ranges[0][1] is None and not negated:
             defn = Literal(ranges[0][0])
 
-    if op == SEQ:
-        _common_sequence(defn.args[0])
+    elif op == SEQ:
+        defn = _common_sequence(defn)
 
-    if op == CHC:
-        _common_choice(defn.args[0])
-
-    # Sequence(x)  ->  x  OR  Choice(x)  ->  x
-    if op in (SEQ, CHC) and len(defn.args[0]) == 1:
-        defn = defn.args[0][0]
-        op = defn.op
+    elif op == CHC:
+        defn = _common_choice(defn)
 
     return defn
 
 
-def _common_sequence(subdefs):
+def _common_sequence(defn):
     i = 0
+    subdefs = list(defn.args[0])
     while i < len(subdefs) - 1:
         d = subdefs[i]
         # ![...] .  ->  [^...]
@@ -163,16 +159,18 @@ def _common_sequence(subdefs):
             if j - i > 1:
                 subdefs[i:j] = [Literal(''.join(x.args[0] for x in subdefs[i:j]))]
         i += 1
+    return Sequence(*subdefs)
 
 
-def _common_choice(subdefs):
+def _common_choice(defn):
     i = 0
+    subdefs = list(defn.args[0])
     while i < len(subdefs) - 1:
         d = subdefs[i]
         # [..] / [..]  ->  [....]
         # [..] / "."   ->  [...]
         if (d.op == CLS and not d.args[1]) or (d.op == LIT and len(d.args[0]) == 1):
-            ranges = d.args[0] if d.op == CLS else [(d.args[0], None)]
+            ranges = list(d.args[0]) if d.op == CLS else [(d.args[0], None)]
             j = i + 1
             while j < len(subdefs):
                 d2 = subdefs[j]
@@ -184,8 +182,14 @@ def _common_choice(subdefs):
                     break
                 j += 1
             if j - i > 1:
-                subdefs[i:j] = [Class(ranges)]
+                subdefs[i:j] = [Class(sorted(set(ranges), key=_range_sort_key))]
         i += 1
+    return Choice(*subdefs)
+
+
+def _range_sort_key(range):
+    """Ensure single hyphen characters are the first."""
+    return (range != ("-", None), range)
 
 
 def _regex_dot(defn, defs, grpid):

diff --git a/pe/_py_machine.py b/pe/_py_machine.py
@@ -11,7 +11,7 @@
 import re
 
 from pe._constants import FAIL as FAILURE, Operator, Flag
-from pe._errors import Error
+from pe._errors import Error, ParseError
 from pe._match import Match
 from pe._types import Memo
 from pe._definition import Definition
@@ -132,6 +132,8 @@ def match(self,
         idx = self._index[self.start]
         end = _match(self.pi, idx, s, pos, args, kwargs, memo)
         if end < 0:
+            if flags & Flag.STRICT:
+                raise ParseError()
             return None
         else:
             return Match(

diff --git a/test/test__optimize.py b/test/test__optimize.py
@@ -15,11 +15,16 @@
 
 
 def gload(s, inline=False, common=False, regex=False):
+    _, original = loads(s)
     start, defmap = loads(s)
-    return optimize(Grammar(defmap, start=start),
-                    inline=inline,
-                    common=common,
-                    regex=regex)
+    optimized = optimize(
+        Grammar(defmap, start=start),
+        inline=inline,
+        common=common,
+        regex=regex
+    )
+    assert original == defmap
+    return optimized
 
 
 def iload(s):
@@ -67,12 +72,13 @@ def test_common():
             gload(r'A <- "a"'))
     assert (cload(r'A <- !"a"') ==
             gload(r'A <- !"a"'))
-    assert (cload(r'A <- !"a"') ==
-            gload(r'A <- !"a"'))
     # single-char classes to literals
     assert (cload(r'A <- [a]') ==
             gload(r'A <- "a"'))
-    # but not single-range
+    # but not multi-char class
+    assert (cload(r'A <- [ab]') ==
+            gload(r'A <- [ab]'))
+    # and not ranges
     assert (cload(r'A <- [a-c]') ==
             gload(r'A <- [a-c]'))
     # add "b" to avoid dropping the sequence
@@ -86,15 +92,24 @@ def test_common():
     # sequence of literals to literal
     assert (cload(r'A <- "a" "bc" "d"') ==
             gload(r'A <- "abcd"'))
-    # but not sequence with classes
+    # or sequence of literals or single-char classes
+    assert (cload(r'A <- "a" [b] "c"') ==
+            gload(r'A <- "abc"'))
+    # but not sequence with multi-char classes
     assert (cload(r'A <- "a" [bc] "d"') ==
             gload(r'A <- "a" [bc] "d"'))
-    # choice of classes or single-char literals
+    # choice of classes
+    assert (cload(r'A <- [ab] / [bc]') ==
+            gload(r'A <- [abc]'))
+    # or choice of classes or single-char literals
     assert (cload(r'A <- [ab] / "m" / [yz]') ==
             gload(r'A <- [abmyz]'))
     # not negated classes though
     assert (cload(r'A <- (![ab] .) / "m" / [yz]') ==
             grm({'A': Choice(Class('ab', negate=True), Class('myz'))}))
+    # hyphen characters are moved to start of class
+    assert (cload(r'A <- [(-,] / [-.]') ==
+            gload(r'A <- [-(-,.]'))
 
 
 def test_regex():