Skip to content

Commit

Permalink
Fix #20: Add additional common optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
goodmami committed Oct 5, 2021
1 parent cc92863 commit 44ecc0d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 17 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@

* Regex optimization avoids some superfluous groups ([#19])

### Changed

* Added more 'common' optimations: ([#20])
- Single-character classes become literals
- Sequence of literals becomes one literal
- Choice of non-negated character classes become one class


## [v0.3.1][]

Expand Down Expand Up @@ -113,3 +120,4 @@ descent parser and a work-in-progress state-machine parser.
[#17]: https://github.com/goodmami/pe/issues/17
[#18]: https://github.com/goodmami/pe/issues/18
[#19]: https://github.com/goodmami/pe/issues/19
[#20]: https://github.com/goodmami/pe/issues/20
78 changes: 61 additions & 17 deletions pe/_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pe._definition import Definition
from pe._grammar import Grammar
from pe.operators import (
Literal,
Class,
Regex,
Choice,
Expand Down Expand Up @@ -120,30 +121,73 @@ def _common(defn):
elif make_op:
defn = make_op(_common(defn.args[0]), *defn.args[1:])

# ![...] . -> [^...]
# !"." . -> [^.]
# [.] -> "." (only 1-char class, not a range, not negated)
if op == CLS:
ranges = defn.args[0]
negated = defn.args[1]
if len(ranges) == 1 and ranges[0][1] is None and not negated:
defn = Literal(ranges[0][0])

if op == SEQ:
subdefs = defn.args[0]
i = 0
while i < len(subdefs) - 1:
d = subdefs[i]
if (d.op == NOT and subdefs[i+1].op == DOT):
notd = d.args[0]
if notd.op == CLS:
negated = not notd.args[1]
subdefs[i:i+2] = [Class(notd.args[0], negate=negated)]
elif notd.op == LIT and len(notd.args[0]) == 1:
subdefs[i:i+2] = [Class(notd.args[0], negate=True)]
i += 1

# Sequence(x) -> x
if op == SEQ and len(defn.args[0]) == 1:
_common_sequence(defn.args[0])

if op == CHC:
_common_choice(defn.args[0])

# Sequence(x) -> x OR Choice(x) -> x
if op in (SEQ, CHC) and len(defn.args[0]) == 1:
defn = defn.args[0][0]
op = defn.op

return defn


def _common_sequence(subdefs):
i = 0
while i < len(subdefs) - 1:
d = subdefs[i]
# ![...] . -> [^...]
# !"." . -> [^.]
if (d.op == NOT and subdefs[i+1].op == DOT):
notd = d.args[0]
if notd.op == CLS:
negated = not notd.args[1]
subdefs[i:i+2] = [Class(notd.args[0], negate=negated)]
elif notd.op == LIT and len(notd.args[0]) == 1:
subdefs[i:i+2] = [Class(notd.args[0], negate=True)]
# "." "." -> ".."
elif d.op == LIT:
j = i + 1
while j < len(subdefs) and subdefs[j].op == LIT:
j += 1
if j - i > 1:
subdefs[i:j] = [Literal(''.join(x.args[0] for x in subdefs[i:j]))]
i += 1


def _common_choice(subdefs):
i = 0
while i < len(subdefs) - 1:
d = subdefs[i]
# [..] / [..] -> [....]
# [..] / "." -> [...]
if (d.op == CLS and not d.args[1]) or (d.op == LIT and len(d.args[0]) == 1):
ranges = d.args[0] if d.op == CLS else [(d.args[0], None)]
j = i + 1
while j < len(subdefs):
d2 = subdefs[j]
if d2.op == CLS and not d2.args[1]:
ranges.extend(d2.args[0])
elif d2.op == LIT and len(d2.args[0]) == 1:
ranges.append((d2.args[0], None))
else:
break
j += 1
if j - i > 1:
subdefs[i:j] = [Class(ranges)]
i += 1


def _regex_dot(defn, defs, grpid):
return Regex('(?s:.)')

Expand Down
18 changes: 18 additions & 0 deletions test/test__optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ def test_common():
gload(r'A <- !"a"'))
assert (cload(r'A <- !"a"') ==
gload(r'A <- !"a"'))
# single-char classes to literals
assert (cload(r'A <- [a]') ==
gload(r'A <- "a"'))
# but not single-range
assert (cload(r'A <- [a-c]') ==
gload(r'A <- [a-c]'))
# add "b" to avoid dropping the sequence
assert (cload(r'A <- !"a" . "b"') ==
cload(r'A <- ![a] . "b"') ==
Expand All @@ -77,6 +83,18 @@ def test_common():
assert (cload(r'A <- !"a" .') ==
cload(r'A <- ![a] .') ==
grm({'A': Class('a', negate=True)}))
# sequence of literals to literal
assert (cload(r'A <- "a" "bc" "d"') ==
gload(r'A <- "abcd"'))
# but not sequence with classes
assert (cload(r'A <- "a" [bc] "d"') ==
gload(r'A <- "a" [bc] "d"'))
# choice of classes or single-char literals
assert (cload(r'A <- [ab] / "m" / [yz]') ==
gload(r'A <- [abmyz]'))
# not negated classes though
assert (cload(r'A <- (![ab] .) / "m" / [yz]') ==
grm({'A': Choice(Class('ab', negate=True), Class('myz'))}))


def test_regex():
Expand Down

0 comments on commit 44ecc0d

Please sign in to comment.