Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
886 lines (780 sloc) 28.5 KB
import tokenizer
doc = path("doc:/chartparser")
# The upgraded chart parser is layout sensitive without
# needing preprocessing in the token stream.
# Therefore the tokenizer no longer needs any feedback
# from the parser to parse these languages.
# This example is too simple to trigger any of the potential
# bugs in the grammar, but it's enough to give an idea of the usage.
main = ():
file = Nonterminal('file')
symbol = Terminal('symbol')
on_indent = (bb, cc):
return bb.indent < cc.start.col and cc.start.col == cc.indent
on_newline = (bb, cc):
return bb.indent == cc.start.col and cc.start.col == cc.indent
user_grammar = [
Rule(file, [symbol, Condition(symbol, [on_indent])]),
Rule(file, [symbol, Condition(symbol, [on_newline])]),
]
parsergen = preprocess(user_grammar, file)
parser = parsergen()
parser.step(symbol, "hello", (lno=0, col=0), (lno=0, col=5))
parser.step(symbol, "world", (lno=1, col=2), (lno=1, col=7))
print("accepted?", parser.accepted)
print(repr(parser.traverse(
((x, a):
return a),
(x):
return "")))
# The interface to construct a grammar consists of
# rules, terminals, nonterminals and conditions.
# Rules have a nonterminal on the left side and
# list of nonterminals, terminals on the right side.
# The rules may be annotated to help reading parse trees.
class Rule
+init = (self, lhs, rhs, annotation=null):
self.lhs = lhs
self.rhs = rhs
self.annotation = annotation
+repr = (self):
rhs = []
for item in self.rhs
rhs.append(repr(item))
out = repr(self.lhs) ++ " -> " ++ " ".join(rhs)
return out
# Earlier I did not separate terminals from
# non-terminals because it was not strictly
# necessary. That turned out to confuse
# when designing grammars.
class Terminal
+init = (self, name):
self.name = name
+repr = (self):
if self.name
return "@" ++ self.name
return "<Terminal>"
# .getsym makes it easier to extract the symbol from rhs item.
getsym = (self):
return self
class Nonterminal
+init = (self, name):
self.name = name
+repr = (self):
if self.name
return self.name
return "<Nonterminal>"
getsym = (self):
return self
class Condition
+init = (self, symbol, constraints):
self.symbol = symbol
self.constraints = constraints
getsym = (self):
return self.symbol
+repr = (self):
cons = []
for c in self.constraints
cons.append(repr(c))
return "{" ++ ", ".join(cons) ++ "}" ++ repr(self.symbol)
# To reduce the work during parsing the grammar is preprocessed.
# the user calls preprocess(user_grammar, accept_symbol)
# and gets an initiator back. The initiator can be used to
# init parsers.
# The preprocessing transforms the grammar into NNF, which
# divides the grammar into null and non-null rules.
# The null rules are not used in middle of parsing, but
# they are useful when interpreting the parse tree.
# TODO: It would be preferable to allow serialization of the
# preprocessed grammar and the annotations attached to it.
preprocess = (user_grammar, default_accept):
nullable = find_nullable(user_grammar)
grammar = {}
blankset = {}
rules = build_nnf(user_grammar, nullable)
for rule in rules
if rule.rhs.length == 0
try
blankset[rule.lhs].append(rule)
except KeyError as _
blankset[rule.lhs] = [rule]
else
try
grammar[rule.lhs].append(rule)
except KeyError as _
grammar[rule.lhs] = [rule]
right_recursive = find_right_recursive(rules)
return Initiator(grammar, blankset,
right_recursive, default_accept)
# Earley based parsing would suffer from nullable rules.
# The parsing step ends up being simple when grammar
# does not contain any of them, so they are rewritten away.
# The result is a "nihilist normal form"
# Further reasoning about this can be found in the paper
# "Practical Earley Parsing" by Aycock & Horspool
find_nullable = (grammar):
nullable = set()
queue = []
new_nullable = (symbol):
if symbol not in nullable
nullable.add(symbol)
queue.append(symbol)
inverse_lookup = {}
new_lookup = (index, symbol):
try
inverse_lookup[symbol].append(index)
except KeyError as _
inverse_lookup[symbol] = [index]
nonterminals = []
nonnullables = []
for rule in grammar
if rule.rhs.length == 0
new_nullable(rule.lhs)
elif all_nonterminals(rule.rhs)
index = nonnullables.length
for x in rule.rhs
x = x.getsym()
if x != rule.lhs
new_lookup(index, x)
nonterminals.append(rule.lhs)
nonnullables.append(count_nonrec(rule))
for n in queue
for i in inverse_lookup.get(n, [])
nonnullables[i] -= 1
if nonnullables[i] == 0
new_nullable(nonterminals[i])
return nullable
all_nonterminals = (rhs):
for x in rhs
if not isinstance(x.getsym(), Nonterminal)
return false
return true
all_nullable = (rhs, nullable):
for x in rhs
if x.getsym() not in nullable
return false
return true
count_nonrec = (rule):
s = 0
for x in rule.rhs
s += int(x.getsym() != rule.lhs)
return s
# Going through n bits in binary produces all possible permutations
# where a field is present and not present.
build_nnf = (grammar, nullable):
result = []
for rule in grammar
order = 0
for x in rule.rhs
order += int(x.getsym() in nullable)
for i in range(1 << order)
result.append(nihilist_rule(rule, i, nullable))
return result
nihilist_rule = (rule, index, nullable):
present = []
rhs = []
for x in rule.rhs
shift = true
if x.getsym() in nullable
if index & 1 == 0
shift = false
index >>= 1
present.append(shift)
if shift
rhs.append(x)
return Rule(rule.lhs, rhs, NNF(rule, present))
# The nihilist normal form rules are annotated with NNF nodes.
class NNF
+init = (self, rule, present):
self.rule = rule # the original rule
self.present = present # tells which fields are present in this rule.
# Conditions on whether an item is leo-eligible:
# its rule is right recursive
# it is quasi-complete
# it is postdot-unique
find_right_recursive = (grammar):
edges = []
for rule in grammar
if rule.rhs.length > 0
right = rule.rhs[rule.rhs.length - 1]
row = []
for other in grammar
row.append(other.lhs == right.getsym())
edges.append(row)
else
row = []
for other in grammar
row.append(false)
edges.append(row)
warshall_transitive_closure(edges)
right_recursive = set()
i = 0
for rule in grammar
if edges[i][i] and rule.rhs.length >= 2 # Excluding rules that have only one rhs symbol.
right_recursive.add(rule) # Leo items caused problems if the rule was a prediction.
i += 1
return right_recursive
warshall_transitive_closure = (a):
n = a.length
for k in range(n)
for i in range(n)
if not a[i][k]
continue
for j in range(n)
if not a[k][j]
continue
a[i][j] = true
return a
# The nullable set presents the same information as the blankset
# so we can discard it.
class Initiator
+init = (self, grammar, blankset, right_recursive, default_accept):
self.grammar = grammar
self.blankset = blankset
self.right_recursive = right_recursive
self.right_recursive = set() # disable LEO
self.default_accept = default_accept
# TODO: Fix up the performance issue in REPR startup by allowing the
# caching of a grammar.
+call = (self, accept=self.default_accept):
parser = Parser(self, accept, [])
# In an earley parser that uses NNF, empty input is a special case, that is taken care of here.
if accept in self.blankset
parser.output.append(SPPF(null, null, null, null, 0))
# The first chart column
nodes = {}
current = []
leims = {}
prediction(current, nodes, self.grammar, parser.chart, accept)
for eim in current
prediction(current, nodes, self.grammar, parser.chart, eim.postdot())
cache_transitions(parser.chart, eim, null, leims)
if isinstance(eim.postdot(), Nonterminal) and eim.postdot_constraint()
parser.must_check_layout = true
return parser
class Parser
+init = (self, init, accept, output):
self.chart = self.first = {}
self.init = init
self.accept = accept
self.output = output
self.lno = null # Previous lno
self.indent = 0
self.must_check_layout = false
step = (self, term, token, start=null, stop=null):
if self.lno != start.lno # Record indentation level
self.lno = start.lno
self.indent = start.col
init = self.init
current = []
leims = {}
transitions = {}
nodes = {}
output = []
bottom = SPPF(start, stop, token, null, self.indent)
# If layout check is necessary, it happens here.
# (self.chart[term]), do a DFS through the chart trying
# to find a position where the layout is satisfied.
# completions proceed in non-deterministic manner,
# until everything has been completed.
edges = self.chart[term]
if self.must_check_layout
edges = filter_by_layout(edges, term, bottom)
assert edges.length > 0
"layout violation at line " ++ start.lno.to_string() ++ ", come up with better error message"
self.must_check_layout = false
shift_eims(current, nodes, edges, bottom, init.right_recursive, leims)
for eim in current
# reduction
cc = nodes[eim]
if eim.is_completed()
shift_eims(current, nodes, eim.origin.get(eim.rule.lhs, []), cc, init.right_recursive, leims)
if eim.rule.lhs == self.accept and eim.origin == self.first
output.append(cc)
prediction(current, nodes, init.grammar, transitions, eim.postdot())
cache_transitions(transitions, eim, cc, leims)
if isinstance(eim.postdot(), Nonterminal) and eim.postdot_constraint()
self.must_check_layout = true
self.chart = transitions
self.output = output
accepted = property();
get = (self):
return self.output.length > 0
expect = property();
get = (self):
return self.chart.keys()
expecting = (self, symbol):
return symbol in self.chart
traverse = (self, postorder_cb,
blank_cb=make_default_blank(self, postorder_cb),
resolve_ambiguity=self.default_ambiguity_resolution):
if self.output.length > 1
# This is really weird in current context. I should probably
# rethink this whole ambiguity resolution -thing.
sppf = resolve_ambiguity(null, self.output)
else
sppf = self.output[0]
if isinstance(sppf, SPPF) and sppf.cell == null
return blank_cb(self.accept.getsym())
res = traverse_sppf([sppf], postorder_cb, blank_cb, resolve_ambiguity)
assert res.length == 1, "broken parse traverse"
return res[0]
default_ambiguity_resolution = (self, sppf):
raise Ambiguity(sppf)
make_default_blank = (parser, postorder_cb):
blank_cb = (symbol):
blanks = parser.init.blankset[symbol]
if blanks.length != 1
raise Exception("default_blank ambiguity")
cell = blanks[0]
return postorder_cb(expand(null, null, cell, blank_cb, iter([]))...)
return blank_cb
prediction = (current, nodes, grammar, transitions, postdot):
if isinstance(postdot, Nonterminal)
for rule in grammar.get(postdot, [])
eim = EIM(rule, 0, transitions)
if eim not in nodes
nodes[eim] = null
current.append(eim)
cache_transitions = (transitions, eim, cc, leims):
if not eim.is_completed()
postdot = eim.postdot()
trans = object();
eim = eim
cc = cc
leo = null
try
transitions[postdot].append(trans)
except KeyError as _
if eim.rule in leims # If the item is not postdot unique, then
trans.leo = leims[eim.rule] # .leo is never read anyway.
transitions[postdot] = [trans]
filter_by_layout = (edges, symbol, cc):
out = []
for trans in edges
if check_layout(trans, cc, set([symbol]))
out.append(trans)
return out
check_layout = (trans, cc, visited):
if trans.eim.is_confirmed()
return trans.eim.check_postdot_constraint(trans.cc, cc)
else
lhs = trans.eim.rule.lhs
return false if lhs in visited
visited.add(lhs)
edges = trans.eim.origin.get(lhs, [])
for trans in edges
return true if check_layout(trans, cc, visited)
return edges.length == 0 # Includes condition that there is no edges for transition.
# Should only happen if we reach accept symbol
# TODO: The ideas and the approach we take with LEO items should be
# reviewed again, so that we can confirm there are no bugs there.
shift_eims = (current, nodes, edges, cc, right_recursive, leims):
if is_leo_eligible(edges, right_recursive)
trans = edges[0]
if trans.leo
if trans.eim.check_postdot_constraint(trans.cc, cc)
link = LEOLink(trans.leo.link, trans.eim.rule, trans.cc)
leims[trans.eim.rule] = object();
trans = trans.leo.trans
link = link
eim = trans.leo.trans.eim.next()
assert eim not in nodes
"assumption that a postdot unique eim does not appear twice"
nodes[eim] = LEO(link, cc, trans.cc.start, trans.cc.indent)
current.append(eim)
else
leims[trans.eim.rule] = object();
trans = trans
link = LEOLink(null, trans.eim.rule, trans.cc)
shift_eim(current, nodes, trans.eim, trans.cc, cc)
else
for trans in edges
shift_eim(current, nodes, trans.eim, trans.cc, cc)
is_leo_eligible = (edges, right_recursive):
if edges.length != 1 # must be postdot-unique
return false
eim = edges[0].eim
return eim.rule in right_recursive and eim.pos == eim.rule.rhs.length - 1 #quasi-complete
shift_eim = (current, nodes, eim, bb, cc):
# We have to prevent Link buildup if the condition is unsatisfied.
return if not eim.check_postdot_constraint(bb, cc)
eim = eim.next()
try
sppf = nodes[eim]
if bb
start = bb.start
else
start = cc.start
assert start == sppf.start
"sppf tree corruption (parsing bug)"
sppf.insert(bb, cc)
except KeyError as _
if bb
start = bb.start
indent = bb.indent
else
start = cc.start
indent = cc.indent
nodes[eim] = sppf = SPPF(start, cc.stop, eim.rule, Link(bb, cc), indent)
current.append(eim)
class EIM
+init = (self, rule, pos, origin):
self.rule = rule
self.pos = pos
self.origin = origin
# assert 0 <= pos <= len(rule)
postdot = (self):
if self.pos < self.rule.rhs.length
return self.rule.rhs[self.pos].getsym()
return null
postdot_constraint = (self):
if self.pos < self.rule.rhs.length
x = self.rule.rhs[self.pos]
if isinstance(x, Condition)
return x.constraints
return null
check_postdot_constraint = (self, bb, cc):
constraints = self.postdot_constraint()
if constraints
for fn in constraints
if not fn(bb, cc)
return false
return true
next = (self):
if self.postdot()
return EIM(self.rule, self.pos + 1, self.origin)
return null
penult = (self):
if self.pos + 1 == self.rule.length
return self.postdot()
is_predicted = (self):
return self.pos == 0
is_confirmed = (self):
return self.pos > 0
is_completed = (self):
return self.pos == self.rule.rhs.length
+hash = (self):
return hash([self.rule, self.pos, self.origin])
# Sometimes to resolve bugs, we need to see what's going on.
+repr = (self):
return repr(self.origin) ++ ":" ++
repr(self.pos) ++
":" ++ repr(self.rule)
# # TODO: String formatting
# # if isinstance(self.rule, Rule):
# # lhs = repr(self.rule.lhs)
# # pre = ' '.join(map(repr, self.rule.rhs[:self.pos]))
# # pos = ' '.join(map(repr, self.rule.rhs[self.pos:]))
# # return "{} -> {} * {} : {}".format(lhs, pre, pos, self.origin)
# # return object.__repr__(self)
#
%"=="[[EIM, EIM]] = (a, b):
if a.rule != b.rule
return false
if a.origin != b.origin
return false
if a.pos != b.pos
return false
return true
class LEO
+init = (self, left, cc, start, indent):
self.left = left
self.cc = cc
self.start = start
self.indent = indent
stop = property();
get = (self):
return self.cc.stop
to_sppf = (self):
left = self.left
cc = self.cc
while left
bb = left.sppf
if bb
start = bb.start
indent = bb.indent
else
start = cc.start
indent = cc.indent
cc = SPPF(start, cc.stop, left.rule, Link(bb, cc), indent)
left = left.left
return cc
class LEOLink
+init = (self, left, rule, sppf):
self.left = left
self.rule = rule
self.sppf = sppf
class SPPF # Shared packed parse forest
+init = (self, start, stop, cell, link, indent):
self.start = start
self.stop = stop
self.cell = cell
self.link = link
self.indent = indent # TODO: consider whether this belongs into the location data.
to_sppf = (self):
return self
is_leaf = (self):
return self.link == null
insert = (self, left, right):
if self.link == null
self.link = Link(left, right)
return self.link
link = self.link
while true
if link.left == left and link.right == right
return link
if link.link == null
link.link = Link(left, right)
return link.link
link = link.link
single = (self):
result = []
link = self.link
while link.left
if link.link
return null
result.append(link.right)
link = link.left.link
if link.link # Fixed the samples/grammar_bug_0
return null
result.append(link.right)
result.reverse()
return result
+iter = (self):
# TODO: should probably be incremental?
output = []
finger = []
# To produce all parses, the sppf is fingered through.
link = self.link
while finger.length > 0 or link
while link.left
finger.append(link)
link = link.left.link
# Now the link contains the head, while the tail is in the finger list.
while link
result = [link.right]
for x in reversed(finger)
result.append(x.right)
output.append(result)
link = link.link
# Now some portion of the finger is already iterated, and should be removed.
while finger.length > 0 and not link
link = finger.pop().link
return iter(output)
## TODO: add string formatter to lever
## return "[{}:{}] {}".format(self.start, self.stop, self.cell)
class Link
+init = (self, left, right, link=null):
self.left = left
self.right = right
self.link = link
traverse_sppf = (stack, postorder_cb, blank_cb, resolve_ambiguity):
rcount = 1
sstack = []
rstack = []
while stack.length > 0
sppf = stack.pop().to_sppf()
if sppf.is_leaf()
sstack.append(sppf.cell)
rcount -= 1
else
result = sppf.single()
if result == null
result = resolve_ambiguity(sppf, ambiguity_traverser(sppf,
postorder_cb, blank_cb, resolve_ambiguity))
if isinstance(result, Resolve)
sstack.append(result.value)
rcount -= 1
else
rstack.append(object();
rcount = rcount - 1
rlen = result.length
sppf = sppf)
rcount = result.length
stack.extend(reversed(result))
while rcount == 0 and rstack.length > 0
s = rstack.pop()
rcount = s.rcount
rlen = s.rlen
sppf = s.sppf
a = []
for i in range(rlen)
a.append(sstack.pop(sstack.length+i-rlen))
# TODO: Here we do not really identify where the blank rule appears.
# That feature could be really useful sometimes.
# That information is available in the sppf.
sstack.append(postorder_cb(expand(
sppf.start, sppf.stop, sppf.cell, blank_cb, iter(a))...))
sstack.reverse() # won't hurt.
return sstack
ambiguity_traverser = (sppf, postorder_cb, blank_cb, resolve_ambiguity):
return (stack):
seq = traverse_sppf(stack,
postorder_cb,
blank_cb,
resolve_ambiguity)
return postorder_cb(expand(
sppf.start, sppf.stop, sppf.cell, blank_cb, iter(seq))...)
class Resolve
+init = (self, value):
self.value = value
expand = (start, stop, cell, blank_callback, seq):
if isinstance(cell.annotation, NNF)
nnf = cell.annotation
result = []
i = 0
for p in nnf.present
if p
result.append(seq.next())
else
result.append(blank_callback(nnf.rule.rhs[i].getsym()))
i += 1
return [nnf.rule, result, start, stop]
return [cell, list(seq), start, stop]
# TODO: 'format_origin' depends on the lno/col positioning information.
# We may want to change this, as parsing is not necessarily constrained
# to text files.
class SyntaxError extends Exception
+init = (self, message, location, source, at_eof=false):
self.message = message
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
return format_origin(self.source, self.location, self.message)
class Ambiguity extends SyntaxError
+init = (self, sppf):
self.sppf = sppf
self.traceback = null
+repr = (self):
start = self.sppf.start
stop = self.sppf.stop
msg = ["Ambiguity in " ++ format_loc(start) ++ "-" ++ format_loc(stop)]
sppf_group = set([self.sppf])
while sppf_group.length == 1
sppf = sppf_group.pop()
rows = list(sppf)
col_count = null
col_count = min(col_count, row.length) for row in rows
for j in range(col_count)
items = set()
items.add(rows[i][j]) for i in range(rows.length)
if items.length > 1
sppf_group = items
break
if sppf_group.length == 0
sppf_group.add(sppf)
for sppf in sppf_group
for row in sppf
cols = []
for item in row
item = item.to_sppf()
start = item.start
stop = item.stop
if isinstance(item.cell, Rule)
cols.append("(" ++ format_loc(start) ++ "-" ++ format_loc(stop) ++ ")")
cols.append(repr(item.cell.lhs))
else
cols.append(repr(item.cell))
msg.append(" ".join(cols))
return "\n".join(msg)
class SyntaxErrorExpected extends SyntaxError
+init = (self, expect, location, source, at_eof=false):
self.expect = list(expect)
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
msg = [format_origin(self.source, self.location, " expected some of:")]
expect = []
for e in self.expect
if e.name
expect.append(e)
expect.sort(symbol_lt)
for symbol in expect
msg.append(" " ++ symbol.name)
return "\n".join(msg)
class SyntaxErrorExpected2 extends SyntaxError
+init = (self, value, expect, location, source, at_eof=false):
self.value = value
self.expect = list(expect)
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
msg = [format_origin(self.source, self.location, " expected some of:")]
if self.value
msg.insert(0, "got:" ++ repr(self.value))
expect = []
for e in self.expect
if e.name
expect.append(e)
expect.sort(symbol_lt)
for symbol in expect
msg.append(" " ++ symbol.name)
return "\n".join(msg)
format_origin = (source, location, message=null):
loc = [repr(location.lno), repr(location.col)]
if message
loc.append(message)
if isinstance(source, path)
loc.insert(0, source.to_string())
elif source
loc.insert(0, source)
else
loc.insert(0, "")
return ":".join(loc)
format_loc = (location):
return repr(location.lno) ++ ":" ++ repr(location.col)
symbol_lt = multimethod(2)
symbol_lt[[Terminal, Nonterminal]] = (a, b):
return false
symbol_lt[[Nonterminal, Terminal]] = (a, b):
return true
symbol_lt[[Nonterminal, Nonterminal]] = (a, b):
return a.name < b.name
symbol_lt[[Terminal, Terminal]] = (a, b):
return a.name < b.name
# There used to be indent parser here, it was removed
# because it was no longer needed.
# TODO: Remove this once grammar.lc doesn't need it anymore.
class IndentParser
+init = (self, pos=tokenizer.Position(0, 1), indent=null, dedent=null, newline=null):
self.stack = []
self.level = pos.col
self.line = pos.lno
self.indent = indent
self.dedent = dedent
self.newline = newline
step = (self, parser, pos, source):
if self.line < pos.lno
while pos.col < self.level and parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
if pos.col < self.level
raise SyntaxError("uneven indent", pos, source)
if pos.col == self.level and parser.expecting(self.newline)
parser.step(self.newline, null, pos, pos)
if pos.col > self.level and parser.expecting(self.indent)
parser.step(self.indent, null, pos, pos)
self.stack.append(self.level)
self.level = pos.col
self.line = pos.lno
# This can be used to terminate dedent if the parsing cannot
# continue otherwise. Though note that this function, and to
# some extent the whole indent parser in its current form
# provides bias for certain interpretations of the input.
slip = (self, parser, pos, source):
while parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
# Most languages have a bug if this function returns false.
finish = (self, parser, pos):
while self.stack.length > 0 and parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
return self.stack.length == 0