Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import tokenizer
doc = path("doc:/chartparser")
# The upgraded chart parser is layout sensitive without
# needing preprocessing in the token stream.
# Therefore the tokenizer no longer needs any feedback
# from the parser to parse these languages.
# This example is too simple to trigger any of the potential
# bugs in the grammar, but it's enough to give an idea of the usage.
main = ():
file = Nonterminal('file')
symbol = Terminal('symbol')
on_indent = (bb, cc):
return bb.indent < cc.start.col and cc.start.col == cc.indent
on_newline = (bb, cc):
return bb.indent == cc.start.col and cc.start.col == cc.indent
user_grammar = [
Rule(file, [symbol, Condition(symbol, [on_indent])]),
Rule(file, [symbol, Condition(symbol, [on_newline])]),
]
parsergen = preprocess(user_grammar, file)
parser = parsergen()
parser.step(symbol, "hello", (lno=0, col=0), (lno=0, col=5))
parser.step(symbol, "world", (lno=1, col=2), (lno=1, col=7))
print("accepted?", parser.accepted)
print(repr(parser.traverse(
((x, a):
return a),
(x):
return "")))
# The interface to construct a grammar consists of
# rules, terminals, nonterminals and conditions.
# Rules have a nonterminal on the left side and
# list of nonterminals, terminals on the right side.
# The rules may be annotated to help reading parse trees.
class Rule
+init = (self, lhs, rhs, annotation=null):
self.lhs = lhs
self.rhs = rhs
self.annotation = annotation
+repr = (self):
rhs = []
for item in self.rhs
rhs.append(repr(item))
out = repr(self.lhs) ++ " -> " ++ " ".join(rhs)
return out
# Earlier I did not separate terminals from
# non-terminals because it was not strictly
# necessary. That turned out to confuse
# when designing grammars.
class Terminal
+init = (self, name):
self.name = name
+repr = (self):
if self.name
return "@" ++ self.name
return "<Terminal>"
# .getsym makes it easier to extract the symbol from rhs item.
getsym = (self):
return self
class Nonterminal
+init = (self, name):
self.name = name
+repr = (self):
if self.name
return self.name
return "<Nonterminal>"
getsym = (self):
return self
class Condition
+init = (self, symbol, constraints):
self.symbol = symbol
self.constraints = constraints
getsym = (self):
return self.symbol
+repr = (self):
cons = []
for c in self.constraints
cons.append(repr(c))
return "{" ++ ", ".join(cons) ++ "}" ++ repr(self.symbol)
# To reduce the work during parsing the grammar is preprocessed.
# the user calls preprocess(user_grammar, accept_symbol)
# and gets an initiator back. The initiator can be used to
# init parsers.
# The preprocessing transforms the grammar into NNF, which
# divides the grammar into null and non-null rules.
# The null rules are not used in middle of parsing, but
# they are useful when interpreting the parse tree.
# TODO: It would be preferable to allow serialization of the
# preprocessed grammar and the annotations attached to it.
preprocess = (user_grammar, default_accept):
nullable = find_nullable(user_grammar)
grammar = {}
blankset = {}
rules = build_nnf(user_grammar, nullable)
for rule in rules
if rule.rhs.length == 0
try
blankset[rule.lhs].append(rule)
except KeyError as _
blankset[rule.lhs] = [rule]
else
try
grammar[rule.lhs].append(rule)
except KeyError as _
grammar[rule.lhs] = [rule]
right_recursive = find_right_recursive(rules)
return Initiator(grammar, blankset,
right_recursive, default_accept)
# Earley based parsing would suffer from nullable rules.
# The parsing step ends up being simple when grammar
# does not contain any of them, so they are rewritten away.
# The result is a "nihilist normal form"
# Further reasoning about this can be found in the paper
# "Practical Earley Parsing" by Aycock & Horspool
find_nullable = (grammar):
nullable = set()
queue = []
new_nullable = (symbol):
if symbol not in nullable
nullable.add(symbol)
queue.append(symbol)
inverse_lookup = {}
new_lookup = (index, symbol):
try
inverse_lookup[symbol].append(index)
except KeyError as _
inverse_lookup[symbol] = [index]
nonterminals = []
nonnullables = []
for rule in grammar
if rule.rhs.length == 0
new_nullable(rule.lhs)
elif all_nonterminals(rule.rhs)
index = nonnullables.length
for x in rule.rhs
x = x.getsym()
if x != rule.lhs
new_lookup(index, x)
nonterminals.append(rule.lhs)
nonnullables.append(count_nonrec(rule))
for n in queue
for i in inverse_lookup.get(n, [])
nonnullables[i] -= 1
if nonnullables[i] == 0
new_nullable(nonterminals[i])
return nullable
all_nonterminals = (rhs):
for x in rhs
if not isinstance(x.getsym(), Nonterminal)
return false
return true
all_nullable = (rhs, nullable):
for x in rhs
if x.getsym() not in nullable
return false
return true
count_nonrec = (rule):
s = 0
for x in rule.rhs
s += int(x.getsym() != rule.lhs)
return s
# Going through n bits in binary produces all possible permutations
# where a field is present and not present.
build_nnf = (grammar, nullable):
result = []
for rule in grammar
order = 0
for x in rule.rhs
order += int(x.getsym() in nullable)
for i in range(1 << order)
result.append(nihilist_rule(rule, i, nullable))
return result
nihilist_rule = (rule, index, nullable):
present = []
rhs = []
for x in rule.rhs
shift = true
if x.getsym() in nullable
if index & 1 == 0
shift = false
index >>= 1
present.append(shift)
if shift
rhs.append(x)
return Rule(rule.lhs, rhs, NNF(rule, present))
# The nihilist normal form rules are annotated with NNF nodes.
class NNF
+init = (self, rule, present):
self.rule = rule # the original rule
self.present = present # tells which fields are present in this rule.
# Conditions on whether an item is leo-eligible:
# its rule is right recursive
# it is quasi-complete
# it is postdot-unique
find_right_recursive = (grammar):
edges = []
for rule in grammar
if rule.rhs.length > 0
right = rule.rhs[rule.rhs.length - 1]
row = []
for other in grammar
row.append(other.lhs == right.getsym())
edges.append(row)
else
row = []
for other in grammar
row.append(false)
edges.append(row)
warshall_transitive_closure(edges)
right_recursive = set()
i = 0
for rule in grammar
if edges[i][i] and rule.rhs.length >= 2 # Excluding rules that have only one rhs symbol.
right_recursive.add(rule) # Leo items caused problems if the rule was a prediction.
i += 1
return right_recursive
warshall_transitive_closure = (a):
n = a.length
for k in range(n)
for i in range(n)
if not a[i][k]
continue
for j in range(n)
if not a[k][j]
continue
a[i][j] = true
return a
# The nullable set presents the same information as the blankset
# so we can discard it.
class Initiator
+init = (self, grammar, blankset, right_recursive, default_accept):
self.grammar = grammar
self.blankset = blankset
self.right_recursive = right_recursive
self.right_recursive = set() # disable LEO
self.default_accept = default_accept
# TODO: Fix up the performance issue in REPR startup by allowing the
# caching of a grammar.
+call = (self, accept=self.default_accept):
parser = Parser(self, accept, [])
# In an earley parser that uses NNF, empty input is a special case, that is taken care of here.
if accept in self.blankset
parser.output.append(SPPF(null, null, null, null, 0))
# The first chart column
nodes = {}
current = []
leims = {}
prediction(current, nodes, self.grammar, parser.chart, accept)
for eim in current
prediction(current, nodes, self.grammar, parser.chart, eim.postdot())
cache_transitions(parser.chart, eim, null, leims)
if isinstance(eim.postdot(), Nonterminal) and eim.postdot_constraint()
parser.must_check_layout = true
return parser
class Parser
+init = (self, init, accept, output):
self.chart = self.first = {}
self.init = init
self.accept = accept
self.output = output
self.lno = null # Previous lno
self.indent = 0
self.must_check_layout = false
step = (self, term, token, start=null, stop=null):
if self.lno != start.lno # Record indentation level
self.lno = start.lno
self.indent = start.col
init = self.init
current = []
leims = {}
transitions = {}
nodes = {}
output = []
bottom = SPPF(start, stop, token, null, self.indent)
# If layout check is necessary, it happens here.
# (self.chart[term]), do a DFS through the chart trying
# to find a position where the layout is satisfied.
# completions proceed in non-deterministic manner,
# until everything has been completed.
edges = self.chart[term]
if self.must_check_layout
edges = filter_by_layout(edges, term, bottom)
assert edges.length > 0
"layout violation at line " ++ start.lno.to_string() ++ ", come up with better error message"
self.must_check_layout = false
shift_eims(current, nodes, edges, bottom, init.right_recursive, leims)
for eim in current
# reduction
cc = nodes[eim]
if eim.is_completed()
shift_eims(current, nodes, eim.origin.get(eim.rule.lhs, []), cc, init.right_recursive, leims)
if eim.rule.lhs == self.accept and eim.origin == self.first
output.append(cc)
prediction(current, nodes, init.grammar, transitions, eim.postdot())
cache_transitions(transitions, eim, cc, leims)
if isinstance(eim.postdot(), Nonterminal) and eim.postdot_constraint()
self.must_check_layout = true
self.chart = transitions
self.output = output
accepted = property();
get = (self):
return self.output.length > 0
expect = property();
get = (self):
return self.chart.keys()
expecting = (self, symbol):
return symbol in self.chart
traverse = (self, postorder_cb,
blank_cb=make_default_blank(self, postorder_cb),
resolve_ambiguity=self.default_ambiguity_resolution):
if self.output.length > 1
# This is really weird in current context. I should probably
# rethink this whole ambiguity resolution -thing.
sppf = resolve_ambiguity(null, self.output)
else
sppf = self.output[0]
if isinstance(sppf, SPPF) and sppf.cell == null
return blank_cb(self.accept.getsym())
res = traverse_sppf([sppf], postorder_cb, blank_cb, resolve_ambiguity)
assert res.length == 1, "broken parse traverse"
return res[0]
default_ambiguity_resolution = (self, sppf):
raise Ambiguity(sppf)
make_default_blank = (parser, postorder_cb):
blank_cb = (symbol):
blanks = parser.init.blankset[symbol]
if blanks.length != 1
raise Exception("default_blank ambiguity")
cell = blanks[0]
return postorder_cb(expand(null, null, cell, blank_cb, iter([]))...)
return blank_cb
prediction = (current, nodes, grammar, transitions, postdot):
if isinstance(postdot, Nonterminal)
for rule in grammar.get(postdot, [])
eim = EIM(rule, 0, transitions)
if eim not in nodes
nodes[eim] = null
current.append(eim)
cache_transitions = (transitions, eim, cc, leims):
if not eim.is_completed()
postdot = eim.postdot()
trans = object();
eim = eim
cc = cc
leo = null
try
transitions[postdot].append(trans)
except KeyError as _
if eim.rule in leims # If the item is not postdot unique, then
trans.leo = leims[eim.rule] # .leo is never read anyway.
transitions[postdot] = [trans]
filter_by_layout = (edges, symbol, cc):
out = []
for trans in edges
if check_layout(trans, cc, set([symbol]))
out.append(trans)
return out
check_layout = (trans, cc, visited):
if trans.eim.is_confirmed()
return trans.eim.check_postdot_constraint(trans.cc, cc)
else
lhs = trans.eim.rule.lhs
return false if lhs in visited
visited.add(lhs)
edges = trans.eim.origin.get(lhs, [])
for trans in edges
return true if check_layout(trans, cc, visited)
return edges.length == 0 # Includes condition that there is no edges for transition.
# Should only happen if we reach accept symbol
# TODO: The ideas and the approach we take with LEO items should be
# reviewed again, so that we can confirm there are no bugs there.
shift_eims = (current, nodes, edges, cc, right_recursive, leims):
if is_leo_eligible(edges, right_recursive)
trans = edges[0]
if trans.leo
if trans.eim.check_postdot_constraint(trans.cc, cc)
link = LEOLink(trans.leo.link, trans.eim.rule, trans.cc)
leims[trans.eim.rule] = object();
trans = trans.leo.trans
link = link
eim = trans.leo.trans.eim.next()
assert eim not in nodes
"assumption that a postdot unique eim does not appear twice"
nodes[eim] = LEO(link, cc, trans.cc.start, trans.cc.indent)
current.append(eim)
else
leims[trans.eim.rule] = object();
trans = trans
link = LEOLink(null, trans.eim.rule, trans.cc)
shift_eim(current, nodes, trans.eim, trans.cc, cc)
else
for trans in edges
shift_eim(current, nodes, trans.eim, trans.cc, cc)
is_leo_eligible = (edges, right_recursive):
if edges.length != 1 # must be postdot-unique
return false
eim = edges[0].eim
return eim.rule in right_recursive and eim.pos == eim.rule.rhs.length - 1 #quasi-complete
shift_eim = (current, nodes, eim, bb, cc):
# We have to prevent Link buildup if the condition is unsatisfied.
return if not eim.check_postdot_constraint(bb, cc)
eim = eim.next()
try
sppf = nodes[eim]
if bb
start = bb.start
else
start = cc.start
assert start == sppf.start
"sppf tree corruption (parsing bug)"
sppf.insert(bb, cc)
except KeyError as _
if bb
start = bb.start
indent = bb.indent
else
start = cc.start
indent = cc.indent
nodes[eim] = sppf = SPPF(start, cc.stop, eim.rule, Link(bb, cc), indent)
current.append(eim)
class EIM
+init = (self, rule, pos, origin):
self.rule = rule
self.pos = pos
self.origin = origin
# assert 0 <= pos <= len(rule)
postdot = (self):
if self.pos < self.rule.rhs.length
return self.rule.rhs[self.pos].getsym()
return null
postdot_constraint = (self):
if self.pos < self.rule.rhs.length
x = self.rule.rhs[self.pos]
if isinstance(x, Condition)
return x.constraints
return null
check_postdot_constraint = (self, bb, cc):
constraints = self.postdot_constraint()
if constraints
for fn in constraints
if not fn(bb, cc)
return false
return true
next = (self):
if self.postdot()
return EIM(self.rule, self.pos + 1, self.origin)
return null
penult = (self):
if self.pos + 1 == self.rule.length
return self.postdot()
is_predicted = (self):
return self.pos == 0
is_confirmed = (self):
return self.pos > 0
is_completed = (self):
return self.pos == self.rule.rhs.length
+hash = (self):
return hash([self.rule, self.pos, self.origin])
# Sometimes to resolve bugs, we need to see what's going on.
+repr = (self):
return repr(self.origin) ++ ":" ++
repr(self.pos) ++
":" ++ repr(self.rule)
# # TODO: String formatting
# # if isinstance(self.rule, Rule):
# # lhs = repr(self.rule.lhs)
# # pre = ' '.join(map(repr, self.rule.rhs[:self.pos]))
# # pos = ' '.join(map(repr, self.rule.rhs[self.pos:]))
# # return "{} -> {} * {} : {}".format(lhs, pre, pos, self.origin)
# # return object.__repr__(self)
#
%"=="[[EIM, EIM]] = (a, b):
if a.rule != b.rule
return false
if a.origin != b.origin
return false
if a.pos != b.pos
return false
return true
class LEO
+init = (self, left, cc, start, indent):
self.left = left
self.cc = cc
self.start = start
self.indent = indent
stop = property();
get = (self):
return self.cc.stop
to_sppf = (self):
left = self.left
cc = self.cc
while left
bb = left.sppf
if bb
start = bb.start
indent = bb.indent
else
start = cc.start
indent = cc.indent
cc = SPPF(start, cc.stop, left.rule, Link(bb, cc), indent)
left = left.left
return cc
class LEOLink
+init = (self, left, rule, sppf):
self.left = left
self.rule = rule
self.sppf = sppf
class SPPF # Shared packed parse forest
+init = (self, start, stop, cell, link, indent):
self.start = start
self.stop = stop
self.cell = cell
self.link = link
self.indent = indent # TODO: consider whether this belongs into the location data.
to_sppf = (self):
return self
is_leaf = (self):
return self.link == null
insert = (self, left, right):
if self.link == null
self.link = Link(left, right)
return self.link
link = self.link
while true
if link.left == left and link.right == right
return link
if link.link == null
link.link = Link(left, right)
return link.link
link = link.link
single = (self):
result = []
link = self.link
while link.left
if link.link
return null
result.append(link.right)
link = link.left.link
if link.link # Fixed the samples/grammar_bug_0
return null
result.append(link.right)
result.reverse()
return result
+iter = (self):
# TODO: should probably be incremental?
output = []
finger = []
# To produce all parses, the sppf is fingered through.
link = self.link
while finger.length > 0 or link
while link.left
finger.append(link)
link = link.left.link
# Now the link contains the head, while the tail is in the finger list.
while link
result = [link.right]
for x in reversed(finger)
result.append(x.right)
output.append(result)
link = link.link
# Now some portion of the finger is already iterated, and should be removed.
while finger.length > 0 and not link
link = finger.pop().link
return iter(output)
## TODO: add string formatter to lever
## return "[{}:{}] {}".format(self.start, self.stop, self.cell)
class Link
+init = (self, left, right, link=null):
self.left = left
self.right = right
self.link = link
traverse_sppf = (stack, postorder_cb, blank_cb, resolve_ambiguity):
rcount = 1
sstack = []
rstack = []
while stack.length > 0
sppf = stack.pop().to_sppf()
if sppf.is_leaf()
sstack.append(sppf.cell)
rcount -= 1
else
result = sppf.single()
if result == null
result = resolve_ambiguity(sppf, ambiguity_traverser(sppf,
postorder_cb, blank_cb, resolve_ambiguity))
if isinstance(result, Resolve)
sstack.append(result.value)
rcount -= 1
else
rstack.append(object();
rcount = rcount - 1
rlen = result.length
sppf = sppf)
rcount = result.length
stack.extend(reversed(result))
while rcount == 0 and rstack.length > 0
s = rstack.pop()
rcount = s.rcount
rlen = s.rlen
sppf = s.sppf
a = []
for i in range(rlen)
a.append(sstack.pop(sstack.length+i-rlen))
# TODO: Here we do not really identify where the blank rule appears.
# That feature could be really useful sometimes.
# That information is available in the sppf.
sstack.append(postorder_cb(expand(
sppf.start, sppf.stop, sppf.cell, blank_cb, iter(a))...))
sstack.reverse() # won't hurt.
return sstack
ambiguity_traverser = (sppf, postorder_cb, blank_cb, resolve_ambiguity):
return (stack):
seq = traverse_sppf(stack,
postorder_cb,
blank_cb,
resolve_ambiguity)
return postorder_cb(expand(
sppf.start, sppf.stop, sppf.cell, blank_cb, iter(seq))...)
class Resolve
+init = (self, value):
self.value = value
expand = (start, stop, cell, blank_callback, seq):
if isinstance(cell.annotation, NNF)
nnf = cell.annotation
result = []
i = 0
for p in nnf.present
if p
result.append(seq.next())
else
result.append(blank_callback(nnf.rule.rhs[i].getsym()))
i += 1
return [nnf.rule, result, start, stop]
return [cell, list(seq), start, stop]
# TODO: 'format_origin' depends on the lno/col positioning information.
# We may want to change this, as parsing is not necessarily constrained
# to text files.
class SyntaxError extends Exception
+init = (self, message, location, source, at_eof=false):
self.message = message
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
return format_origin(self.source, self.location, self.message)
class Ambiguity extends SyntaxError
+init = (self, sppf):
self.sppf = sppf
self.traceback = null
+repr = (self):
start = self.sppf.start
stop = self.sppf.stop
msg = ["Ambiguity in " ++ format_loc(start) ++ "-" ++ format_loc(stop)]
sppf_group = set([self.sppf])
while sppf_group.length == 1
sppf = sppf_group.pop()
rows = list(sppf)
col_count = null
col_count = min(col_count, row.length) for row in rows
for j in range(col_count)
items = set()
items.add(rows[i][j]) for i in range(rows.length)
if items.length > 1
sppf_group = items
break
if sppf_group.length == 0
sppf_group.add(sppf)
for sppf in sppf_group
for row in sppf
cols = []
for item in row
item = item.to_sppf()
start = item.start
stop = item.stop
if isinstance(item.cell, Rule)
cols.append("(" ++ format_loc(start) ++ "-" ++ format_loc(stop) ++ ")")
cols.append(repr(item.cell.lhs))
else
cols.append(repr(item.cell))
msg.append(" ".join(cols))
return "\n".join(msg)
class SyntaxErrorExpected extends SyntaxError
+init = (self, expect, location, source, at_eof=false):
self.expect = list(expect)
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
msg = [format_origin(self.source, self.location, " expected some of:")]
expect = []
for e in self.expect
if e.name
expect.append(e)
expect.sort(symbol_lt)
for symbol in expect
msg.append(" " ++ symbol.name)
return "\n".join(msg)
class SyntaxErrorExpected2 extends SyntaxError
+init = (self, value, expect, location, source, at_eof=false):
self.value = value
self.expect = list(expect)
self.location = location
self.source = source
self.traceback = null
self.at_eof = at_eof
+repr = (self):
msg = [format_origin(self.source, self.location, " expected some of:")]
if self.value
msg.insert(0, "got:" ++ repr(self.value))
expect = []
for e in self.expect
if e.name
expect.append(e)
expect.sort(symbol_lt)
for symbol in expect
msg.append(" " ++ symbol.name)
return "\n".join(msg)
format_origin = (source, location, message=null):
loc = [repr(location.lno), repr(location.col)]
if message
loc.append(message)
if isinstance(source, path)
loc.insert(0, source.to_string())
elif source
loc.insert(0, source)
else
loc.insert(0, "")
return ":".join(loc)
format_loc = (location):
return repr(location.lno) ++ ":" ++ repr(location.col)
symbol_lt = multimethod(2)
symbol_lt[[Terminal, Nonterminal]] = (a, b):
return false
symbol_lt[[Nonterminal, Terminal]] = (a, b):
return true
symbol_lt[[Nonterminal, Nonterminal]] = (a, b):
return a.name < b.name
symbol_lt[[Terminal, Terminal]] = (a, b):
return a.name < b.name
# There used to be indent parser here, it was removed
# because it was no longer needed.
# TODO: Remove this once grammar.lc doesn't need it anymore.
class IndentParser
+init = (self, pos=tokenizer.Position(0, 1), indent=null, dedent=null, newline=null):
self.stack = []
self.level = pos.col
self.line = pos.lno
self.indent = indent
self.dedent = dedent
self.newline = newline
step = (self, parser, pos, source):
if self.line < pos.lno
while pos.col < self.level and parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
if pos.col < self.level
raise SyntaxError("uneven indent", pos, source)
if pos.col == self.level and parser.expecting(self.newline)
parser.step(self.newline, null, pos, pos)
if pos.col > self.level and parser.expecting(self.indent)
parser.step(self.indent, null, pos, pos)
self.stack.append(self.level)
self.level = pos.col
self.line = pos.lno
# This can be used to terminate dedent if the parsing cannot
# continue otherwise. Though note that this function, and to
# some extent the whole indent parser in its current form
# provides bias for certain interpretations of the input.
slip = (self, parser, pos, source):
while parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
# Most languages have a bug if this function returns false.
finish = (self, parser, pos):
while self.stack.length > 0 and parser.expecting(self.dedent)
parser.step(self.dedent, null, pos, pos)
self.level = self.stack.pop()
return self.stack.length == 0