-
Notifications
You must be signed in to change notification settings - Fork 0
/
peg.py
657 lines (602 loc) · 22.5 KB
/
peg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
import operator
def gen(root_peg):
"""Return C source code for a C-Python API module providing a
matcher against root_peg. Currently a bit of the code assumes
root_peg is a JSON grammar: it builds the whitespace production
(which happens to work out to the generated name of g2) into the
toplevel parse function. Needs generalizing! More seriously, we
also can't match the null byte, we don't memoize, and there's at
least one JSON-grammar-specific assumption I've let creep in,
something like the LL(1) grammar condition: that once a sub-peg
matches the leading character of its input, then its failure to
match overall implies that the whole top-level expression must
also fail to match. I don't think it'd be *hard* to relax this
assumption, but it might pervasively affect the C code scheme.
Also, the inlining heuristic will produce code-size blowup on some
grammars; to avoid this we'd need a more conservative definition
of triviality.
On the plus side, the generated code is really fast. I think all
of the optimizations I've included have been tested to actually
speed it up in practice."""
check_for_nulls(root_peg)
# 'names' assigns names to the pegs we won't inline. We inline
# all pegs that are either trivial or referenced just once.
names = gen_names(count_em(root_peg))
nameitems = (sorted(names.items(), key=operator.itemgetter(1))
+ [(root_peg, 'root')])
protos = '\n'.join(gen_prototype(name) for peg, name in nameitems)
char_tables = CharTables()
context = Context(names, char_tables, all_chars)
functions = '\n\n'.join(gen_function(context, peg, name)
for peg, name in nameitems)
return '\n\n'.join([prelude, char_tables.gen_tables(),
protos, functions, postlude])
all_chars = frozenset(map(chr, range(0, 256)))
class Context:
"""The context of code generation. It changes as we go. It
includes the generated names of functions implementing nontrivial
productions, the set of constant tables used by char-set
membership tests, and contextual knowledge of what the next input
character may possibly be at the current point in the code
(because prior tests rule some characters out)."""
def __init__(self, names, char_tables, charset):
self.names = names
self.char_tables = char_tables
self.charset = charset
def gen(self, peg):
if peg in self.names:
return gen_call(self.names[peg], peg)
return peg.gen(self)
def get_possible_leading_chars(self):
return self.charset
def sprout(self, charset):
return Context(self.names, self.char_tables, charset)
def gen_member_test(self, charset):
return gen_member_test(charset, self.charset, self.char_tables)
def gen_advance(self):
# N.B. The correctness of this depends on two facts:
# - We always exit immediately on error.
# - A test against the null-terminator of the input string is
# always an error. We know this because check_for_nulls(root_peg).
return 'c = *++z;'
def gen_member_test(charset, context_charset, char_tables):
"""Return a C expression that's true iff the variable 'c'
is a member of charset, given we already know c is a
member of context_charset."""
test_set = charset & context_charset
if not test_set:
return '0'
elif context_charset.issubset(test_set):
return '1'
elif 1 == len(test_set):
c = list(test_set)[0]
return 'c == %s' % c_char_literal(c)
elif 1 == len(context_charset - test_set):
c = list(context_charset - test_set)[0]
return 'c != %s' % c_char_literal(c)
elif test_set == frozenset('0123456789') & context_charset:
return 'isdigit (c)'
elif test_set == frozenset('0123456789abcdefABCDEF') & context_charset:
return 'isxdigit (c)'
# TODO: check for more 'isfoo()' macros from ctype.h
elif get_nonempty_range(test_set):
# A range test is not necessarily faster in itself than a
# table test, but it conserves space for the irregular
# sets that actually need tables. (When we have fewer
# tables, accessing each one is faster.)
return gen_range_test(get_nonempty_range(test_set))
else:
return char_tables.gen_test(test_set, context_charset)
def get_nonempty_range(charset):
"""Return x,y such that charset == set(map(chr, range(x, y))),
if possible (and nonempty), else None."""
if not charset:
return None
vs = map(ord, sorted(charset))
lo, hi = vs[0], vs[-1]
if hi + 1 - lo == len(charset):
return lo, hi + 1
return None
def gen_range_test((lo, hibound)):
# TODO: generate a > test when hibound == 256
return '(unsigned)(c - %u) < %u' % (lo, hibound - lo)
class CharTables:
"""Generates tests for character-set membership by lookup in a
constant table, which this object remembers and also generates."""
def __init__(self):
self.sets = [] # Those sets that need a table for membership tests.
def gen_test(self, charset, context_charset):
"""Return a C expression that's true iff the variable 'c'
is a member of charset, given we already know c is a
member of context_charset."""
set_index = self._enter_table(charset, context_charset)
return 'charset_table[c] & (1<<%d)' % set_index
def _enter_table(self, charset, context_charset):
test_set = charset & context_charset
for i, set_i in enumerate(self.sets):
if test_set == set_i & context_charset:
return i
assert all(ord(c) < 256 for c in charset) # XXX relax this restriction
self.sets.append(frozenset(charset))
return len(self.sets) - 1
def gen_tables(self):
"""Return C declarations of any tables needed by expressions
we've emitted for self.gen_test()."""
assert len(self.sets) <= 32 # XXX relax this restriction
if len(self.sets) <= 8:
type = 'unsigned char'
elif len(self.sets) <= 16:
type = 'unsigned short'
else:
type = 'unsigned'
bitmasks = map(self._gen_bitmask, range(256))
return """\
static const %s charset_table[256] = {
%s
};""" % (type, indent('\n'.join(bitmasks)))
def _gen_bitmask(self, char_index):
c = chr(char_index)
bitmask = 0
for set_index, charset in enumerate(self.sets):
if c in charset:
bitmask |= 1 << set_index
return '0x%0*x,' % ((len(self.sets) + 3) >> 2, bitmask)
prelude = """\
#include <Python.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
static PyObject *ReadError;
typedef unsigned char u8;
typedef struct {
const u8 *start;
const u8 *end;
} Scanner;
static const u8 *expected (Scanner *s, const u8 *z, char c) {
PyErr_Format (ReadError, "At byte-offset %d, expected '%c'",
z - s->start, c);
return NULL;
}
static const u8 *expected_one_of (Scanner *s, const u8 *z, const char *charset) {
PyErr_Format (ReadError, "At byte-offset %d, expected one of '%s'",
z - s->start, charset);
return NULL;
}"""
def gen_prototype(name):
return """\
static const u8 *%s (Scanner *s, const u8 *z);""" % name
def gen_function(context, peg, name):
return """\
static const u8 *%s (Scanner *s, const u8 *z) {
u8 c = *z;
%s
return z;
}""" % (name,
indent(peg.gen(context)))
postlude = """\
static int parse (const u8 *string, const u8 *end) {
Scanner scanner = { string, end };
const u8 *z = string;
z = g2 (&scanner, z); // XXX wired-in JSON grammar production
z = root (&scanner, z); if (!z) return 1;
z = g2 (&scanner, z); // XXX wired-in JSON grammar production
if (z != end) {
PyErr_Format (ReadError, "At byte offset %d, stuff left over",
z - scanner.start);
return 1;
}
return 0;
}
static PyObject *yajson_check (PyObject *self, PyObject *args) {
(void) self;
PyObject *string;
if (! PyArg_ParseTuple (args, "O", &string))
return NULL;
char *start;
Py_ssize_t size;
if (-1 == PyString_AsStringAndSize (string, &start, &size)) {
Py_DECREF (string);
return NULL;
}
if (parse ((const u8 *)start, (const u8 *)start + size))
return NULL;
return PyInt_FromLong (0);
}
static PyMethodDef YajsonMethods[] = {
{"check", yajson_check, METH_VARARGS, "XXX blah blah."},
{NULL, NULL, 0, NULL}
};
void inityajson (void) {
PyObject *module = Py_InitModule ("yajson", YajsonMethods);
if (module == NULL)
return;
ReadError = PyErr_NewException ("yajson.ReadError", PyExc_ValueError, NULL);
if (ReadError == NULL)
return;
Py_INCREF (ReadError);
PyModule_AddObject (module, "ReadError", ReadError);
}"""
def gen_names(pegs):
"Return a map from the nontrivial pegs to their generated names."
pegs = [peg for peg in pegs if not peg.is_trivial()]
return dict((peg, 'g%d' % i)
for (i, peg) in enumerate(sorted(pegs, key=str)))
def count_em(root_peg):
"""Return a set of those pegs referenced more than once in the
graph reachable from root_peg."""
seen = set()
multiples = set()
def counting(peg):
if peg in seen:
multiples.add(peg)
else:
seen.add(peg)
map(counting, peg.pegs)
counting(root_peg)
return frozenset(multiples)
def check_for_nulls(peg):
"""Crash if any reachable peg can match the null byte. Needed
because we generate code using the null byte as an end-of-input
sentinel, assuming such a null byte is never accepted. (This
happens to be a speed win with the C-Python API.)"""
assert chr(0) not in peg.firsts()
map(check_for_nulls, peg.pegs)
class Peg:
"A part of a Parsing Expression Grammar."
pegs = () # Sub-pegs this peg references (none by default)
def is_trivial(self):
"""A trivial peg has no * or any sub-pegs except literals.
It's equivalent to a set or a sequence of literal characters
or else a C code action."""
return all(isinstance(peg, Literal) for peg in self.pegs)
def gen(self, context):
"""Return C code (a sequence of statements) for this peg to
match the input."""
abstract
def has_null(self):
"Return true iff this peg matches the empty string."
# N.B. Use of this gets problematic in the presence of
# lookahead, which would imply a peg can sometimes match
# epsilon and sometimes not. Each use of has_null() will need
# to be reexamined if we add lookahead.
abstract
def firsts(self):
"""Return a set of all characters that can possibly appear as
the first character of input that this peg matches."""
# N.B. This also may need reviewing given lookahead.
abstract
def may_overcommit(self):
"""Return true unless matching this peg never needs to
backtrack. That is, false means we're conservatively sure that
once the first character is matched, this peg is certain to
match something."""
abstract
class Recur(Peg):
"""A PEG grammar may include recursive productions. Implemented
directly, this would mean cycles in our reference graph, plus the
need to solve simultaneous equations to compute the first-sets.
Instead, we make the user represent the references to the recurring
production as Recur nodes. The user must supply the solution to
the recursion equations. (Not onerous for a simple grammar like
JSON. Later on we can add code to compute it ourselves if this
library ever gets that far.)"""
def __init__(self, nullity, firstset, variables):
self.nullity = nullity
self.firstset = frozenset(firstset)
self.name = 'root' # XXX assumes just 1 recursive production
self.variables = tuple(variables)
def __str__(self):
return self.name
def gen(self, context):
return gen_call(self.name, self)
def has_null(self):
return self.nullity
def firsts(self):
return self.firstset
def may_overcommit(self):
return True
def gen_call(name, peg):
maybe_fail = '' if always_succeeds(peg) else ' if (!z) return z;'
return 'z = %s (s, z);%s c = *z;' % (name, maybe_fail)
def always_succeeds(peg):
return peg.has_null() and not peg.may_overcommit()
class Epsilon(Peg):
"A peg that matches just the empty string."
def __str__(self):
return '()'
def gen(self, context):
return ';'
def has_null(self):
return True
def firsts(self):
return frozenset()
def may_overcommit(self):
return False
class Code(Epsilon):
"A 'peg' that succeeds and performs some C-code action."
# XXX all this needs filling out with a way to return parse results
def __init__(self, body):
self.body = body
def __str__(self):
return '{%s}' % self.body
def gen(self, context):
return self.body
class Variable:
"A C-code variable, to be assigned to in Code pegs."
def __init__(self, name, type, initial_value):
self.name = name
self.type = type
self.initial_value = initial_value
def __str__(self):
return self.name
def gen_init(self):
return '%s %s = %s;' % (self.type, self.name, self.initial_value)
def gen_finalize(self):
if self.type == 'PyObject*':
return 'if (%s) { Py_DECREF (%s); }' % (self.name, self.name)
return ';'
class Scope(Peg):
"A peg that introduces local variables with scope enclosing the sub-peg."
def __init__(self, variables, peg):
self.variables = variables
self.peg = peg
self.pegs = (peg,)
def __str__(self):
return '[%s|%s]' % (','.join(map(str, self.variables)), self.peg)
def gen(self, context):
# XXX needs cleanup, etc.
return ('{\n %s\n %s\n %s\n}'
% (indent('\n'.join(v.gen_init() for v in self.variables)),
indent(self.peg.gen(context)),
indent('\n'.join(v.gen_finalize() for v in self.variables))))
def has_null(self):
return self.peg.has_null()
def firsts(self):
return self.peg.firsts()
def may_overcommit(self):
return self.peg.may_overcommit()
class Literal(Peg):
"A peg that matches a single literal character."
def __init__(self, c):
assert 1 == len(c)
self.c = c
def __str__(self):
return "'%s'" % self.c
def gen(self, context):
if frozenset(self.c) == context.get_possible_leading_chars():
return context.gen_advance()
c_lit = c_char_literal(self.c)
return ("if (c != %s) return expected (s, z, %s); %s"
% (c_lit, c_lit, context.gen_advance()))
def has_null(self):
return False
def firsts(self):
return frozenset(self.c)
def may_overcommit(self):
return False
class OneOf(Peg):
"A peg that matches if any of its sub-pegs matches."
def __init__(self, *pegs):
assert 0 < len(pegs)
self.pegs = map(parse, pegs)
def __str__(self):
return '(%s)' % '|'.join(map(str, self.pegs))
def gen(self, context):
return gen_dispatch(context, self._gen_branches(context))
def _gen_branches(self, context):
# context_set tracks the possible characters that have not
# already been checked for.
context_set = set(context.get_possible_leading_chars())
def gen_branch(peg):
cs = frozenset(context_set)
if peg.has_null():
# This branch is certain to be taken. (True only
# because we don't provide a lookahead PEG type.)
return (cs, cs, context.gen(peg))
f = peg.firsts()
branch = (cs, f, context.sprout(f & cs).gen(peg))
context_set.difference_update(f)
return branch
branches = map(gen_branch, self.pegs)
if context_set:
cs = frozenset(context_set)
expected = sorted(self.firsts())
branches.append((cs, cs, ('return expected_one_of (s, z, %s);'
% c_string_literal(expected))))
return branches
def has_null(self):
return any(peg.has_null() for peg in self.pegs)
def firsts(self):
return union(peg.firsts() for peg in self.pegs)
def may_overcommit(self):
return any(peg.may_overcommit() for peg in self.pegs)
def union(sets):
result = set()
for s in sets:
result |= s
return frozenset(result)
def gen_dispatch(context, branches):
"Generate an if...else-if chain."
# Precondition: the tests of all branches together are exhaustive
# in the given context.
branches = truncate(collapse(branches))
if len(branches) == 0:
return ';'
elif len(branches) <= 3:
return '\nelse '.join(
gen_if(context.sprout(context_set).gen_member_test(charset),
stmts)
for (context_set, charset, stmts) in branches)
else:
return """\
switch (c) {
%s
}""" % indent('\n'.join(gen_case(context, branch) for branch in branches))
def gen_case(context, (context_set, charset, stmts)):
if context_set.issubset(charset):
label = 'default:'
else:
label = '\n'.join('case %s:' % c_char_literal(c)
for c in sorted(charset))
return '%s\n %s\n break;' % (label, indent(stmts))
def truncate(branches):
"Remove any branches that are always preempted by earlier branches."
result = []
for context_set, charset, stmts in branches:
always_taken = context_set.issubset(charset)
if not always_taken or stmts != ';':
result.append((context_set, charset, stmts))
if always_taken:
break
return result
def collapse(branches):
"Combine adjacent branches that share the same body."
result = []
prev_stmts = None
for context_set, charset, stmts in branches:
if stmts == prev_stmts:
prev_context_set, prev_charset, _ = result[-1]
result[-1] = prev_context_set, prev_charset | charset, prev_stmts
else:
result.append((context_set, charset, stmts))
prev_stmts = stmts
return result
def gen_if(test, stmts):
stmts = embrace(stmts)
if test == '1':
return stmts
return 'if (%s) %s' % (test, stmts)
def embrace(stmts):
# XXX why is this a bug? --
#if stmts.startswith('{') and stmts.endswith('}'):
# return stmts
return '{\n %s\n}' % indent(stmts)
class Seq(Peg):
"""A peg that matches any concatenation of what each of its
sub-pegs matches, in order."""
def __init__(self, *pegs):
self.pegs = map(parse, pegs)
def __str__(self):
return '(%s)' % ';'.join(map(str, self.pegs))
def gen(self, context):
stmts = []
c = context
for peg in self.pegs:
stmts.append(c.gen(peg))
# TODO: follow-sets can be narrower than all_chars
c = context.sprout(all_chars)
return '\n'.join(stmts)
def has_null(self):
return all(peg.has_null() for peg in self.pegs)
def firsts(self):
f = set()
for peg in self.pegs:
f |= peg.firsts()
if not peg.has_null():
break
return frozenset(f)
def may_overcommit(self):
return True # (conservative)
class Star(Peg):
"""A peg that matches any concatenation of 0 or more matches
by its sub-peg. (Kleene star.)"""
def __init__(self, peg):
self.peg = parse(peg)
self.pegs = (self.peg,)
def __str__(self):
return '*(%s)' % self.peg
def is_trivial(self):
return False
def gen(self, context):
assert not self.peg.has_null()
f = self.peg.firsts()
return ("""\
while (%s) {
%s
}""" % (context.gen_member_test(f),
indent(context.sprout(f).gen(self.peg))))
def has_null(self):
return True
def firsts(self):
return self.peg.firsts()
def may_overcommit(self):
return self.peg.may_overcommit()
class StarSep(Peg):
"""A peg that matches like (p (sep p)*). That is,
StarSep(p, sep) is equivalent to Seq(p, Star(seq(sep, p))).
sep must not match the empty string."""
# I think I provided this class because it's easier to make this
# primitive than to generate good code for the combination. I
# don't remember for sure...
def __init__(self, peg, separator):
self.peg = parse(peg)
self.separator = parse(separator)
assert not self.peg.has_null()
assert not self.separator.has_null()
self.pegs = (self.peg, self.separator)
def __str__(self):
return '(%s@%s)' % self.pegs
def is_trivial(self):
return False
def gen(self, context):
context = context.sprout(all_chars)
sf = self.separator.firsts()
return ("""\
for (;;) {
%s
if (!(%s)) break;
%s
}""" % (indent(context.gen(self.peg)),
context.gen_member_test(sf),
indent(context.sprout(sf).gen(self.separator))))
def has_null(self):
return False
def firsts(self):
return Seq(self.peg, self.separator).firsts()
def may_overcommit(self):
return True # only slightly conservative
def Maybe(peg):
"Matching peg or the empty string."
return OneOf(peg, Epsilon())
def parse(surface_peg):
"Convert from a convenient 'surface syntax' to a peg object."
if isinstance(surface_peg, basestring):
return one_of(map(Literal, surface_peg))
return surface_peg
def one_of(pegs):
assert 0 < len(pegs)
if 1 == len(pegs):
return pegs[0]
return OneOf(*pegs)
def c_char_literal(c):
"Convert c to a C constant expression of type unsigned char."
esc = c_escape(c, "'")
if 0x80 <= ord(c):
return r"((u8)'%s')" % esc
return r"'%s'" % esc
def c_string_literal(s):
"Convert s to a C constant string."
return '"%s"' % ''.join(c_escape(c, '"') for c in s)
c_escape_table = {
'\\': '\\',
'\a': 'a',
'\b': 'b',
'\f': 'f',
'\n': 'n',
'\r': 'r',
'\t': 't',
'\v': 'v',
}
def c_escape(c, delimiter):
assert 1 == len(c)
if c == delimiter:
return '\\' + c
if c in c_escape_table:
return '\\' + c_escape_table[c]
elif ord(c) < 32 or 127 <= ord(c):
assert ord(c) <= 255
return '\\x%02x' % ord(c)
else:
return c
def indent(string):
return string.replace('\n', '\n ')