Skip to content

Commit

Permalink
Added parsita to json benchmark.
Browse files Browse the repository at this point in the history
  • Loading branch information
eerimoq committed Jul 23, 2018
1 parent 345df80 commit 9671a3b
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 193 deletions.
55 changes: 55 additions & 0 deletions examples/benchmarks/json/lark_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Based on
https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md.
"""

import timeit

from lark import Lark


LARK_JSON_GRAMMAR = r"""
?value: dict
| list
| string
| SIGNED_NUMBER
| "true"
| "false"
| "null"
list : "[" [value ("," value)*] "]"
dict : "{" [pair ("," pair)*] "}"
pair : string ":" value
string : ESCAPED_STRING
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
"""


def parse_lalr(json_string, iterations):
parser = Lark(LARK_JSON_GRAMMAR,
start='value',
lexer='standard',
parser='lalr')

def _parse():
parser.parse(json_string)

return timeit.timeit(_parse, number=iterations)


def parse_earley(json_string, iterations):
parser = Lark(LARK_JSON_GRAMMAR,
start='value',
lexer='standard',
parser='earley')

def _parse():
parser.parse(json_string)

return timeit.timeit(_parse, number=iterations)
234 changes: 41 additions & 193 deletions examples/benchmarks/json/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@

"""A performance example comparing the performance of four parsers.
Lark parsers based on
https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md.
Pyparsing parser based on
http://pyparsing.wikispaces.com/file/view/jsonParser.py.
Test data generated with https://www.json-generator.com.
Example execution:
Expand All @@ -18,23 +12,32 @@
Parsing 'data.json' 3 times took:
PACKAGE SECONDS
textparser 0.728505
lark (LALR) 1.330224
pyparsing 3.538572
lark (Earley) 9.209010
textparser 0.325401
lark (LALR) 0.764833
pyparsing 2.144760
lark (Earley) 5.644952
parsita 6.945121
$
"""

from __future__ import print_function

import sys
import os
import re
import timeit

import textparser as tp
from lark import Lark
import pyparsing as pp
import textparser_json
import lark_json
import pyparsing_json

if sys.version_info[0] > 2:
import parsita_json
else:
class parsita_json(object):

@staticmethod
def parse(_json_string, _iterations):
return float('inf')


SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
Expand All @@ -47,186 +50,31 @@
JSON_STRING = fin.read()


def tokenize(string):
names = {
'LPAREN': '(',
'RPAREN': ')',
'LBRACKET': '[',
'RBRACKET': ']',
'LBRACE': '{',
'RBRACE': '}',
'COMMA': ',',
'COLON': ':'
}

spec = [
('SKIP', r'[ \r\t]+'),
('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
('TRUE', r'true'),
('FALSE', r'false'),
('NULL', r'null'),
('NEWLINE', r'\n'),
('STRING', r'"(\\"|[^"])*?"'),
('LPAREN', r'\('),
('RPAREN', r'\)'),
('LBRACKET', r'\['),
('RBRACKET', r'\]'),
('LBRACE', r'\{'),
('RBRACE', r'\}'),
('COMMA', r','),
('COLON', r':'),
('MISMATCH', r'.')
]

line, line_start, tokens, re_token = tp.tokenize_init(spec)

for mo in re.finditer(re_token, string, re.DOTALL):
kind = mo.lastgroup

if kind == 'SKIP':
pass
elif kind == 'NEWLINE':
line_start = mo.end() - 1
line += 1
elif kind == 'STRING':
column = mo.start() - line_start
tokens.append(tp.Token(kind, mo.group(kind)[1:-1], line, column))
elif kind != 'MISMATCH':
value = mo.group(kind)

if kind in names:
kind = names[kind]

column = mo.start() - line_start
tokens.append(tp.Token(kind, value, line, column))
else:
column = mo.start() - line_start

raise tp.TokenizeError(line, column, mo.start(), string)

return tokens


def textparser_parse():
value = tp.Forward()
list_ = tp.Sequence('[', tp.DelimitedList(value), ']')
pair = tp.Sequence('STRING', ':', value)
dict_ = tp.Sequence('{', tp.DelimitedList(pair), '}')
value <<= tp.choice(list_, dict_, 'STRING', 'NUMBER', 'TRUE', 'FALSE', 'NULL')
grammar = tp.Grammar(value)

def parse():
grammar.parse(tokenize(JSON_STRING))

return timeit.timeit(parse, number=ITERATIONS)


LARK_JSON_GRAMMAR = r"""
?value: dict
| list
| string
| SIGNED_NUMBER
| "true"
| "false"
| "null"
list : "[" [value ("," value)*] "]"
dict : "{" [pair ("," pair)*] "}"
pair : string ":" value
string : ESCAPED_STRING
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
"""


def lark_lalr_parse():
parser = Lark(LARK_JSON_GRAMMAR,
start='value',
lexer='standard',
parser='lalr')

def parse():
parser.parse(JSON_STRING)

return timeit.timeit(parse, number=ITERATIONS)


def lark_earley_parse():
parser = Lark(LARK_JSON_GRAMMAR,
start='value',
lexer='standard',
parser='earley')

def parse():
parser.parse(JSON_STRING)

return timeit.timeit(parse, number=ITERATIONS)


def pyparsing_parse():
TRUE = pp.Keyword('true')
FALSE = pp.Keyword('false')
NULL = pp.Keyword('null')

LBRACK, RBRACK, LBRACE, RBRACE, COLON = map(pp.Suppress, '[]{}:')

string = pp.dblQuotedString().setParseAction(pp.removeQuotes)
number = pp.pyparsing_common.number()

object_ = pp.Forward()
value = pp.Forward()
elements = pp.delimitedList(value)
array = pp.Group(LBRACK + pp.Optional(elements, []) + RBRACK)
value <<= (string
| number
| pp.Group(object_)
| array
| TRUE
| FALSE
| NULL)
member = pp.Group(string + COLON + value)
members = pp.delimitedList(member)
object_ <<= pp.Dict(LBRACE + pp.Optional(members) + RBRACE)

def parse():
value.parseString(JSON_STRING)

return timeit.timeit(parse, number=ITERATIONS)


def main():
print(
"Parsing '{}' {} times per parser. This may take a few seconds.".format(
DATA_JSON,
ITERATIONS))

textparser_time = textparser_parse()
lark_lalr_time = lark_lalr_parse()
lark_earley_time = lark_earley_parse()
pyparsing_time = pyparsing_parse()
print("Parsing '{}' {} times per parser. This may take a few seconds.".format(
DATA_JSON,
ITERATIONS))

# Parse comparison output.
measurements = [
('textparser', textparser_time),
('lark (LALR)', lark_lalr_time),
('lark (Earley)', lark_earley_time),
('pyparsing', pyparsing_time)
]
textparser_time = textparser_json.parse(JSON_STRING, ITERATIONS)
lark_lalr_time = lark_json.parse_lalr(JSON_STRING, ITERATIONS)
lark_earley_time = lark_json.parse_earley(JSON_STRING, ITERATIONS)
pyparsing_time = pyparsing_json.parse(JSON_STRING, ITERATIONS)
parsita_time = parsita_json.parse(JSON_STRING, ITERATIONS)

measurements = sorted(measurements, key=lambda m: m[1])
# Parse comparison output.
measurements = [
('textparser', textparser_time),
('lark (LALR)', lark_lalr_time),
('lark (Earley)', lark_earley_time),
('pyparsing', pyparsing_time),
('parsita', parsita_time)
]

print()
print("Parsing '{}' {} times took:".format(DATA_JSON, ITERATIONS))
print()
print('PACKAGE SECONDS')
for package, seconds in measurements:
print('{:14s} {:f}'.format(package, seconds))
measurements = sorted(measurements, key=lambda m: m[1])

print()
print("Parsing '{}' {} times took:".format(DATA_JSON, ITERATIONS))
print()
print('PACKAGE SECONDS')

if __name__ == '__main__':
main()
for package, seconds in measurements:
print('{:14s} {:f}'.format(package, seconds))
60 changes: 60 additions & 0 deletions examples/benchmarks/json/parsita_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Based on
https://github.com/drhagen/parsita/blob/master/examples/json.py.
"""

import timeit

from parsita import TextParsers
from parsita import lit
from parsita import reg
from parsita import rep
from parsita import repsep


class JsonStringParsers(TextParsers, whitespace=None):
quote = lit(r'\"')
reverse_solidus = lit(r'\\')
solidus = lit(r'\/')
backspace = lit(r'\b')
form_feed = lit(r'\f')
line_feed = lit(r'\n')
carriage_return = lit(r'\r')
tab = lit(r'\t')
uni = reg(r'\\u([0-9a-fA-F]{4})')

escaped = (quote | reverse_solidus | solidus | backspace | form_feed |
line_feed | carriage_return | tab | uni)
unescaped = reg(r'[\u0020-\u0021\u0023-\u005B\u005D-\U0010FFFF]+')

string = '"' >> rep(escaped | unescaped) << '"' > ''.join


class JsonParsers(TextParsers, whitespace=r'[ \t\n\r]*'):
number = reg(r'-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][-+]?[0-9]+)?')

false = lit('false')
true = lit('true')
null = lit('null')

string = JsonStringParsers.string

array = '[' >> repsep(value, ',') << ']'

entry = string << ':' & value
obj = '{' >> repsep(entry, ',') << '}'

value = (number
| false
| true
| null
| string
| array
| obj)


def parse(json_string, iterations):
def _parse():
JsonParsers.value.parse(json_string)

return timeit.timeit(_parse, number=iterations)

0 comments on commit 9671a3b

Please sign in to comment.