Skip to content

Commit

Permalink
Exception properties and tests, and rename string to text.
Browse files Browse the repository at this point in the history
  • Loading branch information
eerimoq committed Jul 29, 2018
1 parent 990326e commit fb64cfb
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 45 deletions.
9 changes: 9 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,13 @@ Functions and classes
.. autoclass:: textparser.Pattern
:members:

.. autoclass:: textparser.ParseError
:members:

.. autoclass:: textparser.TokenizeError
:members:

.. autoclass:: textparser.GrammarError
:members:

.. autofunction:: textparser.tokenize_init
61 changes: 50 additions & 11 deletions tests/test_textparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,18 +729,38 @@ def test_grammar_no_match(self):

self.assertEqual(cm.exception.offset, 3)

def test_tokenizer_error(self):
def test_grammar_error(self):
grammar = Grammar(NoMatch())

datas = [
[('NUMBER', '1', 3)],
[('WORD', 'foo', 3)]
]

for tokens in datas:
tokens = tokenize(tokens)

with self.assertRaises(textparser.GrammarError) as cm:
grammar.parse(tokens)

self.assertEqual(cm.exception.offset, 3)
self.assertEqual(str(cm.exception),
'Invalid syntax at offset 3.')

def test_tokenize_error(self):
datas = [
(2, 'hej', 'Invalid syntax at line 1, column 3: "he>>!<<j"'),
(0, 'a\nb\n', 'Invalid syntax at line 1, column 1: ">>!<<a"'),
(1, 'a\nb\n', 'Invalid syntax at line 1, column 2: "a>>!<<"'),
(2, 'a\nb\n', 'Invalid syntax at line 2, column 1: ">>!<<b"')
]

for offset, string, message in datas:
for offset, text, message in datas:
with self.assertRaises(TokenizeError) as cm:
raise TokenizeError(string, offset)
raise TokenizeError(text, offset)

self.assertEqual(cm.exception.text, text)
self.assertEqual(cm.exception.offset, offset)
self.assertEqual(str(cm.exception), message)

def test_create_token_re(self):
Expand Down Expand Up @@ -824,17 +844,17 @@ def grammar(self):
)
]

for string, expected_tree, expected_token_tree in datas:
tree = Parser().parse(string)
for text, expected_tree, expected_token_tree in datas:
tree = Parser().parse(text)
self.assertEqual(tree, expected_tree)
tree = Parser().parse(string, token_tree=True)
tree = Parser().parse(text, token_tree=True)
self.assertEqual(tree, expected_token_tree)

def test_parser_tokenize_mismatch(self):
class Parser(textparser.Parser):

def tokenize(self, string):
raise TokenizeError(string, 5)
def tokenize(self, text):
raise TokenizeError(text, 5)

def grammar(self):
return Grammar(Sequence('NUMBER', 'WORD'))
Expand All @@ -851,7 +871,7 @@ def grammar(self):
def test_parser_grammar_mismatch(self):
class Parser(textparser.Parser):

def tokenize(self, _string):
def tokenize(self, _text):
return tokenize([
('NUMBER', '1.45', 0),
('NUMBER', '2', 5)
Expand All @@ -875,7 +895,7 @@ class Parser(textparser.Parser):
def __init__(self, tokens):
self._tokens = tokens

def tokenize(self, _string):
def tokenize(self, _text):
return tokenize(self._tokens, add_eof_token=False)

def grammar(self):
Expand Down Expand Up @@ -928,7 +948,7 @@ def grammar(self):
def test_parser_grammar_mismatch_zero_or_more_end_max(self):
class Parser(textparser.Parser):

def tokenize(self, _string):
def tokenize(self, _text):
return tokenize([('TEXT', 'foo', 0)], add_eof_token=False)

def grammar(self):
Expand All @@ -943,6 +963,25 @@ def grammar(self):
self.assertEqual(str(cm.exception),
'Invalid syntax at line 1, column 1: ">>!<<foo"')

def test_parse_error(self):
class Parser(textparser.Parser):

def tokenize(self, text):
raise TokenizeError(text, 5)

def grammar(self):
return Grammar(Sequence('NUMBER', 'WORD'))

with self.assertRaises(textparser.ParseError) as cm:
Parser().parse('12\n3456\n789')

self.assertEqual(cm.exception.text, '12\n3456\n789')
self.assertEqual(cm.exception.offset, 5)
self.assertEqual(cm.exception.line, 2)
self.assertEqual(cm.exception.column, 3)
self.assertEqual(str(cm.exception),
'Invalid syntax at line 2, column 3: "34>>!<<56"')


if __name__ == '__main__':
unittest.main()
135 changes: 101 additions & 34 deletions textparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,44 +35,111 @@ def _wrap_strings(items):
return [_wrap_string(item) for item in items]


def _format_invalid_syntax(string, offset):
def _format_invalid_syntax(text, offset):
return 'Invalid syntax at line {}, column {}: "{}"'.format(
line(string, offset),
column(string, offset),
markup_line(string, offset))
line(text, offset),
column(text, offset),
markup_line(text, offset))


class Error(Exception):
pass


class TokenizeError(Error):
"""This exception is raised when the text cannot be converted into
tokens.
def __init__(self, string, offset):
self.string = string
self.offset = offset
message = _format_invalid_syntax(string, offset)
"""

def __init__(self, text, offset):
self._text = text
self._offset = offset
message = _format_invalid_syntax(text, offset)
super(TokenizeError, self).__init__(message)

@property
def text(self):
"""The input text to the tokenizer.
"""

return self._text

@property
def offset(self):
"""Offset into the text where the tokenizer failed.
"""

return self._offset


class GrammarError(Error):
"""This exception is raised when the tokens cannot be converted into a
parse tree.
"""

def __init__(self, offset):
self.offset = offset
self._offset = offset
message = 'Invalid syntax at offset {}.'.format(offset)
super(GrammarError, self).__init__(message)

@property
def offset(self):
"""Offset into the text where the parser failed.
"""

return self._offset


class ParseError(Error):
"""This exception is raised when the parser fails to parse the text.
"""

def __init__(self, string, offset):
self.string = string
self.offset = offset
self.line = line(string, offset)
self.column = column(string, offset)
message = _format_invalid_syntax(string, offset)
def __init__(self, text, offset):
self._text = text
self._offset = offset
self._line = line(text, offset)
self._column = column(text, offset)
message = _format_invalid_syntax(text, offset)
super(ParseError, self).__init__(message)

@property
def text(self):
"""The input text to the parser.
"""

return self._text

@property
def offset(self):
"""Offset into the text where the tokenizer failed.
"""

return self._offset

@property
def line(self):
"""Line where the parser failed.
"""

return self._line

@property
def column(self):
"""Column where the parser failed.
"""

return self._column


Token = namedtuple('Token', ['kind', 'value', 'offset'])

Expand Down Expand Up @@ -143,7 +210,7 @@ class Pattern(object):
"""Base class of all patterns.
"""

def match(self, tokens):
raise NotImplementedError('To be implemented by subclasses.')

Expand Down Expand Up @@ -554,24 +621,24 @@ def choice(*patterns):
return Choice(*patterns)


def markup_line(string, offset):
begin = string.rfind('\n', 0, offset)
def markup_line(text, offset):
begin = text.rfind('\n', 0, offset)
begin += 1

end = string.find('\n', offset)
end = text.find('\n', offset)

if end == -1:
end = len(string)
end = len(text)

return string[begin:offset] + '>>!<<' + string[offset:end]
return text[begin:offset] + '>>!<<' + text[offset:end]


def line(string, offset):
return string[:offset].count('\n') + 1
def line(text, offset):
return text[:offset].count('\n') + 1


def column(string, offset):
line_start = string.rfind('\n', 0, offset)
def column(text, offset):
line_start = text.rfind('\n', 0, offset)

return offset - line_start

Expand Down Expand Up @@ -638,8 +705,8 @@ def token_specs(self):

return []

def tokenize(self, string):
"""Tokenize given text `string`, and return a list of tokens.
def tokenize(self, text):
"""Tokenize given string `text`, and return a list of tokens.
This method should only be called by
:func:`~textparser.Parser.parse()`, but may very well be
Expand All @@ -652,7 +719,7 @@ def tokenize(self, string):
keywords = self.keywords()
tokens, re_token = tokenize_init(specs)

for mo in re.finditer(re_token, string, re.DOTALL):
for mo in re.finditer(re_token, text, re.DOTALL):
kind = mo.lastgroup

if kind == 'SKIP':
Expand All @@ -668,7 +735,7 @@ def tokenize(self, string):

tokens.append(Token(kind, value, mo.start()))
else:
raise TokenizeError(string, mo.start())
raise TokenizeError(text, mo.start())

return tokens

Expand All @@ -681,8 +748,8 @@ def grammar(self):

raise NotImplementedError('To be implemented by subclasses.')

def parse(self, string, token_tree=False):
"""Parse given string `string` and return the parse tree.
def parse(self, text, token_tree=False):
"""Parse given string `text` and return the parse tree.
Returns a parse tree of tokens if `token_tree` is ``True``.
Expand All @@ -701,11 +768,11 @@ def parse(self, string, token_tree=False):
"""

try:
tokens = self.tokenize(string)
tokens = self.tokenize(text)

if len(tokens) == 0 or tokens[-1].kind != '__EOF__':
tokens.append(Token('__EOF__', None, len(string)))
tokens.append(Token('__EOF__', None, len(text)))

return Grammar(self.grammar()).parse(tokens, token_tree)
except (TokenizeError, GrammarError) as e:
raise ParseError(string, e.offset)
raise ParseError(text, e.offset)

0 comments on commit fb64cfb

Please sign in to comment.