Skip to content

Commit

Permalink
* Removed space_tabs token until it sees some use.
Browse files Browse the repository at this point in the history
* Corrected token type of entity rules (to TEXT).
* Implemented <nowiki>, using a separate lexer state. Cool!
* Added some corner-case tests for headings for the future.
* Added catch-all rules to slurp up normal text.
* Stopped using introspection to enumerate token types. Multiple lexer states made it more trouble than it was worth.
* Running lexer.py now gives you a REPL to play around in.
  • Loading branch information
erikrose committed Dec 28, 2010
1 parent 6984912 commit 77b54ca
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 65 deletions.
21 changes: 18 additions & 3 deletions constants.py
Expand Up @@ -2,10 +2,25 @@
# supported with variations of case, (2) this lacks &apos;, for which there's a
# complicated discussion at http://www.mail-archive.com/mediawiki-
# cvs@lists.wikimedia.org/msg01907.html.
#
# These are current as of MW 1.16.0.
html_entities = {
u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
u'Aacute': 193,
u'aacute': 225,
u'Acirc': 194,
u'acirc': 226,
u'acute': 180,
u'AElig': 198,
u'aelig': 230,
u'Agrave': 192,
u'agrave': 224,
u'alefsym': 8501,
u'Alpha': 913,
u'alpha': 945,
u'amp': 38,
u'and': 8743,
u'ang': 8736,
u'Aring': 197,
u'aring': 229,
u'asymp': 8776,
u'Atilde': 195,
Expand Down
14 changes: 13 additions & 1 deletion design.txt
Expand Up @@ -35,12 +35,17 @@ Not researched in depth.
* Claims to be nearly as fast as C
o Requires a C build step

ANTLR
-----
o Separate code generation step
o Slow because it generates a lot of function calls

Previous work
=============
* OCaml lexer implementation: http://www.mediawiki.org/wiki/MediaWiki_lexer
* Markup spec: http://www.mediawiki.org/wiki/Markup_spec
* BNF grammar: http://www.mediawiki.org/wiki/Markup_spec/BNF
* Corresponds closely to lex input format
* Corresponds closely to yacc input format
* Pretty comprehensive: lots of English describing corner cases and error recovery
. Also discusses render phase
* EBNF grammar: http://www.mediawiki.org/wiki/Markup_spec/EBNF
Expand All @@ -59,6 +64,13 @@ Milestones
* Get apostrophes working (to test ambiguity support).
* Implement productions, tag by tag

Units of estimation
===================
* Apostrophe jungle
* Tables
* Lists
* HTML outputter

Notes
=====
LR good. LALR even better.
Expand Down
140 changes: 114 additions & 26 deletions lexer.py 100644 → 100755
@@ -1,8 +1,11 @@
#!/usr/bin/env python
"""What will eventually become a MediaWiki lexer
Based on the work at http://www.mediawiki.org/wiki/Markup_spec/BNF
"""
import readline # Make raw_input() cool.

from ply import lex
from ply.lex import LexError as PlyLexError, lex

Expand All @@ -17,79 +20,164 @@ def __unicode__(self):
return u'%s: %s' % (self.args[0], self.text)


class Token(object):
"""LexToken-like class, initializable on construction
Equality with LexTokens is based on the type and value attrs, though value
comparison is skipped if T.value is None.
"""

def __init__(self, type, value=None):
self.type = type
self.value = value

def __eq__(self, other):
"""Compare type and, if it's specified, value."""
return (self.type == other.type and
(self.value is None or self.value == other.value))

def __ne__(self, other):
return not self == other

def __str__(self):
return 'T(%s, %s)' % (repr(self.type), repr(self.value))

__repr__ = __str__


class LexerBox(object):
"""A container to group token definitions, hold state, & afford subclassing
Don't instantiate these; that's expensive. Instead, use the module-level
instance in `lexer`.
"""
states = [('nowiki', 'exclusive')]

def __init__(self):
"""Combine the regexes and such. This is expensive."""
self.lexer = lex(module=self, debug=True)

# Remember, when defining tokens, not to couple any HTML-output-specific
# transformations to their t.values. That's for the parser to decide.
# transformations to their t.values. That's for the formatter to decide.

# The secret to lexer/parser division is: lexer recognizes only terminals
# (or else makes recursive calls to itself).

# Any line that does not start with one of the following is not a special
# block: " " | "{|" | "#" | ";" | ":" | "*" | "=".
# (http://www.mediawiki.org/wiki/Markup_spec/BNF/Article#Article)

# The secret to lexer/parser division is: lexer recognizes only terminals.
# TODO: Would using __slots__ in LexToken speed things up? token()
# instantiates a lot of them.

# How does PLY tell what order tokens are defined in? Allegedly, it adds
# the callable ones in definition order and then the string ones in
# ascending length order. [Ed: It looks at each function obj to get
# co_firstlineno. Thus, subclassing this might not work as well as I
# thought. TODO: Reconsider how to extend.]

# Fundamental elements
# (http://www.mediawiki.org/wiki/Markup_spec/BNF/Fundamental_elements):
def t_NOWIKI(self, t):
r'<[nN][oO][wW][iI][kK][iI]>'
t.lexer.push_state('nowiki') # Use stack in case inside a table or something.
# TODO: Optimize this state by making a special text token that'll chew
# up anything that's not </nowiki>.
return None

def t_nowiki_NOWIKI_END(self, t):
r'</[nN][oO][wW][iI][kK][iI]>'
t.lexer.pop_state()
return None

# def t_HEADING(self, t):
# r'^(?P<HEADING_LEVEL>={1,6})(.+)\g<HEADING_LEVEL>\s*' # TODO: Or do we just match the terminals and let the parser sort out the pairing of === spans? H2 :: =={text}=={whitespace}. Or do we match ^== and then throw the lexer into a 'header' state which tries to .... Can't just match the whole line in one regex, because then the lexer never gets a chance to parse the text of the header normally and resolve the entities.
# # Swallows trailing whitespace like MediaWiki
# t.type =

def t_NEWLINE(self, t):
r'(?:\r\n|\n\r|\r|\n)'
return t

#t_newlines: >=1 t_newline. In the BNF but possibly unneeded.
#t_bol: beginning of line. Should be able to fold into individual regexes.
#t_eol: same
#t_space = r'[ ]' # Brackets because PLY compiles regexes with re.VERBOSE
#t_spaces = r'[ ]+'
#t_space_tab = r'[\t ]'
# Add the rest of these as needed. They might be overly formal noise.

# def t_SPACE_TABS(self, t):
# r'[\t ]+'
# return t

def t_HTML_ENTITY_HEX(self, t):
def t_ANY_HTML_ENTITY_HEX(self, t):
r'&\#x(?P<HTML_ENTITY_HEX_NUM>[0-9a-fA-F]+);'
t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_HEX_NUM'), 16))
t.type = 'TEXT'
return t

def t_HTML_ENTITY_DEC(self, t):
def t_ANY_HTML_ENTITY_DEC(self, t):
r'&\#(?P<HTML_ENTITY_DEC_NUM>[0-9]+);'
# Group indexes reference the combined, master regex: hard to predict.
t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_DEC_NUM')))
t.type = 'TEXT'
return t

def t_HTML_ENTITY_SYM(self, t):
def t_ANY_HTML_ENTITY_SYM(self, t):
r'&(?P<HTML_ENTITY_SYM_NAME>[a-zA-Z1-4]+);'
sym = t.lexer.lexmatch.group('HTML_ENTITY_SYM_NAME')
if sym in html_entities:
t.value = unichr(html_entities[sym])
else:
t.type = 'text'
t.type = 'TEXT'
return t

def t_ANY_HARMLESS_TEXT(self, t):
r'[a-zA-Z0-9]+'
# Runs of stuff that can't possibly be part of another token. An
# optimization to avoid hitting t_ANY_TEXT
# TODO: Harmless Unicode chars are missing, so Japanese will go slow.
t.type = 'TEXT'
return t

def t_ANY_TEXT(self, t): # probably scarily inefficient
r'.'
return t

# <url-path> ::= <url-char> [<url-path>]
# <url-char> ::= LEGAL_URL_ENTITY # Not only "abc" and "%23" but "%ey", all of which should be preserved verbatim.

def t_error(self, t):
def t_ANY_error(self, t):
raise LexError('Illegal character', t.value[0])
#t.lexer.skip(1)

def __iter__(self):
return merged_text_tokens(iter(self.lexer))

def input(self, text):
return self.lexer.input(text)

# Everything after the t_ in anything that starts with t_:
tokens = ([k[2:] for k in vars().keys() if k.startswith('t_') and k != 't_error'] +
['text'])
tokens = ['NEWLINE', 'TEXT']

lexer = LexerBox().lexer
lexer = LexerBox()
# TODO: Since we might have multiple threads, have the class build the lexer
# once and stash it in a class var. Then clone from it on construction of
# future instances.


def merged_text_tokens(tokens):
"""Merge adjacent TEXT tokens in the given iterable of LexTokens."""
acc = []
for t in tokens:
if t.type == 'TEXT':
acc.append(t.value)
else:
if acc:
yield Token('TEXT', ''.join(acc))
acc = []
yield t
if acc: # in case last token is TEXT
yield Token('TEXT', ''.join(acc))


if __name__ == '__main__':
def repl():
while True:
try:
input = raw_input('lexer> ')
except EOFError:
break
try:
lexer.input(input)
print list(lexer)
except LexError, e:
print e
repl()
86 changes: 51 additions & 35 deletions tests.py
Expand Up @@ -5,59 +5,75 @@

from ply.lex import LexToken

from lexer import lexer, LexError


class T(object):
"""LexToken-like class, initializable on construction
Equality with LexTokens is based on the type and value attrs, though value
comparison is skipped if T.value is None.
"""

def __init__(self, type_, value=None):
self.type_ = type_
self.value = value

def __eq__(self, other):
"""Compare type and, if it's specified, value."""
return (self.type_ == other.type and
(self.value is None or self.value == other.value))

def __ne__(self, other):
return not self == other

def __str__(self):
return 'T(%s, %s)' % (repr(self.type_), repr(self.value))

__repr__ = __str__
from lexer import lexer, LexError, Token as T


def lexed_eq(input, want):
"""Assert lexing `input` yields `want`."""
lexer.input(input)
got = list(lexer)
if want != got:
raise AssertionError('%s != %s' % (got, want))


def html_eq(input, want):
"""Assert the lexed, parsed, HTML-formatted input string equals `want`.
Lets differences in linebreaks slide.
"""


class LexerTests(TestCase):
def test_newline(self):
lexed_eq('\r\r\n\n\r\n', [T('NEWLINE', '\r'),
T('NEWLINE', '\r\n'),
T('NEWLINE', '\n\r'),
T('NEWLINE', '\n')])

def test_space_tabs(self):
lexed_eq(' ', [T('SPACE_TABS', ' ')])
lexed_eq('\t', [T('SPACE_TABS', '\t')])
# def test_space_tabs(self):
# lexed_eq(' ', [T('SPACE_TABS', ' ')])
# lexed_eq('\t', [T('SPACE_TABS', '\t')])

def test_html_entity(self):
lexed_eq('&#x2014;', [T('HTML_ENTITY_HEX', u'\u2014')])
lexed_eq('&#8212;', [T('HTML_ENTITY_DEC', u'\u2014')])
lexed_eq('&mdash;', [T('HTML_ENTITY_SYM', u'\u2014')])
lexed_eq('&badentity;', [T('text', '&badentity;')])

lexed_eq('&#x2014;', [T('TEXT', u'\u2014')])
lexed_eq('&#8212;', [T('TEXT', u'\u2014')])
lexed_eq('&mdash;', [T('TEXT', u'\u2014')])
lexed_eq('&badentity;', [T('TEXT', '&badentity;')])

def test_nowiki(self):
lexed_eq("<nowiki>''not bold''</nowiki>", [T('TEXT', "''not bold''")])

# HTML entities inside <nowiki> should be resolved.
lexed_eq("<nowiki>&#8212;</nowiki>", [T('TEXT', u'\u2014')])

lexed_eq('</nowiki>', [T('TEXT', '</nowiki>')])

# <nowiki>s aren't nestable. Uncomment when bold is implemented.
# lexed_eq("<nowiki><nowiki></nowiki>''hey''</nowiki>",
# [T('TEXT', '<nowiki>'),
# T('BOLD'),
# T('TEXT', 'hey'),
# T('BOLD_END'),
# T('TEXT', '</nowiki>')])

def test_text(self):
lexed_eq('hi', [T('TEXT', 'hi')])


class IntegrationTests(TestCase):
"""Tests of the whole stack, from lexer to HTML formatter"""

def test_h1(self):
html_eq('= h1 = trailer', '<p>= h1 = = there = boo</p>')
html_eq(' = h1 =', '<pre>= h1 =</pre>')
html_eq('= h1 ==', # An H1 containing a trailing equal sign
'<h1> <span class="mw-headline" id="h1_.3D"> h1 =</span></h1>')

# Some challenging test cases:
# <ref>[http://www.susanscott.net/Oceanwatch2002/mar1-02.html Seaweed also plays a role in the formation of sand<!-- Bot generated title -->]</ref>, from wikipedia:Sand
# [[File:Suesswasserstachelroche.jpg|thumb|A [[stingray]] about to bury itself in sand]]
# In MW, [[clay [[this [[thing]]]]]] links "thing". py-wikimarkup links the whole thing.

if __name__ == '__main__':
unittest.main()

0 comments on commit 77b54ca

Please sign in to comment.