Permalink
Browse files

* Removed space_tabs token until it sees some use.

* Corrected token type of entity rules (to TEXT).
* Implemented <nowiki>, using a separate lexer state. Cool!
* Added some corner-case tests for headings for the future.
* Added catch-all rules to slurp up normal text.
* Stopped using introspection to enumerate token types. Multiple lexer states made it more trouble than it was worth.
* Running lexer.py now gives you a REPL to play around in.
  • Loading branch information...
1 parent 6984912 commit 77b54ca2c3e29ba1d71c86c5c3035f26758fa883 @erikrose committed Dec 26, 2010
Showing with 196 additions and 65 deletions.
  1. +18 −3 constants.py
  2. +13 −1 design.txt
  3. +114 −26 lexer.py
  4. +51 −35 tests.py
View
@@ -2,10 +2,25 @@
# supported with variations of case, (2) this lacks &apos;, for which there's a
# complicated discussion at http://www.mail-archive.com/mediawiki-
# cvs@lists.wikimedia.org/msg01907.html.
+#
+# These are current as of MW 1.16.0.
html_entities = {
- u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
- u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
- u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
+ u'Aacute': 193,
+ u'aacute': 225,
+ u'Acirc': 194,
+ u'acirc': 226,
+ u'acute': 180,
+ u'AElig': 198,
+ u'aelig': 230,
+ u'Agrave': 192,
+ u'agrave': 224,
+ u'alefsym': 8501,
+ u'Alpha': 913,
+ u'alpha': 945,
+ u'amp': 38,
+ u'and': 8743,
+ u'ang': 8736,
+ u'Aring': 197,
u'aring': 229,
u'asymp': 8776,
u'Atilde': 195,
View
@@ -35,12 +35,17 @@ Not researched in depth.
* Claims to be nearly as fast as C
o Requires a C build step
+ANTLR
+-----
+o Separate code generation step
+o Slow because it generates a lot of function calls
+
Previous work
=============
* OCaml lexer implementation: http://www.mediawiki.org/wiki/MediaWiki_lexer
* Markup spec: http://www.mediawiki.org/wiki/Markup_spec
* BNF grammar: http://www.mediawiki.org/wiki/Markup_spec/BNF
- * Corresponds closely to lex input format
+ * Corresponds closely to yacc input format
* Pretty comprehensive: lots of English describing corner cases and error recovery
. Also discusses render phase
* EBNF grammar: http://www.mediawiki.org/wiki/Markup_spec/EBNF
@@ -59,6 +64,13 @@ Milestones
* Get apostrophes working (to test ambiguity support).
* Implement productions, tag by tag
+Units of estimation
+===================
+* Apostrophe jungle
+* Tables
+* Lists
+* HTML outputter
+
Notes
=====
LR good. LALR even better.
View
140 lexer.py 100644 → 100755
@@ -1,8 +1,11 @@
+#!/usr/bin/env python
"""What will eventually become a MediaWiki lexer
Based on the work at http://www.mediawiki.org/wiki/Markup_spec/BNF
"""
+import readline # Make raw_input() cool.
+
from ply import lex
from ply.lex import LexError as PlyLexError, lex
@@ -17,79 +20,164 @@ def __unicode__(self):
return u'%s: %s' % (self.args[0], self.text)
+class Token(object):
+ """LexToken-like class, initializable on construction
+
+ Equality with LexTokens is based on the type and value attrs, though value
+ comparison is skipped if T.value is None.
+
+ """
+
+ def __init__(self, type, value=None):
+ self.type = type
+ self.value = value
+
+ def __eq__(self, other):
+ """Compare type and, if it's specified, value."""
+ return (self.type == other.type and
+ (self.value is None or self.value == other.value))
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __str__(self):
+ return 'T(%s, %s)' % (repr(self.type), repr(self.value))
+
+ __repr__ = __str__
+
+
class LexerBox(object):
"""A container to group token definitions, hold state, & afford subclassing
Don't instantiate these; that's expensive. Instead, use the module-level
instance in `lexer`.
"""
+ states = [('nowiki', 'exclusive')]
+
def __init__(self):
"""Combine the regexes and such. This is expensive."""
self.lexer = lex(module=self, debug=True)
# Remember, when defining tokens, not to couple any HTML-output-specific
- # transformations to their t.values. That's for the parser to decide.
+ # transformations to their t.values. That's for the formatter to decide.
+
+ # The secret to lexer/parser division is: lexer recognizes only terminals
+ # (or else makes recursive calls to itself).
+
+ # Any line that does not start with one of the following is not a special
+ # block: " " | "{|" | "#" | ";" | ":" | "*" | "=".
+ # (http://www.mediawiki.org/wiki/Markup_spec/BNF/Article#Article)
- # The secret to lexer/parser division is: lexer recognizes only terminals.
+ # TODO: Would using __slots__ in LexToken speed things up? token()
+ # instantiates a lot of them.
# How does PLY tell what order tokens are defined in? Allegedly, it adds
# the callable ones in definition order and then the string ones in
# ascending length order. [Ed: It looks at each function obj to get
# co_firstlineno. Thus, subclassing this might not work as well as I
# thought. TODO: Reconsider how to extend.]
- # Fundamental elements
- # (http://www.mediawiki.org/wiki/Markup_spec/BNF/Fundamental_elements):
+ def t_NOWIKI(self, t):
+ r'<[nN][oO][wW][iI][kK][iI]>'
+ t.lexer.push_state('nowiki') # Use stack in case inside a table or something.
+ # TODO: Optimize this state by making a special text token that'll chew
+ # up anything that's not </nowiki>.
+ return None
+
+ def t_nowiki_NOWIKI_END(self, t):
+ r'</[nN][oO][wW][iI][kK][iI]>'
+ t.lexer.pop_state()
+ return None
+
+ # def t_HEADING(self, t):
+ # r'^(?P<HEADING_LEVEL>={1,6})(.+)\g<HEADING_LEVEL>\s*' # TODO: Or do we just match the terminals and let the parser sort out the pairing of === spans? H2 :: =={text}=={whitespace}. Or do we match ^== and then throw the lexer into a 'header' state which tries to .... Can't just match the whole line in one regex, because then the lexer never gets a chance to parse the text of the header normally and resolve the entities.
+ # # Swallows trailing whitespace like MediaWiki
+ # t.type =
def t_NEWLINE(self, t):
r'(?:\r\n|\n\r|\r|\n)'
return t
- #t_newlines: >=1 t_newline. In the BNF but possibly unneeded.
- #t_bol: beginning of line. Should be able to fold into individual regexes.
- #t_eol: same
- #t_space = r'[ ]' # Brackets because PLY compiles regexes with re.VERBOSE
- #t_spaces = r'[ ]+'
- #t_space_tab = r'[\t ]'
- # Add the rest of these as needed. They might be overly formal noise.
-
- # def t_SPACE_TABS(self, t):
- # r'[\t ]+'
- # return t
-
- def t_HTML_ENTITY_HEX(self, t):
+ def t_ANY_HTML_ENTITY_HEX(self, t):
r'&\#x(?P<HTML_ENTITY_HEX_NUM>[0-9a-fA-F]+);'
t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_HEX_NUM'), 16))
+ t.type = 'TEXT'
return t
- def t_HTML_ENTITY_DEC(self, t):
+ def t_ANY_HTML_ENTITY_DEC(self, t):
r'&\#(?P<HTML_ENTITY_DEC_NUM>[0-9]+);'
# Group indexes reference the combined, master regex: hard to predict.
t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_DEC_NUM')))
+ t.type = 'TEXT'
return t
- def t_HTML_ENTITY_SYM(self, t):
+ def t_ANY_HTML_ENTITY_SYM(self, t):
r'&(?P<HTML_ENTITY_SYM_NAME>[a-zA-Z1-4]+);'
sym = t.lexer.lexmatch.group('HTML_ENTITY_SYM_NAME')
if sym in html_entities:
t.value = unichr(html_entities[sym])
- else:
- t.type = 'text'
+ t.type = 'TEXT'
+ return t
+
+ def t_ANY_HARMLESS_TEXT(self, t):
+ r'[a-zA-Z0-9]+'
+ # Runs of stuff that can't possibly be part of another token. An
+ # optimization to avoid hitting t_ANY_TEXT
+ # TODO: Harmless Unicode chars are missing, so Japanese will go slow.
+ t.type = 'TEXT'
+ return t
+
+ def t_ANY_TEXT(self, t): # probably scarily inefficient
+ r'.'
return t
# <url-path> ::= <url-char> [<url-path>]
# <url-char> ::= LEGAL_URL_ENTITY # Not only "abc" and "%23" but "%ey", all of which should be preserved verbatim.
- def t_error(self, t):
+ def t_ANY_error(self, t):
raise LexError('Illegal character', t.value[0])
#t.lexer.skip(1)
+
+ def __iter__(self):
+ return merged_text_tokens(iter(self.lexer))
+
+ def input(self, text):
+ return self.lexer.input(text)
- # Everything after the t_ in anything that starts with t_:
- tokens = ([k[2:] for k in vars().keys() if k.startswith('t_') and k != 't_error'] +
- ['text'])
+ tokens = ['NEWLINE', 'TEXT']
-lexer = LexerBox().lexer
+lexer = LexerBox()
# TODO: Since we might have multiple threads, have the class build the lexer
# once and stash it in a class var. Then clone from it on construction of
# future instances.
+
+
+def merged_text_tokens(tokens):
+ """Merge adjacent TEXT tokens in the given iterable of LexTokens."""
+ acc = []
+ for t in tokens:
+ if t.type == 'TEXT':
+ acc.append(t.value)
+ else:
+ if acc:
+ yield Token('TEXT', ''.join(acc))
+ acc = []
+ yield t
+ if acc: # in case last token is TEXT
+ yield Token('TEXT', ''.join(acc))
+
+
+if __name__ == '__main__':
+ def repl():
+ while True:
+ try:
+ input = raw_input('lexer> ')
+ except EOFError:
+ break
+ try:
+ lexer.input(input)
+ print list(lexer)
+ except LexError, e:
+ print e
+ repl()
View
@@ -5,59 +5,75 @@
from ply.lex import LexToken
-from lexer import lexer, LexError
-
-
-class T(object):
- """LexToken-like class, initializable on construction
-
- Equality with LexTokens is based on the type and value attrs, though value
- comparison is skipped if T.value is None.
-
- """
-
- def __init__(self, type_, value=None):
- self.type_ = type_
- self.value = value
-
- def __eq__(self, other):
- """Compare type and, if it's specified, value."""
- return (self.type_ == other.type and
- (self.value is None or self.value == other.value))
-
- def __ne__(self, other):
- return not self == other
-
- def __str__(self):
- return 'T(%s, %s)' % (repr(self.type_), repr(self.value))
-
- __repr__ = __str__
+from lexer import lexer, LexError, Token as T
def lexed_eq(input, want):
+ """Assert lexing `input` yields `want`."""
lexer.input(input)
got = list(lexer)
if want != got:
raise AssertionError('%s != %s' % (got, want))
+def html_eq(input, want):
+ """Assert the lexed, parsed, HTML-formatted input string equals `want`.
+
+ Lets differences in linebreaks slide.
+
+ """
+
+
class LexerTests(TestCase):
def test_newline(self):
lexed_eq('\r\r\n\n\r\n', [T('NEWLINE', '\r'),
T('NEWLINE', '\r\n'),
T('NEWLINE', '\n\r'),
T('NEWLINE', '\n')])
- def test_space_tabs(self):
- lexed_eq(' ', [T('SPACE_TABS', ' ')])
- lexed_eq('\t', [T('SPACE_TABS', '\t')])
+ # def test_space_tabs(self):
+ # lexed_eq(' ', [T('SPACE_TABS', ' ')])
+ # lexed_eq('\t', [T('SPACE_TABS', '\t')])
def test_html_entity(self):
- lexed_eq('&#x2014;', [T('HTML_ENTITY_HEX', u'\u2014')])
- lexed_eq('&#8212;', [T('HTML_ENTITY_DEC', u'\u2014')])
- lexed_eq('&mdash;', [T('HTML_ENTITY_SYM', u'\u2014')])
- lexed_eq('&badentity;', [T('text', '&badentity;')])
-
+ lexed_eq('&#x2014;', [T('TEXT', u'\u2014')])
+ lexed_eq('&#8212;', [T('TEXT', u'\u2014')])
+ lexed_eq('&mdash;', [T('TEXT', u'\u2014')])
+ lexed_eq('&badentity;', [T('TEXT', '&badentity;')])
+
+ def test_nowiki(self):
+ lexed_eq("<nowiki>''not bold''</nowiki>", [T('TEXT', "''not bold''")])
+
+ # HTML entities inside <nowiki> should be resolved.
+ lexed_eq("<nowiki>&#8212;</nowiki>", [T('TEXT', u'\u2014')])
+
+ lexed_eq('</nowiki>', [T('TEXT', '</nowiki>')])
+
+ # <nowiki>s aren't nestable. Uncomment when bold is implemented.
+ # lexed_eq("<nowiki><nowiki></nowiki>''hey''</nowiki>",
+ # [T('TEXT', '<nowiki>'),
+ # T('BOLD'),
+ # T('TEXT', 'hey'),
+ # T('BOLD_END'),
+ # T('TEXT', '</nowiki>')])
+
+ def test_text(self):
+ lexed_eq('hi', [T('TEXT', 'hi')])
+
+
+class IntegrationTests(TestCase):
+ """Tests of the whole stack, from lexer to HTML formatter"""
+
+ def test_h1(self):
+ html_eq('= h1 = trailer', '<p>= h1 = = there = boo</p>')
+ html_eq(' = h1 =', '<pre>= h1 =</pre>')
+ html_eq('= h1 ==', # An H1 containing a trailing equal sign
+ '<h1> <span class="mw-headline" id="h1_.3D"> h1 =</span></h1>')
+
+# Some challenging test cases:
+# <ref>[http://www.susanscott.net/Oceanwatch2002/mar1-02.html Seaweed also plays a role in the formation of sand<!-- Bot generated title -->]</ref>, from wikipedia:Sand
+# [[File:Suesswasserstachelroche.jpg|thumb|A [[stingray]] about to bury itself in sand]]
+# In MW, [[clay [[this [[thing]]]]]] links "thing". py-wikimarkup links the whole thing.
if __name__ == '__main__':
unittest.main()

0 comments on commit 77b54ca

Please sign in to comment.