* Removed space_tabs token until it sees some use.

* Corrected token type of entity rules (to TEXT). * Implemented <nowiki>, using a separate lexer state. Cool! * Added some corner-case tests for headings for the future. * Added catch-all rules to slurp up normal text. * Stopped using introspection to enumerate token types. Multiple lexer states made it more trouble than it was worth. * Running lexer.py now gives you a REPL to play around in.
erikrose · Dec 28, 2010 · 77b54ca · 77b54ca
1 parent 6984912
commit 77b54ca
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 65 deletions.
diff --git a/constants.py b/constants.py
@@ -2,10 +2,25 @@
 # supported with variations of case, (2) this lacks &apos;, for which there's a
 # complicated discussion at http://www.mail-archive.com/mediawiki-
 # cvs@lists.wikimedia.org/msg01907.html.
+#
+# These are current as of MW 1.16.0.
 html_entities = {
-    u'Aacute': 193,    u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
-    u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
-    u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
+    u'Aacute':     193,
+    u'aacute':     225,
+    u'Acirc':      194,
+    u'acirc':      226,
+    u'acute':      180,
+    u'AElig':      198,
+    u'aelig':      230,
+    u'Agrave':     192,
+    u'agrave':     224,
+    u'alefsym':    8501,
+    u'Alpha':      913,
+    u'alpha':      945,
+    u'amp':        38,
+    u'and':        8743,
+    u'ang':        8736,
+    u'Aring':      197,
     u'aring':      229,
     u'asymp':      8776,
     u'Atilde':     195,

diff --git a/design.txt b/design.txt
@@ -35,12 +35,17 @@ Not researched in depth.
 * Claims to be nearly as fast as C
 o Requires a C build step
 
+ANTLR
+-----
+o Separate code generation step
+o Slow because it generates a lot of function calls
+
 Previous work
 =============
 * OCaml lexer implementation: http://www.mediawiki.org/wiki/MediaWiki_lexer
 * Markup spec: http://www.mediawiki.org/wiki/Markup_spec
 * BNF grammar: http://www.mediawiki.org/wiki/Markup_spec/BNF
-  * Corresponds closely to lex input format
+  * Corresponds closely to yacc input format
   * Pretty comprehensive: lots of English describing corner cases and error recovery
   . Also discusses render phase
 * EBNF grammar: http://www.mediawiki.org/wiki/Markup_spec/EBNF
@@ -59,6 +64,13 @@ Milestones
 * Get apostrophes working (to test ambiguity support).
 * Implement productions, tag by tag
 
+Units of estimation
+===================
+* Apostrophe jungle
+* Tables
+* Lists
+* HTML outputter
+
 Notes
 =====
 LR good. LALR even better.

diff --git a/lexer.py b/lexer.py
@@ -1,8 +1,11 @@
+#!/usr/bin/env python
 """What will eventually become a MediaWiki lexer
 
 Based on the work at http://www.mediawiki.org/wiki/Markup_spec/BNF
 
 """
+import readline  # Make raw_input() cool.
+
 from ply import lex
 from ply.lex import LexError as PlyLexError, lex
 
@@ -17,79 +20,164 @@ def __unicode__(self):
         return u'%s: %s' % (self.args[0], self.text)
 
 
+class Token(object):
+    """LexToken-like class, initializable on construction
+
+    Equality with LexTokens is based on the type and value attrs, though value
+    comparison is skipped if T.value is None.
+
+    """
+
+    def __init__(self, type, value=None):
+        self.type = type
+        self.value = value
+
+    def __eq__(self, other):
+        """Compare type and, if it's specified, value."""
+        return (self.type == other.type and
+                (self.value is None or self.value == other.value))
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return 'T(%s, %s)' % (repr(self.type), repr(self.value))
+
+    __repr__ = __str__
+
+
 class LexerBox(object):
     """A container to group token definitions, hold state, & afford subclassing
 
     Don't instantiate these; that's expensive. Instead, use the module-level
     instance in `lexer`.
 
     """
+    states = [('nowiki', 'exclusive')]
+
     def __init__(self):
         """Combine the regexes and such. This is expensive."""
         self.lexer = lex(module=self, debug=True)
 
     # Remember, when defining tokens, not to couple any HTML-output-specific
-    # transformations to their t.values. That's for the parser to decide.
+    # transformations to their t.values. That's for the formatter to decide.
+
+    # The secret to lexer/parser division is: lexer recognizes only terminals
+    # (or else makes recursive calls to itself).
+
+    # Any line that does not start with one of the following is not a special
+    # block: " " | "{|" | "#" | ";" | ":" | "*" | "=".
+    # (http://www.mediawiki.org/wiki/Markup_spec/BNF/Article#Article)
 
-    # The secret to lexer/parser division is: lexer recognizes only terminals.
+    # TODO: Would using __slots__ in LexToken speed things up? token()
+    # instantiates a lot of them.
 
     # How does PLY tell what order tokens are defined in? Allegedly, it adds
     # the callable ones in definition order and then the string ones in
     # ascending length order. [Ed: It looks at each function obj to get
     # co_firstlineno. Thus, subclassing this might not work as well as I
     # thought. TODO: Reconsider how to extend.]
 
-    # Fundamental elements
-    # (http://www.mediawiki.org/wiki/Markup_spec/BNF/Fundamental_elements):
+    def t_NOWIKI(self, t):
+        r'<[nN][oO][wW][iI][kK][iI]>'
+        t.lexer.push_state('nowiki')  # Use stack in case inside a table or something.
+        # TODO: Optimize this state by making a special text token that'll chew
+        # up anything that's not </nowiki>.
+        return None
+
+    def t_nowiki_NOWIKI_END(self, t):
+        r'</[nN][oO][wW][iI][kK][iI]>'
+        t.lexer.pop_state()
+        return None
+
+    # def t_HEADING(self, t):
+    #     r'^(?P<HEADING_LEVEL>={1,6})(.+)\g<HEADING_LEVEL>\s*'  # TODO: Or do we just match the terminals and let the parser sort out the pairing of === spans? H2 :: =={text}=={whitespace}. Or do we match ^== and then throw the lexer into a 'header' state which tries to .... Can't just match the whole line in one regex, because then the lexer never gets a chance to parse the text of the header normally and resolve the entities.
+    #   # Swallows trailing whitespace like MediaWiki
+    #   t.type = 
 
     def t_NEWLINE(self, t):
         r'(?:\r\n|\n\r|\r|\n)'
         return t
 
-    #t_newlines: >=1 t_newline. In the BNF but possibly unneeded.
-    #t_bol: beginning of line. Should be able to fold into individual regexes.
-    #t_eol: same
-    #t_space = r'[ ]'  # Brackets because PLY compiles regexes with re.VERBOSE
-    #t_spaces = r'[ ]+'
-    #t_space_tab = r'[\t ]'
-    # Add the rest of these as needed. They might be overly formal noise.
-
-    # def t_SPACE_TABS(self, t):
-    #     r'[\t ]+'
-    #     return t
-
-    def t_HTML_ENTITY_HEX(self, t):
+    def t_ANY_HTML_ENTITY_HEX(self, t):
         r'&\#x(?P<HTML_ENTITY_HEX_NUM>[0-9a-fA-F]+);'
         t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_HEX_NUM'), 16))
+        t.type = 'TEXT'
         return t
 
-    def t_HTML_ENTITY_DEC(self, t):
+    def t_ANY_HTML_ENTITY_DEC(self, t):
         r'&\#(?P<HTML_ENTITY_DEC_NUM>[0-9]+);'
         # Group indexes reference the combined, master regex: hard to predict.
         t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_DEC_NUM')))
+        t.type = 'TEXT'
         return t
 
-    def t_HTML_ENTITY_SYM(self, t):
+    def t_ANY_HTML_ENTITY_SYM(self, t):
         r'&(?P<HTML_ENTITY_SYM_NAME>[a-zA-Z1-4]+);'
         sym = t.lexer.lexmatch.group('HTML_ENTITY_SYM_NAME')
         if sym in html_entities:
             t.value = unichr(html_entities[sym])
-        else:
-            t.type = 'text'
+        t.type = 'TEXT'
+        return t
+
+    def t_ANY_HARMLESS_TEXT(self, t):
+        r'[a-zA-Z0-9]+'
+        # Runs of stuff that can't possibly be part of another token. An
+        # optimization to avoid hitting t_ANY_TEXT
+        # TODO: Harmless Unicode chars are missing, so Japanese will go slow.
+        t.type = 'TEXT'
+        return t
+
+    def t_ANY_TEXT(self, t):  # probably scarily inefficient
+        r'.'
         return t
 
     # <url-path>		::= <url-char> [<url-path>]
     # <url-char>		::= LEGAL_URL_ENTITY  # Not only "abc" and "%23" but "%ey", all of which should be preserved verbatim.
 
-    def t_error(self, t):
+    def t_ANY_error(self, t):
         raise LexError('Illegal character', t.value[0])
         #t.lexer.skip(1)
+
+    def __iter__(self):
+        return merged_text_tokens(iter(self.lexer))
+
+    def input(self, text):
+        return self.lexer.input(text)
 
-    # Everything after the t_ in anything that starts with t_:
-    tokens = ([k[2:] for k in vars().keys() if k.startswith('t_') and k != 't_error'] +
-              ['text'])
+    tokens = ['NEWLINE', 'TEXT']
 
-lexer = LexerBox().lexer
+lexer = LexerBox()
 # TODO: Since we might have multiple threads, have the class build the lexer
 # once and stash it in a class var. Then clone from it on construction of
 # future instances.
+
+
+def merged_text_tokens(tokens):
+    """Merge adjacent TEXT tokens in the given iterable of LexTokens."""
+    acc = []
+    for t in tokens:
+        if t.type == 'TEXT':
+            acc.append(t.value)
+        else:
+            if acc:
+                yield Token('TEXT', ''.join(acc))
+                acc = []
+            yield t
+    if acc:  # in case last token is TEXT
+        yield Token('TEXT', ''.join(acc))
+
+
+if __name__ == '__main__':
+    def repl():
+        while True:
+            try:
+                input = raw_input('lexer> ')
+            except EOFError:
+                break
+            try:
+                lexer.input(input)
+                print list(lexer)
+            except LexError, e:
+                print e
+    repl()
diff --git a/tests.py b/tests.py
@@ -5,59 +5,75 @@
 
 from ply.lex import LexToken
 
-from lexer import lexer, LexError
-
-
-class T(object):
-    """LexToken-like class, initializable on construction
-
-    Equality with LexTokens is based on the type and value attrs, though value
-    comparison is skipped if T.value is None.
-
-    """
-
-    def __init__(self, type_, value=None):
-        self.type_ = type_
-        self.value = value
-
-    def __eq__(self, other):
-        """Compare type and, if it's specified, value."""
-        return (self.type_ == other.type and
-                (self.value is None or self.value == other.value))
-
-    def __ne__(self, other):
-        return not self == other
-
-    def __str__(self):
-        return 'T(%s, %s)' % (repr(self.type_), repr(self.value))
-
-    __repr__ = __str__
+from lexer import lexer, LexError, Token as T
 
 
 def lexed_eq(input, want):
+    """Assert lexing `input` yields `want`."""
     lexer.input(input)
     got = list(lexer)
     if want != got:
         raise AssertionError('%s != %s' % (got, want))
 
 
+def html_eq(input, want):
+    """Assert the lexed, parsed, HTML-formatted input string equals `want`.
+    
+    Lets differences in linebreaks slide.
+    
+    """
+
+
 class LexerTests(TestCase):
     def test_newline(self):
         lexed_eq('\r\r\n\n\r\n', [T('NEWLINE', '\r'),
                                   T('NEWLINE', '\r\n'),
                                   T('NEWLINE', '\n\r'),
                                   T('NEWLINE', '\n')])
 
-    def test_space_tabs(self):
-        lexed_eq(' ', [T('SPACE_TABS', ' ')])
-        lexed_eq('\t', [T('SPACE_TABS', '\t')])
+    # def test_space_tabs(self):
+    #     lexed_eq(' ', [T('SPACE_TABS', ' ')])
+    #     lexed_eq('\t', [T('SPACE_TABS', '\t')])
 
     def test_html_entity(self):
-        lexed_eq('&#x2014;', [T('HTML_ENTITY_HEX', u'\u2014')])
-        lexed_eq('&#8212;', [T('HTML_ENTITY_DEC', u'\u2014')])
-        lexed_eq('&mdash;', [T('HTML_ENTITY_SYM', u'\u2014')])
-        lexed_eq('&badentity;', [T('text', '&badentity;')])
-
+        lexed_eq('&#x2014;', [T('TEXT', u'\u2014')])
+        lexed_eq('&#8212;', [T('TEXT', u'\u2014')])
+        lexed_eq('&mdash;', [T('TEXT', u'\u2014')])
+        lexed_eq('&badentity;', [T('TEXT', '&badentity;')])
+
+    def test_nowiki(self):
+        lexed_eq("<nowiki>''not bold''</nowiki>", [T('TEXT', "''not bold''")])
+
+        # HTML entities inside <nowiki> should be resolved.
+        lexed_eq("<nowiki>&#8212;</nowiki>", [T('TEXT', u'\u2014')])
+
+        lexed_eq('</nowiki>', [T('TEXT', '</nowiki>')])
+
+        # <nowiki>s aren't nestable. Uncomment when bold is implemented.
+        # lexed_eq("<nowiki><nowiki></nowiki>''hey''</nowiki>",
+        #          [T('TEXT', '<nowiki>'),
+        #           T('BOLD'),
+        #           T('TEXT', 'hey'),
+        #           T('BOLD_END'),
+        #           T('TEXT', '</nowiki>')])
+
+    def test_text(self):
+        lexed_eq('hi', [T('TEXT', 'hi')])
+
+
+class IntegrationTests(TestCase):
+    """Tests of the whole stack, from lexer to HTML formatter"""
+
+    def test_h1(self):
+        html_eq('= h1 = trailer', '<p>= h1 = = there = boo</p>')
+        html_eq(' = h1 =', '<pre>= h1 =</pre>')
+        html_eq('= h1 ==',  # An H1 containing a trailing equal sign
+                '<h1> <span class="mw-headline" id="h1_.3D"> h1 =</span></h1>')
+
+# Some challenging test cases:
+# <ref>[http://www.susanscott.net/Oceanwatch2002/mar1-02.html Seaweed also plays a role in the formation of sand<!-- Bot generated title -->]</ref>, from wikipedia:Sand
+# [[File:Suesswasserstachelroche.jpg|thumb|A [[stingray]] about to bury itself in sand]]
+# In MW, [[clay [[this [[thing]]]]]] links "thing". py-wikimarkup links the whole thing.
 
 if __name__ == '__main__':
     unittest.main()