Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Reorganize the tree structure

  • Loading branch information...
commit f6697cd362ca81726750a1ec06de121da4c0dae8 1 parent 939415b
@peter17 peter17 authored
View
1  TODO.rst
@@ -24,6 +24,7 @@ Parsing functions .
Magic links (ISBN, RFC...) . Not needed for the moment
Galleries . Not needed for the moment
Behavior switches (__toc__, __notoc__...) . Not needed for the moment
+Signatures (~~, ~~~, ~~~~...) . Not needed for the moment
=================================================================== ============== =======================================================
======= ==============================
View
0  spikes/pijnu/apostropheParser.py → apostrophes.py
File renamed without changes
View
247 lexer.py
@@ -1,247 +0,0 @@
-#!/usr/bin/env python
-"""What will eventually become a MediaWiki lexer"""
-
-import re
-import readline # Make raw_input() cool.
-
-from ply.lex import LexError as PlyLexError, lex
-
-from mediawiki_parser.constants import html_entities
-
-
-class LexError(PlyLexError):
- def __str__(self):
- return str(self.__unicode__())
-
- def __unicode__(self):
- return u'%s: %s' % (self.args[0], self.text)
-
-
-class Token(object):
- """LexToken-like class, initializable on construction
-
- Equality with LexTokens is based on the type and value attrs, though value
- comparison is skipped if T.value is None.
-
- """
-
- def __init__(self, type, value=None):
- self.type = type
- self.value = value
-
- def __eq__(self, other):
- """Compare type and, if it's specified, value."""
- return (self.type == other.type and
- (self.value is None or self.value == other.value))
-
- def __ne__(self, other):
- return not self == other
-
- def __str__(self):
- return 'T(%s, %s)' % (repr(self.type), repr(self.value))
-
- __repr__ = __str__
-
-
-class LexerBox(object):
- """A container to group token definitions, hold state, & afford subclassing
-
- Don't instantiate these; that's expensive. Instead, use the module-level
- instance in `lexer`.
-
- """
- states = [('nowiki', 'exclusive'),
- ('heading', 'exclusive')]
- TITLE_LEGAL_CHARS = """[ %!"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\x80-\xFF+\n\r]+"""
-
- def __init__(self):
- """Combine the regexes and such. This is expensive."""
- self.lexer = lex(module=self, debug=True, reflags=re.M)
- self.heading_level = 0 # number of =s in the start token of the heading we're currently scanning
-
- # Remember, when defining tokens, not to couple any HTML-output-specific
- # transformations to their t.values. That's for the formatter to decide.
-
- # The secret to lexer/parser division is: lexer recognizes only terminals
- # (or else makes recursive calls to itself).
-
- # Any line that does not start with one of the following is not a special
- # block: " " | "{|" | "#" | ";" | ":" | "*" | "=".
- # (http://www.mediawiki.org/wiki/Markup_spec/BNF/Article#Article)
-
- # TODO: Would using __slots__ in LexToken speed things up? token()
- # instantiates a lot of them.
-
- # How does PLY tell what order tokens are defined in? Allegedly, it adds
- # the callable ones in definition order and then the string ones in
- # ascending length order. [Ed: It looks at each function obj to get
- # co_firstlineno. Thus, subclassing this might not work as well as I
- # thought. TODO: Reconsider how to extend.]
-
- def t_INITIAL_heading_NOWIKI(self, t):
- r'<[nN][oO][wW][iI][kK][iI](?:\s[^>]*)?>'
- t.lexer.push_state('nowiki') # Use stack in case inside a table or something.
- # TODO: Optimize this state by making a special text token that'll chew
- # up anything that's not </nowiki>.
- return None
-
- def t_nowiki_heading_NOWIKI_END(self, t):
- r'</[nN][oO][wW][iI][kK][iI]>'
- t.lexer.pop_state()
- return None
-
- def t_HEADING_START(self, t):
- r'^(?P<HEADING_LEVEL>={1,6})(?=.+?(?P=HEADING_LEVEL)\s*$)'
- # Hoping the non-greedy .+? makes this a bit more efficient
- # TODO: If the lookahead-and-set-state thing becomes common, write a wrapper around the lexer we pass to the parser: let us push tokens onto a list from within a token handler, and make the lexer wrapper return those to the parser before finding another token handler. IOW, let us return multiple tokens from one regex match. Beware the implications for things that aren't atomic (which is most things--most things can contain entities, for example).
- level = len(t.lexer.lexmatch.group('HEADING_LEVEL'))
- t.type = 'H%i' % level
- # t.value doesn't matter.
- self.heading_level = level
- t.lexer.push_state('heading')
- return t
-
- def t_heading_HEADING_END(self, t):
- r'=+\s*$' # Swallows trailing whitespace like MediaWiki
- # If we mistakenly match too early and catch more =s than needed in a
- # heading like = hi ==, return one of the =s as a text token, and
- # resume lexing at the next = to try again. It was either this or else
- # face a profusion of states, one for each heading level (or
- # dynamically add a regex to the master regex, which would be a cool
- # feature for it to support). Headings that end in = should be a rare
- # case, thankfully.
- matched_level = len(t.value.rstrip())
- if matched_level > self.heading_level:
- t.type = 'TEXT'
- t.value = '='
- t.lexer.lexpos -= matched_level - 1
- else:
- t.type = 'H%i_END' % self.heading_level
- # t.value doesn't matter.
- self.heading_level = 0
- t.lexer.pop_state()
- return t
-
- def t_HR(self, t):
- r'^----+'
- # t.value doesn't matter.
- return t
-
- def t_NEWLINE(self, t):
- r'(?:\r\n|\n\r|\r|\n)'
- return t
-
- def t_ANY_HTML_ENTITY_HEX(self, t):
- r'&\#x(?P<HTML_ENTITY_HEX_NUM>[0-9a-fA-F]+);'
- t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_HEX_NUM'), 16))
- t.type = 'TEXT'
- return t
-
- def t_ANY_HTML_ENTITY_DEC(self, t):
- r'&\#(?P<HTML_ENTITY_DEC_NUM>[0-9]+);'
- # Group indexes reference the combined, master regex: hard to predict.
- t.value = unichr(int(t.lexer.lexmatch.group('HTML_ENTITY_DEC_NUM')))
- t.type = 'TEXT'
- return t
-
- def t_ANY_HTML_ENTITY_SYM(self, t):
- r'&(?P<HTML_ENTITY_SYM_NAME>[a-zA-Z1-4]+);'
- sym = t.lexer.lexmatch.group('HTML_ENTITY_SYM_NAME')
- if sym in html_entities:
- t.value = unichr(html_entities[sym])
- t.type = 'TEXT'
- return t
-
- def t_INITIAL_heading_INTERNAL_LINK(self, t):
- # The canonical regex for internal links is $e1 in replaceInternalLinks2() in Parser.php. (TODO: See about accounting for $e1_img as well.) The MW internal link recognizer splits the whole page on "[[" and then runs a regex over each element. We attempt to emulate that behavior here by tweaking the MW pattern.
- # If you want the lexer to state, without doubt, what a [[ token represents, you have to lookahead. Is there a way to disambiguate a [[ later, in the parser? Perhaps: just have the lexer call it a DOUBLE_BRACKET or something, and then have error fallbacks in the parser for if it can't pair them with anything. I suspect, though, that disambiguating gets more expensive as we go deeper in the stack.
- return t
- t_INITIAL_heading_INTERNAL_LINK.__doc__ = (
- r'\[\[['
- + self.TITLE_LEGAL_CHARS +
- r']+'
- r'(?:\|.+?)?]][a-zA-Z]*') # We might need a lexer state (and a token with a different pattern with a different lookahead) for each combination of present subgroups: title, alternate, and ending; title, alternate; title, ending. Otherwise, I'm not sure how we're going to backtrack to get the precise subgroup boundaries that this regex chose. Try to think of a better way. Hmm. I'm starting to think it would be better after all to embrace ambiguity at the lexer level and return DOUBLE_BRACKET or even just BRACKET. Next: Determine whether the parser can sort it out. If all the interesting productions fail, for example, just return a "[[" text node. Can we left-factorize things until the lookahead of 1 is sufficient?
- # TODO: Perhaps add a post-parse pass that finds text after links and moves any leading [a-zA-Z] portion thereof onto the link's .ending attr. That way, we can support [[hey]]&#0065;, which MW doesn't.
-
-
- def t_INITIAL_heading_INTERNAL_LINK_END(self, t):
- r'\]\]'
- return t
-
- def t_INITIAL_heading_TITLE_LEGAL_CHARS(self, t):
- # from $wgLegalTitleChars in DefaultSettings.php. Added \n\r to make up for not being able to set PCRE_DOTALL on the pattern.
- return something
- t_INITIAL_heading_TITLE_LEGAL_CHARS.__doc__ = TITLE_LEGAL_CHARS
-
- def t_ANY_HARMLESS_TEXT(self, t):
- r'[a-zA-Z0-9]+'
- # Runs of stuff that can't possibly be part of another token. An
- # optimization to avoid hitting t_ANY_TEXT
- # TODO: Harmless Unicode chars are missing, so Japanese will go slow.
- t.type = 'TEXT'
- t.value = unicode(t.value)
- return t
-
- def t_ANY_TEXT(self, t): # probably scarily inefficient
- r'.'
- t.value = unicode(t.value)
- return t
-
- # <url-path> ::= <url-char> [<url-path>]
- # <url-char> ::= LEGAL_URL_ENTITY # Not only "abc" and "%23" but "%ey", all of which should be preserved verbatim.
-
- def t_ANY_error(self, t):
- raise LexError('Illegal character', t.value[0])
- #t.lexer.skip(1)
-
- def __iter__(self):
- return merged_text_tokens(iter(self.lexer))
-
- def token(self):
- return self.lexer.token()
-
- def input(self, text):
- """Break the given text into tokens.
-
- Results are available by iterating over me.
-
- """
- return self.lexer.input(text)
-
- tokens = ['NEWLINE', 'TEXT', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H1_END', 'H2_END', 'H3_END', 'H4_END', 'H5_END', 'H6_END', 'HR', 'INTERNAL_LINK', 'INTERNAL_LINK_END']
-
-lexer = LexerBox()
-# TODO: Since we might have multiple threads, have the class build the lexer
-# once and stash it in a class var. Then clone from it on construction of
-# future instances.
-
-
-def merged_text_tokens(tokens):
- """Merge adjacent TEXT tokens in the given iterable of LexTokens."""
- # I hope to make this unnecessary with clever definitions of tokens somehow.
- acc = []
- for t in tokens:
- if t.type == 'TEXT':
- acc.append(t.value)
- else:
- if acc:
- yield Token('TEXT', ''.join(acc))
- acc = []
- yield t
- if acc: # in case last token is TEXT
- yield Token('TEXT', ''.join(acc))
-
-
-if __name__ == '__main__':
- def repl():
- while True:
- try:
- input = raw_input('lexer> ')
- except EOFError:
- break
- try:
- lexer.input(input)
- print list(lexer)
- except LexError, e:
- print e
- repl()
View
8 spikes/pijnu/mediawiki.pijnu → mediawiki.pijnu
@@ -3,7 +3,7 @@ wikitext
def setNullValue(node):
node.value = ''
def parseAllQuotes(node):
- from apostropheParser import parseQuotes
+ from apostrophes import parseQuotes
node.value = parseQuotes(node.value)
<definition>
# codes
@@ -51,7 +51,7 @@ def parseAllQuotes(node):
HTTP : "http://" : liftValue
FTP : "ftp://" : liftValue
- protocole : HTTP / FTP : liftValue
+ protocol : HTTP / FTP : liftValue
# tags
NOWIKI_BEGIN : "<nowiki>" : drop
@@ -67,7 +67,7 @@ def parseAllQuotes(node):
titleEnd : TITLE6_END/TITLE5_END/TITLE4_END/TITLE3_END/TITLE2_END/TITLE1_END
# character expression
- escChar : L_BRACKET/R_BRACKET/protocole/PIPE/L_BRACE/R_BRACE
+ escChar : L_BRACKET/R_BRACKET/protocol/PIPE/L_BRACE/R_BRACE
escSeq : escChar / tag / titleEnd
rawChar : !escSeq [\x20..\xff]
rawText : rawChar+ : join parseAllQuotes
@@ -78,7 +78,7 @@ def parseAllQuotes(node):
pageName : rawChar+ : join
templateName : rawChar+ : join
address : (!(SPACE/QUOTE) [\x21..\xff])+ : liftValue
- url : protocole address : join
+ url : protocol address : join
boldText : BOLD_BEGIN inline BOLD_END : liftValue
italicText : ITALIC_BEGIN inline ITALIC_END : liftValue
value : EQUAL cleanInline : liftValue
View
333 parse.py
@@ -1,333 +0,0 @@
-#!/usr/bin/env python
-# This is on its way out in favor of a PLY-based implementation.
-"""What will eventually become a MediaWiki parser.
-
-Based on the work at http://www.mediawiki.org/wiki/Markup_spec/BNF
-
-"""
-
-from pyparsing import *
-import string
-
-
-# Different from html5lib.constants.entities in that (1) some of these are supported in multiple cases, (2) this lacks &apos, for which there's a complicated discussion at http://www.mail-archive.com/mediawiki-cvs@lists.wikimedia.org/msg01907.html.
-_htmlEntities = {
- u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
- u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
- u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
- u'aring': 229,
- u'asymp': 8776,
- u'Atilde': 195,
- u'atilde': 227,
- u'Auml': 196,
- u'auml': 228,
- u'bdquo': 8222,
- u'Beta': 914,
- u'beta': 946,
- u'brvbar': 166,
- u'bull': 8226,
- u'cap': 8745,
- u'Ccedil': 199,
- u'ccedil': 231,
- u'cedil': 184,
- u'cent': 162,
- u'Chi': 935,
- u'chi': 967,
- u'circ': 710,
- u'clubs': 9827,
- u'cong': 8773,
- u'copy': 169,
- u'crarr': 8629,
- u'cup': 8746,
- u'curren': 164,
- u'dagger': 8224,
- u'Dagger': 8225,
- u'darr': 8595,
- u'dArr': 8659,
- u'deg': 176,
- u'Delta': 916,
- u'delta': 948,
- u'diams': 9830,
- u'divide': 247,
- u'Eacute': 201,
- u'eacute': 233,
- u'Ecirc': 202,
- u'ecirc': 234,
- u'Egrave': 200,
- u'egrave': 232,
- u'empty': 8709,
- u'emsp': 8195,
- u'ensp': 8194,
- u'Epsilon': 917,
- u'epsilon': 949,
- u'equiv': 8801,
- u'Eta': 919,
- u'eta': 951,
- u'ETH': 208,
- u'eth': 240,
- u'Euml': 203,
- u'euml': 235,
- u'euro': 8364,
- u'exist': 8707,
- u'fnof': 402,
- u'forall': 8704,
- u'frac12': 189,
- u'frac14': 188,
- u'frac34': 190,
- u'frasl': 8260,
- u'Gamma': 915,
- u'gamma': 947,
- u'ge': 8805,
- u'gt': 62,
- u'harr': 8596,
- u'hArr': 8660,
- u'hearts': 9829,
- u'hellip': 8230,
- u'Iacute': 205,
- u'iacute': 237,
- u'Icirc': 206,
- u'icirc': 238,
- u'iexcl': 161,
- u'Igrave': 204,
- u'igrave': 236,
- u'image': 8465,
- u'infin': 8734,
- u'int': 8747,
- u'Iota': 921,
- u'iota': 953,
- u'iquest': 191,
- u'isin': 8712,
- u'Iuml': 207,
- u'iuml': 239,
- u'Kappa': 922,
- u'kappa': 954,
- u'Lambda': 923,
- u'lambda': 955,
- u'lang': 9001,
- u'laquo': 171,
- u'larr': 8592,
- u'lArr': 8656,
- u'lceil': 8968,
- u'ldquo': 8220,
- u'le': 8804,
- u'lfloor': 8970,
- u'lowast': 8727,
- u'loz': 9674,
- u'lrm': 8206,
- u'lsaquo': 8249,
- u'lsquo': 8216,
- u'lt': 60,
- u'macr': 175,
- u'mdash': 8212,
- u'micro': 181,
- u'middot': 183,
- u'minus': 8722,
- u'Mu': 924,
- u'mu': 956,
- u'nabla': 8711,
- u'nbsp': 160,
- u'ndash': 8211,
- u'ne': 8800,
- u'ni': 8715,
- u'not': 172,
- u'notin': 8713,
- u'nsub': 8836,
- u'Ntilde': 209,
- u'ntilde': 241,
- u'Nu': 925,
- u'nu': 957,
- u'Oacute': 211,
- u'oacute': 243,
- u'Ocirc': 212,
- u'ocirc': 244,
- u'OElig': 338,
- u'oelig': 339,
- u'Ograve': 210,
- u'ograve': 242,
- u'oline': 8254,
- u'Omega': 937,
- u'omega': 969,
- u'Omicron': 927,
- u'omicron': 959,
- u'oplus': 8853,
- u'or': 8744,
- u'ordf': 170,
- u'ordm': 186,
- u'Oslash': 216,
- u'oslash': 248,
- u'Otilde': 213,
- u'otilde': 245,
- u'otimes': 8855,
- u'Ouml': 214,
- u'ouml': 246,
- u'para': 182,
- u'part': 8706,
- u'permil': 8240,
- u'perp': 8869,
- u'Phi': 934,
- u'phi': 966,
- u'Pi': 928,
- u'pi': 960,
- u'piv': 982,
- u'plusmn': 177,
- u'pound': 163,
- u'prime': 8242,
- u'Prime': 8243,
- u'prod': 8719,
- u'prop': 8733,
- u'Psi': 936,
- u'psi': 968,
- u'quot': 34,
- u'radic': 8730,
- u'rang': 9002,
- u'raquo': 187,
- u'rarr': 8594,
- u'rArr': 8658,
- u'rceil': 8969,
- u'rdquo': 8221,
- u'real': 8476,
- u'reg': 174,
- u'rfloor': 8971,
- u'Rho': 929,
- u'rho': 961,
- u'rlm': 8207,
- u'rsaquo': 8250,
- u'rsquo': 8217,
- u'sbquo': 8218,
- u'Scaron': 352,
- u'scaron': 353,
- u'sdot': 8901,
- u'sect': 167,
- u'shy': 173,
- u'Sigma': 931,
- u'sigma': 963,
- u'sigmaf': 962,
- u'sim': 8764,
- u'spades': 9824,
- u'sub': 8834,
- u'sube': 8838,
- u'sum': 8721,
- u'sup': 8835,
- u'sup1': 185,
- u'sup2': 178,
- u'sup3': 179,
- u'supe': 8839,
- u'szlig': 223,
- u'Tau': 932,
- u'tau': 964,
- u'there4': 8756,
- u'Theta': 920,
- u'theta': 952,
- u'thetasym': 977,
- u'thinsp': 8201,
- u'THORN': 222,
- u'thorn': 254,
- u'tilde': 732,
- u'times': 215,
- u'trade': 8482,
- u'Uacute': 218,
- u'uacute': 250,
- u'uarr': 8593,
- u'uArr': 8657,
- u'Ucirc': 219,
- u'ucirc': 251,
- u'Ugrave': 217,
- u'ugrave': 249,
- u'uml': 168,
- u'upsih': 978,
- u'Upsilon': 933,
- u'upsilon': 965,
- u'Uuml': 220,
- u'uuml': 252,
- u'weierp': 8472,
- u'Xi': 926,
- u'xi': 958,
- u'Yacute': 221,
- u'yacute': 253,
- u'yen': 165,
- u'Yuml': 376,
- u'yuml': 255,
- u'Zeta': 918,
- u'zeta': 950,
- u'zwj': 8205,
- u'zwnj': 8204
-}
-
-
-def parsed_eq(expr, text, want):
- got = expr.parseString(text).asList()
- if got != want:
- raise AssertionError('%s != %s' % (got, want))
-
-
-ParserElement.setDefaultWhitespaceChars('') # Whitespace is significant.
-ParserElement.enablePackrat() # Enable memoizing.
-
-
-# Fundamental elements (http://www.mediawiki.org/wiki/Markup_spec/BNF/Fundamental_elements):
-# TODO: Put Group() around almost everything to shape the output into a parse tree. Assign setResultsName()s to everything so we can tell what kind of tokens they are.
-newline = Literal('\r\n') | '\n\r' | '\r' | '\n'
-newlines = Combine(OneOrMore(newline))
-#newlines.verbose_stacktrace = True
-bol = newline | StringStart()
-eol = newline | StringEnd()
-
-space = Literal(' ')
-spaces = Combine(OneOrMore(space))
-space_tab = (space | '\t').parseWithTabs()
-space_tabs = OneOrMore(space_tab)
-
-whitespace_char = (space_tab | newline).parseWithTabs()
-whitespace = Combine(OneOrMore(whitespace_char) + Optional(StringEnd())).parseWithTabs()
-
-hex_digit = oneOf(list(hexnums))
-hex_number = Combine(OneOrMore(hex_digit))
-
-decimal_digit = oneOf(list(nums))
-decimal_number = Combine(OneOrMore(decimal_digit))
-
-underscore = Literal('_')
-html_unsafe_symbol = oneOf(list('<>&')) # TODO: on output, escape
-symbol = Regex('[^0-9a-zA-Z]') # inferred from inadequate description
-lcase_letter = Regex('[a-z]')
-ucase_letter = Regex('[A-Z]')
-letter = Regex('[a-zA-Z]')
-non_whitespace_char = letter | decimal_digit | symbol # Optimize all such combinations; they'd probably benefit from being collapsed into single regex alternations.
-
-html_entity_char = letter | decimal_digit
-html_entity_chars = OneOrMore(html_entity_char)
-html_entity = (('&#x' + hex_number + ';') |
- ('&#' + decimal_number + ';') |
- ('&' + oneOf(_htmlEntities.keys()) + ';')).setResultsName('html_entity') #
-
-character = html_entity | whitespace_char | non_whitespace_char
-
-
-# (Temporary?) unit tests:
-parsed_eq(OneOrMore(newline), '\r\r\n\n\r\n', ['\r', '\r\n', '\n\r', '\n'])
-parsed_eq(newlines, '\r\r\n\n\r\n', ['\r\r\n\n\r\n'])
-parsed_eq(bol + 'hi', 'hi', ['hi'])
-parsed_eq(bol + 'hi', '\nhi', ['\n', 'hi'])
-parsed_eq('hi' + eol, 'hi', ['hi'])
-parsed_eq('hi' + eol, 'hi\n', ['hi', '\n'])
-parsed_eq(spaces, ' ', [' '])
-parsed_eq(space_tab, '\t', ['\t'])
-parsed_eq(whitespace_char, '\t', ['\t'])
-parsed_eq(whitespace, ' \t\r', [' \t\r'])
-parsed_eq(whitespace, ' hi', [' ']) # no StringEnd
-parsed_eq(hex_number, '123DECAFBAD', ['123DECAFBAD'])
-parsed_eq(decimal_number, '0123', ['0123'])
-assert character.parseString('&#xdeadbeef;').getName() == 'html_entity'
-assert character.parseString('&aring;').getName() == 'html_entity'
-assert character.parseString('&bozo;').getName() != 'html_entity'
-
-
-print "All's well!"
-
-# try:
-# p = newlines.parseString(str)
-# except ParseException, e:
-# print repr(e.msg)
-# raise
-# else:
-# print p
View
104 parser.py 100755 → 100644
@@ -1,95 +1,15 @@
-#!/usr/bin/env python
-"""A parser that builds an abstract syntax tree from MediaWiki syntax"""
+# -*- coding: utf8 -*-
+# get the parser
+from pijnu import makeParser
+mediawikiGrammar = file("mediawiki.pijnu").read()
+mediawikiParser = makeParser(mediawikiGrammar)
-# TODO: Probably yacc.yacc(write_tables=0) to keep from doing FS writes.
-# TODO: Return lightweight tuples or even chunks of CElementTree from parse rules. Then have a decoupled HTML writer walk them.
+# import the source in a utf-8 string for parseAllQuotes
+import codecs
+from apostropheParser import parseAllQuotes
+fileObj = codecs.open("wikitext.txt", "r", "utf-8")
+source = fileObj.read()
+#source = parseAllQuotes(source)
-# Shift/reduce conflicts happen when there are 2 or more productions that could be chosen, like "abcd: A B C D" or "abcx: A B empty C X". There, the conflict happens as C is seen.
+mediawikiParser.test(source)
-# Maybe use embedded rules (section 6.11) to keep track of apostrophe jungles?
-
-# Perhaps we can best recover from things that look like they're going to be productions but aren't by catching them in the error procedure and then just printing them out verbatim.
-
-from ply.yacc import yacc
-
-from mediawiki_parser.lexer import lexer
-
-
-class ParserBox(object):
- def __init__(self, lexer=lexer):
- self.tokens = lexer.tokens
- self._lexer = lexer
- self._parser = yacc(module=self, debug=True)
-
- def p_inline(self, p): # Called "inline_text" in the BNF
- """inline : inline inline_element
- | inline_element"""
- if len(p) == 3: # TODO: Split into separate rules for speed.
- p[0] = p[1] + Inline([p[2]])
- else:
- p[0] = Inline([p[1]])
-
- def p_inline_element(self, p):
- """inline_element : texts
- | internal_link"""
- p[0] = p[1]
-
- def p_internal_link(self, p):
- """internal_link : INTERNAL_LINK texts INTERNAL_LINK_END
- | INTERNAL_LINK texts INTERNAL_LINK_END""" # "texts" should probably be "link-safe texts": that is, excluding "]]" (and "[["?)
- p[0] = Link(p[2])
-
- def p_texts(self, p): # Merge consecutive TEXT terminals.
- """texts : texts TEXT"""
- p[0] = p[1] + p[2]
-
- def p_texts_from_brackets(self, p):
- """texts : INTERNAL_LINK
- | INTERNAL_LINK_END
- | TEXT"""
- # Failed brace matches can be treated as text.
- p[0] = unicode(p[1])
-
- def p_error(self, p):
- print "yo: {%s}" % p
-
- def parse(self, text):
- return self._parser.parse(text, lexer=self._lexer)
-
-parser = ParserBox()
-
-
-class NodeWithAttributes(object):
- """An abstract-syntax-tree node that has attrs and compares by them"""
- # Will probably turn into something with a generic concept of children
-
- def __str__(self):
- return '%s(%s)' % (self.__class__.__name__, ', '.join('%s=%s' % (k, repr(v)) for k, v in self.__dict__.iteritems()))
-
- __repr__ = __str__
-
- def __eq__(self, other):
- return self.__dict__ == other.__dict__
-
- def __ne__(self, other):
- return not self == other
-
-
-class Link(NodeWithAttributes):
- def __init__(self, text):
- self.text = text
-
-
-class Inline(list):
- pass
-
-
-if __name__ == '__main__':
- def repl():
- while True:
- try:
- input = raw_input('parser> ')
- except EOFError:
- break
- print parser.parse(input)
- repl()
View
23 spikes/pijnu/wikitext_test.py
@@ -1,23 +0,0 @@
-# get the parser
-from pijnu import makeParser
-mediawikiGrammar = file("mediawiki.pijnu").read()
-mediawikiParser = makeParser(mediawikiGrammar)
-
-# import the source in a utf-8 string for parseAllQuotes
-import codecs
-from apostropheParser import parseAllQuotes
-fileObj = codecs.open("wikitext.txt", "r", "utf-8")
-source = fileObj.read()
-#source = parseAllQuotes(source)
-
-mediawikiParser.test(source)
-
-
-print "\nLet's get all the external links of the article:"
-print mediawikiParser.url.findAll(source)
-
-print "\nLet's get all the internal links of the article:"
-print mediawikiParser.internalLink.findAll(source)
-
-print "\nLet's get all the templates of the article:" # Fails
-print mediawikiParser.templateName.findAll(source)
View
20 spikes/tree.py
@@ -1,20 +0,0 @@
-#!/usr/bin/env python
-
-from pyparsing import *
-
-
-# Spike to prove we can get a parse tree out of PyParsing:
-letters = Regex('[a-zA-Z]+').setName('letters').setDebug()
-bold_toggle = Literal("'''").setName('bold_toggle').setDebug()
-italic_toggle = Literal("''").setName('italic_toggle').setDebug()
-text_with_formatting = Forward()
-italic_span = Forward()
-bold_span = Group(bold_toggle + OneOrMore(italic_span | letters) + bold_toggle).setName('bold_span').setDebug()
-italic_span << Group(italic_toggle + OneOrMore(bold_span | letters) + italic_toggle).setName('italic_span').setDebug()
-text_with_formatting << OneOrMore(bold_span | italic_span | letters).setName('text_with_formatting').setDebug()
-text_with_formatting.verbose_stacktrace = True
-
-# 12345678901234567890123
-print text_with_formatting.parseString("'''bdasl''hide''seek'''") # This doesn't.
-
-# Next: traverse the tree. Make sure that doesn't hurt.
View
134 tests.py
@@ -1,134 +0,0 @@
-#!/usr/bin/env python
-
-import unittest
-from unittest import TestCase
-
-from ply.lex import LexToken
-
-from mediawiki_parser.lexer import lexer, LexError, Token as T
-from mediawiki_parser.parser import parser, Link, Inline
-
-
-def lexed_eq(input, want):
- """Assert lexing `input` yields `want`."""
- lexer.input(input)
- got = list(lexer)
- if want != got:
- raise AssertionError('%s != %s' % (got, want))
-
-
-def parsed_eq(input, want):
- """Assert parsing `input` yields `want`."""
- got = parser.parse(input)
- if want != got:
- raise AssertionError('%s != %s' % (got, want))
-
-
-def html_eq(input, want):
- """Assert the lexed, parsed, HTML-formatted input string equals `want`.
-
- Lets differences in linebreaks slide.
-
- """
-
-
-class LexerTests(TestCase):
- def test_newline(self):
- lexed_eq('\r\r\n\n\r\n', [T('NEWLINE', '\r'),
- T('NEWLINE', '\r\n'),
- T('NEWLINE', '\n\r'),
- T('NEWLINE', '\n')])
-
- # def test_space_tabs(self):
- # lexed_eq(' ', [T('SPACE_TABS', ' ')])
- # lexed_eq('\t', [T('SPACE_TABS', '\t')])
-
- def test_html_entity(self):
- lexed_eq('&#x2014;', [T('TEXT', u'\u2014')])
- lexed_eq('&#8212;', [T('TEXT', u'\u2014')])
- lexed_eq('&mdash;', [T('TEXT', u'\u2014')])
- lexed_eq('&badentity;', [T('TEXT', '&badentity;')])
-
- def test_nowiki(self):
- lexed_eq("<nowiki>''not bold''</nowiki>", [T('TEXT', "''not bold''")])
-
- # HTML entities inside <nowiki> should be resolved.
- lexed_eq("<nowiki>&#8212;</nowiki>", [T('TEXT', u'\u2014')])
-
- lexed_eq('</nowiki>', [T('TEXT', '</nowiki>')])
-
- # Junk inside the opening tag should be tolerated:
- lexed_eq("<nowiki dudes and froods>>''cool''</nowiki>",
- [T('TEXT', ">''cool''")])
-
- # <nowiki>s aren't nestable. Uncomment when bold is implemented.
- # lexed_eq("<nowiki><nowiki></nowiki>''hey''</nowiki>",
- # [T('TEXT', '<nowiki>'),
- # T('BOLD'),
- # T('TEXT', 'hey'),
- # T('BOLD_END'),
- # T('TEXT', '</nowiki>')])
-
- def test_text(self):
- lexed_eq('hi', [T('TEXT', 'hi')])
-
- def test_heading(self):
- lexed_eq('======', [T('H2'), T('TEXT', '=='), T('H2_END')])
- lexed_eq('==', [T('TEXT', '==')]) # Headings must contain something.
- lexed_eq('====== h6 ======', [T('H6'), T('TEXT', ' h6 '), T('H6_END')]) # maintain whitespace
- lexed_eq('=h1= ', [T('H1'), T('TEXT', 'h1'), T('H1_END')]) # strip trailing whitespace
- lexed_eq('=&amp;=', [T('H1'), T('TEXT', '&'), T('H1_END')]) # recognize contained lexemes
-
- def test_hr(self):
- lexed_eq('----one', [T('HR'), T('TEXT', 'one')])
- lexed_eq('-------one', [T('HR'), T('TEXT', 'one')])
- lexed_eq('one----two', [T('TEXT', 'one----two')])
-
- def test_combination(self):
- lexed_eq('===Llamas===\n'
- 'Llamas are cute.\n'
- '----\n'
- 'And poetic.',
- [T('H3'), T('TEXT', 'Llamas'), T('H3_END'), T('NEWLINE'),
- T('TEXT', 'Llamas are cute.'), T('NEWLINE'),
- T('HR'), T('NEWLINE'),
- T('TEXT', 'And poetic.')])
-
-
-class ParserTests(TestCase):
- def test_text(self):
- parsed_eq('Hi', Inline([u'Hi']))
-
- def test_internal_link(self):
- parsed_eq('[[Booga]]', Inline([Link('Booga')]))
- parsed_eq('[[this [[thing]]]]', Inline([u'[[this ', Link('thing'), u']]']))
-
- def test_inline(self):
- """Make sure lists of inline elements parse."""
- parsed_eq('The[[Booga]]Loo', Inline([u'The', Link('Booga'), u'Loo']))
-
-
-class IntegrationTests(TestCase):
- """Tests of the whole stack, from lexer to HTML formatter"""
-
- def test_h1(self):
- html_eq('= h1 = trailer', '<p>= h1 = = there = boo</p>')
- html_eq(' = h1 =', '<pre>= h1 =</pre>')
- html_eq('= h1 ==', # An H1 containing a trailing equal sign
- '<h1> <span class="mw-headline" id="h1_.3D"> h1 =</span></h1>')
-
-# Some challenging test cases:
-# <ref>[http://www.susanscott.net/Oceanwatch2002/mar1-02.html Seaweed also plays a role in the formation of sand<!-- Bot generated title -->]</ref>, from wikipedia:Sand
-# [[File:Suesswasserstachelroche.jpg|thumb|A [[stingray]] about to bury itself in sand]]
-# In MW, [[clay [[this [[thing]]]]]] links "thing". py-wikimarkup links the whole thing.
-# In endings, like [[somelink]]ending, only [a-zA-Z] should be captured as the ending: not even umlauts or numbers or entities representing ASCII letters get captured in MW.
-# [[h&#0069;llo]] is a link called "hEllo".
-# [[LimBo\n]] isn't a link, but [[LimBo|and\n]] is.
-# [[L'''i'''mB'''o|a'''''n''d]] is not a link; neither bold nor italics seems to be allowed in page names.
-# [[LimBo|a''n''d]] is a link with an italic n.
-# [[LimBo]]ohai9 uses "LimBoohai" as the linked text. (See "extra-description" on the Links page of the BNF.)
-# [[Limbo|and]]ohai9 gives "andohai" as the linked text.
-
-
-if __name__ == '__main__':
- unittest.main()
View
13 spikes/pijnu/wikitext.txt → tests/test_findAll.py
@@ -1,4 +1,5 @@
-= Title 1 = this should be dropped
+# -*- coding: utf8 -*-
+source = u"""= Title 1 = this should be dropped
Normal paragraph.
This
@@ -148,3 +149,13 @@
== Others ==
---- the above line is an '''horizonal rule'''
+"""
+
+from mediawiki_parser import wikitextParser
+mediawikiParser = wikitextParser
+
+mediawikiParser.url.findAll(source)
+
+mediawikiParser.internalLink.findAll(source)
+
+mediawikiParser.templateName.findAll(source)
View
93 spikes/pijnu/wikitext_tests.py → tests/wikitext_tests.py
@@ -1,9 +1,7 @@
# -*- coding: utf8 -*-
-# get the parser
-from pijnu import makeParser
-mediawikiGrammar = file("mediawiki.pijnu").read()
-mediawikiParser = makeParser(mediawikiGrammar)
+from .. import wikitextParser
+mediawikiParser = wikitextParser.wikitextParser
print "\n\n== Testing titles =="
@@ -25,7 +23,8 @@
'== Title == title ==\n' : "[title2:[rawText:' Title == title ']]", # Allow =* in titles
}
-mediawikiParser.testSuite(test_suite_dict)
+def test():
+ mediawikiParser.testSuite(test_suite_dict)
print "\n\n== Testing nowiki sections =="
@@ -79,7 +78,7 @@
optionalValue:
rawText:parameter
parameter:
- parameterName:2
+ parameterName:2
optionalValue:
rawText: parameters """
source1 = """{{Template which
@@ -92,20 +91,20 @@
pageName:Template which
parameters:
parameter:
- parameterName:is
+ parameterName:is
optionalValue:
rawText: test
parameter:
- parameterName:multi
+ parameterName:multi
optionalValue:
rawText: test
parameter:
- parameterName:lines
+ parameterName:lines
optionalValue:
rawText: test"""
source2 = """A template {{Template with|1=parameter| 2 = parameters }} inside a text."""
result2 = """@inline@:
- rawText:A template
+ rawText:A template
advancedTemplate:
pageName:Template with
parameters:
@@ -114,13 +113,13 @@
optionalValue:
rawText:parameter
parameter:
- parameterName:2
+ parameterName:2
optionalValue:
- rawText: parameters
+ rawText: parameters
rawText: inside a text."""
source3 = """Formatted arguments in a template {{Template with|1='''parameter'''| 2 = ''parameters'' }}."""
result3 = """@inline@:
- rawText:Formatted arguments in a template
+ rawText:Formatted arguments in a template
advancedTemplate:
pageName:Template with
parameters:
@@ -129,27 +128,27 @@
optionalValue:
rawText:<strong>parameter</strong>
parameter:
- parameterName:2
+ parameterName:2
optionalValue:
- rawText: <em>parameters</em>
+ rawText: <em>parameters</em>
rawText:."""
source4 = """A {{Template with|{{other}} |1={{templates}}| 2 = {{nested|inside=1}} }}."""
result4 = """@inline@:
- rawText:A
+ rawText:A
advancedTemplate:
pageName:Template with
parameters:
parameter:
simpleTemplate:other
- rawText:
+ rawText:
parameter:
parameterName:1
optionalValue:
simpleTemplate:templates
parameter:
- parameterName:2
+ parameterName:2
optionalValue:
- rawText:
+ rawText:
advancedTemplate:
pageName:nested
parameters:
@@ -157,11 +156,11 @@
parameterName:inside
optionalValue:
rawText:1
- rawText:
+ rawText:
rawText:."""
source5 = """A '''template {{Template with|1=parameter| 2 = parameters }} inside formatted''' text."""
result5 = """@inline@:
- rawText:A <strong>template
+ rawText:A <strong>template
advancedTemplate:
pageName:Template with
parameters:
@@ -170,9 +169,9 @@
optionalValue:
rawText:parameter
parameter:
- parameterName:2
+ parameterName:2
optionalValue:
- rawText: parameters
+ rawText: parameters
rawText: inside formatted</strong> text.""" # Fails
sources = [source0, source1, source2, source3, source4, source5]
results = [result0, result1, result2, result3, result4, result5]
@@ -261,7 +260,7 @@
wikiTableLine:
wikiTableTitle:
@inline@:
- rawText: Table
+ rawText: Table
advancedTemplate:
pageName:title
parameters:
@@ -273,7 +272,7 @@
wikiTableLineCells:
wikiTableFirstCell:
@cleanInline@:
- rawText: cell 1
+ rawText: cell 1
<?>:
wikiTableOtherCell:
@cleanInline@:
@@ -284,7 +283,7 @@
wikiTableLineCells:
wikiTableFirstCell:
@cleanInline@:
- rawText: cell 3
+ rawText: cell 3
<?>:
wikiTableOtherCell:
@cleanInline@:
@@ -308,7 +307,7 @@
result2 = """@wikiTable@:
wikiTableBegin:
wikiTableParameters:
- CSS_text: class="wikitable"
+ CSS_text: class="wikitable"
@cleanInline@:
simpleTemplate:prettyTable
<?>:
@@ -318,10 +317,10 @@
wikiTableTitle:
<?>:
CSS_attributes:
- CSS_text: style="color:red"
+ CSS_text: style="color:red"
<?>:
@inline@:
- rawText: Table
+ rawText: Table
advancedTemplate:
pageName:title
parameters:
@@ -332,7 +331,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=col
+ CSS_text: scope=col
<?>:
@cleanInline@:
rawText: Title A
@@ -340,7 +339,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=col
+ CSS_text: scope=col
<?>:
@cleanInline@:
rawText: Title B
@@ -350,7 +349,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=row
+ CSS_text: scope=row
<?>:
@cleanInline@:
rawText: Line 1
@@ -368,7 +367,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=row
+ CSS_text: scope=row
<?>:
@cleanInline@:
rawText: Line 2
@@ -379,7 +378,7 @@
wikiTableLine:
wikiTableLineCells:
@cleanInline@:
- rawText:data
+ rawText:data
advancedTemplate:
pageName:template
parameters:
@@ -413,7 +412,7 @@
result3 = """@wikiTable@:
wikiTableBegin:
wikiTableParameters:
- CSS_text: class="wikitable"
+ CSS_text: class="wikitable"
@cleanInline@:
advancedTemplate:
pageName:prettyTable
@@ -429,10 +428,10 @@
wikiTableTitle:
<?>:
CSS_attributes:
- CSS_text: style="color:red"
+ CSS_text: style="color:red"
<?>:
@inline@:
- rawText: Table
+ rawText: Table
simpleTemplate:title
wikiTableLine:
wikiTableLineBreak:
@@ -440,7 +439,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=col
+ CSS_text: scope=col
<?>:
@cleanInline@:
rawText: First (mother)
@@ -448,14 +447,14 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=col
+ CSS_text: scope=col
<?>:
@cleanInline@:
rawText: table
@wikiTable@:
wikiTableBegin:
wikiTableParameters:
- CSS_text: class="wikitable"
+ CSS_text: class="wikitable"
@cleanInline@:
simpleTemplate:prettyTable
<?>:
@@ -467,7 +466,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=row
+ CSS_text: scope=row
<?>:
@cleanInline@:
rawText: Second (daughter) table
@@ -485,7 +484,7 @@
wikiTableLineHeader:
<?>:
CSS_attributes:
- CSS_text: scope=row
+ CSS_text: scope=row
<?>:
@cleanInline@:
rawText: in the first one
@@ -552,7 +551,7 @@
"## ''more text''\n" : "[list:[@numberSubList@:[numberListLeaf:[rawText:' <em>more text</em>']]]]",
"### ''other text''\n" : "[list:[@numberSubList@:[@numberSubList@:[numberListLeaf:[rawText:' <em>other text</em>']]]]]",
": '''more text'''\n" : "[list:[colonListLeaf:[rawText:' <strong>more text</strong>']]]",
- ":::: '''more text'''\n" : "[list:[@colonSubList@:[@colonSubList@:[@colonSubList@:[colonListLeaf:[rawText:' <strong>more text</strong>']]]]]]",
+ ":::: '''more text'''\n" : "[list:[@colonSubList@:[@colonSubList@:[@colonSubList@:[colonListLeaf:[rawText:' <strong>more text</strong>']]]]]]",
'; still more [[text]]\n' : "[list:[semiColonListLeaf:[rawText:' still more ' simpleInternalLink:'text']]]",
';; still more [[text]]\n' : "[list:[@semiColonSubList@:[semiColonListLeaf:[rawText:' still more ' simpleInternalLink:'text']]]]",
':* more complicated case\n' : "[list:[@colonSubList@:[bulletListLeaf:[rawText:' more complicated case']]]]",
@@ -635,7 +634,7 @@
result4 = """body:
paragraphs:
paragraph:
- rawText:Styled text such as <em>italic</em>, <strong>bold</strong>,
+ rawText:Styled text such as <em>italic</em>, <strong>bold</strong>,
simpleTemplate:templates
rawText: also work."""
@@ -676,7 +675,7 @@
result3 = """body:
preformattedLines:
preformattedLine:
- rawText:Styled text such as <em>italic</em>, <strong>bold</strong>,
+ rawText:Styled text such as <em>italic</em>, <strong>bold</strong>,
simpleTemplate:templates
rawText: also work."""
source4 = """<pre>
@@ -692,7 +691,7 @@
result5 = """body:
paragraphs:
paragraph:
- rawText:Normal paragraph
+ rawText:Normal paragraph
preformatted:
rawText:Preformatted one
rawText: Normal one."""
@@ -711,7 +710,7 @@
wikiTableLine:
wikiTableLineHeader:
@cleanInline@:
- rawText:
+ rawText:
preformatted:
rawText:Text"""
Please sign in to comment.
Something went wrong with that request. Please try again.