Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Propose a way to transform the AST into HTML and text strings

  • Loading branch information...
commit fc5d10c5c570ee100e808ae8addd90450e308734 1 parent 12780a4
Peter Potrowl authored July 06, 2011
1  config.py
... ...
@@ -0,0 +1 @@
  1
+output = 'html'
15  html.py
... ...
@@ -0,0 +1,15 @@
  1
+def render_title2(node):
  2
+    node.value = '<h2>%s</h2>\n' % node.leaf()
  3
+
  4
+def render_title6(node):
  5
+    node.value = '<h6>%s</h6>\n' % node.leaf()
  6
+
  7
+def render_raw_text(node):
  8
+    from apostrophes import parseQuotes
  9
+    node.value = "%s" % parseQuotes(node.leaf())
  10
+
  11
+def render_paragraph(node):
  12
+    node.value = '<p>%s</p>\n' % node.leaf()
  13
+
  14
+def render_body(node):
  15
+    node.value = '<body>\n%s</body>' % node.leaf()
24  mediawiki.pijnu
... ...
@@ -1,11 +1,14 @@
1 1
 wikitext
2 2
 <toolset>
3  
-def parse_all_quotes(node):
4  
-    from apostrophes import parseQuotes
5  
-    node.value = parseQuotes(node.value)
  3
+import config
  4
+
  5
+if config.output == 'html':
  6
+    from html import *
  7
+elif config.output == 'text':
  8
+    from text import *
  9
+else:
  10
+    from raw import *
6 11
 
7  
-def replace_by_space(node):
8  
-    node.value = ' '
9 12
 <definition>
10 13
 # Codes
11 14
 
@@ -80,7 +83,7 @@ def replace_by_space(node):
80 83
     titleEnd                : TITLE6_END/TITLE5_END/TITLE4_END/TITLE3_END/TITLE2_END/TITLE1_END
81 84
     escSeq                  : special_tag / escChar / titleEnd
82 85
     rawChar                 : !escSeq [\x20..\xff]
83  
-    rawText                 : rawChar+                                                              : join parse_all_quotes
  86
+    rawText                 : rawChar+                                                              : join render_raw_text
84 87
     alpha_num               : [a..zA..Z0..9]
85 88
     alpha_num_text          : alpha_num+                                                            : join
86 89
     anyChar                 : [\x20..\xff]
@@ -154,8 +157,7 @@ def replace_by_space(node):
154 157
     pre_text                : (!PRE_END anyChar)*                                                   : join
155 158
     preformatted            : PRE_BEGIN pre_text PRE_END                                            : liftValue
156 159
     # We allow any char without parsing them as long as the tag is not closed
157  
-    eol_to_space            : EOL*                                                                  : replace_by_space
158  
-    nowiki_text             : (!NOWIKI_END (anyChar/eol_to_space))*                                 : join
  160
+    nowiki_text             : (!NOWIKI_END anyChar)*                                                : join
159 161
     nowiki                  : NOWIKI_BEGIN nowiki_text NOWIKI_END                                   : liftValue
160 162
 
161 163
 # Text types
@@ -172,7 +174,7 @@ def replace_by_space(node):
172 174
     special_line_begin      : SPACE/EQUAL/BULLET/HASH/COLON/DASH{4}/TABLE_BEGIN/SEMICOLON
173 175
     paragraph_line          : !special_line_begin inline EOL                                        : liftValue
174 176
     blank_paragraph         : EOL{2}                                                                : drop keep
175  
-    paragraph               : paragraph_line+                                                       : liftValue
  177
+    paragraph               : paragraph_line+                                                       : liftValue render_paragraph
176 178
     paragraphs              : (blank_paragraph/EOL/paragraph)+
177 179
 
178 180
 # Titles
@@ -181,7 +183,7 @@ def replace_by_space(node):
181 183
     title5                  : TITLE5_BEGIN inline TITLE5_END                                        : liftValue
182 184
     title4                  : TITLE4_BEGIN inline TITLE4_END                                        : liftValue
183 185
     title3                  : TITLE3_BEGIN inline TITLE3_END                                        : liftValue
184  
-    title2                  : TITLE2_BEGIN inline TITLE2_END                                        : liftValue
  186
+    title2                  : TITLE2_BEGIN inline TITLE2_END                                        : liftValue render_title2
185 187
     title1                  : TITLE1_BEGIN inline TITLE1_END                                        : liftValue
186 188
     title                   : title6 / title5 / title4 / title3 / title2 / title1
187 189
 
@@ -253,4 +255,4 @@ def replace_by_space(node):
253 255
 
254 256
 # Top pattern
255 257
 
256  
-    body                    : optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+ : liftValue
  258
+    body                    : optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+ : liftValue render_body
16  parser.py
... ...
@@ -1,15 +1,23 @@
1 1
 # -*- coding: utf8 -*-
  2
+import config
  3
+print "Output will be", config.output
  4
+
2 5
 # get the parser
3 6
 from pijnu import makeParser
4 7
 mediawikiGrammar = file("mediawiki.pijnu").read()
5 8
 mediawikiParser = makeParser(mediawikiGrammar)
6 9
 
7  
-# import the source in a utf-8 string for parseAllQuotes
  10
+# import the source in a utf-8 string
8 11
 import codecs
9  
-from apostropheParser import parseAllQuotes
  12
+from apostrophes import parseAllQuotes
10 13
 fileObj = codecs.open("wikitext.txt", "r", "utf-8")
11 14
 source = fileObj.read()
12  
-#source = parseAllQuotes(source)
13 15
 
14  
-mediawikiParser.test(source)
  16
+# The last line of the file will not be parsed correctly if
  17
+# there is no newline at the end of file, so, we add one.
  18
+if source[-1] != '\n':
  19
+  source += '\n'
  20
+
  21
+tree = mediawikiParser.parse(source)
15 22
 
  23
+print tree.leaves()
15  raw.py
... ...
@@ -0,0 +1,15 @@
  1
+def render_title2(node):
  2
+    pass
  3
+
  4
+def render_title6(node):
  5
+    pass
  6
+
  7
+def render_raw_text(node):
  8
+    from apostrophes import parseQuotes
  9
+    node.value = "%s" % parseQuotes(node.leaf())
  10
+
  11
+def render_paragraph(node):
  12
+    pass
  13
+
  14
+def render_body(node):
  15
+    pass
14  text.py
... ...
@@ -0,0 +1,14 @@
  1
+def render_title2(node):
  2
+    node.value += '\n'
  3
+
  4
+def render_title6(node):
  5
+    node.value += '\n'
  6
+
  7
+def render_raw_text(node):
  8
+    pass
  9
+
  10
+def render_paragraph(node):
  11
+    node.value += '\n'
  12
+
  13
+def render_body(node):
  14
+    pass
10  wikitext.txt
... ...
@@ -0,0 +1,10 @@
  1
+== Title ==
  2
+This is a paragraph.
  3
+
  4
+This is a [[link#Title|link test]], '''bold and ''italic'' texts'''.
  5
+
  6
+An http://www.mozilla.org URL.
  7
+
  8
+[[Image:Test.png|thumb|150px|Legend]]
  9
+
  10
+a [[Category:Tests|Text]]
35  wikitextParser.py
@@ -73,7 +73,7 @@
73 73
     titleEnd                : TITLE6_END/TITLE5_END/TITLE4_END/TITLE3_END/TITLE2_END/TITLE1_END
74 74
     escSeq                  : special_tag / escChar / titleEnd
75 75
     rawChar                 : !escSeq [\x20..\xff]
76  
-    rawText                 : rawChar+                                                              : join parse_all_quotes
  76
+    rawText                 : rawChar+                                                              : join render_raw_text
77 77
     alpha_num               : [a..zA..Z0..9]
78 78
     alpha_num_text          : alpha_num+                                                            : join
79 79
     anyChar                 : [\x20..\xff]
@@ -147,8 +147,7 @@
147 147
     pre_text                : (!PRE_END anyChar)*                                                   : join
148 148
     preformatted            : PRE_BEGIN pre_text PRE_END                                            : liftValue
149 149
     # We allow any char without parsing them as long as the tag is not closed
150  
-    eol_to_space            : EOL*                                                                  : replace_by_space
151  
-    nowiki_text             : (!NOWIKI_END (anyChar/eol_to_space))*                                 : join
  150
+    nowiki_text             : (!NOWIKI_END anyChar)*                                                : join
152 151
     nowiki                  : NOWIKI_BEGIN nowiki_text NOWIKI_END                                   : liftValue
153 152
 
154 153
 # Text types
@@ -165,7 +164,7 @@
165 164
     special_line_begin      : SPACE/EQUAL/BULLET/HASH/COLON/DASH{4}/TABLE_BEGIN/SEMICOLON
166 165
     paragraph_line          : !special_line_begin inline EOL                                        : liftValue
167 166
     blank_paragraph         : EOL{2}                                                                : drop keep
168  
-    paragraph               : paragraph_line+                                                       : liftValue
  167
+    paragraph               : paragraph_line+                                                       : liftValue render_paragraph
169 168
     paragraphs              : (blank_paragraph/EOL/paragraph)+
170 169
 
171 170
 # Titles
@@ -174,7 +173,7 @@
174 173
     title5                  : TITLE5_BEGIN inline TITLE5_END                                        : liftValue
175 174
     title4                  : TITLE4_BEGIN inline TITLE4_END                                        : liftValue
176 175
     title3                  : TITLE3_BEGIN inline TITLE3_END                                        : liftValue
177  
-    title2                  : TITLE2_BEGIN inline TITLE2_END                                        : liftValue
  176
+    title2                  : TITLE2_BEGIN inline TITLE2_END                                        : liftValue render_title2
178 177
     title1                  : TITLE1_BEGIN inline TITLE1_END                                        : liftValue
179 178
     title                   : title6 / title5 / title4 / title3 / title2 / title1
180 179
 
@@ -246,7 +245,7 @@
246 245
 
247 246
 # Top pattern
248 247
 
249  
-    body                    : optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+ : liftValue
  248
+    body                    : optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+ : liftValue render_body
250 249
 
251 250
 """
252 251
 
@@ -263,12 +262,15 @@
263 262
 
264 263
 
265 264
 ###   <toolset>
266  
-def parse_all_quotes(node):
267  
-    from apostrophes import parseQuotes
268  
-    node.value = parseQuotes(node.value)
  265
+import config
  266
+
  267
+if config.output == 'html':
  268
+    from html import *
  269
+elif config.output == 'text':
  270
+    from text import *
  271
+else:
  272
+    from raw import *
269 273
 
270  
-def replace_by_space(node):
271  
-    node.value = ' '
272 274
 
273 275
 ###   <definition>
274 276
 # recursive pattern(s)
@@ -353,7 +355,7 @@ def replace_by_space(node):
353 355
 titleEnd = Choice([TITLE6_END, TITLE5_END, TITLE4_END, TITLE3_END, TITLE2_END, TITLE1_END], expression='TITLE6_END/TITLE5_END/TITLE4_END/TITLE3_END/TITLE2_END/TITLE1_END', name='titleEnd')
354 356
 escSeq = Choice([special_tag, escChar, titleEnd], expression='special_tag / escChar / titleEnd', name='escSeq')
355 357
 rawChar = Sequence([NextNot(escSeq, expression='!escSeq'), Klass(u' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', expression='[\\x20..\\xff]')], expression='!escSeq [\\x20..\\xff]', name='rawChar')
356  
-rawText = Repetition(rawChar, numMin=1, numMax=False, expression='rawChar+', name='rawText')(join, parse_all_quotes)
  358
+rawText = Repetition(rawChar, numMin=1, numMax=False, expression='rawChar+', name='rawText')(join, render_raw_text)
357 359
 alpha_num = Klass(u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', expression='[a..zA..Z0..9]', name='alpha_num')
358 360
 alpha_num_text = Repetition(alpha_num, numMin=1, numMax=False, expression='alpha_num+', name='alpha_num_text')(join)
359 361
 anyChar = Klass(u' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', expression='[\\x20..\\xff]', name='anyChar')
@@ -427,8 +429,7 @@ def replace_by_space(node):
427 429
 pre_text = Repetition(Sequence([NextNot(PRE_END, expression='!PRE_END'), anyChar], expression='!PRE_END anyChar'), numMin=False, numMax=False, expression='(!PRE_END anyChar)*', name='pre_text')(join)
428 430
 preformatted = Sequence([PRE_BEGIN, pre_text, PRE_END], expression='PRE_BEGIN pre_text PRE_END', name='preformatted')(liftValue)
429 431
     # We allow any char without parsing them as long as the tag is not closed
430  
-eol_to_space = Repetition(EOL, numMin=False, numMax=False, expression='EOL*', name='eol_to_space')(replace_by_space)
431  
-nowiki_text = Repetition(Sequence([NextNot(NOWIKI_END, expression='!NOWIKI_END'), Choice([anyChar, eol_to_space], expression='anyChar/eol_to_space')], expression='!NOWIKI_END (anyChar/eol_to_space)'), numMin=False, numMax=False, expression='(!NOWIKI_END (anyChar/eol_to_space))*', name='nowiki_text')(join)
  432
+nowiki_text = Repetition(Sequence([NextNot(NOWIKI_END, expression='!NOWIKI_END'), anyChar], expression='!NOWIKI_END anyChar'), numMin=False, numMax=False, expression='(!NOWIKI_END anyChar)*', name='nowiki_text')(join)
432 433
 nowiki = Sequence([NOWIKI_BEGIN, nowiki_text, NOWIKI_END], expression='NOWIKI_BEGIN nowiki_text NOWIKI_END', name='nowiki')(liftValue)
433 434
 
434 435
 # Text types
@@ -445,7 +446,7 @@ def replace_by_space(node):
445 446
 special_line_begin = Choice([SPACE, EQUAL, BULLET, HASH, COLON, Repetition(DASH, numMin=4, numMax=4, expression='DASH{4}'), TABLE_BEGIN, SEMICOLON], expression='SPACE/EQUAL/BULLET/HASH/COLON/DASH{4}/TABLE_BEGIN/SEMICOLON', name='special_line_begin')
446 447
 paragraph_line = Sequence([NextNot(special_line_begin, expression='!special_line_begin'), inline, EOL], expression='!special_line_begin inline EOL', name='paragraph_line')(liftValue)
447 448
 blank_paragraph = Repetition(EOL, numMin=2, numMax=2, expression='EOL{2}', name='blank_paragraph')(drop, keep)
448  
-paragraph = Repetition(paragraph_line, numMin=1, numMax=False, expression='paragraph_line+', name='paragraph')(liftValue)
  449
+paragraph = Repetition(paragraph_line, numMin=1, numMax=False, expression='paragraph_line+', name='paragraph')(liftValue, render_paragraph)
449 450
 paragraphs = Repetition(Choice([blank_paragraph, EOL, paragraph], expression='blank_paragraph/EOL/paragraph'), numMin=1, numMax=False, expression='(blank_paragraph/EOL/paragraph)+', name='paragraphs')
450 451
 
451 452
 # Titles
@@ -454,7 +455,7 @@ def replace_by_space(node):
454 455
 title5 = Sequence([TITLE5_BEGIN, inline, TITLE5_END], expression='TITLE5_BEGIN inline TITLE5_END', name='title5')(liftValue)
455 456
 title4 = Sequence([TITLE4_BEGIN, inline, TITLE4_END], expression='TITLE4_BEGIN inline TITLE4_END', name='title4')(liftValue)
456 457
 title3 = Sequence([TITLE3_BEGIN, inline, TITLE3_END], expression='TITLE3_BEGIN inline TITLE3_END', name='title3')(liftValue)
457  
-title2 = Sequence([TITLE2_BEGIN, inline, TITLE2_END], expression='TITLE2_BEGIN inline TITLE2_END', name='title2')(liftValue)
  458
+title2 = Sequence([TITLE2_BEGIN, inline, TITLE2_END], expression='TITLE2_BEGIN inline TITLE2_END', name='title2')(liftValue, render_title2)
458 459
 title1 = Sequence([TITLE1_BEGIN, inline, TITLE1_END], expression='TITLE1_BEGIN inline TITLE1_END', name='title1')(liftValue)
459 460
 title = Choice([title6, title5, title4, title3, title2, title1], expression='title6 / title5 / title4 / title3 / title2 / title1', name='title')
460 461
 
@@ -526,7 +527,7 @@ def replace_by_space(node):
526 527
 
527 528
 # Top pattern
528 529
 
529  
-body = Sequence([optional_comment, Repetition(Choice([list, horizontal_rule, preformattedGroup, title, wikiTable, EOL, paragraphs, invalid_line, EOL], expression='list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL'), numMin=1, numMax=False, expression='(list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+')], expression='optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+', name='body')(liftValue)
  530
+body = Sequence([optional_comment, Repetition(Choice([list, horizontal_rule, preformattedGroup, title, wikiTable, EOL, paragraphs, invalid_line, EOL], expression='list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL'), numMin=1, numMax=False, expression='(list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+')], expression='optional_comment (list / horizontal_rule / preformattedGroup / title / wikiTable / EOL / paragraphs / invalid_line / EOL)+', name='body')(liftValue, render_body)
530 531
 
531 532
 
532 533
 

0 notes on commit fc5d10c

Please sign in to comment.
Something went wrong with that request. Please try again.