-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
readers.py
254 lines (197 loc) · 7.75 KB
/
readers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import six
import os
import re
try:
import docutils
import docutils.core
import docutils.io
from docutils.writers.html4css1 import HTMLTranslator
# import the directives to have pygments support
from pelican import rstdirectives # NOQA
except ImportError:
core = False
try:
from markdown import Markdown
except ImportError:
Markdown = False # NOQA
try:
from asciidocapi import AsciiDocAPI
asciidoc = True
except ImportError:
asciidoc = False
import re
from pelican.contents import Category, Tag, Author
from pelican.utils import get_date, pelican_open
_METADATA_PROCESSORS = {
'tags': lambda x, y: [Tag(tag, y) for tag in x.split(',')],
'date': lambda x, y: get_date(x),
'status': lambda x, y: x.strip(),
'category': Category,
'author': Author,
}
class Reader(object):
enabled = True
extensions = None
def __init__(self, settings):
self.settings = settings
def process_metadata(self, name, value):
if name in _METADATA_PROCESSORS:
return _METADATA_PROCESSORS[name](value, self.settings)
return value
class _FieldBodyTranslator(HTMLTranslator):
def __init__(self, document):
HTMLTranslator.__init__(self, document)
self.compact_p = None
def astext(self):
return ''.join(self.body)
def visit_field_body(self, node):
pass
def depart_field_body(self, node):
pass
def render_node_to_html(document, node):
visitor = _FieldBodyTranslator(document)
node.walkabout(visitor)
return visitor.astext()
class PelicanHTMLTranslator(HTMLTranslator):
def visit_abbreviation(self, node):
attrs = {}
if node.hasattr('explanation'):
attrs['title'] = node['explanation']
self.body.append(self.starttag(node, 'abbr', '', **attrs))
def depart_abbreviation(self, node):
self.body.append('</abbr>')
class RstReader(Reader):
enabled = bool(docutils)
file_extensions = ['rst']
def _parse_metadata(self, document):
"""Return the dict containing document metadata"""
output = {}
for docinfo in document.traverse(docutils.nodes.docinfo):
for element in docinfo.children:
if element.tagname == 'field': # custom fields (e.g. summary)
name_elem, body_elem = element.children
name = name_elem.astext()
if name == 'summary':
value = render_node_to_html(document, body_elem)
else:
value = body_elem.astext()
else: # standard fields (e.g. address)
name = element.tagname
value = element.astext()
name = name.lower()
output[name] = self.process_metadata(name, value)
return output
def _get_publisher(self, filename):
extra_params = {'initial_header_level': '2'}
pub = docutils.core.Publisher(
destination_class=docutils.io.StringOutput)
pub.set_components('standalone', 'restructuredtext', 'html')
pub.writer.translator_class = PelicanHTMLTranslator
pub.process_programmatic_settings(None, extra_params, None)
pub.set_source(source_path=filename)
pub.publish()
return pub
def read(self, filename):
"""Parses restructured text"""
pub = self._get_publisher(filename)
parts = pub.writer.parts
content = parts.get('body')
metadata = self._parse_metadata(pub.document)
metadata.setdefault('title', parts.get('title'))
return content, metadata
class MarkdownReader(Reader):
enabled = bool(Markdown)
file_extensions = ['md', 'markdown', 'mkd']
extensions = ['codehilite', 'extra']
def _parse_metadata(self, meta):
"""Return the dict containing document metadata"""
md = Markdown(extensions=set(self.extensions + ['meta']))
output = {}
for name, value in meta.items():
name = name.lower()
if name == "summary":
summary_values = "\n".join(str(item) for item in value)
summary = md.convert(summary_values)
output[name] = self.process_metadata(name, summary)
else:
output[name] = self.process_metadata(name, value[0])
return output
def read(self, filename):
"""Parse content and metadata of markdown files"""
text = pelican_open(filename)
md = Markdown(extensions=set(self.extensions + ['meta']))
content = md.convert(text)
metadata = self._parse_metadata(md.Meta)
return content, metadata
class HtmlReader(Reader):
file_extensions = ['html', 'htm']
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
def read(self, filename):
"""Parse content and metadata of (x)HTML files"""
with open(filename) as content:
metadata = {'title': 'unnamed'}
for i in self._re.findall(content):
key = i.split(':')[0][5:].strip()
value = i.split(':')[-1][:-3].strip()
name = key.lower()
metadata[name] = self.process_metadata(name, value)
return content, metadata
class AsciiDocReader(Reader):
enabled = bool(asciidoc)
file_extensions = ['asc']
default_options = ["--no-header-footer", "-a newline=\\n"]
def read(self, filename):
"""Parse content and metadata of asciidoc files"""
from cStringIO import StringIO
text = StringIO(pelican_open(filename))
content = StringIO()
ad = AsciiDocAPI()
options = self.settings.get('ASCIIDOC_OPTIONS', [])
if isinstance(options, (str, unicode)):
options = [m.strip() for m in options.split(',')]
options = self.default_options + options
for o in options:
ad.options(*o.split())
ad.execute(text, content, backend="html4")
content = content.getvalue()
metadata = {}
for name, value in ad.asciidoc.document.attributes.items():
name = name.lower()
metadata[name] = self.process_metadata(name, value)
if 'doctitle' in metadata:
metadata['title'] = metadata['doctitle']
return content, metadata
_EXTENSIONS = {}
for cls in Reader.__subclasses__():
for ext in cls.file_extensions:
_EXTENSIONS[ext] = cls
def read_file(filename, fmt=None, settings=None):
"""Return a reader object using the given format."""
base, ext = os.path.splitext(os.path.basename(filename))
if not fmt:
fmt = ext[1:]
if fmt not in _EXTENSIONS:
raise TypeError('Pelican does not know how to parse %s' % filename)
reader = _EXTENSIONS[fmt](settings)
settings_key = '%s_EXTENSIONS' % fmt.upper()
if settings and settings_key in settings:
reader.extensions = settings[settings_key]
if not reader.enabled:
raise ValueError("Missing dependencies for %s" % fmt)
content, metadata = reader.read(filename)
# eventually filter the content with typogrify if asked so
if settings and settings.get('TYPOGRIFY'):
from typogrify.filters import typogrify
content = typogrify(content)
metadata['title'] = typogrify(metadata['title'])
filename_metadata = settings and settings.get('FILENAME_METADATA')
if filename_metadata:
match = re.match(filename_metadata, base)
if match:
for k, v in match.groupdict().items():
if k not in metadata:
k = k.lower() # metadata must be lowercase
metadata[k] = reader.process_metadata(k, v)
return content, metadata