/
html5lib.py
53 lines (41 loc) · 1.68 KB
/
html5lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import absolute_import
from django.utils.encoding import smart_unicode
from django.core.exceptions import ImproperlyConfigured
from compressor.exceptions import ParserError
from compressor.parser import ParserBase
from compressor.utils.decorators import cached_property
class Html5LibParser(ParserBase):
def _serialize(self, elem):
fragment = self.html5lib.treebuilders.simpletree.DocumentFragment()
fragment.appendChild(elem)
return self.html5lib.serialize(fragment,
quote_attr_values=True, omit_optional_tags=False)
def _find(self, *names):
for node in self.html.childNodes:
if node.type == 5 and node.name in names:
yield node
@cached_property
def html(self):
try:
import html5lib
self.html5lib = html5lib
return html5lib.parseFragment(self.content)
except ImportError, err:
raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
except Exception, err:
raise ParserError("Error while initializing Parser: %s" % err)
def css_elems(self):
return self._find('style', 'link')
def js_elems(self):
return self._find('script')
def elem_attribs(self, elem):
return elem.attributes
def elem_content(self, elem):
return elem.childNodes[0].value
def elem_name(self, elem):
return elem.name
def elem_str(self, elem):
# This method serializes HTML in a way that does not pass all tests.
# However, this method is only called in tests anyway, so it doesn't
# really matter.
return smart_unicode(self._serialize(elem))