-
Notifications
You must be signed in to change notification settings - Fork 12
/
html_decode.py
44 lines (38 loc) · 1.29 KB
/
html_decode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from htmlentitydefs import name2codepoint as n2cp
import re
def decode_htmlentities(string):
"""
Decode HTML entities–hex, decimal, or named–in a string
@see http://snippets.dzone.com/posts/show/4569
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
>>> print decode_htmlentities(u).encode('UTF-8')
E tu vivrai nel terrore - L'aldilà (1981)
>>> print decode_htmlentities("l'eau")
l'eau
>>> print decode_htmlentities("foo < bar")
foo < bar
"""
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#":
# decoding by number
if match.group(2) == '':
# number is in decimal
return unichr(int(ent))
elif match.group(2) == 'x':
# number is in hex
return unichr(int('0x'+ent, 16))
else:
# they were using a name
cp = n2cp.get(ent)
if cp: return unichr(cp)
else: return match.group()
entity_re = re.compile(r'&(#?)(x?)(\w+);')
return entity_re.subn(substitute_entity, string)[0]
def _test():
import doctest, html_decode
return doctest.testmod(html_decode)
if __name__ == "__main__":
_test()