Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Fixes so that unit tests run under python 3.1

Note however that while there is a python3 version of html5lib,
it appears to be unmaintained, so the worth of all this is
questionable.

References:
  http://code.google.com/p/html5lib/issues/detail?id=144
  http://code.google.com/p/html5lib/source/browse/#hg%2Fpython3
  • Loading branch information...
commit 4dc02d16f1a71048a1267851e3751d67a7ac2a14 1 parent 49a49e6
@dairiki authored
Showing with 18 additions and 8 deletions.
  1. +18 −8 src/lxml/html/html5parser.py
View
26 src/lxml/html/html5parser.py
@@ -2,8 +2,6 @@
An interface to html5lib that mimics the lxml.html interface.
"""
-import urllib
-
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
@@ -15,7 +13,14 @@
_strings = basestring
except NameError:
_strings = (bytes, str)
-
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+try:
+ from urlparse import urlparse
+except ImportError:
+ from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
@@ -104,11 +109,11 @@ def fragment_fromstring(html, create_parent=False,
no_leading_text=not accept_leading_text)
if create_parent:
- if not isinstance(create_parent, basestring):
+ if not isinstance(create_parent, _strings):
create_parent = 'div'
new_root = Element(create_parent)
if elements:
- if isinstance(elements[0], basestring):
+ if isinstance(elements[0], _strings):
new_root.text = elements[0]
del elements[0]
new_root.extend(elements)
@@ -174,11 +179,16 @@ def parse(filename_url_or_file, guess_charset=True, parser=None):
"""
if parser is None:
parser = html_parser
- if isinstance(filename_url_or_file, basestring):
- fp = urllib.urlopen(filename_url_or_file)
- else:
+ if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
+ elif _looks_like_url(filename_url_or_file):
+ fp = urlopen(filename_url_or_file)
+ else:
+ fp = open(filename_url_or_file, 'rb')
return parser.parse(fp, useChardet=guess_charset)
+def _looks_like_url(str):
+ scheme = urlparse(str)[0]
+ return scheme != ''
html_parser = HTMLParser()
Please sign in to comment.
Something went wrong with that request. Please try again.