Skip to content
Permalink
Browse files
Fix Python 3 bug with lxml and not-really-strings
Signed-off-by: Chris Warrick <kwpolska@gmail.com>
  • Loading branch information
Kwpolska committed Nov 10, 2016
1 parent 97b068b commit be697bd16ef44023ffd5df311420dc6668a7bbe4
Showing with 8 additions and 2 deletions.
  1. +8 −2 v7/import_page/import_page.py
@@ -34,6 +34,7 @@
libextract = None
import lxml.html
import requests
import sys

from nikola.plugin_categories import Command
from nikola import utils
@@ -70,8 +71,13 @@ def _import_page(self, url):
if 199 < r.status_code < 300: # Got it
# Use the page's title
doc = lxml.html.fromstring(r.content)
title = doc.find('*//title').text_content().decode('utf-8')
slug = utils.slugify(title)
title = doc.find('*//title').text
if sys.version_info[0] == 2 and isinstance(title, str):
title = title.decode('utf-8')
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)
nodes = list(libextract.api.extract(r.content))
# Let's assume the node with more text is the good one
lengths = [len(n.text_content()) for n in nodes]

0 comments on commit be697bd

Please sign in to comment.