Skip to content
Permalink
Browse files

Fix #1842 -- convert to utf-8 already

Signed-off-by: Chris Warrick <kwpolska@gmail.com>
  • Loading branch information
Kwpolska committed Jun 21, 2015
1 parent 0ed7211 commit fd96b57987454dddfa1f0e4a4298c99739a5fdc3
Showing with 17 additions and 19 deletions.
  1. +1 −0 CHANGES.txt
  2. +16 −19 nikola/plugins/task/sitemap/__init__.py
@@ -19,6 +19,7 @@ Features
Bugfixes
--------

* Don’t crash on non-UTF-8 files during sitemap generation (Issue #1842)
* Unnecessary rebuilds of yearly archives (Issue #1833)
* Quietly ignore non-existent files in ``nikola check -l`` (Issue #1831)
* Don’t rebuild all tag or category pages when changing tag/category descriptions
@@ -164,31 +164,28 @@ def scan_locs():
if not robot_fetch(path):
continue

filehead = io.open(real_path, 'r', encoding='utf8').read(1024)
# read in binary mode to make ancient files work
fh = open(real_path, 'rb')
filehead = fh.read(1024)
fh.close()

if path.endswith('.html') or path.endswith('.htm'):
try:

""" ignores "html" files without doctype """
if u'<!doctype html' not in filehead.lower():
continue

""" ignores "html" files with noindex robot directives """
robots_directives = [u'<meta content="noindex" name="robots"',
u'<meta content="none" name="robots"',
u'<meta name="robots" content="noindex"',
u'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
continue
""" ignores "html" files without doctype """
if b'<!doctype html' not in filehead.lower():
continue

except UnicodeDecodeError:
# ignore ancient files
# most non-utf8 files are worthless anyways
""" ignores "html" files with noindex robot directives """
robots_directives = [b'<meta content="noindex" name="robots"',
b'<meta content="none" name="robots"',
b'<meta name="robots" content="noindex"',
b'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
continue

""" put Atom and RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
# put Atom and RSS in sitemapindex[] instead of in urlset[],
# sitemap_path is included after it is generated
if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
known_elm_roots = (u'<feed', u'<rss', u'<urlset')
known_elm_roots = (b'<feed', b'<rss', b'<urlset')
if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)

0 comments on commit fd96b57

Please sign in to comment.
You can’t perform that action at this time.