Exclude html meta robots exclusion directives from sitemaps
da2x committed May 14, 2015
1 parent e89101f commit 3cf503948bf98513f6cb2ffe56cd51feb29a65ea
Showing 2 changed files with 18 additions and 5 deletions.
@@ -10,6 +10,7 @@ Features

* Exclude `<meta content="noindex" name="robots">` from sitemaps
* new_post paths are now relative to CWD (Issue #1325)

New in v7.4.1
@@ -163,21 +163,33 @@ def scan_locs():
if not robot_fetch(path):

filehead =, 'r', encoding='utf8').read(1024)

if path.endswith('.html') or path.endswith('.htm'):
if u'<!doctype html' not in, 'r', encoding='utf8').read(1024).lower():
# ignores "html" files without doctype
# alexa-verify, google-site-verification, etc.

""" ignores "html" files without doctype """
if u'<!doctype html' not in filehead.lower():

""" ignores "html" files with noindex robot directives """
robots_directives = [u'<meta content="noindex" name="robots"',
u'<meta content="none" name="robots"',
u'<meta name="robots" content="noindex"',
u'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):

except UnicodeDecodeError:
# ignore ancient files
# most non-utf8 files are worthless anyways

""" put Atom and RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
known_elm_roots = (u'<feed', u'<rss', u'<urlset')
filehead =, 'r', encoding='utf8').read(512)
if any([elm_root in filehead for elm_root in known_elm_roots]) and path != sitemap_path:
if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)
loc = urljoin(base_url, base_path + path)

