Skip to content
Permalink
Browse files

Simplistic remote URL checker

  • Loading branch information
ralsina committed May 6, 2015
1 parent 9d3e79a commit 2103aa4c48aa5686475e9793f8356ce0b84e1d90
Showing with 25 additions and 10 deletions.
  1. +25 −10 nikola/plugins/command/check.py
@@ -35,6 +35,7 @@
from urllib.parse import unquote, urlparse, urljoin, urldefrag # NOQA

import lxml.html
import requests

from nikola.plugin_categories import Command
from nikola.utils import get_logger
@@ -137,6 +138,14 @@ class CommandCheck(Command):
'default': False,
'help': 'Be more verbose.',
},
{
'name': 'remote',
'long': 'remote',
'short': 'r',
'type': bool,
'default': False,
'help': 'Check that remote links work.',
},
]

def _execute(self, options, args):
@@ -152,7 +161,7 @@ def _execute(self, options, args):
else:
self.logger.level = 4
if options['links']:
failure = self.scan_links(options['find_sources'])
failure = self.scan_links(options['find_sources'], options['remote'])
if options['files']:
failure = self.scan_files()
if options['clean']:
@@ -162,7 +171,7 @@ def _execute(self, options, args):

existing_targets = set([])

def analyze(self, task, find_sources=False):
def analyze(self, task, find_sources=False, check_remote=False):
rv = False
self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
base_url = urlparse(self.site.config['BASE_URL'])
@@ -193,13 +202,19 @@ def analyze(self, task, find_sources=False):
if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http":
self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target))

# Absolute links when using only paths, skip.
if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'):
continue

# Absolute links to other domains, skip
if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc:
continue
# Absolute links when using only paths, skip.
if ((parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc) or \
((parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path')):
if not check_remote or parsed.scheme not in ["http", "https"]:
continue
if parsed.netloc == base_url.netloc:
continue
# Check the remote link works
resp = requests.head(target)
if resp.status_code > 399: # Error
self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code))
continue

if url_type == 'rel_path':
target_filename = os.path.abspath(
@@ -234,7 +249,7 @@ def analyze(self, task, find_sources=False):
self.logger.error("Error with: {0} {1}".format(filename, exc))
return rv

def scan_links(self, find_sources=False):
def scan_links(self, find_sources=False, check_remote=False):
self.logger.info("Checking Links:")
self.logger.info("===============\n")
self.logger.notice("{0} mode".format(self.site.config['URL_TYPE']))
@@ -246,7 +261,7 @@ def scan_links(self, find_sources=False):
'render_galleries', 'render_indexes',
'render_pages', 'render_posts',
'render_site') and '.html' in task:
if self.analyze(task, find_sources):
if self.analyze(task, find_sources, check_remote):
failure = True
if not failure:
self.logger.info("All links checked.")

0 comments on commit 2103aa4

Please sign in to comment.
You can’t perform that action at this time.