|
@@ -35,6 +35,7 @@ |
|
|
from urllib.parse import unquote, urlparse, urljoin, urldefrag # NOQA |
|
|
|
|
|
import lxml.html |
|
|
import requests |
|
|
|
|
|
from nikola.plugin_categories import Command |
|
|
from nikola.utils import get_logger |
|
@@ -137,6 +138,14 @@ class CommandCheck(Command): |
|
|
'default': False, |
|
|
'help': 'Be more verbose.', |
|
|
}, |
|
|
{ |
|
|
'name': 'remote', |
|
|
'long': 'remote', |
|
|
'short': 'r', |
|
|
'type': bool, |
|
|
'default': False, |
|
|
'help': 'Check that remote links work.', |
|
|
}, |
|
|
] |
|
|
|
|
|
def _execute(self, options, args): |
|
@@ -152,7 +161,7 @@ def _execute(self, options, args): |
|
|
else: |
|
|
self.logger.level = 4 |
|
|
if options['links']: |
|
|
failure = self.scan_links(options['find_sources']) |
|
|
failure = self.scan_links(options['find_sources'], options['remote']) |
|
|
if options['files']: |
|
|
failure = self.scan_files() |
|
|
if options['clean']: |
|
@@ -162,7 +171,7 @@ def _execute(self, options, args): |
|
|
|
|
|
existing_targets = set([]) |
|
|
|
|
|
def analyze(self, task, find_sources=False): |
|
|
def analyze(self, task, find_sources=False, check_remote=False): |
|
|
rv = False |
|
|
self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] |
|
|
base_url = urlparse(self.site.config['BASE_URL']) |
|
@@ -193,13 +202,19 @@ def analyze(self, task, find_sources=False): |
|
|
if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http": |
|
|
self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target)) |
|
|
|
|
|
# Absolute links when using only paths, skip. |
|
|
if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'): |
|
|
continue |
|
|
|
|
|
# Absolute links to other domains, skip |
|
|
if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc: |
|
|
continue |
|
|
# Absolute links when using only paths, skip. |
|
|
if ((parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc) or \ |
|
|
((parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path')): |
|
|
if not check_remote or parsed.scheme not in ["http", "https"]: |
|
|
continue |
|
|
if parsed.netloc == base_url.netloc: |
|
|
continue |
|
|
# Check the remote link works |
|
|
resp = requests.head(target) |
|
|
if resp.status_code > 399: # Error |
|
|
self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code)) |
|
|
continue |
|
|
|
|
|
if url_type == 'rel_path': |
|
|
target_filename = os.path.abspath( |
|
@@ -234,7 +249,7 @@ def analyze(self, task, find_sources=False): |
|
|
self.logger.error("Error with: {0} {1}".format(filename, exc)) |
|
|
return rv |
|
|
|
|
|
def scan_links(self, find_sources=False): |
|
|
def scan_links(self, find_sources=False, check_remote=False): |
|
|
self.logger.info("Checking Links:") |
|
|
self.logger.info("===============\n") |
|
|
self.logger.notice("{0} mode".format(self.site.config['URL_TYPE'])) |
|
@@ -246,7 +261,7 @@ def scan_links(self, find_sources=False): |
|
|
'render_galleries', 'render_indexes', |
|
|
'render_pages', 'render_posts', |
|
|
'render_site') and '.html' in task: |
|
|
if self.analyze(task, find_sources): |
|
|
if self.analyze(task, find_sources, check_remote): |
|
|
failure = True |
|
|
if not failure: |
|
|
self.logger.info("All links checked.") |
|
|