|
@@ -25,6 +25,7 @@ |
|
|
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
|
|
|
|
from __future__ import print_function |
|
|
from collections import defaultdict |
|
|
import os |
|
|
import re |
|
|
import sys |
|
@@ -34,6 +35,7 @@ |
|
|
except ImportError: |
|
|
from urllib.parse import unquote, urlparse, urljoin, urldefrag # NOQA |
|
|
|
|
|
from doit.loader import generate_tasks |
|
|
import lxml.html |
|
|
try: |
|
|
import requests |
|
@@ -44,33 +46,28 @@ |
|
|
from nikola.utils import get_logger, req_missing |
|
|
|
|
|
|
|
|
def _call_nikola_list(l, site, arguments): |
|
|
class NotReallyAStream(object): |
|
|
"""A massive hack.""" |
|
|
out = [] |
|
|
def _call_nikola_list(site): |
|
|
files = [] |
|
|
deps = defaultdict(list) |
|
|
for task in generate_tasks('render_site', site.gen_tasks('render_site', "Task", '')): |
|
|
files.extend(task.targets) |
|
|
for target in task.targets: |
|
|
deps[target].extend(task.file_dep) |
|
|
for task in generate_tasks('post_render', site.gen_tasks('render_site', "LateTask", '')): |
|
|
files.extend(task.targets) |
|
|
for target in task.targets: |
|
|
deps[target].extend(task.file_dep) |
|
|
return files, deps |
|
|
|
|
|
def write(self, t): |
|
|
self.out.append(t) |
|
|
|
|
|
oldstream = l.outstream |
|
|
newstream = NotReallyAStream() |
|
|
try: |
|
|
l.outstream = newstream |
|
|
l.parse_execute(arguments) |
|
|
return newstream.out |
|
|
finally: |
|
|
l.outstream = oldstream |
|
|
|
|
|
|
|
|
def real_scan_files(l, site): |
|
|
def real_scan_files(site): |
|
|
task_fnames = set([]) |
|
|
real_fnames = set([]) |
|
|
output_folder = site.config['OUTPUT_FOLDER'] |
|
|
# First check that all targets are generated in the right places |
|
|
for task in _call_nikola_list(l, site, ["--all"]): |
|
|
task = task.strip() |
|
|
if output_folder in task and ':' in task: |
|
|
fname = task.split(':', 1)[-1] |
|
|
for fname in _call_nikola_list(site)[0]: |
|
|
fname = fname.strip() |
|
|
if fname.startswith(output_folder): |
|
|
task_fnames.add(fname) |
|
|
# And now check that there are no non-target files |
|
|
for root, dirs, files in os.walk(output_folder, followlinks=True): |
|
@@ -154,7 +151,6 @@ class CommandCheck(Command): |
|
|
def _execute(self, options, args): |
|
|
"""Check the generated site.""" |
|
|
self.logger = get_logger('check', self.site.loghandlers) |
|
|
self.l = self._doitargs['cmds'].get_plugin('list')(config=self.config, **self._doitargs) |
|
|
|
|
|
if not options['links'] and not options['files'] and not options['clean']: |
|
|
print(self.help()) |
|
@@ -175,21 +171,25 @@ def _execute(self, options, args): |
|
|
existing_targets = set([]) |
|
|
checked_remote_targets = {} |
|
|
|
|
|
def analyze(self, task, find_sources=False, check_remote=False): |
|
|
def analyze(self, fname, find_sources=False, check_remote=False): |
|
|
rv = False |
|
|
self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] |
|
|
base_url = urlparse(self.site.config['BASE_URL']) |
|
|
self.existing_targets.add(self.site.config['SITE_URL']) |
|
|
self.existing_targets.add(self.site.config['BASE_URL']) |
|
|
url_type = self.site.config['URL_TYPE'] |
|
|
|
|
|
deps = {} |
|
|
if find_sources: |
|
|
deps = _call_nikola_list(self.site)[1] |
|
|
|
|
|
if check_remote and requests is None: |
|
|
req_missing(['requests'], 'check remote links') |
|
|
|
|
|
if url_type in ('absolute', 'full_path'): |
|
|
url_netloc_to_root = urlparse(self.site.config['BASE_URL']).path |
|
|
try: |
|
|
filename = task.split(":")[-1] |
|
|
filename = fname |
|
|
|
|
|
if filename.startswith(self.site.config['CACHE_FOLDER']): |
|
|
# Do not look at links in the cache, which are not parsed by |
|
@@ -262,7 +262,7 @@ def analyze(self, task, find_sources=False, check_remote=False): |
|
|
self.logger.warn("Broken link in {0}: {1}".format(filename, target)) |
|
|
if find_sources: |
|
|
self.logger.warn("Possible sources:") |
|
|
self.logger.warn("\n".join(_call_nikola_list(self.l, self.site, ["--deps", task]))) |
|
|
self.logger.warn("\n".join(deps[filename])) |
|
|
self.logger.warn("===============================\n") |
|
|
except Exception as exc: |
|
|
self.logger.error("Error with: {0} {1}".format(filename, exc)) |
|
@@ -273,14 +273,11 @@ def scan_links(self, find_sources=False, check_remote=False): |
|
|
self.logger.info("===============\n") |
|
|
self.logger.notice("{0} mode".format(self.site.config['URL_TYPE'])) |
|
|
failure = False |
|
|
for task in _call_nikola_list(self.l, self.site, ["--all"]): |
|
|
task = task.strip() |
|
|
if task.split(':')[0] in ( |
|
|
'render_tags', 'render_archive', |
|
|
'render_galleries', 'render_indexes', |
|
|
'render_pages', 'render_posts', |
|
|
'render_site') and '.html' in task: |
|
|
if self.analyze(task, find_sources, check_remote): |
|
|
# Maybe we should just examine all HTML files |
|
|
output_folder = self.site.config['OUTPUT_FOLDER'] |
|
|
for fname in _call_nikola_list(self.site)[0]: |
|
|
if fname.startswith(output_folder) and '.html' == fname[-5:]: |
|
|
if self.analyze(fname, find_sources, check_remote): |
|
|
failure = True |
|
|
if not failure: |
|
|
self.logger.info("All links checked.") |
|
@@ -290,7 +287,7 @@ def scan_files(self): |
|
|
failure = False |
|
|
self.logger.info("Checking Files:") |
|
|
self.logger.info("===============\n") |
|
|
only_on_output, only_on_input = real_scan_files(self.l, self.site) |
|
|
only_on_output, only_on_input = real_scan_files(self.site) |
|
|
|
|
|
# Ignore folders |
|
|
only_on_output = [p for p in only_on_output if not os.path.isdir(p)] |
|
@@ -312,7 +309,7 @@ def scan_files(self): |
|
|
return failure |
|
|
|
|
|
def clean_files(self): |
|
|
only_on_output, _ = real_scan_files(self.l, self.site) |
|
|
only_on_output, _ = real_scan_files(self.site) |
|
|
for f in only_on_output: |
|
|
os.unlink(f) |
|
|
return True |