getnikola · ralsina · May 22, 2015 · May 8, 2015 · May 8, 2015 · May 9, 2015
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -10,6 +10,7 @@ Features
 Bugfixes
 --------
 
+* Scanning of posts refactored out of core (Issue #1700)
 * Handle strange URLs, like ed2k:// (Issue #1695)
 * Fix very old metadata format support (Issue #1689)
 

diff --git a/docs/internals.txt b/docs/internals.txt
@@ -95,17 +95,14 @@ posts are added into RSS feeds and stories are not. All of them are in a list ca
 "the timeline" formed by objects of class ``Post``.
 
 When you are creating a task that needs the list of posts and/or stories (for example,
-the RSS creation plugin), your plugin should call ``self.site.scan_posts()`` to ensure
-the timeline is created and available in ``self.site.timeline``. You should not modify
-the timeline, because it will cause consistency issues.
+the RSS creation plugin) on task execution time, your plugin should have a dependency 
+on the ``scan_posts`` task to ensure the timeline is created and available in 
+``self.site.timeline``. You should not modify the timeline, because it will cause consistency issues.
 
 .. sidebar:: scan_posts
 
-   The scan_posts function is what reads your site and creates the timeline.
-
-   I am considering moving scan_posts off the core and into its own plugin
-   so it can be replaced (for example, by a version that reads a database
-   instead of scanning a folder tree).
+   The ``Nikola.scan_posts`` function can be used in plugins to force the
+   timeline creation, for example, while creating the tasks.
 
 Your plugin can use the timeline to generate "stuff" (technical term). For example,
 Nikola comes with plugins that use the timeline to create a website (surprised?).

diff --git a/nikola/nikola.py b/nikola/nikola.py
@@ -1352,129 +1352,14 @@ def flatten(task):
             'task_dep': task_dep
         }
 
-    def scan_posts(self, really=False, ignore_quit=False, quiet=False):
+    def scan_posts(self):
         """Scan all the posts."""
-        if self._scanned and not really:
+        # FIXME this is temporary while moving things out to a plugin
+        # Why doesn't getPluginByName work????
+        if self._scanned:
             return
-
-        self.global_data = {}
-        self.posts = []
-        self.all_posts = []
-        self.posts_per_year = defaultdict(list)
-        self.posts_per_month = defaultdict(list)
-        self.posts_per_tag = defaultdict(list)
-        self.posts_per_category = defaultdict(list)
-        self.post_per_file = {}
-        self.timeline = []
-        self.pages = []
-
-        seen = set([])
-        if not self.quiet and not quiet:
-            print("Scanning posts", end='', file=sys.stderr)
-        slugged_tags = set([])
-        quit = False
-        for wildcard, destination, template_name, use_in_feeds in \
-                self.config['post_pages']:
-            if not self.quiet and not quiet:
-                print(".", end='', file=sys.stderr)
-            dirname = os.path.dirname(wildcard)
-            for dirpath, _, _ in os.walk(dirname, followlinks=True):
-                dest_dir = os.path.normpath(os.path.join(destination,
-                                            os.path.relpath(dirpath, dirname)))  # output/destination/foo/
-                # Get all the untranslated paths
-                dir_glob = os.path.join(dirpath, os.path.basename(wildcard))  # posts/foo/*.rst
-                untranslated = glob.glob(dir_glob)
-                # And now get all the translated paths
-                translated = set([])
-                for lang in self.config['TRANSLATIONS'].keys():
-                    if lang == self.config['DEFAULT_LANG']:
-                        continue
-                    lang_glob = utils.get_translation_candidate(self.config, dir_glob, lang)  # posts/foo/*.LANG.rst
-                    translated = translated.union(set(glob.glob(lang_glob)))
-                # untranslated globs like *.rst often match translated paths too, so remove them
-                # and ensure x.rst is not in the translated set
-                untranslated = set(untranslated) - translated
-
-                # also remove from translated paths that are translations of
-                # paths in untranslated_list, so x.es.rst is not in the untranslated set
-                for p in untranslated:
-                    translated = translated - set([utils.get_translation_candidate(self.config, p, l) for l in self.config['TRANSLATIONS'].keys()])
-
-                full_list = list(translated) + list(untranslated)
-                # We eliminate from the list the files inside any .ipynb folder
-                full_list = [p for p in full_list
-                             if not any([x.startswith('.')
-                                         for x in p.split(os.sep)])]
-
-                for base_path in full_list:
-                    if base_path in seen:
-                        continue
-                    else:
-                        seen.add(base_path)
-                    post = Post(
-                        base_path,
-                        self.config,
-                        dest_dir,
-                        use_in_feeds,
-                        self.MESSAGES,
-                        template_name,
-                        self.get_compiler(base_path)
-                    )
-                    self.timeline.append(post)
-                    self.global_data[post.source_path] = post
-                    if post.use_in_feeds:
-                        self.posts.append(post)
-                        self.posts_per_year[
-                            str(post.date.year)].append(post)
-                        self.posts_per_month[
-                            '{0}/{1:02d}'.format(post.date.year, post.date.month)].append(post)
-                        for tag in post.alltags:
-                            _tag_slugified = utils.slugify(tag)
-                            if _tag_slugified in slugged_tags:
-                                if tag not in self.posts_per_tag:
-                                    # Tags that differ only in case
-                                    other_tag = [existing for existing in self.posts_per_tag.keys() if utils.slugify(existing) == _tag_slugified][0]
-                                    utils.LOGGER.error('You have tags that are too similar: {0} and {1}'.format(tag, other_tag))
-                                    utils.LOGGER.error('Tag {0} is used in: {1}'.format(tag, post.source_path))
-                                    utils.LOGGER.error('Tag {0} is used in: {1}'.format(other_tag, ', '.join([p.source_path for p in self.posts_per_tag[other_tag]])))
-                                    quit = True
-                            else:
-                                slugged_tags.add(utils.slugify(tag, force=True))
-                            self.posts_per_tag[tag].append(post)
-                        self.posts_per_category[post.meta('category')].append(post)
-
-                    if post.is_post:
-                        # unpublished posts
-                        self.all_posts.append(post)
-                    else:
-                        self.pages.append(post)
-
-                    for lang in self.config['TRANSLATIONS'].keys():
-                        self.post_per_file[post.destination_path(lang=lang)] = post
-                        self.post_per_file[post.destination_path(lang=lang, extension=post.source_ext())] = post
-
-        # Sort everything.
-        self.timeline.sort(key=lambda p: p.date)
-        self.timeline.reverse()
-        self.posts.sort(key=lambda p: p.date)
-        self.posts.reverse()
-        self.all_posts.sort(key=lambda p: p.date)
-        self.all_posts.reverse()
-        self.pages.sort(key=lambda p: p.date)
-        self.pages.reverse()
-
-        for i, p in enumerate(self.posts[1:]):
-            p.next_post = self.posts[i]
-        for i, p in enumerate(self.posts[:-1]):
-            p.prev_post = self.posts[i + 1]
-        self._scanned = True
-        if not self.quiet and not quiet:
-            print("done!", file=sys.stderr)
-
-        signal('scanned').send(self)
-
-        if quit and not ignore_quit:
-            sys.exit(1)
+        for p in self.plugin_manager.getPluginsOfCategory('PostScanner'):
+            p.plugin_object.scan()
 
     def generic_page_renderer(self, lang, post, filters):
         """Render post fragments to final HTML pages."""

diff --git a/nikola/plugin_categories.py b/nikola/plugin_categories.py
@@ -79,6 +79,13 @@ def inject_dependency(self, target, dependency):
         """Add 'dependency' to the target task's task_deps"""
         self.site.injected_deps[target].append(dependency)
 
+class PostScanner(BasePlugin):
+    """The scan method of these plugins is called by Nikola.scan_posts."""
+
+    def scan(self):
+        """Load posts into the timeline."""
+        raise NotImplementedError()
+
 
 class Command(BasePlugin, DoitCommand):
     """These plugins are exposed via the command line.

diff --git a/nikola/plugins/task/scan_posts.plugin b/nikola/plugins/task/scan_posts.plugin
@@ -0,0 +1,10 @@
+[Core]
+Name = scan_posts
+Module = scan_posts
+
+[Documentation]
+Author = Roberto Alsina
+Version = 1.0
+Website = http://getnikola.com
+Description = Scan posts and create timeline
+
diff --git a/nikola/plugins/task/scan_posts.py b/nikola/plugins/task/scan_posts.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2012-2015 Roberto Alsina and others.
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+from __future__ import unicode_literals, print_function
+from collections import defaultdict
+import glob
+import os
+import sys
+
+from blinker import signal
+
+from nikola.plugin_categories import PostScanner
+from nikola import utils
+from nikola.post import Post
+
+
+class ScanPosts(PostScanner):
+    """Render pages into output."""
+
+    name = "scan_posts"
+
+    def scan(self):
+        """Build final pages from metadata and HTML fragments."""
+        kw = {
+            "post_pages": self.site.config["post_pages"],
+            "translations": self.site.config["TRANSLATIONS"],
+            "filters": self.site.config["FILTERS"],
+            "show_untranslated_posts": self.site.config['SHOW_UNTRANSLATED_POSTS'],
+            "demote_headers": self.site.config['DEMOTE_HEADERS'],
+        }
+        self.site.global_data = {}
+        self.site.posts = []
+        self.site.all_posts = []
+        self.site.posts_per_year = defaultdict(list)
+        self.site.posts_per_month = defaultdict(list)
+        self.site.posts_per_tag = defaultdict(list)
+        self.site.posts_per_category = defaultdict(list)
+        self.site.post_per_file = {}
+        self.site.timeline = []
+        self.site.pages = []
+
+        seen = set([])
+        if not self.site.quiet:
+            print("Scanning posts", end='', file=sys.stderr)
+
+        slugged_tags = set([])
+        quit = False
+        for wildcard, destination, template_name, use_in_feeds in \
+                self.site.config['post_pages']:
+            if not self.site.quiet:
+                print(".", end='', file=sys.stderr)
+            dirname = os.path.dirname(wildcard)
+            for dirpath, _, _ in os.walk(dirname, followlinks=True ):
+                dest_dir = os.path.normpath(os.path.join(destination,
+                                            os.path.relpath(dirpath, dirname)))  # output/destination/foo/
+                # Get all the untranslated paths
+                dir_glob = os.path.join(dirpath, os.path.basename(wildcard))  # posts/foo/*.rst
+                untranslated = glob.glob(dir_glob)
+                # And now get all the translated paths
+                translated = set([])
+                for lang in self.site.config['TRANSLATIONS'].keys():
+                    if lang == self.site.config['DEFAULT_LANG']:
+                        continue
+                    lang_glob = utils.get_translation_candidate(self.site.config, dir_glob, lang)  # posts/foo/*.LANG.rst
+                    translated = translated.union(set(glob.glob(lang_glob)))
+                # untranslated globs like *.rst often match translated paths too, so remove them
+                # and ensure x.rst is not in the translated set
+                untranslated = set(untranslated) - translated
+
+                # also remove from translated paths that are translations of
+                # paths in untranslated_list, so x.es.rst is not in the untranslated set
+                for p in untranslated:
+                    translated = translated - set([utils.get_translation_candidate(self.site.config, p, l) for l in self.site.config['TRANSLATIONS'].keys()])
+
+                full_list = list(translated) + list(untranslated)
+                # We eliminate from the list the files inside any .ipynb folder
+                full_list = [p for p in full_list
+                             if not any([x.startswith('.')
+                                         for x in p.split(os.sep)])]
+
+                for base_path in full_list:
+                    if base_path in seen:
+                        continue
+                    else:
+                        seen.add(base_path)
+                    post = Post(
+                        base_path,
+                        self.site.config,
+                        dest_dir,
+                        use_in_feeds,
+                        self.site.MESSAGES,
+                        template_name,
+                        self.site.get_compiler(base_path)
+                    )
+                    self.site.timeline.append(post)
+                    self.site.global_data[post.source_path] = post
+                    if post.use_in_feeds:
+                        self.site.posts.append(post)
+                        self.site.posts_per_year[
+                            str(post.date.year)].append(post)
+                        self.site.posts_per_month[
+                            '{0}/{1:02d}'.format(post.date.year, post.date.month)].append(post)
+                        for tag in post.alltags:
+                            _tag_slugified = utils.slugify(tag)
+                            if _tag_slugified in slugged_tags:
+                                if tag not in self.site.posts_per_tag:
+                                    # Tags that differ only in case
+                                    other_tag = [existing for existing in self.site.posts_per_tag.keys() if utils.slugify(existing) == _tag_slugified][0]
+                                    utils.LOGGER.error('You have tags that are too similar: {0} and {1}'.format(tag, other_tag))
+                                    utils.LOGGER.error('Tag {0} is used in: {1}'.format(tag, post.source_path))
+                                    utils.LOGGER.error('Tag {0} is used in: {1}'.format(other_tag, ', '.join([p.source_path for p in self.site.posts_per_tag[other_tag]])))
+                                    quit = True
+                            else:
+                                slugged_tags.add(utils.slugify(tag, force=True))
+                            self.site.posts_per_tag[tag].append(post)
+                        self.site.posts_per_category[post.meta('category')].append(post)
+
+                    if post.is_post:
+                        # unpublished posts
+                        self.site.all_posts.append(post)
+                    else:
+                        self.site.pages.append(post)
+
+                    for lang in self.site.config['TRANSLATIONS'].keys():
+                        self.site.post_per_file[post.destination_path(lang=lang)] = post
+                        self.site.post_per_file[post.destination_path(lang=lang, extension=post.source_ext())] = post
+
+        # Sort everything.
+        self.site.timeline.sort(key=lambda p: p.date)
+        self.site.timeline.reverse()
+        self.site.posts.sort(key=lambda p: p.date)
+        self.site.posts.reverse()
+        self.site.all_posts.sort(key=lambda p: p.date)
+        self.site.all_posts.reverse()
+        self.site.pages.sort(key=lambda p: p.date)
+        self.site.pages.reverse()
+
+        for i, p in enumerate(self.site.posts[1:]):
+            p.next_post = self.site.posts[i]
+        for i, p in enumerate(self.site.posts[:-1]):
+            p.prev_post = self.site.posts[i + 1]
+        self.site._scanned = True
+        if not self.site.quiet:
+            print("done!", file=sys.stderr)
+
+        signal('scanned').send(self)
+
+        if quit and not ignore_quit:
+            sys.exit(1)
+
+        yield self.group_task()