diff --git a/CHANGES.txt b/CHANGES.txt index af8ea602ac..bee2f42125 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,13 +4,14 @@ New in master Features -------- +* Add support for ``MetadataExtractor`` plugins that allow custom, + extensible metadata extraction from posts (Issue #2830) +* Support YAML and TOML metadata in 2-file posts (via Issue #2830) +* Renamed ``UNSLUGIFY_TITLES`` → ``FILE_METADATA_UNSLUGIFY_TITLES`` (Issue #2840) * Add ``NIKOLA_SHOW_TRACEBACKS`` environment variable that shows full tracebacks instead of one-line summaries * Use ``PRETTY_URLS`` by default on all sites (Issue #1838) * Feed link generation is completely refactored (Issue #2844) -* Added ``extract_metadata`` and ``split_metadata`` to the - ``utils`` module, which are used by the metadata extraction - facilities in the ``post`` module. Bugfixes -------- diff --git a/docs/extending.txt b/docs/extending.txt index 192f820a9a..e0e92039dc 100644 --- a/docs/extending.txt +++ b/docs/extending.txt @@ -383,8 +383,31 @@ If the compiler produces something other than HTML files, it should also impleme returns the preferred extension for the output file. These plugins can also be used to extract metadata from a file. To do so, the -plugin may implement ``read_metadata`` that will return a dict containing the -metadata contained in the file. +plugin must set ``supports_metadata`` to ``True`` and implement ``read_metadata`` that will return a dict containing the +metadata contained in the file. Optionally, it may list ``metadata_conditions`` (see `MetadataExtractor Plugins`_ below) + +MetadataExtractor Plugins +------------------------- + +Plugins that extract metadata from posts. If they are based on post content, +they must implement ``_extract_metadata_from_text`` (takes source of a post +returns a dict of metadata). They may also implement +``split_metadata_from_text``, ``extract_text``. If they are based on filenames, +they only need ``extract_filename``. If ``support_write`` is set to True, +``write_metadata`` must be implemented. + +Every extractor must be configured properly. The ``name``, ``source`` (from the +``MetaSource`` enum in ``metadata_extractors``) and ``priority`` +(``MetaPriority``) fields are mandatory. There might also be a list of +``conditions`` (tuples of ``MetaCondition, arg``), used to check if an +extractor can provide metadata, a compiled regular expression used to split +metadata (``split_metadata_re``, may be ``None``, used by default +``split_metadata_from_text``), a list of ``requirements`` (3-tuples: import +name, pip name, friendly name), ``map_from`` (name of ``METADATA_MAPPING`` to +use, if any) and ``supports_write`` (whether the extractor supports writing +metadata in the desired format). + +For more details, see the definition in ``plugin_categories.py`` and default extractors in ``metadata_extractors.py``. RestExtension Plugins --------------------- diff --git a/docs/manual.txt b/docs/manual.txt index 91a13bcc78..110f2da661 100644 --- a/docs/manual.txt +++ b/docs/manual.txt @@ -408,7 +408,7 @@ Current Nikola versions experimentally supports other metadata formats that make other static site generators. The currently supported metadata formats are: * reST-style comments (``.. name: value`` — default format) -* Two-file format (reST-style comments or 7-line) +* Two-file format (reST-style, YAML, TOML) * Jupyter Notebook metadata * YAML, between ``---`` (Jekyll, Hugo) * TOML, between ``+++`` (Hugo) @@ -421,7 +421,7 @@ You can add arbitrary meta fields in any format. When you create new posts, by default the metadata will be created as reST style comments. If you prefer a different format, you can set the ``METADATA_FORMAT`` to one of these values: -* ``"Nikola"``: reST comments wrapped in a comment if needed (default) +* ``"Nikola"``: reST comments, wrapped in a HTML comment if needed (default) * ``"YAML"``: YAML wrapped in "---" * ``"TOML"``: TOML wrapped in "+++" * ``"Pelican"``: Native markdown metadata or reST docinfo fields. Nikola style for other formats. @@ -448,6 +448,8 @@ Meta information can also be specified in separate ``.meta`` files. Those suppor .. slug: how-to-make-money .. date: 2012-09-15 19:52:05 UTC +You can also use YAML or TOML metadata inside those (with the appropriate markers). + Jupyter Notebook metadata ````````````````````````` diff --git a/nikola/conf.py.in b/nikola/conf.py.in index d52944bf33..d273f5e798 100644 --- a/nikola/conf.py.in +++ b/nikola/conf.py.in @@ -206,7 +206,7 @@ COMPILERS = ${COMPILERS} # ONE_FILE_POSTS = True # Preferred metadata format for new posts -# "Nikola": reST comments wrapped in a comment if needed (default) +# "Nikola": reST comments, wrapped in a HTML comment if needed (default) # "YAML": YAML wrapped in "---" # "TOML": TOML wrapped in "+++" # "Pelican": Native markdown metadata or reST docinfo fields. Nikola style for other formats. @@ -1149,6 +1149,9 @@ MARKDOWN_EXTENSIONS = ['markdown.extensions.fenced_code', 'markdown.extensions.c # (Note the '.*\/' in the beginning -- matches source paths relative to conf.py) # FILE_METADATA_REGEXP = None +# Should titles fetched from file metadata be unslugified (made prettier?) +# FILE_METADATA_UNSLUGIFY_TITLES = True + # If enabled, extract metadata from docinfo fields in reST documents # USE_REST_DOCINFO_METADATA = False @@ -1166,10 +1169,6 @@ MARKDOWN_EXTENSIONS = ['markdown.extensions.fenced_code', 'markdown.extensions.c # } # Other examples: https://getnikola.com/handbook.html#mapping-metadata-from-other-formats -# If you hate "Filenames with Capital Letters and Spaces.md", you should -# set this to true. -UNSLUGIFY_TITLES = True - # Additional metadata that is added to a post when creating a new_post # ADDITIONAL_METADATA = {} diff --git a/nikola/metadata_extractors.py b/nikola/metadata_extractors.py new file mode 100644 index 0000000000..8bd3e9bd8e --- /dev/null +++ b/nikola/metadata_extractors.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2012-2017 Chris Warrick, Roberto Alsina and others. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Default metadata extractors and helper functions.""" + +import re +import natsort + +from enum import Enum +from nikola.plugin_categories import MetadataExtractor +from nikola.utils import unslugify + +__all__ = ('MetaCondition', 'MetaPriority', 'MetaSource', 'check_conditions') +_default_extractors = [] +DEFAULT_EXTRACTOR_NAME = 'nikola' +DEFAULT_EXTRACTOR = None + + +class MetaCondition(Enum): + """Conditions for extracting metadata.""" + + config_bool = 1 + config_present = 2 + extension = 3 + compiler = 4 + first_line = 5 + never = -1 + + +class MetaPriority(Enum): + """Priority of metadata. + + An extractor is used if and only if the higher-priority extractors returned nothing. + """ + + override = 1 + specialized = 2 + normal = 3 + fallback = 4 + + +class MetaSource(Enum): + """Source of metadata.""" + + text = 1 + filename = 2 + + +def check_conditions(post, filename: str, conditions: list, config: dict, source_text: str) -> bool: + """Check the conditions for a metadata extractor.""" + for ct, arg in conditions: + if any(( + ct == MetaCondition.config_bool and not config.get(arg, False), + ct == MetaCondition.config_present and arg not in config, + ct == MetaCondition.extension and not filename.endswith(arg), + ct == MetaCondition.compiler and post.compiler.name != arg, + ct == MetaCondition.never + )): + return False + elif ct == MetaCondition.first_line: + if not source_text or not source_text.startswith(arg + '\n'): + return False + return True + + +def classify_extractor(extractor: MetadataExtractor, metadata_extractors_by: dict): + """Classify an extractor and add it to the metadata_extractors_by dict.""" + global DEFAULT_EXTRACTOR + if extractor.name == DEFAULT_EXTRACTOR_NAME: + DEFAULT_EXTRACTOR = extractor + metadata_extractors_by['priority'][extractor.priority].append(extractor) + metadata_extractors_by['source'][extractor.source].append(extractor) + metadata_extractors_by['name'][extractor.name] = extractor + metadata_extractors_by['all'].append(extractor) + + +def load_defaults(site: 'nikola.nikola.Nikola', metadata_extractors_by: dict): + """Load default metadata extractors.""" + for extractor in _default_extractors: + extractor.site = site + classify_extractor(extractor, metadata_extractors_by) + + +def is_extractor(extractor) -> bool: + """Check if a given class is an extractor.""" + return isinstance(extractor, MetadataExtractor) + + +def default_metadata_extractors_by() -> dict: + """Return the default metadata_extractors_by dictionary.""" + d = { + 'priority': {}, + 'source': {}, + 'name': {}, + 'all': [] + } + + for i in MetaPriority: + d['priority'][i] = [] + for i in MetaSource: + d['source'][i] = [] + + return d + + +def _register_default(extractor: MetadataExtractor) -> MetadataExtractor: + """Register a default extractor.""" + _default_extractors.append(extractor()) + return extractor + + +@_register_default +class NikolaMetadata(MetadataExtractor): + """Extractor for Nikola-style metadata.""" + + name = 'nikola' + source = MetaSource.text + priority = MetaPriority.normal + supports_write = True + split_metadata_re = re.compile('\n\n') + nikola_re = re.compile(r'^\s*\.\. (.*?): (.*)') + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + outdict = {} + for line in source_text.split('\n'): + match = self.nikola_re.match(line) + if match: + outdict[match.group(1)] = match.group(2) + return outdict + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + metadata = metadata.copy() + order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type') + f = '.. {0}: {1}' + meta = [] + for k in order: + try: + meta.append(f.format(k, metadata.pop(k))) + except KeyError: + pass + # Leftover metadata (user-specified/non-default). + for k in natsort.natsorted(list(metadata.keys()), alg=natsort.ns.F | natsort.ns.IC): + meta.append(f.format(k, metadata[k])) + data = '\n'.join(meta) + if comment_wrap is True: + comment_wrap = ('') + if comment_wrap: + return '\n'.join((comment_wrap[0], data, comment_wrap[1], '', '')) + else: + return data + '\n\n' + + +@_register_default +class YAMLMetadata(MetadataExtractor): + """Extractor for YAML metadata.""" + + name = 'yaml' + source = MetaSource.text + conditions = ((MetaCondition.first_line, '---'),) + requirements = [('yaml', 'PyYAML', 'YAML')] + supports_write = True + split_metadata_re = re.compile('\n---\n') + map_from = 'yaml' + priority = MetaPriority.specialized + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + import yaml + meta = yaml.safe_load(source_text[4:]) + # We expect empty metadata to be '', not None + for k in meta: + if meta[k] is None: + meta[k] = '' + return meta + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + import yaml + return '\n'.join(('---', yaml.safe_dump(metadata, default_flow_style=False).strip(), '---', '')) + + +@_register_default +class TOMLMetadata(MetadataExtractor): + """Extractor for TOML metadata.""" + + name = 'toml' + source = MetaSource.text + conditions = ((MetaCondition.first_line, '+++'),) + requirements = [('toml', 'toml', 'TOML')] + supports_write = True + split_metadata_re = re.compile('\n\\+\\+\\+\n') + map_from = 'toml' + priority = MetaPriority.specialized + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + import toml + return toml.loads(source_text[4:]) + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + import toml + return '\n'.join(('+++', toml.dumps(metadata).strip(), '+++', '')) + + +@_register_default +class FilenameRegexMetadata(MetadataExtractor): + """Extractor for filename metadata.""" + + name = 'filename_regex' + source = MetaSource.filename + priority = MetaPriority.fallback + conditions = [(MetaCondition.config_bool, 'FILE_METADATA_REGEXP')] + + def extract_filename(self, filename: str, lang: str) -> dict: + """Try to read the metadata from the filename based on the given re. + + This requires to use symbolic group names in the pattern. + The part to read the metadata from the filename based on a regular + expression is taken from Pelican - pelican/readers.py + """ + match = re.match(self.site.config['FILE_METADATA_REGEXP'], filename) + meta = {} + + if match: + for key, value in match.groupdict().items(): + k = key.lower().strip() # metadata must be lowercase + if k == 'title' and self.site.config['FILE_METADATA_UNSLUGIFY_TITLES']: + meta[k] = unslugify(value, lang, discard_numbers=False) + else: + meta[k] = value + + return meta diff --git a/nikola/nikola.py b/nikola/nikola.py index a3e4b3c78b..b630f935c1 100644 --- a/nikola/nikola.py +++ b/nikola/nikola.py @@ -68,6 +68,7 @@ CompilerExtension, MarkdownExtension, RestExtension, + MetadataExtractor, ShortcodePlugin, Task, TaskMultiplier, @@ -77,6 +78,8 @@ PostScanner, Taxonomy, ) +from . import metadata_extractors +from .metadata_extractors import default_metadata_extractors_by if DEBUG: logging.basicConfig(level=logging.DEBUG) @@ -423,6 +426,7 @@ def __init__(self, **config): self.configured = bool(config) self.injected_deps = defaultdict(list) self.shortcode_registry = {} + self.metadata_extractors_by = default_metadata_extractors_by() self.rst_transforms = [] self.template_hooks = { @@ -503,6 +507,7 @@ def __init__(self, **config): 'FAVICONS': (), 'FEED_LENGTH': 10, 'FILE_METADATA_REGEXP': None, + 'FILE_METADATA_UNSLUGIFY_TITLES': True, 'ADDITIONAL_METADATA': {}, 'FILES_FOLDERS': {'files': ''}, 'FILTERS': {}, @@ -600,7 +605,6 @@ def __init__(self, **config): 'THEME_COLOR': '#5670d4', # light "corporate blue" 'THUMBNAIL_SIZE': 180, 'TRANSLATIONS_PATTERN': '{path}.{lang}.{ext}', - 'UNSLUGIFY_TITLES': False, # WARNING: conf.py.in overrides this with True for backwards compatibility 'URL_TYPE': 'rel_path', 'USE_BASE_TAG': False, 'USE_BUNDLES': True, @@ -728,6 +732,10 @@ def __init__(self, **config): if self.config['PRESERVE_EXIF_DATA'] and not self.config['EXIF_WHITELIST']: utils.LOGGER.warn('You are setting PRESERVE_EXIF_DATA and not EXIF_WHITELIST so EXIF data is not really kept.') + if 'UNSLUGIFY_TITLES' in self.config: + utils.LOGGER.warn('The UNSLUGIFY_TITLES setting was renamed to FILE_METADATA_UNSLUGIFY_TITLES.') + self.config['FILE_METADATA_UNSLUGIFY_TITLES'] = self.config['UNSLUGIFY_TITLES'] + # Handle CONTENT_FOOTER and RSS_COPYRIGHT* properly. # We provide the arguments to format in CONTENT_FOOTER_FORMATS and RSS_COPYRIGHT_FORMATS. self.config['CONTENT_FOOTER'].langformat(self.config['CONTENT_FOOTER_FORMATS']) @@ -819,6 +827,13 @@ def __init__(self, **config): utils.LOGGER.error("Punycode of {}: {}".format(_bnl, _bnl.encode('idna'))) sys.exit(1) + # Load built-in metadata extractors + metadata_extractors.load_defaults(self, self.metadata_extractors_by) + if metadata_extractors.DEFAULT_EXTRACTOR is None: + utils.LOGGER.error("Could not find default meta extractor ({})".format( + metadata_extractors.DEFAULT_EXTRACTOR_NAME)) + sys.exit(1) + # The pelican metadata format requires a markdown extension if config.get('METADATA_FORMAT', 'nikola').lower() == 'pelican': if 'markdown.extensions.meta' not in config.get('MARKDOWN_EXTENSIONS', []) \ @@ -902,6 +917,7 @@ def init_plugins(self, commands_only=False, load_all=False): "CompilerExtension": CompilerExtension, "MarkdownExtension": MarkdownExtension, "RestExtension": RestExtension, + "MetadataExtractor": MetadataExtractor, "ShortcodePlugin": ShortcodePlugin, "SignalHandler": SignalHandler, "ConfigPlugin": ConfigPlugin, @@ -999,6 +1015,10 @@ def init_plugins(self, commands_only=False, load_all=False): if 'needs_ipython_css' not in self._GLOBAL_CONTEXT: self._GLOBAL_CONTEXT['needs_ipython_css'] = 'ipynb' in self.config['COMPILERS'] + # Activate metadata extractors and prepare them for use + for p in self._activate_plugins_of_category("MetadataExtractor"): + metadata_extractors.classify_extractor(p.plugin_object, self.metadata_extractors_by) + self._activate_plugins_of_category("Taxonomy") self.taxonomy_plugins = {} for taxonomy in [p.plugin_object for p in self.plugin_manager.getPluginsOfCategory('Taxonomy')]: diff --git a/nikola/plugin_categories.py b/nikola/plugin_categories.py index c352ab4d75..183878b3af 100644 --- a/nikola/plugin_categories.py +++ b/nikola/plugin_categories.py @@ -33,7 +33,7 @@ from yapsy.IPlugin import IPlugin from doit.cmd_base import Command as DoitCommand -from .utils import LOGGER, first_line, split_metadata +from .utils import LOGGER, first_line, req_missing __all__ = ( 'Command', @@ -41,6 +41,7 @@ 'PageCompiler', 'RestExtension', 'MarkdownExtension', + 'MetadataExtractor', 'Task', 'TaskMultiplier', 'TemplateSystem', @@ -255,6 +256,8 @@ class PageCompiler(BasePlugin): demote_headers = False supports_onefile = True use_dep_file = True # If set to false, the .dep file is never written and not automatically added as a target + supports_metadata = False + metadata_conditions = [] default_metadata = { 'title': '', 'slug': '', @@ -318,18 +321,22 @@ def extension(self): """Return the preferred extension for the output of this compiler.""" return ".html" - def read_metadata(self, post, file_metadata_regexp=None, unslugify_titles=False, lang=None): + def read_metadata(self, post, lang=None): """Read the metadata from a post, and return a metadata dict.""" return {} - def split_metadata(self, data): - """Split data from metadata in the raw post content. + def split_metadata(self, data, post=None, lang=None): + """Split data from metadata in the raw post content.""" + if lang and post: + extractor = post.used_extractor[lang] + else: + import nikola.metadata_extractors + extractor = nikola.metadata_extractors.DEFAULT_EXTRACTOR - This splits in the first empty line that is NOT at the beginning - of the document, or after YAML/TOML metadata without an empty line. - """ - meta, content, _ = split_metadata(data) - return meta, content + if isinstance(extractor, MetadataExtractor): + return extractor.split_metadata_from_text(data) + else: + return data, data def get_compiler_extensions(self): """Activate all the compiler extension plugins for a given compiler and return them.""" @@ -372,6 +379,74 @@ class MarkdownExtension(CompilerExtension): compiler_name = "markdown" +class MetadataExtractor(BasePlugin): + """Plugins that can extract meta information from post files.""" + + # Name of the extractor. (required) + name = "unknown" + # Where to get metadata from. (MetaSource; required) + source = None + # Priority of extractor. (MetaPriority; required) + priority = None + # List of tuples (MetaCondition, arg) with conditions used to select this extractor. + conditions = [] + # Regular expression used for splitting metadata, or None if not applicable. + split_metadata_re = None + # List of tuples (import name, pip name, friendly name) of Python packages required for this extractor. + requirements = [] + # Name of METADATA_MAPPING to use, if any. + map_from = None + # Whether or not the extractor supports writing metadata. + supports_write = False + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + raise NotImplementedError() + + def split_metadata_from_text(self, source_text: str) -> (str, str): + """Split text into metadata and content (both strings). + + If splitting fails (there is no match), return source_text as both metadata and content. + (This behavior is required for 2-file posts.) + """ + if self.split_metadata_re is None: + return source_text, source_text + else: + split_result = self.split_metadata_re.split(source_text.lstrip(), maxsplit=1) + if len(split_result) == 1: + return source_text, source_text + else: + return split_result + + def extract_text(self, source_text: str) -> dict: + """Extract metadata from text (also calls ``split_metadata_from_text``).""" + split = self.split_metadata_from_text(source_text) + meta = self._extract_metadata_from_text(split[0]) + return meta + + def extract_filename(self, filename: str, lang: str) -> dict: + """Extract metadata from filename.""" + return {} + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format. + + ``comment_wrap`` is either True, False, or a 2-tuple of comments to use for wrapping, if necessary. + If it’s set to True, defaulting to ``('')`` is recommended. + + This function should insert comment markers (if applicable) and must insert trailing newlines. + """ + raise NotImplementedError() + + def check_requirements(self): + """Check if requirements for an extractor are satisfied.""" + for import_name, pip_name, friendly_name in self.requirements: + try: + __import__(import_name) + except ImportError: + req_missing([pip_name], "use {0} metadata".format(friendly_name), python=True, optional=False) + + class SignalHandler(BasePlugin): """Signal handlers.""" diff --git a/nikola/plugins/basic_import.py b/nikola/plugins/basic_import.py index de6ba6f17c..f3ef57ed1e 100644 --- a/nikola/plugins/basic_import.py +++ b/nikola/plugins/basic_import.py @@ -156,8 +156,7 @@ def write_post(cls, filename, content, headers, compiler, rewrite_html=True): onefile=True, **headers) - @staticmethod - def write_metadata(filename, title, slug, post_date, description, tags, **kwargs): + def write_metadata(self, filename, title, slug, post_date, description, tags, **kwargs): """Write metadata to meta file.""" if not description: description = "" @@ -166,7 +165,7 @@ def write_metadata(filename, title, slug, post_date, description, tags, **kwargs with io.open(filename, "w+", encoding="utf8") as fd: data = {'title': title, 'slug': slug, 'date': post_date, 'tags': ','.join(tags), 'description': description} data.update(kwargs) - fd.write(utils.write_metadata(data)) + fd.write(utils.write_metadata(data, site=self.site, comment_wrap=False)) @staticmethod def write_urlmap_csv(output_file, url_map): diff --git a/nikola/plugins/command/new_post.py b/nikola/plugins/command/new_post.py index bb837f37fd..0c0d5d4707 100644 --- a/nikola/plugins/command/new_post.py +++ b/nikola/plugins/command/new_post.py @@ -434,7 +434,7 @@ def _execute(self, options, args): if not onefile: # write metadata file with io.open(meta_path, "w+", encoding="utf8") as fd: - fd.write(utils.write_metadata(data)) + fd.write(utils.write_metadata(data, comment_wrap=False, site=self.site)) LOGGER.info("Your {0}'s metadata is at: {1}".format(content_type, meta_path)) event['meta_path'] = meta_path LOGGER.info("Your {0}'s text is at: {1}".format(content_type, txt_path)) diff --git a/nikola/plugins/compile/html.py b/nikola/plugins/compile/html.py index 4414a91d98..b8f7dc9aa6 100644 --- a/nikola/plugins/compile/html.py +++ b/nikola/plugins/compile/html.py @@ -42,11 +42,12 @@ class CompileHtml(PageCompiler): name = "html" friendly_name = "HTML" + supports_metadata = True def compile_string(self, data, source_path=None, is_two_file=True, post=None, lang=None): """Compile HTML into HTML strings, with shortcode support.""" if not is_two_file: - _, data = self.split_metadata(data) + _, data = self.split_metadata(data, post, lang) new_data, shortcodes = sc.extract_shortcodes(data) return self.site.apply_shortcodes_uuid(new_data, shortcodes, filename=source_path, extra_context={'post': post}) @@ -81,9 +82,7 @@ def create_post(self, path, **kw): content += '\n' with io.open(path, "w+", encoding="utf8") as fd: if onefile: - fd.write('\n\n') + fd.write(write_metadata(metadata, comment_wrap=True, site=self.site, compiler=self)) fd.write(content) def read_metadata(self, post, file_metadata_regexp=None, unslugify_titles=False, lang=None): diff --git a/nikola/plugins/compile/ipynb.py b/nikola/plugins/compile/ipynb.py index 3466dcf331..1b1ac0ab6d 100644 --- a/nikola/plugins/compile/ipynb.py +++ b/nikola/plugins/compile/ipynb.py @@ -52,6 +52,7 @@ class CompileIPynb(PageCompiler): friendly_name = "Jupyter Notebook" demote_headers = True default_kernel = 'python3' + supports_metadata = True def set_site(self, site): """Set Nikola site.""" @@ -97,10 +98,10 @@ def compile(self, source, dest, is_two_file=False, post=None, lang=None): else: post._depfile[dest] += shortcode_deps - def read_metadata(self, post, file_metadata_regexp=None, unslugify_titles=False, lang=None): + def read_metadata(self, post, lang=None): """Read metadata directly from ipynb file. - As ipynb file support arbitrary metadata as json, the metadata used by Nikola + As ipynb files support arbitrary metadata as json, the metadata used by Nikola will be assume to be in the 'nikola' subfield. """ self._req_missing_ipynb() diff --git a/nikola/plugins/compile/markdown/__init__.py b/nikola/plugins/compile/markdown/__init__.py index 8f88908528..759dd49003 100644 --- a/nikola/plugins/compile/markdown/__init__.py +++ b/nikola/plugins/compile/markdown/__init__.py @@ -74,6 +74,7 @@ class CompileMarkdown(PageCompiler): friendly_name = "Markdown" demote_headers = True site = None + supports_metadata = False def set_site(self, site): """Set Nikola site.""" @@ -90,14 +91,14 @@ def set_site(self, site): extensions.extend(site_extensions) if Markdown is not None: self.converter = ThreadLocalMarkdown(extensions) - self.support_metadata = 'markdown.extensions.meta' in extensions + self.supports_metadata = 'markdown.extensions.meta' in extensions def compile_string(self, data, source_path=None, is_two_file=True, post=None, lang=None): """Compile Markdown into HTML strings.""" if Markdown is None: req_missing(['markdown'], 'build this site (compile Markdown)') if not is_two_file: - _, data = self.split_metadata(data) + _, data = self.split_metadata(data, post, lang) new_data, shortcodes = sc.extract_shortcodes(data) output, _ = self.converter.convert(new_data) output, shortcode_deps = self.site.apply_shortcodes_uuid(output, shortcodes, filename=source_path, extra_context={'post': post}) @@ -136,18 +137,12 @@ def create_post(self, path, **kw): content += '\n' with io.open(path, "w+", encoding="utf8") as fd: if onefile: - _format = self.site.config.get('METADATA_FORMAT', 'nikola').lower() - if _format == 'pelican': - _format = 'pelican_md' - data = write_metadata(metadata, _format) - if _format == 'nikola': - data = '\n\n' - fd.write(data) + fd.write(write_metadata(metadata, comment_wrap=True, site=self.site, compiler=self)) fd.write(content) - def read_metadata(self, post, file_metadata_regexp=None, unslugify_titles=False, lang=None): + def read_metadata(self, post, lang=None): """Read the metadata from a post, and return a metadata dict.""" - if not self.support_metadata: + if not self.supports_metadata: return {} if Markdown is None: req_missing(['markdown'], 'build this site (compile Markdown)') diff --git a/nikola/plugins/compile/pandoc.py b/nikola/plugins/compile/pandoc.py index a9d24f2104..4888894c64 100644 --- a/nikola/plugins/compile/pandoc.py +++ b/nikola/plugins/compile/pandoc.py @@ -87,7 +87,5 @@ def create_post(self, path, **kw): content += '\n' with io.open(path, "w+", encoding="utf8") as fd: if onefile: - fd.write('\n\n') + fd.write(write_metadata(metadata, comment_wrap=True, site=self.site, compiler=self)) fd.write(content) diff --git a/nikola/plugins/compile/php.py b/nikola/plugins/compile/php.py index ff14e99b3c..c0221ffcd6 100644 --- a/nikola/plugins/compile/php.py +++ b/nikola/plugins/compile/php.py @@ -79,9 +79,7 @@ def create_post(self, path, **kw): content += '\n' with io.open(path, "w+", encoding="utf8") as fd: if onefile: - fd.write('\n\n') + fd.write(write_metadata(metadata, comment_wrap=True, site=self.site, compiler=self)) fd.write(content) def extension(self): diff --git a/nikola/plugins/compile/rest/__init__.py b/nikola/plugins/compile/rest/__init__.py index 3344efa435..7d9dada597 100644 --- a/nikola/plugins/compile/rest/__init__.py +++ b/nikola/plugins/compile/rest/__init__.py @@ -40,6 +40,7 @@ from docutils.parsers.rst import roles from nikola.nikola import LEGAL_VALUES +from nikola.metadata_extractors import MetaCondition from nikola.plugin_categories import PageCompiler from nikola.utils import ( unicode_str, @@ -58,11 +59,11 @@ class CompileRest(PageCompiler): friendly_name = "reStructuredText" demote_headers = True logger = None + supports_metadata = True + metadata_conditions = [(MetaCondition.config_bool, "USE_REST_DOCINFO_METADATA")] - def read_metadata(self, post, file_metadata_regexp=None, unslugify_titles=False, lang=None): + def read_metadata(self, post, lang=None): """Read the metadata from a post, and return a metadata dict.""" - if not self.site.config.get('USE_REST_DOCINFO_METADATA'): - return {} if lang is None: lang = LocaleBorg().current_lang source_path = post.translated_source_path(lang) @@ -104,7 +105,7 @@ def compile_string(self, data, source_path=None, is_two_file=True, post=None, la # 7 with default metadata, could be more or less depending on the post). add_ln = 0 if not is_two_file: - m_data, data = self.split_metadata(data) + m_data, data = self.split_metadata(data, post, lang) add_ln = len(m_data.splitlines()) + 1 default_template_path = os.path.join(os.path.dirname(__file__), 'template.txt') @@ -170,11 +171,7 @@ def create_post(self, path, **kw): content += '\n' with io.open(path, "w+", encoding="utf8") as fd: if onefile: - _format = self.site.config.get('METADATA_FORMAT', 'nikola').lower() - if _format == 'pelican': - _format = 'pelican_rest' - fd.write(write_metadata(metadata, _format)) - fd.write('\n') + fd.write(write_metadata(metadata, comment_wrap=False, site=self.site, compiler=self)) fd.write(content) def set_site(self, site): diff --git a/nikola/plugins/misc/scan_posts.py b/nikola/plugins/misc/scan_posts.py index 68c88cdf96..ad57b8d819 100644 --- a/nikola/plugins/misc/scan_posts.py +++ b/nikola/plugins/misc/scan_posts.py @@ -97,7 +97,8 @@ def scan(self): self.site.MESSAGES, template_name, self.site.get_compiler(base_path), - destination_base=destination_translatable + destination_base=destination_translatable, + metadata_extractors_by=self.site.metadata_extractors_by ) timeline.append(post) except Exception: diff --git a/nikola/plugins/task/galleries.py b/nikola/plugins/task/galleries.py index 7e14e8980f..77e404606a 100644 --- a/nikola/plugins/task/galleries.py +++ b/nikola/plugins/task/galleries.py @@ -409,7 +409,9 @@ def parse_index(self, gallery, input_folder, output_folder): False, self.site.MESSAGES, 'story.tmpl', - self.site.get_compiler(index_path) + self.site.get_compiler(index_path), + None, + self.site.metadata_extractors_by ) # If this did not exist, galleries without a title in the # index.txt file would be errorneously named `index` diff --git a/nikola/post.py b/nikola/post.py index 5640b7d249..0512005044 100644 --- a/nikola/post.py +++ b/nikola/post.py @@ -34,7 +34,6 @@ import json import os import re -import string try: from urlparse import urljoin except ImportError: @@ -73,9 +72,9 @@ unicode_str, demote_headers, get_translation_candidate, - unslugify, map_metadata ) +from nikola import metadata_extractors __all__ = ('Post',) @@ -94,7 +93,8 @@ def __init__( messages, template_name, compiler, - destination_base=None + destination_base=None, + metadata_extractors_by=None ): """Initialize post. @@ -107,6 +107,7 @@ def __init__( """ self.config = config self.compiler = compiler + self.compiler_contexts = {} self.compile_html = self.compiler.compile self.demote_headers = self.compiler.demote_headers and self.config['DEMOTE_HEADERS'] tzinfo = self.config['__tzinfo__'] @@ -150,24 +151,31 @@ def __init__( self._dependency_uptodate_fragment = defaultdict(list) self._dependency_uptodate_page = defaultdict(list) self._depfile = defaultdict(list) + if metadata_extractors_by is None: + self.metadata_extractors_by = {'priority': {}, 'source': {}} + else: + self.metadata_extractors_by = metadata_extractors_by # Load internationalized metadata for lang in self.translations: if os.path.isfile(get_translation_candidate(self.config, self.source_path, lang)): self.translated_to.add(lang) - default_metadata = get_meta(self, self.config['FILE_METADATA_REGEXP'], self.config['UNSLUGIFY_TITLES']) + default_metadata, default_used_extractor = get_meta(self, lang=None) self.meta = Functionary(lambda: None, self.default_lang) + self.used_extractor = Functionary(lambda: None, self.default_lang) self.meta[self.default_lang] = default_metadata + self.used_extractor[self.default_lang] = default_used_extractor for lang in self.translations: if lang != self.default_lang: meta = defaultdict(lambda: '') meta.update(default_metadata) - _meta = get_meta(self, self.config['FILE_METADATA_REGEXP'], self.config['UNSLUGIFY_TITLES'], lang) + _meta, _extractors = get_meta(self, lang) meta.update(_meta) self.meta[lang] = meta + self.used_extractor[lang] = _extractors if not self.is_translation_available(self.default_lang): # Special case! (Issue #373) @@ -934,36 +942,8 @@ def source_ext(self, prefix=False): else: return ext -# Code that fetches metadata from different places - - -# For backwards compatibility -re_meta = utils.re_meta - - -def _get_metadata_from_filename_by_regex(filename, metadata_regexp, unslugify_titles, lang): - """Try to reed the metadata from the filename based on the given re. - - This requires to use symbolic group names in the pattern. - The part to read the metadata from the filename based on a regular - expression is taken from Pelican - pelican/readers.py - """ - match = re.match(metadata_regexp, filename) - meta = {} - if match: - # .items() for py3k compat. - for key, value in match.groupdict().items(): - k = key.lower().strip() # metadata must be lowercase - if k == 'title' and unslugify_titles: - meta[k] = unslugify(value, lang, discard_numbers=False) - else: - meta[k] = value - - return meta - - -def get_metadata_from_file(source_path, config=None, lang=None): +def get_metadata_from_file(source_path, post, config, lang, metadata_extractors_by): """Extract metadata from the file itself, by parsing contents.""" try: if lang and config: @@ -971,33 +951,38 @@ def get_metadata_from_file(source_path, config=None, lang=None): elif lang: source_path += '.' + lang with io.open(source_path, "r", encoding="utf-8-sig") as meta_file: - file_lines = [x.strip() for x in meta_file.readlines()] - return _get_metadata_from_file(file_lines, config) + source_text = meta_file.read() + + meta = {} + used_extractor = None + for priority in metadata_extractors.MetaPriority: + found_in_priority = False + for extractor in metadata_extractors_by['priority'].get(priority, []): + if not metadata_extractors.check_conditions(post, source_path, extractor.conditions, config, source_text): + continue + extractor.check_requirements() + new_meta = extractor.extract_text(source_text) + if new_meta: + found_in_priority = True + used_extractor = extractor + # Map metadata from other platforms to names Nikola expects (Issue #2817) + map_metadata(new_meta, extractor.map_from, config) + + meta.update(new_meta) + break + + if found_in_priority: + break + return meta, used_extractor except (UnicodeDecodeError, UnicodeEncodeError): msg = 'Error reading {0}: Nikola only supports UTF-8 files'.format(source_path) LOGGER.error(msg) raise ValueError(msg) except Exception: # The file may not exist, for multilingual sites - return {} + return {}, None -re_md_title = re.compile(r'^{0}([^{0}].*)'.format(re.escape('#'))) -# Assuming rst titles are going to be at least 4 chars long -# otherwise this detects things like ''' wich breaks other markups. -re_rst_title = re.compile(r'^([{0}]{{4,}})'.format(re.escape( - string.punctuation))) - - -def _get_metadata_from_file(file_lines, config=None): - """Extract metadata from a post's source file.""" - meta, metadata_type = utils.extract_metadata(file_lines) - if metadata_type in ('toml', 'yaml'): - # Map metadata from other platforms to names Nikola expects (Issue #2817) - map_metadata(meta, metadata_type, config) - return meta - - -def get_metadata_from_meta_file(path, config=None, lang=None): +def get_metadata_from_meta_file(path, post, config, lang, metadata_extractors_by=None): """Take a post path, and gets data from a matching .meta file.""" meta_path = os.path.splitext(path)[0] + '.meta' if lang and config: @@ -1005,58 +990,57 @@ def get_metadata_from_meta_file(path, config=None, lang=None): elif lang: meta_path += '.' + lang if os.path.isfile(meta_path): - return get_metadata_from_file(path, config, lang) - + return get_metadata_from_file(meta_path, post, config, lang, metadata_extractors_by)[0] elif lang: # Metadata file doesn't exist, but not default language, # So, if default language metadata exists, return that. # This makes the 2-file format detection more reliable (Issue #525) - return get_metadata_from_meta_file(path, config, lang=None) + return get_metadata_from_meta_file(meta_path, post, config, None, metadata_extractors_by) else: # No 2-file metadata return {} -def get_meta(post, file_metadata_regexp=None, unslugify_titles=False, lang=None): - """Get post's meta from source. - - If ``file_metadata_regexp`` is given it will be tried to read - metadata from the filename. - If ``unslugify_titles`` is True, the extracted title (if any) will be unslugified, as is - done in galleries. If any metadata is then found inside the file the metadata from the - file will override previous findings. - """ +def get_meta(post, lang): + """Get post meta from compiler or source file.""" meta = defaultdict(lambda: '') + used_extractor = None - try: - config = post.config - except AttributeError: - config = None + config = getattr(post, 'config', None) + metadata_extractors_by = getattr(post, 'metadata_extractors_by') + if metadata_extractors_by is None: + metadata_extractors_by = metadata_extractors.default_metadata_extractors_by() - meta.update(get_metadata_from_meta_file(post.metadata_path, config, lang)) + # If meta file exists, use it + meta.update(get_metadata_from_meta_file(post.metadata_path, post, config, lang, metadata_extractors_by)) if not meta: post.is_two_file = False - if file_metadata_regexp is not None: - meta.update(_get_metadata_from_filename_by_regex(post.source_path, - file_metadata_regexp, - unslugify_titles, - post.default_lang)) - + # Fetch compiler metadata. compiler_meta = {} - if getattr(post, 'compiler', None): - compiler_meta = post.compiler.read_metadata(post, file_metadata_regexp, unslugify_titles, lang) + if (getattr(post, 'compiler', None) and post.compiler.supports_metadata and + metadata_extractors.check_conditions(post, post.source_path, post.compiler.metadata_conditions, config, None)): + compiler_meta = post.compiler.read_metadata(post, lang=lang) + used_extractor = post.compiler meta.update(compiler_meta) if not post.is_two_file and not compiler_meta: # Meta file has precedence over file, which can contain garbage. # Moreover, we should not read the file if we have compiler meta. - meta.update(get_metadata_from_file(post.source_path, config, lang)) + new_meta, used_extractor = get_metadata_from_file(post.source_path, post, config, lang, metadata_extractors_by) + meta.update(new_meta) + + # Filename-based metadata extractors (fallback only) + if not meta: + extractors = metadata_extractors_by['source'].get(metadata_extractors.MetaSource.filename, []) + for extractor in extractors: + if not metadata_extractors.check_conditions(post, post.source_path, extractor.conditions, config, None): + continue + meta.update(extractor.extract_filename(post.source_path, lang)) if lang is None: # Only perform these checks for the default language - if 'slug' not in meta: # If no slug is found in the metadata use the filename meta['slug'] = slugify(unicode_str(os.path.splitext( @@ -1067,7 +1051,7 @@ def get_meta(post, file_metadata_regexp=None, unslugify_titles=False, lang=None) meta['title'] = os.path.splitext( os.path.basename(post.source_path))[0] - return meta + return meta, used_extractor def hyphenate(dom, _lang): diff --git a/nikola/utils.py b/nikola/utils.py index 57dd9afc94..4f03989496 100644 --- a/nikola/utils.py +++ b/nikola/utils.py @@ -34,7 +34,6 @@ import io import locale import logging -import natsort import operator import os import re @@ -97,7 +96,7 @@ 'adjust_name_for_index_path', 'adjust_name_for_index_link', 'NikolaPygmentsHTML', 'create_redirect', 'clean_before_deployment', 'sort_posts', 'indent', 'load_data', 'html_unescape', 'rss_writer', - 'map_metadata', 're_meta', 'extract_metadata', 'split_metadata', + 'map_metadata', # Deprecated, moved to hierarchy_utils: 'TreeNode', 'clone_treenode', 'flatten_tree_structure', 'sort_classifications', 'join_hierarchical_category_path', @@ -1437,23 +1436,45 @@ def get_translation_candidate(config, path, lang): return config['TRANSLATIONS_PATTERN'].format(path=p, ext=e, lang=lang) -def write_metadata(data, _format='nikola'): - """Write metadata.""" - _format = _format.lower() - if _format not in ['nikola', 'yaml', 'toml', 'pelican_rest', 'pelican_md']: - LOGGER.warn('Unknown METADATA_FORMAT %s, using "nikola" format', _format) +def write_metadata(data, metadata_format=None, comment_wrap=False, site=None, compiler=None): + """Write metadata. - if _format == 'yaml': - if yaml is None: - req_missing('pyyaml', 'use YAML metadata', optional=False) - return '\n'.join(('---', yaml.safe_dump(data, default_flow_style=False).strip(), '---', '', '')) + Recommended usage: pass `site`, `comment_wrap`, and optionally `compiler`. Other options are for backwards compatibility. + """ + # API compatibility + if metadata_format is None and site is not None: + metadata_format = site.config.get('METADATA_FORMAT', 'nikola').lower() + if metadata_format is None: + metadata_format = 'nikola' + + if site is None: + import nikola.metadata_extractors + metadata_extractors_by = nikola.metadata_extractors.default_metadata_extractors_by() + nikola.metadata_extractors.load_defaults(site, metadata_extractors_by) + else: + metadata_extractors_by = site.metadata_extractors_by + + # Pelican is mapped to rest_docinfo, markdown_meta, or nikola. + if metadata_format == 'pelican': + if compiler and compiler.name == 'rest': + metadata_format = 'rest_docinfo' + elif compiler and compiler.name == 'markdown': + metadata_format = 'markdown_meta' + else: + # Quiet fallback. + metadata_format = 'nikola' - elif _format == 'toml': - if toml is None: - req_missing('toml', 'use TOML metadata', optional=False) - return '\n'.join(('+++', toml.dumps(data).strip(), '+++', '', '')) + extractor = metadata_extractors_by['name'].get(metadata_format) + if extractor and extractor.supports_write: + extractor.check_requirements() + return extractor.write_metadata(data, comment_wrap) + else: + LOGGER.warn('Writing METADATA_FORMAT %s is not supported, using "nikola" format', metadata_format) + + if metadata_format not in ('nikola', 'rest_docinfo', 'markdown_meta'): + LOGGER.warn('Unknown METADATA_FORMAT %s, using "nikola" format', metadata_format) - elif _format == 'pelican_rest': + if metadata_format == 'rest_docinfo': title = data.pop('title') results = [ '=' * len(title), @@ -1462,25 +1483,12 @@ def write_metadata(data, _format='nikola'): '' ] + [':{0}: {1}'.format(k, v) for k, v in data.items() if v] + [''] return '\n'.join(results) - - elif _format == 'pelican_md': + elif metadata_format == 'markdown_meta': results = ['{0}: {1}'.format(k, v) for k, v in data.items() if v] + ['', ''] return '\n'.join(results) - else: # Nikola, default - order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type') - f = '.. {0}: {1}' - meta = [] - for k in order: - try: - meta.append(f.format(k, data.pop(k))) - except KeyError: - pass - # Leftover metadata (user-specified/non-default). - for k in natsort.natsorted(list(data.keys()), alg=natsort.ns.F | natsort.ns.IC): - meta.append(f.format(k, data[k])) - meta.append('') - return '\n'.join(meta) + from nikola.metadata_extractors import DEFAULT_EXTRACTOR + return DEFAULT_EXTRACTOR.write_metadata(data, comment_wrap) def ask(query, default=None): @@ -2028,111 +2036,3 @@ def read_from_config(self, site, basename, posts_per_classification_per_language args = {'translation_manager': self, 'site': site, 'posts_per_classification_per_language': posts_per_classification_per_language} signal('{}_translations_config'.format(basename.lower())).send(args) - - -# Moved to global variable to avoid recompilation -# of regex every time re_meta() is called. -_DEFAULT_REST_METADATA_PARSING = re.compile('^\.\. (.*?): (.*)') - - -def re_meta(line, match=None): - """Find metadata using regular expressions.""" - if match: - reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match))) - else: - reStr = _DEFAULT_REST_METADATA_PARSING - result = reStr.findall(line.strip()) - if match and result: - return (match, result[0]) - elif not match and result: - return result[0][0], result[0][1].strip() - else: - return None, None - - -def extract_metadata(file_lines): - """Extract metadata from the lines of a file. - - Returns a pair ``(meta, metadata_type)``, where ``meta`` is the - metadata dictionary and ``metadata_type`` the metadata format. - - Valid values for ``metadata_type`` are: - * ``'none'``: no metadata was found (file was empty) - * ``'yaml'``: metadata in YAML format - * ``'toml'``: metadata in TOML format - * ``'rest'``: metadata in reST format (the standard Nikola - reST-like metadata format) - """ - meta = {} - if not file_lines: - return meta, 'none' - - # Skip up to one empty line at the beginning (for txt2tags) - if not file_lines[0]: - file_lines = file_lines[1:] - - # If 1st line is '---', then it's YAML metadata - if file_lines[0] == '---': - if yaml is None: - req_missing('pyyaml', 'use YAML metadata', optional=True) - raise ValueError('Error parsing metadata') - idx = file_lines.index('---', 1) - meta = yaml.safe_load('\n'.join(file_lines[1:idx])) - # We expect empty metadata to be '', not None - for k in meta: - if meta[k] is None: - meta[k] = '' - return meta, 'yaml' - - # If 1st line is '+++', then it's TOML metadata - if file_lines[0] == '+++': - if toml is None: - req_missing('toml', 'use TOML metadata', optional=True) - raise ValueError('Error parsing metadata') - idx = file_lines.index('+++', 1) - meta = toml.loads('\n'.join(file_lines[1:idx])) - return meta, 'toml' - - # First, get metadata from the beginning of the file, - # up to first empty line - - for line in file_lines: - if not line: - break - match = re_meta(line) - if match[0]: - meta[match[0]] = match[1] - - return meta, 'nikola' - - -def split_metadata(data): - """Split data from metadata in the raw post content. - - This splits in the first empty line that is NOT at the beginning - of the document, or after YAML/TOML metadata without an empty line. - - Returns a tuple ``(meta, content, metadata_type)`` where ``meta`` and - ``content`` are parts of ``data``, and ``metadata_type`` is the metadata - format. - - Valid values for ``metadata_type`` are: - * ``'none'``: no metadata was found (file was empty) - * ``'yaml'``: metadata in YAML format - * ``'toml'``: metadata in TOML format - * ``'rest'``: metadata in reST format (the standard Nikola - reST-like metadata format) - """ - if data.startswith('---'): # YAML metadata - split_result = re.split('(\n---\n|\r\n---\r\n)', data.lstrip(), maxsplit=1) - metadata_type = 'yaml' - elif data.startswith('+++'): # TOML metadata - split_result = re.split('(\n\\+\\+\\+\n|\r\n\\+\\+\\+\r\n)', data.lstrip(), maxsplit=1) - metadata_type = 'toml' - else: - split_result = re.split('(\n\n|\r\n\r\n)', data.lstrip(), maxsplit=1) - metadata_type = 'nikola' - if len(split_result) == 1: - return '', split_result[0], 'none' - # ['metadata', '\n\n', 'post content'] - return split_result[0], split_result[-1], metadata_type diff --git a/setup.py b/setup.py index a2219df301..22677f77d7 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def run_tests(self): with open('requirements.txt', 'r') as fh: dependencies = [l.strip() for l in fh] -extras = {} +extras = {':python_version == "3.3"': ['enum34']} with open('requirements-extras.txt', 'r') as fh: extras['extras'] = [l.strip() for l in fh][1:] diff --git a/tests/data/metadata_extractors/f-html-1-compiler.html b/tests/data/metadata_extractors/f-html-1-compiler.html new file mode 100644 index 0000000000..5e95c4736c --- /dev/null +++ b/tests/data/metadata_extractors/f-html-1-compiler.html @@ -0,0 +1,7 @@ + + + + + +Content line 1. +Content line 2. \ No newline at end of file diff --git a/tests/data/metadata_extractors/f-ipynb-1-compiler.ipynb b/tests/data/metadata_extractors/f-ipynb-1-compiler.ipynb new file mode 100644 index 0000000000..3f6d18f5da --- /dev/null +++ b/tests/data/metadata_extractors/f-ipynb-1-compiler.ipynb @@ -0,0 +1,31 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Content line 1.\nContent line 2." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "env": {}, + "language": "python", + "name": "python3" + }, + "nikola": { + "category": "", + "date": "2017-07-01 00:00:00 UTC", + "description": "", + "link": "", + "slug": "s-ipynb-1-compiler", + "tags": "meta,Jupyter Notebook,onefile,compiler", + "title": "T: Jupyter Notebook, 1, compiler", + "type": "text" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/data/metadata_extractors/f-markdown-1-compiler.md b/tests/data/metadata_extractors/f-markdown-1-compiler.md new file mode 100644 index 0000000000..689bb0f181 --- /dev/null +++ b/tests/data/metadata_extractors/f-markdown-1-compiler.md @@ -0,0 +1,7 @@ +title: T: Markdown, 1, compiler +slug: s-markdown-1-compiler +date: 2017-07-01 00:00:00 UTC +tags: meta,Markdown,onefile,compiler + +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-markdown-1-nikola.md b/tests/data/metadata_extractors/f-markdown-1-nikola.md new file mode 100644 index 0000000000..5b38c2075c --- /dev/null +++ b/tests/data/metadata_extractors/f-markdown-1-nikola.md @@ -0,0 +1,13 @@ + + +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-markdown-2-nikola.md b/tests/data/metadata_extractors/f-markdown-2-nikola.md new file mode 100644 index 0000000000..0bd667b6a0 --- /dev/null +++ b/tests/data/metadata_extractors/f-markdown-2-nikola.md @@ -0,0 +1,2 @@ +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-markdown-2-nikola.meta b/tests/data/metadata_extractors/f-markdown-2-nikola.meta new file mode 100644 index 0000000000..c068ba66cb --- /dev/null +++ b/tests/data/metadata_extractors/f-markdown-2-nikola.meta @@ -0,0 +1,7 @@ +.. title: T: Markdown, 2, Nikola +.. slug: s-markdown-2-nikola +.. date: 2017-07-01 00:00:00 UTC +.. tags: meta,Markdown,twofile,Nikola +.. link: +.. description: +.. type: text diff --git a/tests/data/metadata_extractors/f-rest-1-compiler.rst b/tests/data/metadata_extractors/f-rest-1-compiler.rst new file mode 100644 index 0000000000..3b21c3f363 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-1-compiler.rst @@ -0,0 +1,9 @@ +T: reST, 1, compiler +==================== + +:slug: s-rest-1-compiler +:Date: 2017-07-01 00:00:00 UTC +:tags: meta,reST,onefile,compiler + +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-1-nikola.rst b/tests/data/metadata_extractors/f-rest-1-nikola.rst new file mode 100644 index 0000000000..14dede0bf0 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-1-nikola.rst @@ -0,0 +1,11 @@ +.. title: T: reST, 1, Nikola +.. slug: s-rest-1-nikola +.. date: 2017-07-01 00:00:00 UTC +.. tags: meta,reST,onefile,Nikola +.. category: +.. link: +.. description: +.. type: text + +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-1-toml.rst b/tests/data/metadata_extractors/f-rest-1-toml.rst new file mode 100644 index 0000000000..0e2c4eb453 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-1-toml.rst @@ -0,0 +1,8 @@ ++++ +title = "T: reST, 1, TOML" +slug = "s-rest-1-toml" +date = "2017-07-01 00:00:00 UTC" +tags = "meta,reST,onefile,TOML" ++++ +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-1-yaml.rst b/tests/data/metadata_extractors/f-rest-1-yaml.rst new file mode 100644 index 0000000000..b904b35bf2 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-1-yaml.rst @@ -0,0 +1,8 @@ +--- +title: "T: reST, 1, YAML" +slug: s-rest-1-yaml +date: "2017-07-01 00:00:00 UTC" +tags: ["meta", "reST", "onefile", "YAML"] +--- +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-2-nikola.meta b/tests/data/metadata_extractors/f-rest-2-nikola.meta new file mode 100644 index 0000000000..aeb6f494f8 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-nikola.meta @@ -0,0 +1,7 @@ +.. title: T: reST, 2, Nikola +.. slug: s-rest-2-nikola +.. date: 2017-07-01 00:00:00 UTC +.. tags: meta,reST,twofile,Nikola +.. link: +.. description: +.. type: text diff --git a/tests/data/metadata_extractors/f-rest-2-nikola.rst b/tests/data/metadata_extractors/f-rest-2-nikola.rst new file mode 100644 index 0000000000..0bd667b6a0 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-nikola.rst @@ -0,0 +1,2 @@ +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-2-toml.meta b/tests/data/metadata_extractors/f-rest-2-toml.meta new file mode 100644 index 0000000000..4235df9cd7 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-toml.meta @@ -0,0 +1,6 @@ ++++ +title = "T: reST, 2, TOML" +slug = "s-rest-2-toml" +date = "2017-07-01 00:00:00 UTC" +tags = "meta,reST,twofile,TOML" ++++ diff --git a/tests/data/metadata_extractors/f-rest-2-toml.rst b/tests/data/metadata_extractors/f-rest-2-toml.rst new file mode 100644 index 0000000000..0bd667b6a0 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-toml.rst @@ -0,0 +1,2 @@ +Content line 1. +Content line 2. diff --git a/tests/data/metadata_extractors/f-rest-2-yaml.meta b/tests/data/metadata_extractors/f-rest-2-yaml.meta new file mode 100644 index 0000000000..87d83bcc89 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-yaml.meta @@ -0,0 +1,6 @@ +--- +title: "T: reST, 2, YAML" +slug: s-rest-2-yaml +date: "2017-07-01 00:00:00 UTC" +tags: ["meta", "reST", "twofile", "YAML"] +--- diff --git a/tests/data/metadata_extractors/f-rest-2-yaml.rst b/tests/data/metadata_extractors/f-rest-2-yaml.rst new file mode 100644 index 0000000000..0bd667b6a0 --- /dev/null +++ b/tests/data/metadata_extractors/f-rest-2-yaml.rst @@ -0,0 +1,2 @@ +Content line 1. +Content line 2. diff --git a/tests/test_metadata_extractors.py b/tests/test_metadata_extractors.py new file mode 100644 index 0000000000..20b9b67422 --- /dev/null +++ b/tests/test_metadata_extractors.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +"""Test metadata extractors.""" + +import mock +import os +import pytest +from .base import FakeSite +from nikola.metadata_extractors import default_metadata_extractors_by, load_defaults +from nikola.post import get_meta +from nikola.plugins.compile.rest import CompileRest +from nikola.plugins.compile.markdown import CompileMarkdown +from nikola.plugins.compile.ipynb import CompileIPynb +from nikola.plugins.compile.html import CompileHtml + + +@pytest.fixture(name='metadata_extractors_by') +def f__metadata_extractors_by(): + m = default_metadata_extractors_by() + load_defaults(None, m) + return m + + +class FakePost(): + def __init__(self, source_path, metadata_path, config, compiler, metadata_extractors_by): + self.source_path = source_path + self.metadata_path = metadata_path + self.is_two_file = True + self.config = { + 'TRANSLATIONS': {'en': './'}, + 'DEFAULT_LANG': 'en' + } + self.config.update(config) + self.default_lang = self.config['DEFAULT_LANG'] + self.metadata_extractors_by = metadata_extractors_by + if compiler: + self.compiler = compiler + + def translated_source_path(self, _): + return self.source_path + + +@pytest.mark.parametrize("meta_twofile", [(1, "onefile", "twofile"), (2, "twofile", "onefile")]) +@pytest.mark.parametrize("meta_format", [('nikola', 'Nikola'), ('toml', 'TOML'), ('yaml', 'YAML')]) +def test_builtin_extractors_rest(metadata_extractors_by, meta_twofile, meta_format): + twofile_number, twofile_expected, twofile_unexpected = meta_twofile + twofile = twofile_number == 2 + format_lc, format_friendly = meta_format + + source_filename = "f-rest-{0}-{1}.rst".format(twofile_number, format_lc) + metadata_filename = "f-rest-{0}-{1}.meta".format(twofile_number, format_lc) + title = 'T: reST, {0}, {1}'.format(twofile_number, format_friendly) + slug = "s-rest-{0}-{1}".format(twofile_number, format_lc) + source_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', source_filename)) + metadata_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', metadata_filename)) + post = FakePost(source_path, metadata_path, {}, None, metadata_extractors_by) + + assert os.path.exists(source_path) + if twofile: + assert os.path.exists(metadata_path) + + meta, extractor = get_meta(post, None) + + assert meta + if twofile: + assert extractor is None + else: + assert extractor is metadata_extractors_by['name'][format_lc] + + assert meta['title'] == title + assert meta['slug'] == slug + assert twofile_expected in meta['tags'] + assert twofile_unexpected not in meta['tags'] + assert 'meta' in meta['tags'] + assert format_friendly in meta['tags'] + assert 'reST' in meta['tags'] + assert meta['date'] == '2017-07-01 00:00:00 UTC' + + +@pytest.mark.parametrize("meta_twofile", [(1, "onefile", "twofile"), (2, "twofile", "onefile")]) +def test_nikola_meta_markdown(metadata_extractors_by, meta_twofile): + twofile_number, twofile_expected, twofile_unexpected = meta_twofile + twofile = twofile_number == 2 + + source_filename = "f-markdown-{0}-nikola.md".format(twofile_number) + metadata_filename = "f-markdown-{0}-nikola.meta".format(twofile_number) + title = 'T: Markdown, {0}, Nikola'.format(twofile_number) + slug = "s-markdown-{0}-nikola".format(twofile_number) + source_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', source_filename)) + metadata_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', metadata_filename)) + post = FakePost(source_path, metadata_path, {}, None, metadata_extractors_by) + + assert os.path.exists(source_path) + if twofile: + assert os.path.exists(metadata_path) + + meta, extractor = get_meta(post, None) + if twofile: + assert extractor is None + else: + assert extractor is metadata_extractors_by['name']['nikola'] + + assert meta['title'] == title + assert meta['slug'] == slug + assert twofile_expected in meta['tags'] + assert twofile_unexpected not in meta['tags'] + assert 'meta' in meta['tags'] + assert 'Nikola' in meta['tags'] + assert 'Markdown' in meta['tags'] + assert meta['date'] == '2017-07-01 00:00:00 UTC' + + +@pytest.mark.parametrize("compiler_data", [ + (CompileRest, 'rst', 'rest', 'reST'), + (CompileMarkdown, 'md', 'markdown', 'Markdown'), + (CompileIPynb, 'ipynb', 'ipynb', 'Jupyter Notebook'), + (CompileHtml, 'html', 'html', 'HTML'), +]) +def test_compiler_metadata(metadata_extractors_by, compiler_data): + compiler_cls, compiler_ext, compiler_lc, compiler_name = compiler_data + source_filename = "f-{0}-1-compiler.{1}".format(compiler_lc, compiler_ext) + metadata_filename = "f-{0}-1-compiler.meta".format(compiler_lc) + title = 'T: {0}, 1, compiler'.format(compiler_name) + slug = "s-{0}-1-compiler".format(compiler_lc) + source_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', source_filename)) + metadata_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'metadata_extractors', metadata_filename)) + + config = {'USE_REST_DOCINFO_METADATA': True, 'MARKDOWN_EXTENSIONS': ['markdown.extensions.meta']} + site = FakeSite() + site.config.update(config) + compiler_obj = compiler_cls() + compiler_obj.set_site(site) + + post = FakePost(source_path, metadata_path, config, compiler_obj, metadata_extractors_by) + + class FakeBorg(): + current_lang = 'en' + + def __call__(self): + return self + + with mock.patch('nikola.plugins.compile.' + compiler_lc + '.LocaleBorg', FakeBorg): + meta, extractor = get_meta(post, None) + + assert meta['title'] == title + assert meta['slug'] == slug + assert 'meta' in meta['tags'] + assert 'onefile' in meta['tags'] + assert 'compiler' in meta['tags'] + assert compiler_name in meta['tags'] + assert meta['date'] == '2017-07-01 00:00:00 UTC' diff --git a/tests/test_rss_feeds.py b/tests/test_rss_feeds.py index 389bb7dda9..17117138e4 100644 --- a/tests/test_rss_feeds.py +++ b/tests/test_rss_feeds.py @@ -48,7 +48,7 @@ def setUp(self): self.blog_url = "http://some.blog" with mock.patch('nikola.post.get_meta', - mock.Mock(return_value=defaultdict(str, { + mock.Mock(return_value=(defaultdict(str, { 'title': 'post title', 'slug': 'awesome_article', 'date': '2012-10-01 22:41', @@ -57,7 +57,7 @@ def setUp(self): 'link': 'link', 'description': 'description', 'enclosure': 'http://www.example.org/foo.mp3', - 'enclosure_length': '5'}))): + 'enclosure_length': '5'}), None))): with mock.patch('nikola.nikola.utils.os.path.isdir', mock.Mock(return_value=True)): with mock.patch('nikola.nikola.Post.text', diff --git a/tests/test_utils.py b/tests/test_utils.py index 5ff14a64e2..5adcf470b3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,7 @@ import mock import os import lxml.html +from nikola import metadata_extractors from nikola.post import get_meta from nikola.utils import ( demote_headers, TranslatableSetting, get_crumbs, TemplateHookRegistry, @@ -12,6 +13,13 @@ class dummy(object): default_lang = 'en' + metadata_extractors_by = metadata_extractors.default_metadata_extractors_by() + config = {'TRANSLATIONS_PATTERN': '{path}.{lang}.{ext}', + 'TRANSLATIONS': {'en': './'}, + 'DEFAULT_LANG': 'en'} + + def __init__(self): + metadata_extractors.load_defaults(self, self.metadata_extractors_by) class GetMetaTest(unittest.TestCase): @@ -21,7 +29,7 @@ def test_getting_metadata_from_content(self): ".. date: 2012/09/15 19:52:05\n"\ ".. tags:\n"\ ".. link:\n"\ - ".. description:\n"\ + ".. description:\n\n"\ "Post content\n" opener_mock = mock.mock_open(read_data=file_metadata) @@ -31,7 +39,7 @@ def test_getting_metadata_from_content(self): post.metadata_path = 'file_with_metadata.meta' with mock.patch('nikola.post.io.open', opener_mock, create=True): - meta = get_meta(post) + meta = get_meta(post, None)[0] self.assertEqual('Nikola needs more tests!', meta['title']) self.assertEqual('write-tests-now', meta['slug']) @@ -54,7 +62,7 @@ def test_get_title_from_fname(self): post.metadata_path = 'file_with_metadata.meta' with mock.patch('nikola.post.io.open', opener_mock, create=True): - meta = get_meta(post, 'file_with_metadata') + meta = get_meta(post, None)[0] self.assertEqual('file_with_metadata', meta['title']) self.assertEqual('write-tests-now', meta['slug']) @@ -78,7 +86,7 @@ def test_use_filename_as_slug_fallback(self): post.metadata_path = 'Slugify this.meta' with mock.patch('nikola.post.io.open', opener_mock, create=True): - meta = get_meta(post, 'Slugify this') + meta = get_meta(post, None)[0] self.assertEqual('Nikola needs more tests!', meta['title']) self.assertEqual('slugify-this', meta['slug']) self.assertEqual('2012/09/15 19:52:05', meta['date']) @@ -90,10 +98,10 @@ def test_extracting_metadata_from_filename(self): post = dummy() post.source_path = '2013-01-23-the_slug-dubdubtitle.md' post.metadata_path = '2013-01-23-the_slug-dubdubtitle.meta' + post.config['FILE_METADATA_REGEXP'] = r'(?P\d{4}-\d{2}-\d{2})-(?P.*)-(?P.*)\.md' + post.config['FILE_METADATA_UNSLUGIFY_TITLES'] = False with mock.patch('nikola.post.io.open', create=True): - meta = get_meta( - post, - r'(?P<date>\d{4}-\d{2}-\d{2})-(?P<slug>.*)-(?P<title>.*)\.md') + meta = get_meta(post, None)[0] self.assertEqual('dubdubtitle', meta['title']) self.assertEqual('the_slug', meta['slug']) @@ -104,7 +112,7 @@ def test_get_meta_slug_only_from_filename(self): post.source_path = 'some/path/the_slug.md' post.metadata_path = 'some/path/the_slug.meta' with mock.patch('nikola.post.io.open', create=True): - meta = get_meta(post) + meta = get_meta(post, None)[0] self.assertEqual('the_slug', meta['slug']) @@ -276,27 +284,13 @@ def test_dict_input_lang(self): # locale settings returned by LocaleBorg! Use with care! S.lang = 'zz' - try: - u = unicode(S) - except NameError: # Python 3 - u = str(S) - + u = str(S) cn = S() self.assertEqual(inp['zz'], u) self.assertEqual(inp['zz'], cn) -def test_get_metadata_from_file(): - # These were doctests and not running :-P - from nikola.post import _get_metadata_from_file - g = _get_metadata_from_file - assert list(g([]).values()) == [] - assert str(g([".. title: FooBar"])["title"]) == 'FooBar' - assert 'title' not in g(["", "", ".. title: FooBar"]) - assert 'title' in g(["", ".. title: FooBar"]) - - def test_get_asset_path(): assert get_asset_path('assets/css/nikola_rst.css', get_theme_chain('bootstrap3', ['themes'])).replace(