nikola/metadata_extractors.py

# -*- coding: utf-8 -*-

# Copyright © 2012-2017 Chris Warrick, Roberto Alsina and others.

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""Default metadata extractors and helper functions."""

import re
import natsort

from enum import Enum
from nikola.plugin_categories import MetadataExtractor
from nikola.utils import unslugify

__all__ = ('MetaCondition', 'MetaPriority', 'MetaSource', 'check_conditions')
_default_extractors = []
DEFAULT_EXTRACTOR_NAME = 'nikola'
DEFAULT_EXTRACTOR = None


class MetaCondition(Enum):
    """Conditions for extracting metadata."""

    config_bool = 1
    config_present = 2
    extension = 3
    compiler = 4
    first_line = 5
    never = -1


class MetaPriority(Enum):
    """Priority of metadata.

    An extractor is used if and only if the higher-priority extractors returned nothing.
    """

    override = 1
    specialized = 2
    normal = 3
    fallback = 4


class MetaSource(Enum):
    """Source of metadata."""

    text = 1
    filename = 2


def check_conditions(post, filename: str, conditions: list, config: dict, source_text: str) -> bool:
    """Check the conditions for a metadata extractor."""
    for ct, arg in conditions:
        if any((
            ct == MetaCondition.config_bool and not config.get(arg, False),
            ct == MetaCondition.config_present and arg not in config,
            ct == MetaCondition.extension and not filename.endswith(arg),
            ct == MetaCondition.compiler and (post is None or post.compiler.name != arg),
            ct == MetaCondition.never
        )):
            return False
        elif ct == MetaCondition.first_line:
            if not source_text or not source_text.startswith(arg + '\n'):
                return False
    return True


def classify_extractor(extractor: MetadataExtractor, metadata_extractors_by: dict):
    """Classify an extractor and add it to the metadata_extractors_by dict."""
    global DEFAULT_EXTRACTOR
    if extractor.name == DEFAULT_EXTRACTOR_NAME:
        DEFAULT_EXTRACTOR = extractor
    metadata_extractors_by['priority'][extractor.priority].append(extractor)
    metadata_extractors_by['source'][extractor.source].append(extractor)
    metadata_extractors_by['name'][extractor.name] = extractor
    metadata_extractors_by['all'].append(extractor)


def load_defaults(site: 'nikola.nikola.Nikola', metadata_extractors_by: dict):
    """Load default metadata extractors."""
    for extractor in _default_extractors:
        extractor.site = site
        classify_extractor(extractor, metadata_extractors_by)


def is_extractor(extractor) -> bool:  # pragma: no cover
    """Check if a given class is an extractor."""
    return isinstance(extractor, MetadataExtractor)


def default_metadata_extractors_by() -> dict:
    """Return the default metadata_extractors_by dictionary."""
    d = {
        'priority': {},
        'source': {},
        'name': {},
        'all': []
    }

    for i in MetaPriority:
        d['priority'][i] = []
    for i in MetaSource:
        d['source'][i] = []

    return d


def _register_default(extractor: MetadataExtractor) -> MetadataExtractor:
    """Register a default extractor."""
    _default_extractors.append(extractor())
    return extractor


@_register_default
class NikolaMetadata(MetadataExtractor):
    """Extractor for Nikola-style metadata."""

    name = 'nikola'
    source = MetaSource.text
    priority = MetaPriority.normal
    supports_write = True
    split_metadata_re = re.compile('\n\n')
    nikola_re = re.compile(r'^\s*\.\. (.*?): (.*)')

    def _extract_metadata_from_text(self, source_text: str) -> dict:
        """Extract metadata from text."""
        outdict = {}
        for line in source_text.split('\n'):
            match = self.nikola_re.match(line)
            if match:
                k, v = match.group(1), match.group(2)
                if v:
                    outdict[k] = v
        return outdict

    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
        """Write metadata in this extractor’s format."""
        metadata = metadata.copy()
        order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type')
        f = '.. {0}: {1}'
        meta = []
        for k in order:
            try:
                meta.append(f.format(k, metadata.pop(k)))
            except KeyError:
                pass
        # Leftover metadata (user-specified/non-default).
        for k in natsort.natsorted(list(metadata.keys()), alg=natsort.ns.F | natsort.ns.IC):
            meta.append(f.format(k, metadata[k]))
        data = '\n'.join(meta)
        if comment_wrap is True:
            comment_wrap = ('<!--', '-->')
        if comment_wrap:
            return '\n'.join((comment_wrap[0], data, comment_wrap[1], '', ''))
        else:
            return data + '\n\n'


@_register_default
class YAMLMetadata(MetadataExtractor):
    """Extractor for YAML metadata."""

    name = 'yaml'
    source = MetaSource.text
    conditions = ((MetaCondition.first_line, '---'),)
    requirements = [('yaml', 'PyYAML', 'YAML')]
    supports_write = True
    split_metadata_re = re.compile('\n---\n')
    map_from = 'yaml'
    priority = MetaPriority.specialized

    def _extract_metadata_from_text(self, source_text: str) -> dict:
        """Extract metadata from text."""
        import yaml
        meta = yaml.safe_load(source_text[4:])
        # We expect empty metadata to be '', not None
        for k in meta:
            if meta[k] is None:
                meta[k] = ''
        return meta

    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
        """Write metadata in this extractor’s format."""
        import yaml
        return '\n'.join(('---', yaml.safe_dump(metadata, default_flow_style=False).strip(), '---', ''))


@_register_default
class TOMLMetadata(MetadataExtractor):
    """Extractor for TOML metadata."""

    name = 'toml'
    source = MetaSource.text
    conditions = ((MetaCondition.first_line, '+++'),)
    requirements = [('toml', 'toml', 'TOML')]
    supports_write = True
    split_metadata_re = re.compile('\n\\+\\+\\+\n')
    map_from = 'toml'
    priority = MetaPriority.specialized

    def _extract_metadata_from_text(self, source_text: str) -> dict:
        """Extract metadata from text."""
        import toml
        return toml.loads(source_text[4:])

    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
        """Write metadata in this extractor’s format."""
        import toml
        return '\n'.join(('+++', toml.dumps(metadata).strip(), '+++', ''))


@_register_default
class FilenameRegexMetadata(MetadataExtractor):
    """Extractor for filename metadata."""

    name = 'filename_regex'
    source = MetaSource.filename
    priority = MetaPriority.fallback
    conditions = [(MetaCondition.config_bool, 'FILE_METADATA_REGEXP')]

    def extract_filename(self, filename: str, lang: str) -> dict:
        """Try to read the metadata from the filename based on the given re.

        This requires to use symbolic group names in the pattern.
        The part to read the metadata from the filename based on a regular
        expression is taken from Pelican - pelican/readers.py
        """
        match = re.match(self.site.config['FILE_METADATA_REGEXP'], filename)
        meta = {}

        if match:
            for key, value in match.groupdict().items():
                k = key.lower().strip()  # metadata must be lowercase
                if k == 'title' and self.site.config['FILE_METADATA_UNSLUGIFY_TITLES']:
                    meta[k] = unslugify(value, lang, discard_numbers=False)
                else:
                    meta[k] = value

        return meta