Skip to content
Permalink
Browse files

Extracting raw metadata extraction/split functions to utils package.

  • Loading branch information
felixfontein committed Jul 1, 2017
1 parent c94951d commit 2c090ae146b3adaded0b8d9df7f8dcf1e641f1ab
Showing with 115 additions and 69 deletions.
  1. +3 −12 nikola/plugin_categories.py
  2. +8 −56 nikola/post.py
  3. +104 −1 nikola/utils.py
@@ -28,13 +28,12 @@

import sys
import os
import re
import io

from yapsy.IPlugin import IPlugin
from doit.cmd_base import Command as DoitCommand

from .utils import LOGGER, first_line
from .utils import LOGGER, first_line, split_metadata

__all__ = (
'Command',
@@ -329,16 +328,8 @@ def split_metadata(self, data):
This splits in the first empty line that is NOT at the beginning
of the document, or after YAML/TOML metadata without an empty line.
"""
if data.startswith('---'): # YAML metadata
split_result = re.split('(\n---\n|\r\n---\r\n)', data.lstrip(), maxsplit=1)
elif data.startswith('+++'): # TOML metadata
split_result = re.split('(\n\\+\\+\\+\n|\r\n\\+\\+\\+\r\n)', data.lstrip(), maxsplit=1)
else:
split_result = re.split('(\n\n|\r\n\r\n)', data.lstrip(), maxsplit=1)
if len(split_result) == 1:
return '', split_result[0]
# ['metadata', '\n\n', 'post content']
return split_result[0], split_result[-1]
meta, content, _ = split_metadata(data)
return meta, content

def get_compiler_extensions(self):
"""Activate all the compiler extension plugins for a given compiler and return them."""
@@ -937,19 +937,8 @@ def source_ext(self, prefix=False):
# Code that fetches metadata from different places


def re_meta(line, match=None):
"""Find metadata using regular expressions."""
if match:
reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match)))
else:
reStr = re.compile('^\.\. (.*?): (.*)')
result = reStr.findall(line.strip())
if match and result:
return (match, result[0])
elif not match and result:
return (result[0][0], result[0][1].strip())
else:
return (None,)
# For backwards compatibility
re_meta = utils.re_meta


def _get_metadata_from_filename_by_regex(filename, metadata_regexp, unslugify_titles, lang):
@@ -982,8 +971,8 @@ def get_metadata_from_file(source_path, config=None, lang=None):
elif lang:
source_path += '.' + lang
with io.open(source_path, "r", encoding="utf-8-sig") as meta_file:
meta_data = [x.strip() for x in meta_file.readlines()]
return _get_metadata_from_file(meta_data, config)
file_lines = [x.strip() for x in meta_file.readlines()]
return _get_metadata_from_file(file_lines, config)
except (UnicodeDecodeError, UnicodeEncodeError):
msg = 'Error reading {0}: Nikola only supports UTF-8 files'.format(source_path)
LOGGER.error(msg)
@@ -999,52 +988,15 @@ def get_metadata_from_file(source_path, config=None, lang=None):
string.punctuation)))


def _get_metadata_from_file(meta_data, config=None):
def _get_metadata_from_file(file_lines, config=None):
"""Extract metadata from a post's source file."""
meta = {}
if not meta_data:
return meta

# Skip up to one empty line at the beginning (for txt2tags)
if not meta_data[0]:
meta_data = meta_data[1:]

# If 1st line is '---', then it's YAML metadata
if meta_data[0] == '---':
if yaml is None:
utils.req_missing('pyyaml', 'use YAML metadata', optional=True)
raise ValueError('Error parsing metadata')
idx = meta_data.index('---', 1)
meta = yaml.safe_load('\n'.join(meta_data[1:idx]))
# We expect empty metadata to be '', not None
for k in meta:
if meta[k] is None:
meta[k] = ''
meta, type = utils.extract_metadata(file_lines)
if type == 'yaml':
# Map metadata from other platforms to names Nikola expects (Issue #2817)
map_metadata(meta, 'yaml', config)
return meta

# If 1st line is '+++', then it's TOML metadata
if meta_data[0] == '+++':
if toml is None:
utils.req_missing('toml', 'use TOML metadata', optional=True)
raise ValueError('Error parsing metadata')
idx = meta_data.index('+++', 1)
meta = toml.loads('\n'.join(meta_data[1:idx]))
if type == 'toml':
# Map metadata from other platforms to names Nikola expects (Issue #2817)
map_metadata(meta, 'toml', config)
return meta

# First, get metadata from the beginning of the file,
# up to first empty line

for i, line in enumerate(meta_data):
if not line:
break
match = re_meta(line)
if match[0]:
meta[match[0]] = match[1]

return meta


@@ -97,7 +97,7 @@
'adjust_name_for_index_path', 'adjust_name_for_index_link',
'NikolaPygmentsHTML', 'create_redirect', 'clean_before_deployment',
'sort_posts', 'indent', 'load_data', 'html_unescape', 'rss_writer',
'map_metadata',
'map_metadata', 're_meta', 'extract_metadata', 'split_metadata',
# Deprecated, moved to hierarchy_utils:
'TreeNode', 'clone_treenode', 'flatten_tree_structure',
'sort_classifications', 'join_hierarchical_category_path',
@@ -2028,3 +2028,106 @@ def read_from_config(self, site, basename, posts_per_classification_per_language
args = {'translation_manager': self, 'site': site,
'posts_per_classification_per_language': posts_per_classification_per_language}
signal('{}_translations_config'.format(basename.lower())).send(args)


def re_meta(line, match=None):
"""Find metadata using regular expressions."""
if match:
reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match)))
else:
reStr = re.compile('^\.\. (.*?): (.*)')
result = reStr.findall(line.strip())
if match and result:
return (match, result[0])
elif not match and result:
return (result[0][0], result[0][1].strip())
else:
return (None,)


def extract_metadata(file_lines):
"""Extract metadata from the lines of a file.
Returns a pair ``(meta, type)``, where ``meta`` is the
metadata dictionary and ``type`` the metadata format.
Valid values for ``type`` are:
* ``'none'``: no metadata was found (file was empty)
* ``'yaml'``: metadata in YAML format
* ``'toml'``: metadata in TOML format
* ``'rest'``: metadata in reST format (the standard Nikola
reST-like metadata format)
"""
meta = {}
if not file_lines:
return meta, 'none'

# Skip up to one empty line at the beginning (for txt2tags)
if not file_lines[0]:
file_lines = file_lines[1:]

# If 1st line is '---', then it's YAML metadata
if file_lines[0] == '---':
if yaml is None:
req_missing('pyyaml', 'use YAML metadata', optional=True)
raise ValueError('Error parsing metadata')
idx = file_lines.index('---', 1)
meta = yaml.safe_load('\n'.join(file_lines[1:idx]))
# We expect empty metadata to be '', not None
for k in meta:
if meta[k] is None:
meta[k] = ''
return meta, 'yaml'

# If 1st line is '+++', then it's TOML metadata
if file_lines[0] == '+++':
if toml is None:
req_missing('toml', 'use TOML metadata', optional=True)
raise ValueError('Error parsing metadata')
idx = file_lines.index('+++', 1)
meta = toml.loads('\n'.join(file_lines[1:idx]))
return meta, 'toml'

# First, get metadata from the beginning of the file,
# up to first empty line

for i, line in enumerate(file_lines):
if not line:
break
match = re_meta(line)
if match[0]:
meta[match[0]] = match[1]

return meta, 'nikola'


def split_metadata(self, data):
"""Split data from metadata in the raw post content.
This splits in the first empty line that is NOT at the beginning
of the document, or after YAML/TOML metadata without an empty line.
Returns a tuple ``(meta, content, type)`` where ``meta`` and
``content`` are parts of ``data``, and ``type`` is the metadata
format.
Valid values for ``type`` are:
* ``'none'``: no metadata was found (file was empty)
* ``'yaml'``: metadata in YAML format
* ``'toml'``: metadata in TOML format
* ``'rest'``: metadata in reST format (the standard Nikola
reST-like metadata format)
"""
if data.startswith('---'): # YAML metadata
split_result = re.split('(\n---\n|\r\n---\r\n)', data.lstrip(), maxsplit=1)
type = 'yaml'
elif data.startswith('+++'): # TOML metadata
split_result = re.split('(\n\\+\\+\\+\n|\r\n\\+\\+\\+\r\n)', data.lstrip(), maxsplit=1)
type = 'toml'
else:
split_result = re.split('(\n\n|\r\n\r\n)', data.lstrip(), maxsplit=1)
type = 'nikola'
if len(split_result) == 1:
return '', split_result[0], 'none'
# ['metadata', '\n\n', 'post content']
return split_result[0], split_result[-1], type

0 comments on commit 2c090ae

Please sign in to comment.