Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use shortcodes in a more robust manner #2737

Merged
merged 24 commits into from May 10, 2017
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -26,6 +26,8 @@ Features
Bugfixes
--------

* More robust shortcodes, no need to escape URLs in reSt, work better
with LaTeX, etc.
* No longer creates empty subarchive pages, and no longer create broken
archive navigation links on day level (Issue #2734)
* Fixes post scanner plugin order (Issue #2720)
Expand Down
17 changes: 16 additions & 1 deletion nikola/nikola.py
Expand Up @@ -1686,12 +1686,27 @@ def register_shortcode(self, name, f):
self.shortcode_registry[name] = f

# XXX in v8, get rid of with_dependencies
def apply_shortcodes(self, data, filename=None, lang=None, with_dependencies=False, extra_context={}):
def apply_shortcodes(self, data, filename=None, lang=None, with_dependencies=False, extra_context=None):
"""Apply shortcodes from the registry on data."""
if extra_context is None:
extra_context = {}
if lang is None:
lang = utils.LocaleBorg().current_lang
return shortcodes.apply_shortcodes(data, self.shortcode_registry, self, filename, lang=lang, with_dependencies=with_dependencies, extra_context=extra_context)

def apply_shortcodes_uuid(self, data, _shortcodes, filename=None, lang=None, with_dependencies=False, extra_context=None):
"""Apply shortcodes from the registry on data."""
if lang is None:
lang = utils.LocaleBorg().current_lang
if extra_context is None:
extra_context = {}
deps = []
for k, v in _shortcodes.items():
replacement, _deps = shortcodes.apply_shortcodes(v, self.shortcode_registry, self, filename, lang=lang, with_dependencies=with_dependencies, extra_context=extra_context)
data = data.replace(k, replacement)
deps.extend(_deps)
return data, deps

def _get_rss_copyright(self, lang, rss_plain):
if rss_plain:
return (
Expand Down
7 changes: 5 additions & 2 deletions nikola/plugins/compile/rest/__init__.py
Expand Up @@ -81,14 +81,17 @@ def compile_string(self, data, source_path=None, is_two_file=True, post=None, la
'language_code': LEGAL_VALUES['DOCUTILS_LOCALES'].get(LocaleBorg().current_lang, 'en')
}

from nikola import shortcodes as sc
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about other compilers? Also, I think it might be a slightly better-looking API to put extract_shortcodes in the Nikola object as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Kwpolska Since the old API still works, later branches can port each one to the new API.

I did not put extract_shortcodes in the Nikola object mostly because there's nothing site-specific in it, and I am half convinced to use a real tokenizer in shortcodes.py which would make things awkward if things need refactoring.

new_data, shortcodes = sc.extract_shortcodes(data)
output, error_level, deps = rst2html(
data, settings_overrides=settings_overrides, logger=self.logger, source_path=source_path, l_add_ln=add_ln, transforms=self.site.rst_transforms,
new_data, settings_overrides=settings_overrides, logger=self.logger, source_path=source_path, l_add_ln=add_ln, transforms=self.site.rst_transforms,
no_title_transform=self.site.config.get('NO_DOCUTILS_TITLE_TRANSFORM', False))
if not isinstance(output, unicode_str):
# To prevent some weird bugs here or there.
# Original issue: empty files. `output` became a bytestring.
output = output.decode('utf-8')
output, shortcode_deps = self.site.apply_shortcodes(output, filename=source_path, with_dependencies=True, extra_context=dict(post=post))

output, shortcode_deps = self.site.apply_shortcodes_uuid(output, shortcodes, filename=source_path, with_dependencies=True, extra_context=dict(post=post))
return output, error_level, deps, shortcode_deps

# TODO remove in v8
Expand Down
92 changes: 70 additions & 22 deletions nikola/shortcodes.py
Expand Up @@ -27,14 +27,11 @@
"""Support for Hugo-style shortcodes."""

from __future__ import unicode_literals
from .utils import LOGGER
import sys

import uuid

# Constants
_TEXT = 1
_SHORTCODE_START = 2
_SHORTCODE_END = 3
from .utils import LOGGER
import sys


class ParsingError(Exception):
Expand Down Expand Up @@ -83,11 +80,10 @@ def _skip_whitespace(data, pos, must_be_nontrivial=False):

def _skip_nonwhitespace(data, pos):
"""Return first position not before pos which contains a non-whitespace character."""
while pos < len(data):
if data[pos].isspace():
break
pos += 1
return pos
for i, x in enumerate(data[pos:]):
if x.isspace():
return pos + i
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now you are iterating over the rest of the string twice in case no space is found.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I am not? Sorry, it's late and I don't see it :-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The for loop tries to find the first non-whitespace. If it doesn't find anything, the while loop (which is still there below) does the same thing afterwards, and it also won't find anything, so finally len(data) will be returned. You can replace the while loop with return len(data).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I forgot to delete all that. Will fix.

return len(data)


def _parse_quoted_string(data, start):
Expand Down Expand Up @@ -209,14 +205,66 @@ def _parse_shortcode_args(data, start, shortcode_name, start_pos):
raise ParsingError("Shortcode '{0}' starting at {1} is not terminated correctly with '%}}}}'!".format(shortcode_name, _format_position(data, start_pos)))


def _new_sc_id():
return str('SHORTCODE{0}REPLACEMENT'.format(str(uuid.uuid4()).replace('-', '')))


def extract_shortcodes(data):
"""
Return data with replaced shortcodes, shortcodes.

data is the original data, with the shortcodes replaced by UUIDs.

a dictionary of shortcodes, where the keys are UUIDs and the values
are the shortcodes themselves ready to process.
"""
shortcodes = {}
splitted = _split_shortcodes(data)

def extract_data_chunk(data):
"""Take a list of splitted shortcodes and return a string and a tail.

The string is data, the tail is ready for a new run of this same function.
"""
text = []
for i, token in enumerate(data):
if token[0] == 'SHORTCODE_START':
name = token[3]
sc_id = _new_sc_id()
text.append(sc_id)
# See if this shortcode closes
for j in range(i, len(data)):
if data[j][0] == 'SHORTCODE_END' and data[j][3] == name:
# Extract this chunk
shortcodes[sc_id] = ''.join(t[1] for t in data[i:j + 1])
return ''.join(text), data[j + 1:]
# Doesn't close
shortcodes[sc_id] = token[1]
return ''.join(text), data[i + 1:]
elif token[0] == 'TEXT':
text.append(token[1])
return ''.join(text), data[1:]
elif token[0] == 'SHORTCODE_END': # This is malformed
raise Exception('Closing unopened shortcode {}'.format(token[3]))

text = []
tail = splitted
while True:
new_text, tail = extract_data_chunk(tail)
text.append(new_text)
if not tail:
break
return ''.join(text), shortcodes


def _split_shortcodes(data):
"""Given input data, splits it into a sequence of texts, shortcode starts and shortcode ends.

Returns a list of tuples of the following forms:

1. (_TEXT, text)
2. (_SHORTCODE_START, text, start, name, args)
3. (_SHORTCODE_END, text, start, name)
1. ("TEXT", text)
2. ("SHORTCODE_START", text, start, name, args)
3. ("SHORTCODE_END", text, start, name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you do these replacements? Before, you'd get a runtime error in case you did a typo (because the symbol with the typo doesn't exist). Now you won't notice anything until something behaves strange, and then you have to some debugging to find out that you did a typo.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because reading the parsed code made my head hurt.


Here, text is the raw text represented by the token; start is the starting position in data
of the token; name is the name of the shortcode; and args is a tuple (args, kw) as returned
Expand All @@ -228,9 +276,9 @@ def _split_shortcodes(data):
# Search for shortcode start
start = data.find('{{%', pos)
if start < 0:
result.append((_TEXT, data[pos:]))
result.append(("TEXT", data[pos:]))
break
result.append((_TEXT, data[pos:start]))
result.append(("TEXT", data[pos:start]))
# Extract name
name_start = _skip_whitespace(data, start + 3)
name_end = _skip_nonwhitespace(data, name_start)
Expand All @@ -246,13 +294,13 @@ def _split_shortcodes(data):
# Must be followed by '%}}'
if pos > len(data) or data[end_start:pos] != '%}}':
raise ParsingError("Syntax error: '{{{{% /{0}' must be followed by ' %}}}}' ({1})!".format(name, _format_position(data, end_start)))
result.append((_SHORTCODE_END, data[start:pos], start, name))
result.append(("SHORTCODE_END", data[start:pos], start, name))
elif name == '%}}':
raise ParsingError("Syntax error: '{{{{%' must be followed by shortcode name ({0})!".format(_format_position(data, start)))
else:
# This is an opening shortcode
pos, args = _parse_shortcode_args(data, name_end, shortcode_name=name, start_pos=start)
result.append((_SHORTCODE_START, data[start:pos], start, name, args))
result.append(("SHORTCODE_START", data[start:pos], start, name, args))
return result


Expand Down Expand Up @@ -284,17 +332,17 @@ def apply_shortcodes(data, registry, site=None, filename=None, raise_exceptions=
pos = 0
while pos < len(sc_data):
current = sc_data[pos]
if current[0] == _TEXT:
if current[0] == "TEXT":
result.append(current[1])
pos += 1
elif current[0] == _SHORTCODE_END:
elif current[0] == "SHORTCODE_END":
raise ParsingError("Found shortcode ending '{{{{% /{0} %}}}}' which isn't closing a started shortcode ({1})!".format(current[3], _format_position(data, current[2])))
elif current[0] == _SHORTCODE_START:
elif current[0] == "SHORTCODE_START":
name = current[3]
# Check if we can find corresponding ending
found = None
for p in range(pos + 1, len(sc_data)):
if sc_data[p][0] == _SHORTCODE_END and sc_data[p][3] == name:
if sc_data[p][0] == "SHORTCODE_END" and sc_data[p][3] == name:
found = p
break
if found:
Expand Down
4 changes: 4 additions & 0 deletions tests/base.py
Expand Up @@ -230,3 +230,7 @@ def register_shortcode(self, name, f):
def apply_shortcodes(self, data, *a, **kw):
"""Apply shortcodes from the registry on data."""
return nikola.shortcodes.apply_shortcodes(data, self.shortcode_registry, **kw)

def apply_shortcodes_uuid(self, data, shortcodes, *a, **kw):
"""Apply shortcodes from the registry on data."""
return nikola.shortcodes.apply_shortcodes(data, self.shortcode_registry, **kw)
20 changes: 20 additions & 0 deletions tests/test_shortcodes.py
Expand Up @@ -74,3 +74,23 @@ def test_errors(self):
self.assertRaisesRegexp(shortcodes.ParsingError, "^Found shortcode ending '{{% / %}}' which isn't closing a started shortcode", shortcodes.apply_shortcodes, '{{% / %}}', self.fakesite.shortcode_registry, raise_exceptions=True)
self.assertRaisesRegexp(shortcodes.ParsingError, "^Syntax error: '{{% /' must be followed by ' %}}'", shortcodes.apply_shortcodes, '{{% / a %}}', self.fakesite.shortcode_registry, raise_exceptions=True)
self.assertRaisesRegexp(shortcodes.ParsingError, "^Shortcode '<==' starting at .* is not terminated correctly with '%}}'!", shortcodes.apply_shortcodes, '==> {{% <==', self.fakesite.shortcode_registry, raise_exceptions=True)


@pytest.mark.parametrize("input, expected", [
('{{% foo %}}', (u'SC1', {u'SC1': u'{{% foo %}}'})),
('{{% foo %}} bar {{% /foo %}}', (u'SC1', {u'SC1': u'{{% foo %}} bar {{% /foo %}}'})),
('AAA{{% foo %}} bar {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} bar {{% /foo %}}'})),
('AAA{{% foo %}} {{% bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% bar %}} {{% /foo %}}'})),
('AAA{{% foo %}} {{% /bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% /bar %}} {{% /foo %}}'})),
('AAA{{% foo %}} {{% bar %}} quux {{% /bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% bar %}} quux {{% /bar %}} {{% /foo %}}'})),
('AAA{{% foo %}} BBB {{% bar %}} quux {{% /bar %}} CCC', (u'AAASC1 BBB SC2 CCC', {u'SC1': u'{{% foo %}}', u'SC2': u'{{% bar %}} quux {{% /bar %}}'})),
])
def test_extract_shortcodes(input, expected, monkeypatch):

i = iter('SC%d' % i for i in range(1, 100))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should change this to use itertools.counter anyway

if sys.version[0] < "3":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if sys.version_info[0] < 3:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, that's what it was and it failed :-P

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you did sys.version[0] < 3.

According to the documentation of sys.version: "Do not extract version information out of it, rather, use version_info [...]" (https://docs.python.org/2/library/sys.html#sys.version_info, https://docs.python.org/3/library/sys.html#sys.version)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhhh right.

monkeypatch.setattr(shortcodes, '_new_sc_id', i.next)
else:
monkeypatch.setattr(shortcodes, '_new_sc_id', i.__next__)
extracted = shortcodes.extract_shortcodes(input)
assert extracted == expected