getnikola · ralsina · May 10, 2017 · Apr 19, 2017 · Apr 28, 2017 · Apr 28, 2017
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -26,6 +26,8 @@ Features
 Bugfixes
 --------
 
+* More robust shortcodes, no need to escape URLs in reSt, work better
+  with LaTeX, etc.
 * No longer creates empty subarchive pages, and no longer create broken
   archive navigation links on day level (Issue #2734)
 * Fixes post scanner plugin order (Issue #2720)

diff --git a/nikola/nikola.py b/nikola/nikola.py
@@ -1686,12 +1686,27 @@ def register_shortcode(self, name, f):
         self.shortcode_registry[name] = f
 
     # XXX in v8, get rid of with_dependencies
-    def apply_shortcodes(self, data, filename=None, lang=None, with_dependencies=False, extra_context={}):
+    def apply_shortcodes(self, data, filename=None, lang=None, with_dependencies=False, extra_context=None):
         """Apply shortcodes from the registry on data."""
+        if extra_context is None:
+            extra_context = {}
         if lang is None:
             lang = utils.LocaleBorg().current_lang
         return shortcodes.apply_shortcodes(data, self.shortcode_registry, self, filename, lang=lang, with_dependencies=with_dependencies, extra_context=extra_context)
 
+    def apply_shortcodes_uuid(self, data, _shortcodes, filename=None, lang=None, with_dependencies=False, extra_context=None):
+        """Apply shortcodes from the registry on data."""
+        if lang is None:
+            lang = utils.LocaleBorg().current_lang
+        if extra_context is None:
+            extra_context = {}
+        deps = []
+        for k, v in _shortcodes.items():
+            replacement, _deps = shortcodes.apply_shortcodes(v, self.shortcode_registry, self, filename, lang=lang, with_dependencies=with_dependencies, extra_context=extra_context)
+            data = data.replace(k, replacement)
+            deps.extend(_deps)
+        return data, deps
+
     def _get_rss_copyright(self, lang, rss_plain):
         if rss_plain:
             return (

diff --git a/nikola/plugins/compile/rest/__init__.py b/nikola/plugins/compile/rest/__init__.py
@@ -81,14 +81,17 @@ def compile_string(self, data, source_path=None, is_two_file=True, post=None, la
             'language_code': LEGAL_VALUES['DOCUTILS_LOCALES'].get(LocaleBorg().current_lang, 'en')
         }
 
+        from nikola import shortcodes as sc
+        new_data, shortcodes = sc.extract_shortcodes(data)
         output, error_level, deps = rst2html(
-            data, settings_overrides=settings_overrides, logger=self.logger, source_path=source_path, l_add_ln=add_ln, transforms=self.site.rst_transforms,
+            new_data, settings_overrides=settings_overrides, logger=self.logger, source_path=source_path, l_add_ln=add_ln, transforms=self.site.rst_transforms,
             no_title_transform=self.site.config.get('NO_DOCUTILS_TITLE_TRANSFORM', False))
         if not isinstance(output, unicode_str):
             # To prevent some weird bugs here or there.
             # Original issue: empty files.  `output` became a bytestring.
             output = output.decode('utf-8')
-        output, shortcode_deps = self.site.apply_shortcodes(output, filename=source_path, with_dependencies=True, extra_context=dict(post=post))
+
+        output, shortcode_deps = self.site.apply_shortcodes_uuid(output, shortcodes, filename=source_path, with_dependencies=True, extra_context=dict(post=post))
         return output, error_level, deps, shortcode_deps
 
     # TODO remove in v8

diff --git a/nikola/shortcodes.py b/nikola/shortcodes.py
@@ -27,14 +27,11 @@
 """Support for Hugo-style shortcodes."""
 
 from __future__ import unicode_literals
-from .utils import LOGGER
-import sys
 
+import uuid
 
-# Constants
-_TEXT = 1
-_SHORTCODE_START = 2
-_SHORTCODE_END = 3
+from .utils import LOGGER
+import sys
 
 
 class ParsingError(Exception):
@@ -83,11 +80,10 @@ def _skip_whitespace(data, pos, must_be_nontrivial=False):
 
 def _skip_nonwhitespace(data, pos):
     """Return first position not before pos which contains a non-whitespace character."""
-    while pos < len(data):
-        if data[pos].isspace():
-            break
-        pos += 1
-    return pos
+    for i, x in enumerate(data[pos:]):
+        if x.isspace():
+            return pos + i
+    return len(data)
 
 
 def _parse_quoted_string(data, start):
@@ -209,14 +205,66 @@ def _parse_shortcode_args(data, start, shortcode_name, start_pos):
     raise ParsingError("Shortcode '{0}' starting at {1} is not terminated correctly with '%}}}}'!".format(shortcode_name, _format_position(data, start_pos)))
 
 
+def _new_sc_id():
+    return str('SHORTCODE{0}REPLACEMENT'.format(str(uuid.uuid4()).replace('-', '')))
+
+
+def extract_shortcodes(data):
+    """
+    Return data with replaced shortcodes, shortcodes.
+
+    data is the original data, with the shortcodes replaced by UUIDs.
+
+    a dictionary of shortcodes, where the keys are UUIDs and the values
+    are the shortcodes themselves ready to process.
+    """
+    shortcodes = {}
+    splitted = _split_shortcodes(data)
+
+    def extract_data_chunk(data):
+        """Take a list of splitted shortcodes and return a string and a tail.
+
+        The string is data, the tail is ready for a new run of this same function.
+        """
+        text = []
+        for i, token in enumerate(data):
+            if token[0] == 'SHORTCODE_START':
+                name = token[3]
+                sc_id = _new_sc_id()
+                text.append(sc_id)
+                # See if this shortcode closes
+                for j in range(i, len(data)):
+                    if data[j][0] == 'SHORTCODE_END' and data[j][3] == name:
+                        # Extract this chunk
+                        shortcodes[sc_id] = ''.join(t[1] for t in data[i:j + 1])
+                        return ''.join(text), data[j + 1:]
+                # Doesn't close
+                shortcodes[sc_id] = token[1]
+                return ''.join(text), data[i + 1:]
+            elif token[0] == 'TEXT':
+                text.append(token[1])
+                return ''.join(text), data[1:]
+            elif token[0] == 'SHORTCODE_END':  # This is malformed
+                raise Exception('Closing unopened shortcode {}'.format(token[3]))
+
+    text = []
+    tail = splitted
+    while True:
+        new_text, tail = extract_data_chunk(tail)
+        text.append(new_text)
+        if not tail:
+            break
+    return ''.join(text), shortcodes
+
+
 def _split_shortcodes(data):
     """Given input data, splits it into a sequence of texts, shortcode starts and shortcode ends.
 
     Returns a list of tuples of the following forms:
 
-        1. (_TEXT, text)
-        2. (_SHORTCODE_START, text, start, name, args)
-        3. (_SHORTCODE_END, text, start, name)
+        1. ("TEXT", text)
+        2. ("SHORTCODE_START", text, start, name, args)
+        3. ("SHORTCODE_END", text, start, name)
 
     Here, text is the raw text represented by the token; start is the starting position in data
     of the token; name is the name of the shortcode; and args is a tuple (args, kw) as returned
@@ -228,9 +276,9 @@ def _split_shortcodes(data):
         # Search for shortcode start
         start = data.find('{{%', pos)
         if start < 0:
-            result.append((_TEXT, data[pos:]))
+            result.append(("TEXT", data[pos:]))
             break
-        result.append((_TEXT, data[pos:start]))
+        result.append(("TEXT", data[pos:start]))
         # Extract name
         name_start = _skip_whitespace(data, start + 3)
         name_end = _skip_nonwhitespace(data, name_start)
@@ -246,13 +294,13 @@ def _split_shortcodes(data):
             # Must be followed by '%}}'
             if pos > len(data) or data[end_start:pos] != '%}}':
                 raise ParsingError("Syntax error: '{{{{% /{0}' must be followed by ' %}}}}' ({1})!".format(name, _format_position(data, end_start)))
-            result.append((_SHORTCODE_END, data[start:pos], start, name))
+            result.append(("SHORTCODE_END", data[start:pos], start, name))
         elif name == '%}}':
             raise ParsingError("Syntax error: '{{{{%' must be followed by shortcode name ({0})!".format(_format_position(data, start)))
         else:
             # This is an opening shortcode
             pos, args = _parse_shortcode_args(data, name_end, shortcode_name=name, start_pos=start)
-            result.append((_SHORTCODE_START, data[start:pos], start, name, args))
+            result.append(("SHORTCODE_START", data[start:pos], start, name, args))
     return result
 
 
@@ -284,17 +332,17 @@ def apply_shortcodes(data, registry, site=None, filename=None, raise_exceptions=
         pos = 0
         while pos < len(sc_data):
             current = sc_data[pos]
-            if current[0] == _TEXT:
+            if current[0] == "TEXT":
                 result.append(current[1])
                 pos += 1
-            elif current[0] == _SHORTCODE_END:
+            elif current[0] == "SHORTCODE_END":
                 raise ParsingError("Found shortcode ending '{{{{% /{0} %}}}}' which isn't closing a started shortcode ({1})!".format(current[3], _format_position(data, current[2])))
-            elif current[0] == _SHORTCODE_START:
+            elif current[0] == "SHORTCODE_START":
                 name = current[3]
                 # Check if we can find corresponding ending
                 found = None
                 for p in range(pos + 1, len(sc_data)):
-                    if sc_data[p][0] == _SHORTCODE_END and sc_data[p][3] == name:
+                    if sc_data[p][0] == "SHORTCODE_END" and sc_data[p][3] == name:
                         found = p
                         break
                 if found:

diff --git a/tests/base.py b/tests/base.py
@@ -230,3 +230,7 @@ def register_shortcode(self, name, f):
     def apply_shortcodes(self, data, *a, **kw):
         """Apply shortcodes from the registry on data."""
         return nikola.shortcodes.apply_shortcodes(data, self.shortcode_registry, **kw)
+
+    def apply_shortcodes_uuid(self, data, shortcodes, *a, **kw):
+        """Apply shortcodes from the registry on data."""
+        return nikola.shortcodes.apply_shortcodes(data, self.shortcode_registry, **kw)
diff --git a/tests/test_shortcodes.py b/tests/test_shortcodes.py
@@ -74,3 +74,23 @@ def test_errors(self):
         self.assertRaisesRegexp(shortcodes.ParsingError, "^Found shortcode ending '{{% / %}}' which isn't closing a started shortcode", shortcodes.apply_shortcodes, '{{% / %}}', self.fakesite.shortcode_registry, raise_exceptions=True)
         self.assertRaisesRegexp(shortcodes.ParsingError, "^Syntax error: '{{% /' must be followed by ' %}}'", shortcodes.apply_shortcodes, '{{% / a %}}', self.fakesite.shortcode_registry, raise_exceptions=True)
         self.assertRaisesRegexp(shortcodes.ParsingError, "^Shortcode '<==' starting at .* is not terminated correctly with '%}}'!", shortcodes.apply_shortcodes, '==> {{% <==', self.fakesite.shortcode_registry, raise_exceptions=True)
+
+
+@pytest.mark.parametrize("input, expected", [
+    ('{{% foo %}}', (u'SC1', {u'SC1': u'{{% foo %}}'})),
+    ('{{% foo %}} bar {{% /foo %}}', (u'SC1', {u'SC1': u'{{% foo %}} bar {{% /foo %}}'})),
+    ('AAA{{% foo %}} bar {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} bar {{% /foo %}}'})),
+    ('AAA{{% foo %}} {{% bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% bar %}} {{% /foo %}}'})),
+    ('AAA{{% foo %}} {{% /bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% /bar %}} {{% /foo %}}'})),
+    ('AAA{{% foo %}} {{% bar %}} quux {{% /bar %}} {{% /foo %}}BBB', (u'AAASC1BBB', {u'SC1': u'{{% foo %}} {{% bar %}} quux {{% /bar %}} {{% /foo %}}'})),
+    ('AAA{{% foo %}} BBB {{% bar %}} quux {{% /bar %}} CCC', (u'AAASC1 BBB SC2 CCC', {u'SC1': u'{{% foo %}}', u'SC2': u'{{% bar %}} quux {{% /bar %}}'})),
+])
+def test_extract_shortcodes(input, expected, monkeypatch):
+
+    i = iter('SC%d' % i for i in range(1, 100))
+    if sys.version[0] < "3":
+        monkeypatch.setattr(shortcodes, '_new_sc_id', i.next)
+    else:
+        monkeypatch.setattr(shortcodes, '_new_sc_id', i.__next__)
+    extracted = shortcodes.extract_shortcodes(input)
+    assert extracted == expected