fix: improve the performance parsing link labels (#124)

frostming · Sep 21, 2022 · 4e87fe5 · 4e87fe5
1 parent f904b65
commit 4e87fe5
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 85 deletions.
diff --git a/marko/block.py b/marko/block.py
@@ -6,9 +6,9 @@
 from typing import List as _List
 from typing import Match, Optional, Tuple, Union, cast
 
-from . import inline, patterns
+from . import inline, inline_parser, patterns
 from .element import Element
-from .helpers import Source, is_paired, normalize_label, partition_by_spaces
+from .helpers import Source, normalize_label, partition_by_spaces, find_next
 from .parser import Parser
 
 __all__ = (
@@ -569,42 +569,55 @@ class LinkRefDef(BlockElement):
     [label]: destination "title"
     """
 
-    pattern = re.compile(
-        r" {,3}%s:(?P<s1>\s*)%s(?P<s2>\s*)(?:(?<=\s)%s)?[^\n\S]*$\n?"
-        % (patterns.link_label, patterns.link_dest, patterns.link_title),
-        flags=re.M,
+    pattern = re.compile(r" {,3}(\[[\s\S]*?)(?=\n\n|\Z)", flags=re.M)
+    _parse_info = (
+        inline_parser._EMPTY_GROUP,
+        inline_parser._EMPTY_GROUP,
+        inline_parser._EMPTY_GROUP,
+        -1,
     )
-    _parse_info = ("", "", "")
 
     @classmethod
     def match(cls, source: Source) -> bool:
         m = source.expect_re(cls.pattern)
         if not m:
             return False
-        rv = m.groupdict()
-        if rv["s1"].count("\n") > 1 or rv["s1"].count("\n") > 1:
+        text = source._buffer
+        link_label = inline_parser._parse_link_label(text, m.start(1))
+        if not link_label:  # no ending bracket
             return False
-        label = rv["label"]
-        if rv["dest"][0] == "<" and rv["dest"][-1] == ">":
-            dest = rv["dest"]
-        elif is_paired(rv["dest"], "(", ")"):
-            dest = rv["dest"]
-        else:
+        if link_label.end >= len(text) or text[link_label.end] != ":":
+            # no colon after the ending bracket
             return False
-        title = rv["title"]
-        if title and re.search(r"^$", title, re.M):
+        i = inline_parser._parse_link_separator(text, link_label.end + 1)
+        try:
+            link_dest, link_title = inline_parser._parse_link_dest_title(text, i)
+        except inline_parser.ParseError:
             return False
-        cls._parse_info = label, dest, title
-        return m is not None
+        i = max(link_dest.end, link_title.end)
+        end = find_next(text, "\n", i)
+        if end >= 0:
+            end += 1
+        else:
+            end = i
+        if text[i:end].strip():
+            if link_title.text and "\n" in text[link_dest.end : link_title.start]:
+                link_title = inline_parser._EMPTY_GROUP
+                end = find_next(text, "\n", link_dest.end) + 1
+            else:
+                # There is content after the link title
+                return False
+        cls._parse_info = (link_label, link_dest, link_title, end)
+        return True
 
     @classmethod
     def parse(cls, source: Source) -> "LinkRefDef":
-        label, dest, title = cls._parse_info
-        normalized_label = normalize_label(label[1:-1])
+        label, dest, title, pos = cls._parse_info
+        normalized_label = normalize_label(label.text[1:-1])
         assert isinstance(source.root, Document)
         if normalized_label not in source.root.link_ref_defs:
-            source.root.link_ref_defs[normalized_label] = (dest, title)
-        source.consume()
+            source.root.link_ref_defs[normalized_label] = (dest.text, title.text)
+        source.pos = pos
         return cls()
 
 

diff --git a/marko/helpers.py b/marko/helpers.py
@@ -8,7 +8,9 @@
 from typing import (
     TYPE_CHECKING,
     Callable,
+    Container,
     Generator,
+    Iterable,
     List,
     Match,
     Optional,
@@ -28,7 +30,7 @@ def camel_to_snake_case(name: str) -> str:
     return "_".join(map(str.lower, re.findall(pattern, name)))
 
 
-def is_paired(text: str, open: str = "(", close: str = ")") -> bool:
+def is_paired(text: Iterable[str], open: str = "(", close: str = ")") -> bool:
     """Check if the text only contains:
     1. blackslash escaped parentheses, or
     2. parentheses paired.
@@ -188,14 +190,44 @@ def normalize_label(label: str) -> str:
     return re.sub(r"\s+", " ", label).strip().casefold()
 
 
-def partition_by_spaces(text: str) -> Tuple[str, str, str]:
+def find_next(
+    text: str,
+    target: Container[str],
+    start: int = 0,
+    end: Optional[int] = None,
+    disallowed: Container[str] = (),
+) -> int:
+    """Find the next occurrence of target in text, and return the index
+    Characters are escaped by backslash.
+    Optional disallowed characters can be specified, if found, the search
+    will fail with -2 returned. Otherwise, -1 is returned if not found.
+    """
+    if end is None:
+        end = len(text)
+    i = start
+    escaped = False
+    while i < end:
+        c = text[i]
+        if escaped:
+            escaped = False
+        elif c in target:
+            return i
+        elif c in disallowed:
+            return -2
+        elif c == "\\":
+            escaped = True
+        i += 1
+    return -1
+
+
+def partition_by_spaces(text: str, spaces: str = " \t") -> Tuple[str, str, str]:
     """Split the given text by spaces or tabs, and return a tuple of
     (start, delimiter, remaining). If spaces are not found, the latter
     two elements will be empty.
     """
     start = end = -1
     for i, c in enumerate(text):
-        if c in " \t":
+        if c in spaces:
             if start >= 0:
                 continue
             start = i

diff --git a/marko/inline_parser.py b/marko/inline_parser.py
@@ -1,21 +1,29 @@
 """
 Parse inline elements
 """
+import collections
 import re
-import string
 from typing import TYPE_CHECKING, List, Match, Optional, Tuple, Type, Union
 
 from . import patterns
-from .helpers import is_paired, normalize_label
+from .helpers import is_paired, normalize_label, find_next
 
 if TYPE_CHECKING:
     from .block import Document
     from .inline import InlineElement
 
     ElementType = Type[InlineElement]
-    Group = Tuple[int, int, Optional[str]]
     _Match = Union[Match[str], "MatchObj"]
 
+Group = collections.namedtuple("Group", "start end text")
+_EMPTY_GROUP = Group(-1, -1, None)
+WHITESPACE = " \n\t"
+ASCII_CONTROL = "".join(chr(i) for i in range(0, 32)) + chr(127)
+
+
+class ParseError(ValueError):
+    """Raised when parsing fails."""
+
 
 def parse(
     text: str, elements: List["ElementType"], fallback: "ElementType"
@@ -203,7 +211,7 @@ def look_for_image_or_link(
             break  # break to remove the delimiter and return None
         if not _is_legal_link_text(text[d.end : close]):
             break
-        link_text = (d.end, close, text[d.end : close])
+        link_text = Group(d.end, close, text[d.end : close])
         etype = "Image" if d.content == "![" else "Link"
         match = _expect_inline_link(text, close + 1) or _expect_reference_link(
             text, close + 1, link_text[2], root_node
@@ -231,68 +239,127 @@ def _is_legal_link_text(text: str) -> bool:
     return is_paired(text, "[", "]")
 
 
-def _expect_inline_link(
-    text: str, start: int
-) -> Optional[Tuple["Group", "Group", int]]:
-    """(link_dest "link_title")"""
-    if start >= len(text) - 1 or text[start] != "(":
+def _parse_link_separator(text: str, start: int) -> int:
+    i = start
+    has_newline = False
+    while i < len(text):
+        if text[i] == "\n":
+            if has_newline:
+                break
+            has_newline = True
+        elif text[i] not in WHITESPACE:
+            break
+        i += 1
+    return i
+
+
+def _parse_link_label(text: str, start: int) -> Optional[Group]:
+    if text[start : start + 1] != "[":
+        return None
+    i = find_next(text, "]", start + 1, disallowed="[")
+    if i < 0:
         return None
-    i = start + 1
-    m = patterns.whitespace.match(text, i)
-    if m:
-        i = m.end()
-    m = patterns.link_dest_1.match(text, i)
-    if m:
-        link_dest = m.start(), m.end(), m.group()
-        i = m.end()
+    label = text[start + 1 : i]
+    if not label.strip() or len(label) > 999:
+        return None
+    return Group(start, i + 1, text[start : i + 1])
+
+
+def _parse_link_dest_title(
+    link_text: str, start: int = 0, is_inline: bool = False
+) -> Tuple[Group, Group]:
+    if start >= len(link_text):
+        raise ParseError()
+    if link_text[start] == "<":
+        right_bracket = find_next(link_text, ">", start + 1, disallowed="<\n")
+        if right_bracket < 0:
+            raise ParseError()
+        i = right_bracket + 1
+        link_dest = Group(start, i, link_text[start:i])
     else:
-        if text[i] == "<":
-            return None
-        open_num = 0
         escaped = False
-        prev = i
-        while i < len(text):
-            c = text[i]
+        pairs = 0
+        for i, c in enumerate(link_text[start:], start):
             if escaped:
                 escaped = False
             elif c == "\\":
                 escaped = True
-            elif c == "(":
-                open_num += 1
-            elif c in string.whitespace:
+            elif c in WHITESPACE:
                 break
+            elif c in ASCII_CONTROL:
+                raise ParseError("Invalid character in link destination")
+            elif c == "(":
+                pairs += 1
             elif c == ")":
-                if open_num > 0:
-                    open_num -= 1
+                if pairs > 0:
+                    pairs -= 1
+                elif is_inline:
+                    link_dest = Group(start, i, link_text[start:i])
+                    return link_dest, _EMPTY_GROUP
                 else:
-                    break
-            i += 1
-        if open_num != 0:
-            return None
-        link_dest = prev, i, text[prev:i]
-    link_title = i, i, None
-    tail_re = re.compile(r"(?:\s+%s)?\s*\)" % patterns.link_title, flags=re.UNICODE)
-    m = tail_re.match(text, i)
-    if not m:
+                    raise ParseError("unmatched parenthesis")
+        else:
+            if is_inline:
+                raise ParseError("No right parenthesis is found")
+        link_dest = Group(start, i, link_text[start:i])
+        if not link_dest.text:
+            raise ParseError("Empty link destination")
+    prev = i
+    i = _parse_link_separator(link_text, i)
+    if i >= len(link_text) or link_text[i] == "\n" or link_text[i] == ")" and is_inline:
+        return link_dest, _EMPTY_GROUP
+    if link_text[i] == '"':
+        end = find_next(link_text, '"', i + 1)
+    elif link_text[i] == "'":
+        end = find_next(link_text, "'", i + 1)
+    elif link_text[i] == "(":
+        end = find_next(link_text, ")", i + 1, disallowed="(")
+    elif "\n" in link_text[prev:i]:
+        return link_dest, _EMPTY_GROUP
+    else:
+        raise ParseError()
+    if 0 < i < len(link_text) and link_text[i - 1] not in WHITESPACE:
+        raise ParseError()
+    if end < 0:
+        raise ParseError()
+    if "\n\n" in link_text[i:end]:
+        raise ParseError()
+    link_title = Group(i, end + 1, link_text[i : end + 1])
+    return link_dest, link_title
+
+
+def _expect_inline_link(text: str, start: int) -> Optional[Tuple[Group, Group, int]]:
+    """(link_dest "link_title")"""
+    if start >= len(text) - 1 or text[start] != "(":
+        return None
+    i = _parse_link_separator(text, start + 1)
+
+    try:
+        link_dest, link_title = _parse_link_dest_title(text, i, is_inline=True)
+    except ParseError:
+        return None
+    end = max(link_dest.end, link_title.end)
+    end = _parse_link_separator(text, end)
+    if end >= len(text) or text[end] != ")":
         return None
-    if m.group("title"):
-        link_title = m.start("title"), m.end("title"), m.group("title")  # type: ignore
-    return (link_dest, link_title, m.end())
+    return link_dest, link_title, end + 1
 
 
 def _expect_reference_link(
     text: str, start: int, link_text: str, root_node: "Document"
-) -> Optional[Tuple["Group", "Group", int]]:
-    match = patterns.optional_label.match(text, start)
-    link_label = link_text
-    if match and match.group()[1:-1]:
-        link_label = match.group()[1:-1]
-    result = _get_reference_link(link_label, root_node)
+) -> Optional[Tuple[Group, Group, int]]:
+    link_label = _parse_link_label(text, start)
+    label = link_text
+    if link_label is not None:
+        label = link_label.text[1:-1] or link_text
+    elif text[start : start + 2] == "[]":
+        link_label = Group(start, start + 2, "[]")
+    result = _get_reference_link(label, root_node)
     if not result:
         return None
-    link_dest = start, start, result[0]
-    link_title = start, start, result[1]
-    return (link_dest, link_title, match and match.end() or start)
+    link_dest = Group(start, start, result[0])
+    link_title = Group(start, start, result[1])
+    return (link_dest, link_title, link_label.end if link_label else start)
 
 
 def _get_reference_link(
@@ -322,7 +389,9 @@ def process_emphasis(
                 text,
                 d_opener.end - n,
                 d_closer.start + n,
-                (d_opener.end, d_closer.start, text[d_opener.end : d_closer.start]),
+                Group(
+                    d_opener.end, d_closer.start, text[d_opener.end : d_closer.start]
+                ),
             )
             matches.append(match)
             del delimiters[opener + 1 : cur]
@@ -458,7 +527,7 @@ class MatchObj:
     """A fake match object that memes re.match methods"""
 
     def __init__(
-        self, etype: str, text: str, start: int, end: int, *groups: "Group"
+        self, etype: str, text: str, start: int, end: int, *groups: Group
     ) -> None:
         self._text = text
         self._start = start
@@ -480,3 +549,6 @@ def end(self, n: int = 0) -> int:
         if n == 0:
             return self._end
         return self._groups[n - 1][1]
+
+    def span(self, n: int = 0) -> Tuple[int, int]:
+        return (self.start(n), self.end(n))