Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve the performance parsing link labels #124

Merged
merged 1 commit into from
Sep 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 36 additions & 23 deletions marko/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from typing import List as _List
from typing import Match, Optional, Tuple, Union, cast

from . import inline, patterns
from . import inline, inline_parser, patterns
from .element import Element
from .helpers import Source, is_paired, normalize_label, partition_by_spaces
from .helpers import Source, normalize_label, partition_by_spaces, find_next
from .parser import Parser

__all__ = (
Expand Down Expand Up @@ -569,42 +569,55 @@ class LinkRefDef(BlockElement):
[label]: destination "title"
"""

pattern = re.compile(
r" {,3}%s:(?P<s1>\s*)%s(?P<s2>\s*)(?:(?<=\s)%s)?[^\n\S]*$\n?"
% (patterns.link_label, patterns.link_dest, patterns.link_title),
flags=re.M,
pattern = re.compile(r" {,3}(\[[\s\S]*?)(?=\n\n|\Z)", flags=re.M)
_parse_info = (
inline_parser._EMPTY_GROUP,
inline_parser._EMPTY_GROUP,
inline_parser._EMPTY_GROUP,
-1,
)
_parse_info = ("", "", "")

@classmethod
def match(cls, source: Source) -> bool:
m = source.expect_re(cls.pattern)
if not m:
return False
rv = m.groupdict()
if rv["s1"].count("\n") > 1 or rv["s1"].count("\n") > 1:
text = source._buffer
link_label = inline_parser._parse_link_label(text, m.start(1))
if not link_label: # no ending bracket
return False
label = rv["label"]
if rv["dest"][0] == "<" and rv["dest"][-1] == ">":
dest = rv["dest"]
elif is_paired(rv["dest"], "(", ")"):
dest = rv["dest"]
else:
if link_label.end >= len(text) or text[link_label.end] != ":":
# no colon after the ending bracket
return False
title = rv["title"]
if title and re.search(r"^$", title, re.M):
i = inline_parser._parse_link_separator(text, link_label.end + 1)
try:
link_dest, link_title = inline_parser._parse_link_dest_title(text, i)
except inline_parser.ParseError:
return False
cls._parse_info = label, dest, title
return m is not None
i = max(link_dest.end, link_title.end)
end = find_next(text, "\n", i)
if end >= 0:
end += 1
else:
end = i
if text[i:end].strip():
if link_title.text and "\n" in text[link_dest.end : link_title.start]:
link_title = inline_parser._EMPTY_GROUP
end = find_next(text, "\n", link_dest.end) + 1
else:
# There is content after the link title
return False
cls._parse_info = (link_label, link_dest, link_title, end)
return True

@classmethod
def parse(cls, source: Source) -> "LinkRefDef":
label, dest, title = cls._parse_info
normalized_label = normalize_label(label[1:-1])
label, dest, title, pos = cls._parse_info
normalized_label = normalize_label(label.text[1:-1])
assert isinstance(source.root, Document)
if normalized_label not in source.root.link_ref_defs:
source.root.link_ref_defs[normalized_label] = (dest, title)
source.consume()
source.root.link_ref_defs[normalized_label] = (dest.text, title.text)
source.pos = pos
return cls()


Expand Down
38 changes: 35 additions & 3 deletions marko/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from typing import (
TYPE_CHECKING,
Callable,
Container,
Generator,
Iterable,
List,
Match,
Optional,
Expand All @@ -28,7 +30,7 @@ def camel_to_snake_case(name: str) -> str:
return "_".join(map(str.lower, re.findall(pattern, name)))


def is_paired(text: str, open: str = "(", close: str = ")") -> bool:
def is_paired(text: Iterable[str], open: str = "(", close: str = ")") -> bool:
"""Check if the text only contains:
1. blackslash escaped parentheses, or
2. parentheses paired.
Expand Down Expand Up @@ -188,14 +190,44 @@ def normalize_label(label: str) -> str:
return re.sub(r"\s+", " ", label).strip().casefold()


def partition_by_spaces(text: str) -> Tuple[str, str, str]:
def find_next(
text: str,
target: Container[str],
start: int = 0,
end: Optional[int] = None,
disallowed: Container[str] = (),
) -> int:
"""Find the next occurrence of target in text, and return the index
Characters are escaped by backslash.
Optional disallowed characters can be specified, if found, the search
will fail with -2 returned. Otherwise, -1 is returned if not found.
"""
if end is None:
end = len(text)
i = start
escaped = False
while i < end:
c = text[i]
if escaped:
escaped = False
elif c in target:
return i
elif c in disallowed:
return -2
elif c == "\\":
escaped = True
i += 1
return -1


def partition_by_spaces(text: str, spaces: str = " \t") -> Tuple[str, str, str]:
"""Split the given text by spaces or tabs, and return a tuple of
(start, delimiter, remaining). If spaces are not found, the latter
two elements will be empty.
"""
start = end = -1
for i, c in enumerate(text):
if c in " \t":
if c in spaces:
if start >= 0:
continue
start = i
Expand Down
174 changes: 123 additions & 51 deletions marko/inline_parser.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
"""
Parse inline elements
"""
import collections
import re
import string
from typing import TYPE_CHECKING, List, Match, Optional, Tuple, Type, Union

from . import patterns
from .helpers import is_paired, normalize_label
from .helpers import is_paired, normalize_label, find_next

if TYPE_CHECKING:
from .block import Document
from .inline import InlineElement

ElementType = Type[InlineElement]
Group = Tuple[int, int, Optional[str]]
_Match = Union[Match[str], "MatchObj"]

Group = collections.namedtuple("Group", "start end text")
_EMPTY_GROUP = Group(-1, -1, None)
WHITESPACE = " \n\t"
ASCII_CONTROL = "".join(chr(i) for i in range(0, 32)) + chr(127)


class ParseError(ValueError):
"""Raised when parsing fails."""


def parse(
text: str, elements: List["ElementType"], fallback: "ElementType"
Expand Down Expand Up @@ -203,7 +211,7 @@ def look_for_image_or_link(
break # break to remove the delimiter and return None
if not _is_legal_link_text(text[d.end : close]):
break
link_text = (d.end, close, text[d.end : close])
link_text = Group(d.end, close, text[d.end : close])
etype = "Image" if d.content == "![" else "Link"
match = _expect_inline_link(text, close + 1) or _expect_reference_link(
text, close + 1, link_text[2], root_node
Expand Down Expand Up @@ -231,68 +239,127 @@ def _is_legal_link_text(text: str) -> bool:
return is_paired(text, "[", "]")


def _expect_inline_link(
text: str, start: int
) -> Optional[Tuple["Group", "Group", int]]:
"""(link_dest "link_title")"""
if start >= len(text) - 1 or text[start] != "(":
def _parse_link_separator(text: str, start: int) -> int:
i = start
has_newline = False
while i < len(text):
if text[i] == "\n":
if has_newline:
break
has_newline = True
elif text[i] not in WHITESPACE:
break
i += 1
return i


def _parse_link_label(text: str, start: int) -> Optional[Group]:
if text[start : start + 1] != "[":
return None
i = find_next(text, "]", start + 1, disallowed="[")
if i < 0:
return None
i = start + 1
m = patterns.whitespace.match(text, i)
if m:
i = m.end()
m = patterns.link_dest_1.match(text, i)
if m:
link_dest = m.start(), m.end(), m.group()
i = m.end()
label = text[start + 1 : i]
if not label.strip() or len(label) > 999:
return None
return Group(start, i + 1, text[start : i + 1])


def _parse_link_dest_title(
link_text: str, start: int = 0, is_inline: bool = False
) -> Tuple[Group, Group]:
if start >= len(link_text):
raise ParseError()
if link_text[start] == "<":
right_bracket = find_next(link_text, ">", start + 1, disallowed="<\n")
if right_bracket < 0:
raise ParseError()
i = right_bracket + 1
link_dest = Group(start, i, link_text[start:i])
else:
if text[i] == "<":
return None
open_num = 0
escaped = False
prev = i
while i < len(text):
c = text[i]
pairs = 0
for i, c in enumerate(link_text[start:], start):
if escaped:
escaped = False
elif c == "\\":
escaped = True
elif c == "(":
open_num += 1
elif c in string.whitespace:
elif c in WHITESPACE:
break
elif c in ASCII_CONTROL:
raise ParseError("Invalid character in link destination")
elif c == "(":
pairs += 1
elif c == ")":
if open_num > 0:
open_num -= 1
if pairs > 0:
pairs -= 1
elif is_inline:
link_dest = Group(start, i, link_text[start:i])
return link_dest, _EMPTY_GROUP
else:
break
i += 1
if open_num != 0:
return None
link_dest = prev, i, text[prev:i]
link_title = i, i, None
tail_re = re.compile(r"(?:\s+%s)?\s*\)" % patterns.link_title, flags=re.UNICODE)
m = tail_re.match(text, i)
if not m:
raise ParseError("unmatched parenthesis")
else:
if is_inline:
raise ParseError("No right parenthesis is found")
link_dest = Group(start, i, link_text[start:i])
if not link_dest.text:
raise ParseError("Empty link destination")
prev = i
i = _parse_link_separator(link_text, i)
if i >= len(link_text) or link_text[i] == "\n" or link_text[i] == ")" and is_inline:
return link_dest, _EMPTY_GROUP
if link_text[i] == '"':
end = find_next(link_text, '"', i + 1)
elif link_text[i] == "'":
end = find_next(link_text, "'", i + 1)
elif link_text[i] == "(":
end = find_next(link_text, ")", i + 1, disallowed="(")
elif "\n" in link_text[prev:i]:
return link_dest, _EMPTY_GROUP
else:
raise ParseError()
if 0 < i < len(link_text) and link_text[i - 1] not in WHITESPACE:
raise ParseError()
if end < 0:
raise ParseError()
if "\n\n" in link_text[i:end]:
raise ParseError()
link_title = Group(i, end + 1, link_text[i : end + 1])
return link_dest, link_title


def _expect_inline_link(text: str, start: int) -> Optional[Tuple[Group, Group, int]]:
"""(link_dest "link_title")"""
if start >= len(text) - 1 or text[start] != "(":
return None
i = _parse_link_separator(text, start + 1)

try:
link_dest, link_title = _parse_link_dest_title(text, i, is_inline=True)
except ParseError:
return None
end = max(link_dest.end, link_title.end)
end = _parse_link_separator(text, end)
if end >= len(text) or text[end] != ")":
return None
if m.group("title"):
link_title = m.start("title"), m.end("title"), m.group("title") # type: ignore
return (link_dest, link_title, m.end())
return link_dest, link_title, end + 1


def _expect_reference_link(
text: str, start: int, link_text: str, root_node: "Document"
) -> Optional[Tuple["Group", "Group", int]]:
match = patterns.optional_label.match(text, start)
link_label = link_text
if match and match.group()[1:-1]:
link_label = match.group()[1:-1]
result = _get_reference_link(link_label, root_node)
) -> Optional[Tuple[Group, Group, int]]:
link_label = _parse_link_label(text, start)
label = link_text
if link_label is not None:
label = link_label.text[1:-1] or link_text
elif text[start : start + 2] == "[]":
link_label = Group(start, start + 2, "[]")
result = _get_reference_link(label, root_node)
if not result:
return None
link_dest = start, start, result[0]
link_title = start, start, result[1]
return (link_dest, link_title, match and match.end() or start)
link_dest = Group(start, start, result[0])
link_title = Group(start, start, result[1])
return (link_dest, link_title, link_label.end if link_label else start)


def _get_reference_link(
Expand Down Expand Up @@ -322,7 +389,9 @@ def process_emphasis(
text,
d_opener.end - n,
d_closer.start + n,
(d_opener.end, d_closer.start, text[d_opener.end : d_closer.start]),
Group(
d_opener.end, d_closer.start, text[d_opener.end : d_closer.start]
),
)
matches.append(match)
del delimiters[opener + 1 : cur]
Expand Down Expand Up @@ -458,7 +527,7 @@ class MatchObj:
"""A fake match object that memes re.match methods"""

def __init__(
self, etype: str, text: str, start: int, end: int, *groups: "Group"
self, etype: str, text: str, start: int, end: int, *groups: Group
) -> None:
self._text = text
self._start = start
Expand All @@ -480,3 +549,6 @@ def end(self, n: int = 0) -> int:
if n == 0:
return self._end
return self._groups[n - 1][1]

def span(self, n: int = 0) -> Tuple[int, int]:
return (self.start(n), self.end(n))
Loading