diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py index 6bf9a36f..0d11e3e3 100644 --- a/markdown_it/common/utils.py +++ b/markdown_it/common/utils.py @@ -304,3 +304,15 @@ def normalizeReference(string: str) -> str: # most notably, `__proto__`) # return string.lower().upper() + + +LINK_OPEN_RE = re.compile(r"^\s]", flags=re.IGNORECASE) +LINK_CLOSE_RE = re.compile(r"^", flags=re.IGNORECASE) + + +def isLinkOpen(string: str) -> bool: + return bool(LINK_OPEN_RE.search(string)) + + +def isLinkClose(string: str) -> bool: + return bool(LINK_CLOSE_RE.search(string)) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 88140d3d..febe4e6e 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -16,6 +16,7 @@ # Parser rules _rules: list[tuple[str, RuleFunc]] = [ ("text", rules_inline.text), + ("linkify", rules_inline.linkify), ("newline", rules_inline.newline), ("escape", rules_inline.escape), ("backticks", rules_inline.backtick), diff --git a/markdown_it/presets/__init__.py b/markdown_it/presets/__init__.py index f1cb0507..1e6796a2 100644 --- a/markdown_it/presets/__init__.py +++ b/markdown_it/presets/__init__.py @@ -21,7 +21,7 @@ def make() -> PresetType: config = commonmark.make() config["components"]["core"]["rules"].append("linkify") config["components"]["block"]["rules"].append("table") - config["components"]["inline"]["rules"].append("strikethrough") + config["components"]["inline"]["rules"].extend(["strikethrough", "linkify"]) config["components"]["inline"]["rules2"].append("strikethrough") config["options"]["linkify"] = True config["options"]["html"] = True diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py index 49bb4ef3..efbc9d4c 100644 --- a/markdown_it/rules_core/linkify.py +++ b/markdown_it/rules_core/linkify.py @@ -1,41 +1,32 @@ +from __future__ import annotations + import re +from typing import Protocol -from ..common.utils import arrayReplaceAt +from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen from ..token import Token from .state_core import StateCore -LINK_OPEN_RE = re.compile(r"^\s]", flags=re.IGNORECASE) -LINK_CLOSE_RE = re.compile(r"^", flags=re.IGNORECASE) - HTTP_RE = re.compile(r"^http://") MAILTO_RE = re.compile(r"^mailto:") TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE) -def isLinkOpen(string: str) -> bool: - return bool(LINK_OPEN_RE.search(string)) - - -def isLinkClose(string: str) -> bool: - return bool(LINK_CLOSE_RE.search(string)) - - def linkify(state: StateCore) -> None: - blockTokens = state.tokens - + """Rule for identifying plain-text links.""" if not state.md.options.linkify: return if not state.md.linkify: raise ModuleNotFoundError("Linkify enabled but not installed.") - for j in range(len(blockTokens)): - if blockTokens[j].type != "inline" or not state.md.linkify.pretest( - blockTokens[j].content + for inline_token in state.tokens: + if inline_token.type != "inline" or not state.md.linkify.pretest( + inline_token.content ): continue - tokens = blockTokens[j].children + tokens = inline_token.children htmlLinkLevel = 0 @@ -71,38 +62,47 @@ def linkify(state: StateCore) -> None: currentToken.content ): text = currentToken.content - links = state.md.linkify.match(text) + links: list[_LinkType] = state.md.linkify.match(text) or [] # Now split string to nodes nodes = [] level = currentToken.level lastPos = 0 - for ln in range(len(links)): - url = links[ln].url + # forbid escape sequence at the start of the string, + # this avoids http\://example.com/ from being linkified as + # http://example.com/ + if ( + links + and links[0].index == 0 + and i > 0 + and tokens[i - 1].type == "text_special" + ): + links = links[1:] + + for link in links: + url = link.url fullUrl = state.md.normalizeLink(url) if not state.md.validateLink(fullUrl): continue - urlText = links[ln].text + urlText = link.text # Linkifier might send raw hostnames like "example.com", where url # starts with domain name. So we prepend http:// in those cases, # and remove it afterwards. - if not links[ln].schema: + if not link.schema: urlText = HTTP_RE.sub( "", state.md.normalizeLinkText("http://" + urlText) ) - elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search( - urlText - ): + elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText): urlText = MAILTO_RE.sub( "", state.md.normalizeLinkText("mailto:" + urlText) ) else: urlText = state.md.normalizeLinkText(urlText) - pos = links[ln].index + pos = link.index if pos > lastPos: token = Token("text", "", 0) @@ -130,7 +130,7 @@ def linkify(state: StateCore) -> None: token.info = "auto" nodes.append(token) - lastPos = links[ln].last_index + lastPos = link.last_index if lastPos < len(text): token = Token("text", "", 0) @@ -138,4 +138,12 @@ def linkify(state: StateCore) -> None: token.level = level nodes.append(token) - blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes) + inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes) + + +class _LinkType(Protocol): + url: str + text: str + index: int + last_index: int + schema: str | None diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py index dde97d34..3a8026ec 100644 --- a/markdown_it/rules_inline/__init__.py +++ b/markdown_it/rules_inline/__init__.py @@ -3,6 +3,7 @@ "text", "fragments_join", "link_pairs", + "linkify", "escape", "newline", "backtick", @@ -24,6 +25,7 @@ from .html_inline import html_inline from .image import image from .link import link +from .linkify import linkify from .newline import newline from .state_inline import StateInline from .text import text diff --git a/markdown_it/rules_inline/html_inline.py b/markdown_it/rules_inline/html_inline.py index 3c8b5331..9065e1d0 100644 --- a/markdown_it/rules_inline/html_inline.py +++ b/markdown_it/rules_inline/html_inline.py @@ -1,5 +1,6 @@ # Process html tags from ..common.html_re import HTML_TAG_RE +from ..common.utils import isLinkClose, isLinkOpen from .state_inline import StateInline @@ -33,5 +34,10 @@ def html_inline(state: StateInline, silent: bool) -> bool: token = state.push("html_inline", "", 0) token.content = state.src[pos : pos + len(match.group(0))] + if isLinkOpen(token.content): + state.linkLevel += 1 + if isLinkClose(token.content): + state.linkLevel -= 1 + state.pos += len(match.group(0)) return True diff --git a/markdown_it/rules_inline/link.py b/markdown_it/rules_inline/link.py index 18c0736c..78cf9122 100644 --- a/markdown_it/rules_inline/link.py +++ b/markdown_it/rules_inline/link.py @@ -140,7 +140,9 @@ def link(state: StateInline, silent: bool) -> bool: if label and state.md.options.get("store_labels", False): token.meta["label"] = label + state.linkLevel += 1 state.md.inline.tokenize(state) + state.linkLevel -= 1 token = state.push("link_close", "a", -1) diff --git a/markdown_it/rules_inline/linkify.py b/markdown_it/rules_inline/linkify.py new file mode 100644 index 00000000..a8a18153 --- /dev/null +++ b/markdown_it/rules_inline/linkify.py @@ -0,0 +1,61 @@ +"""Process links like https://example.org/""" +import re + +from .state_inline import StateInline + +# RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +SCHEME_RE = re.compile(r"(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$", re.IGNORECASE) + + +def linkify(state: StateInline, silent: bool) -> bool: + """Rule for identifying plain-text links.""" + if not state.md.options.linkify: + return False + if state.linkLevel > 0: + return False + if not state.md.linkify: + raise ModuleNotFoundError("Linkify enabled but not installed.") + + pos = state.pos + maximum = state.posMax + + if ( + (pos + 3) > maximum + or state.src[pos] != ":" + or state.src[pos + 1] != "/" + or state.src[pos + 2] != "/" + ): + return False + + if not (match := SCHEME_RE.match(state.pending)): + return False + + proto = match.group(1) + if not (link := state.md.linkify.match_at_start(state.src[pos - len(proto) :])): + return False + url: str = link.url + + # disallow '*' at the end of the link (conflicts with emphasis) + url = url.rstrip("*") + + full_url = state.md.normalizeLink(url) + if not state.md.validateLink(full_url): + return False + + if not silent: + state.pending = state.pending[: -len(proto)] + + token = state.push("link_open", "a", 1) + token.attrs = {"href": full_url} + token.markup = "linkify" + token.info = "auto" + + token = state.push("text", "", 0) + token.content = state.md.normalizeLinkText(url) + + token = state.push("link_close", "a", -1) + token.markup = "linkify" + token.info = "auto" + + state.pos += len(url) - len(proto) + return True diff --git a/markdown_it/rules_inline/state_inline.py b/markdown_it/rules_inline/state_inline.py index 143ab33e..c0c491c4 100644 --- a/markdown_it/rules_inline/state_inline.py +++ b/markdown_it/rules_inline/state_inline.py @@ -70,6 +70,10 @@ def __init__( self.backticks: dict[int, int] = {} self.backticksScanned = False + # Counter used to disable inline linkify-it execution + # inside and markdown links + self.linkLevel = 0 + def __repr__(self) -> str: return ( f"{self.__class__.__name__}" diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py index 64a2bbe8..178d717e 100644 --- a/tests/test_api/test_main.py +++ b/tests/test_api/test_main.py @@ -30,6 +30,7 @@ def test_get_rules(): ], "inline": [ "text", + "linkify", "newline", "escape", "backticks", diff --git a/tests/test_port/fixtures/linkify.md b/tests/test_port/fixtures/linkify.md index c9755c03..f51bb6b9 100644 --- a/tests/test_port/fixtures/linkify.md +++ b/tests/test_port/fixtures/linkify.md @@ -29,6 +29,84 @@ don't touch text in html tags

https://example.com

. +entities inside raw links +. +https://example.com/foo&bar +. +

https://example.com/foo&amp;bar

+. + + +emphasis inside raw links (asterisk, can happen in links with params) +. +https://example.com/foo*bar*baz +. +

https://example.com/foo*bar*baz

+. + + +emphasis inside raw links (underscore) +. +http://example.org/foo._bar_-_baz +. +

http://example.org/foo._bar_-_baz

+. + + +backticks inside raw links +. +https://example.com/foo`bar`baz +. +

https://example.com/foo`bar`baz

+. + + +links inside raw links +. +https://example.com/foo[123](456)bar +. +

https://example.com/foo[123](456)bar

+. + + +escapes not allowed at the start +. +\https://example.com +. +

\https://example.com

+. + + +escapes not allowed at comma +. +https\://example.com +. +

https://example.com

+. + + +escapes not allowed at slashes +. +https:\//aa.org https://bb.org +. +

https://aa.org https://bb.org

+. + + +fuzzy link shouldn't match cc.org +. +https:/\/cc.org +. +

https://cc.org

+. + + +bold links (exclude markup of pairs from link tail) +. +**http://example.com/foobar** +. +

http://example.com/foobar

+. match links without protocol . @@ -37,6 +115,35 @@ www.example.org

www.example.org

. +coverage, prefix not valid +. +http:/example.com/ +. +

http:/example.com/

+. + + +coverage, negative link level +. +[https://example.com](https://example.com) +. +

https://example.com

+. + + +emphasis with '*', real link: +. +http://cdecl.ridiculousfish.com/?q=int+%28*f%29+%28float+*%29%3B +. +

http://cdecl.ridiculousfish.com/?q=int+(*f)+(float+*)%3B

+. + +emphasis with '_', real link: +. +https://www.sell.fi/sites/default/files/elainlaakarilehti/tieteelliset_artikkelit/kahkonen_t._et_al.canine_pancreatitis-_review.pdf +. +

https://www.sell.fi/sites/default/files/elainlaakarilehti/tieteelliset_artikkelit/kahkonen_t._et_al.canine_pancreatitis-_review.pdf

+. emails .