Skip to content

Commit

Permalink
♻️ Refactor: Add linkifier rule to inline chain for full links (#279)
Browse files Browse the repository at this point in the history
Fixes collision of emphasis and linkifier
(so `http://example.org/foo._bar_-_baz` is now a single link, not emphasized).
Emails and fuzzy links are not affected by this.

Implements upstream: markdown-it/markdown-it@6b58ec4
  • Loading branch information
chrisjsewell committed Jun 2, 2023
1 parent ba96f34 commit ea27cc8
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 30 deletions.
12 changes: 12 additions & 0 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,15 @@ def normalizeReference(string: str) -> str:
# most notably, `__proto__`)
#
return string.lower().upper()


LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)


def isLinkOpen(string: str) -> bool:
return bool(LINK_OPEN_RE.search(string))


def isLinkClose(string: str) -> bool:
return bool(LINK_CLOSE_RE.search(string))
1 change: 1 addition & 0 deletions markdown_it/parser_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# Parser rules
_rules: list[tuple[str, RuleFunc]] = [
("text", rules_inline.text),
("linkify", rules_inline.linkify),
("newline", rules_inline.newline),
("escape", rules_inline.escape),
("backticks", rules_inline.backtick),
Expand Down
2 changes: 1 addition & 1 deletion markdown_it/presets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def make() -> PresetType:
config = commonmark.make()
config["components"]["core"]["rules"].append("linkify")
config["components"]["block"]["rules"].append("table")
config["components"]["inline"]["rules"].append("strikethrough")
config["components"]["inline"]["rules"].extend(["strikethrough", "linkify"])
config["components"]["inline"]["rules2"].append("strikethrough")
config["options"]["linkify"] = True
config["options"]["html"] = True
Expand Down
66 changes: 37 additions & 29 deletions markdown_it/rules_core/linkify.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,32 @@
from __future__ import annotations

import re
from typing import Protocol

from ..common.utils import arrayReplaceAt
from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen
from ..token import Token
from .state_core import StateCore

LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)

HTTP_RE = re.compile(r"^http://")
MAILTO_RE = re.compile(r"^mailto:")
TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)


def isLinkOpen(string: str) -> bool:
return bool(LINK_OPEN_RE.search(string))


def isLinkClose(string: str) -> bool:
return bool(LINK_CLOSE_RE.search(string))


def linkify(state: StateCore) -> None:
blockTokens = state.tokens

"""Rule for identifying plain-text links."""
if not state.md.options.linkify:
return

if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")

for j in range(len(blockTokens)):
if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
blockTokens[j].content
for inline_token in state.tokens:
if inline_token.type != "inline" or not state.md.linkify.pretest(
inline_token.content
):
continue

tokens = blockTokens[j].children
tokens = inline_token.children

htmlLinkLevel = 0

Expand Down Expand Up @@ -71,38 +62,47 @@ def linkify(state: StateCore) -> None:
currentToken.content
):
text = currentToken.content
links = state.md.linkify.match(text)
links: list[_LinkType] = state.md.linkify.match(text) or []

# Now split string to nodes
nodes = []
level = currentToken.level
lastPos = 0

for ln in range(len(links)):
url = links[ln].url
# forbid escape sequence at the start of the string,
# this avoids http\://example.com/ from being linkified as
# http:<a href="//example.com/">//example.com/</a>
if (
links
and links[0].index == 0
and i > 0
and tokens[i - 1].type == "text_special"
):
links = links[1:]

for link in links:
url = link.url
fullUrl = state.md.normalizeLink(url)
if not state.md.validateLink(fullUrl):
continue

urlText = links[ln].text
urlText = link.text

# Linkifier might send raw hostnames like "example.com", where url
# starts with domain name. So we prepend http:// in those cases,
# and remove it afterwards.
if not links[ln].schema:
if not link.schema:
urlText = HTTP_RE.sub(
"", state.md.normalizeLinkText("http://" + urlText)
)
elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
urlText
):
elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText):
urlText = MAILTO_RE.sub(
"", state.md.normalizeLinkText("mailto:" + urlText)
)
else:
urlText = state.md.normalizeLinkText(urlText)

pos = links[ln].index
pos = link.index

if pos > lastPos:
token = Token("text", "", 0)
Expand Down Expand Up @@ -130,12 +130,20 @@ def linkify(state: StateCore) -> None:
token.info = "auto"
nodes.append(token)

lastPos = links[ln].last_index
lastPos = link.last_index

if lastPos < len(text):
token = Token("text", "", 0)
token.content = text[lastPos:]
token.level = level
nodes.append(token)

blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes)


class _LinkType(Protocol):
url: str
text: str
index: int
last_index: int
schema: str | None
2 changes: 2 additions & 0 deletions markdown_it/rules_inline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"text",
"fragments_join",
"link_pairs",
"linkify",
"escape",
"newline",
"backtick",
Expand All @@ -24,6 +25,7 @@
from .html_inline import html_inline
from .image import image
from .link import link
from .linkify import linkify
from .newline import newline
from .state_inline import StateInline
from .text import text
6 changes: 6 additions & 0 deletions markdown_it/rules_inline/html_inline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Process html tags
from ..common.html_re import HTML_TAG_RE
from ..common.utils import isLinkClose, isLinkOpen
from .state_inline import StateInline


Expand Down Expand Up @@ -33,5 +34,10 @@ def html_inline(state: StateInline, silent: bool) -> bool:
token = state.push("html_inline", "", 0)
token.content = state.src[pos : pos + len(match.group(0))]

if isLinkOpen(token.content):
state.linkLevel += 1
if isLinkClose(token.content):
state.linkLevel -= 1

state.pos += len(match.group(0))
return True
2 changes: 2 additions & 0 deletions markdown_it/rules_inline/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ def link(state: StateInline, silent: bool) -> bool:
if label and state.md.options.get("store_labels", False):
token.meta["label"] = label

state.linkLevel += 1
state.md.inline.tokenize(state)
state.linkLevel -= 1

token = state.push("link_close", "a", -1)

Expand Down
61 changes: 61 additions & 0 deletions markdown_it/rules_inline/linkify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Process links like https://example.org/"""
import re

from .state_inline import StateInline

# RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
SCHEME_RE = re.compile(r"(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$", re.IGNORECASE)


def linkify(state: StateInline, silent: bool) -> bool:
"""Rule for identifying plain-text links."""
if not state.md.options.linkify:
return False
if state.linkLevel > 0:
return False
if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")

pos = state.pos
maximum = state.posMax

if (
(pos + 3) > maximum
or state.src[pos] != ":"
or state.src[pos + 1] != "/"
or state.src[pos + 2] != "/"
):
return False

if not (match := SCHEME_RE.match(state.pending)):
return False

proto = match.group(1)
if not (link := state.md.linkify.match_at_start(state.src[pos - len(proto) :])):
return False
url: str = link.url

# disallow '*' at the end of the link (conflicts with emphasis)
url = url.rstrip("*")

full_url = state.md.normalizeLink(url)
if not state.md.validateLink(full_url):
return False

if not silent:
state.pending = state.pending[: -len(proto)]

token = state.push("link_open", "a", 1)
token.attrs = {"href": full_url}
token.markup = "linkify"
token.info = "auto"

token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(url)

token = state.push("link_close", "a", -1)
token.markup = "linkify"
token.info = "auto"

state.pos += len(url) - len(proto)
return True
4 changes: 4 additions & 0 deletions markdown_it/rules_inline/state_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ def __init__(
self.backticks: dict[int, int] = {}
self.backticksScanned = False

# Counter used to disable inline linkify-it execution
# inside <a> and markdown links
self.linkLevel = 0

def __repr__(self) -> str:
return (
f"{self.__class__.__name__}"
Expand Down
1 change: 1 addition & 0 deletions tests/test_api/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_get_rules():
],
"inline": [
"text",
"linkify",
"newline",
"escape",
"backticks",
Expand Down

0 comments on commit ea27cc8

Please sign in to comment.