Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion mdformat/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def run(cli_args: Sequence[str]) -> int: # noqa: C901
if not is_md_equal(
original_str,
formatted_str,
ignore_codeclasses=enabled_codeformatter_langs,
enabled_extensions=enabled_parserplugins,
enabled_codeformatters=enabled_codeformatter_langs,
):
sys.stderr.write(
f'Error: Could not format "{path_str}"\n'
Expand Down
85 changes: 72 additions & 13 deletions mdformat/_util.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,80 @@
import re
from typing import Iterable
from html.parser import HTMLParser
from typing import Iterable, List, Optional, Set

from markdown_it import MarkdownIt

import mdformat.plugins

def is_md_equal(md1: str, md2: str, *, ignore_codeclasses: Iterable[str] = ()) -> bool:

def is_md_equal(
md1: str,
md2: str,
*,
enabled_extensions: Iterable[str] = (),
enabled_codeformatters: Iterable[str] = (),
) -> bool:
"""Check if two Markdown produce the same HTML.

Renders HTML from both Markdown strings, strips whitespace and
checks equality. Note that this is not a perfect solution, as there
can be meaningful whitespace in HTML, e.g. in a <code> block.
Renders HTML from both Markdown strings, strip content of tags with
specified classes, and checks equality of the generated ASTs.
"""
html1 = MarkdownIt().render(md1)
html2 = MarkdownIt().render(md2)
html1 = re.sub(r"\s+", "", html1)
html2 = re.sub(r"\s+", "", html2)
for codeclass in ignore_codeclasses:
html1 = re.sub(rf'<codeclass="language-{codeclass}">.*</pre>', "", html1)
html2 = re.sub(rf'<codeclass="language-{codeclass}">.*</pre>', "", html2)
ignore_classes = [f"language-{lang}" for lang in enabled_codeformatters]
mdit = MarkdownIt()
for name in enabled_extensions:
plugin = mdformat.plugins.PARSER_EXTENSIONS[name]
plugin.update_mdit(mdit)
ignore_classes.extend(getattr(plugin, "ignore_classes", []))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is new plugin API and should be documented if we leave it in. What would this be used for though? There aren't many classes in HTML generated from Markdown.

I've been thinking to make a TOC generator plugin where I would need to ignore everything in the HTML that's in between certain types of HTML comments. A regex based ignore system could work there, or a flag that disables equality check altogether, but not sure if a class based system like this would.

Copy link
Contributor Author

@chrisjsewell chrisjsewell Sep 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why inbetween HTML comments? what would the ToC HTML structure look like?
If we drop handle_comment (above), then comments would be ignored anyway?

well you could have something like the amsmath plugin that generates:

<section class="amsmath">
<eqn>
\begin{equation*}
a = 1
\end{equation*}
</eqn>
</section>

and so if you wanted to format the TeX in some way, you would specify ignore_classes = ["amsmath"]. You could even take it one step further and specify CSS selector like syntax, e.g. section.amsmath (this is available in beautifulsoup)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add it to the ParserExtensionInterface though

Copy link
Owner

@hukkin hukkin Sep 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would the ToC HTML structure look like?

Something like https://github.com/thlorenz/doctoc#specifying-location-of-toc

So you leave these comments in the Markdown and the tool will generate a TOC in between them.

EDIT: ah sry, didn't really answer the question. So the ToC will be a nested Markdown list, and its HTML is the same but rendered to HTML

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I'm thinking maybe just a boolean switch to turn off the Markdown equality check altogether might be the cleanest solution that would work for every possible use case, and with very little effort...?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could the equality check be toggled to only check the equality of the HTML AST structure, rather than its full content, i.e. def handle_data would not store anything in this mode.
This would still check that the formatting does not change the overall tag structure of the HTML (which a plugin should not generally do), but would not worry about its content (which mdformat core and plugins literally all change).

e.g. it would just check that both docs have the structure

<div a="1">
  <other b="2">
  </other>
</div>

rather than check that they are exactly:

<div a="1">
  a b
  c d
  <other b="2">
  x y z
  </other>
</div>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formatter in theory should never alter AST/rendered markdown

and yeh just to emphasise; in some sense the formatter always alters the HTML AST, if you are taking into account the content of each tag, e.g. stripping whitespace in this context is altering the AST.

what the formatter should not do (in theory) is alter the HTML AST structure

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could the equality check be toggled to only check the equality of the HTML AST structure, rather than its full content

This would also require an addition to the API, which I'd rather not do. I've thought about possibly even removing the equality check once mdformat proves to be stable and bug free (since it makes the tool render 3 docs instead of just 1). I'd prefer the API to not be littered with stuff related.

This would still check that the formatting does not change the overall tag structure of the HTML (which a plugin should not generally do), but would not worry about its content (which mdformat core and plugins literally all change).

stripping whitespace in this context is altering the AST.

I mean CommonMark AST, which AFAIK should completely ignore leading or trailing whitespace. The HTML rendering is just a tool used to check that.
Besides leading/trailing whitespace the changes mdformat does are mostly just adding escape characters and converting characters to their numeric representation (which AFAIK should not change HTML in any way).

what the formatter should not do (in theory) is alter the HTML AST structure

It's more strict than this. All ASTs aside, what I really want is the rendered HTML to look precisely the same before and after running mdformat (core). That means that the changes mentioned above are fine (when applied to paragraphs), but most other changes to content (e.g. any changes to code blocks) are not. Plugins are free to do what they want of course.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've thought about possibly even removing the equality check

I don't know, I quite like it. You probably already know, but black has a similar equivalence check
https://github.com/psf/black/blob/172c0a78facd8f938629f216c95507bbfeddfe5e/src/black/__init__.py#L944

I mean CommonMark AST, which AFAIK should completely ignore leading or trailing whitespace.

I'm not sure if this is correct. I mean there's not really a CommonMark AST anyway, just a specification of what HTML it should produce.

It would just be nice if you didn't have to completely throw away this check, due to the use of one plugin.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this is correct. I mean there's not really a CommonMark AST anyway, just a specification of what HTML it should produce.

I'm pretty sure every language can be parsed into a syntax tree, even if there's not an established format how it should look like. According to CommonMark spec The paragraph’s raw content is formed by concatenating the lines and removing initial and final whitespace., so a parser will definitely remove leading/trailing whitespace and it won't be in this syntax tree.

It would just be nice if you didn't have to completely throw away this check, due to the use of one plugin.

With the CHANGES_AST flag, you don't have throw it away, unless the plugin makes changes to how the rendered Markdown looks like. That seems like simple and reasonable spec to me.

I think at this point we should just agree to disagree? 😄 ☮️

html1 = HTML2AST().parse(mdit.render(md1), ignore_classes)
html2 = HTML2AST().parse(mdit.render(md2), ignore_classes)

return html1 == html2


class HTML2AST(HTMLParser):
"""Parser HTML to AST."""

def parse(self, text: str, strip_classes: Iterable[str] = ()) -> List[dict]:
self.tree: List[dict] = []
self.current: Optional[dict] = None
self.feed(text)
self.strip_classes(self.tree, set(strip_classes))
return self.tree

def strip_classes(self, tree: List[dict], classes: Set[str]) -> List[dict]:
"""Strip content from tags with certain classes."""
items = []
for item in tree:
if set(item["attrs"].get("class", "").split()).intersection(classes):
items.append({"tag": item["tag"], "attrs": item["attrs"]})
continue
items.append(item)
item["children"] = self.strip_classes(item.get("children", []), classes)
if not item["children"]:
item.pop("children")

return items

def handle_starttag(self, tag: str, attrs: list) -> None:
tag_item = {"tag": tag, "attrs": dict(attrs), "parent": self.current}
if self.current is None:
self.tree.append(tag_item)
else:
children = self.current.setdefault("children", [])
children.append(tag_item)
self.current = tag_item

def handle_endtag(self, tag: str) -> None:
# walk up the tree to the tag's parent
while self.current is not None:
if self.current["tag"] == tag:
self.current = self.current.pop("parent")
break
self.current = self.current.pop("parent")

def handle_data(self, data: str) -> None:
# ignore data outside tabs
if self.current is not None:
# ignore empty lines and trailing whitespace
self.current["data"] = [
li.rstrip() for li in data.splitlines() if li.strip()
]
5 changes: 5 additions & 0 deletions mdformat/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ def render_token(
"""
return None

def ignore_classes(self) -> List[str]:
"""Return CSS classes to ignore when comparing the input/output HTML
equality."""
return []


def _load_parser_extensions() -> Dict[str, ParserExtensionInterface]:
parser_extension_entrypoints = importlib_metadata.entry_points().get(
Expand Down
7 changes: 5 additions & 2 deletions mdformat/renderer/_token_renderers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A namespace for functions that render the Markdown of tokens from markdown-
it-py."""
import logging
from typing import List, Optional

from markdown_it.token import Token
Expand All @@ -12,6 +13,8 @@
longest_consecutive_sequence,
)

LOGGER = logging.getLogger(__name__)


def default(tokens: List[Token], idx: int, options: dict, env: dict) -> str:
"""Default formatter for tokens that don't have one implemented."""
Expand Down Expand Up @@ -101,10 +104,10 @@ def fence(tokens: List[Token], idx: int, options: dict, env: dict) -> str:
fmt_func = options["codeformatters"][lang]
try:
code_block = fmt_func(code_block, info_str)
except Exception:
except Exception as err:
# Swallow exceptions so that formatter errors (e.g. due to
# invalid code) do not crash mdformat.
pass
LOGGER.warning(f"Code formatting of '{lang}' failed: {err}")

# The code block must not include as long or longer sequence of `fence_char`s
# as the fence string itself
Expand Down
13 changes: 13 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys

from mdformat._cli import run
from mdformat.plugins import CODEFORMATTERS

UNFORMATTED_MARKDOWN = "\n\n# A header\n\n"
FORMATTED_MARKDOWN = "# A header\n"
Expand Down Expand Up @@ -63,6 +64,18 @@ def test_check__multi_fail(capsys, tmp_path):
assert str(file_path2) in captured.err


def example_formatter(code, info):
return "dummy\n"


def test_formatter_plugin(tmp_path, monkeypatch):
monkeypatch.setitem(CODEFORMATTERS, "lang", example_formatter)
file_path = tmp_path / "test_markdown.md"
file_path.write_text("```lang\nother\n```\n")
assert run((str(file_path),)) == 0
assert file_path.read_text() == "```lang\ndummy\n```\n"


def test_dash_stdin(capsys, monkeypatch):
monkeypatch.setattr(sys, "stdin", StringIO(UNFORMATTED_MARKDOWN))
run(("-",))
Expand Down
27 changes: 26 additions & 1 deletion tests/test_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,35 @@
import yaml

import mdformat
from mdformat.plugins import PARSER_EXTENSIONS
from mdformat.plugins import CODEFORMATTERS, PARSER_EXTENSIONS
from mdformat.renderer import MARKERS, MDRenderer


def example_formatter(code, info):
return "dummy\n"


def test_code_formatter(monkeypatch):
monkeypatch.setitem(CODEFORMATTERS, "lang", example_formatter)
text = mdformat.text(
dedent(
"""\
```lang
a
```
"""
),
codeformatters={"lang"},
)
assert text == dedent(
"""\
```lang
dummy
```
"""
)


class ExampleFrontMatterPlugin:
"""A class for extending the base parser."""

Expand Down
56 changes: 56 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from mdformat._util import HTML2AST


def test_html2ast():
data = HTML2AST().parse('<div><p class="x">a<s>j</s></p></div><a>b</a>')
assert data == [
{
"tag": "div",
"attrs": {},
"children": [
{
"tag": "p",
"attrs": {"class": "x"},
"data": ["a"],
"children": [{"tag": "s", "attrs": {}, "data": ["j"]}],
}
],
},
{"tag": "a", "attrs": {}, "data": ["b"]},
]


def test_html2ast_multiline():
data = HTML2AST().parse("<div>a\nb \nc \n\n</div>")
assert data == [{"tag": "div", "attrs": {}, "data": ["a", "b", "c"]}]


def test_html2ast_nested():
data = HTML2AST().parse("<a d=1>b<a d=2>c<a d=3>e</a></a></a>")
assert data == [
{
"tag": "a",
"attrs": {"d": "1"},
"data": ["b"],
"children": [
{
"tag": "a",
"attrs": {"d": "2"},
"data": ["c"],
"children": [{"tag": "a", "attrs": {"d": "3"}, "data": ["e"]}],
}
],
}
]


def test_html2ast_strip():
data = HTML2AST().parse('<div><p class="x y">a<s>j</s></p></div><a>b</a>', {"x"})
assert data == [
{
"tag": "div",
"attrs": {},
"children": [{"tag": "p", "attrs": {"class": "x y"}}],
},
{"tag": "a", "attrs": {}, "data": ["b"]},
]