From 248044b00e486403c0e5c12fa14a1b53e22a5f92 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 3 Mar 2021 18:20:50 +0100 Subject: [PATCH 1/4] =?UTF-8?q?=F0=9F=91=8C=20IMPROVE:=20MarkdownIt=20conf?= =?UTF-8?q?ig=20and=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add additional configuration presets, allow for options tp be overriden in the `MarkdownIt` initialisation, Add convenience methods to `SyntaxTreeNode` --- .pre-commit-config.yaml | 1 + .readthedocs.yml | 1 + docs/conf.py | 8 ++- docs/using.md | 110 ++++++++++++++++++++++++++---- markdown_it/main.py | 74 ++++++++++++-------- markdown_it/port.yaml | 5 ++ markdown_it/presets/commonmark.py | 9 ++- markdown_it/renderer.py | 2 +- markdown_it/tree.py | 29 ++++++++ tests/test_api/test_main.py | 7 ++ tests/test_tree.py | 18 +++++ tests/test_tree/test_pretty.xml | 30 ++++++++ tox.ini | 2 +- 13 files changed, 251 insertions(+), 45 deletions(-) create mode 100644 tests/test_tree/test_pretty.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0a91d16f..fe1d48ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,7 @@ exclude: > test.*\.md| test.*\.txt| test.*\.html| + test.*\.xml| .*commonmark\.json| benchmark/.*\.md| .*/spec\.md diff --git a/.readthedocs.yml b/.readthedocs.yml index a2bb8862..cb2ff6ae 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -6,6 +6,7 @@ python: - method: pip path: . extra_requirements: + - linkify - rtd sphinx: diff --git a/docs/conf.py b/docs/conf.py index a401f841..f3f7befb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -45,7 +45,13 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] -nitpick_ignore = [("py:class", "Match"), ("py:class", "x in the interval [0, 1).")] +nitpick_ignore = [ + ("py:class", "Match"), + ("py:class", "x in the interval [0, 1)."), + ("py:class", "markdown_it.helpers.parse_link_destination._Result"), + ("py:class", "markdown_it.helpers.parse_link_title._Result"), + ("py:class", "MarkdownIt"), +] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/using.md b/docs/using.md index 70dcf71b..fa61fd44 100644 --- a/docs/using.md +++ b/docs/using.md @@ -28,6 +28,7 @@ then these are converted to other formats using 'renderers'. The simplest way to understand how text will be parsed is using: ```{code-cell} +from pprint import pprint from markdown_it import MarkdownIt ``` @@ -48,8 +49,15 @@ for token in md.parse("some *text*"): The `MarkdownIt` class is instantiated with parsing configuration options, dictating the syntax rules and additional options for the parser and renderer. -You can define this configuration *via* a preset name (`'zero'`, `'commonmark'` or `'default'`), -or by directly supplying a dictionary. +You can define this configuration *via* directly supplying a dictionary or a preset name: + +- `zero`: This configures the minimum components to parse text (i.e. just paragraphs and text) +- `commonmark` (default): This configures the parser to strictly comply with the [CommonMark specification](http://spec.commonmark.org/). +- `js-default`: This is the default in the JavaScript version. + Compared to `commonmark`, it disables HTML parsing and enables the table and strikethrough components. +- `gfm-like`: This configures the parser to approximately comply with the [GitHub Flavored Markdown specification](https://github.github.com/gfm/). + Compared to `commonmark`, it disables HTML parsing and enables the table, strikethrough and linkify components. + **Important**, to use this configuration you must have `linkify-it-py` installed. ```{code-cell} from markdown_it.presets import zero @@ -61,18 +69,26 @@ md = MarkdownIt("zero") md.options ``` +You can also override specific options: + ```{code-cell} -print(md.get_active_rules()) +md = MarkdownIt("zero", {"maxNesting": 99}) +md.options ``` ```{code-cell} -print(md.get_all_rules()) +pprint(md.get_active_rules()) ``` You can find all the parsing rules in the source code: `parser_core.py`, `parser_block.py`, `parser_inline.py`. -Any of the parsing rules can be enabled/disabled, and these methods are chainable: + +```{code-cell} +pprint(md.get_all_rules()) +``` + +Any of the parsing rules can be enabled/disabled, and these methods are "chainable": ```{code-cell} md.render("- __*emphasise this*__") @@ -97,6 +113,50 @@ Additionally `renderInline` runs the parser with all block syntax rules disabled md.renderInline("__*emphasise this*__") ``` +### Typographic components + +The `smartquotes` and `replacements` components are intended to improve typography: + +`smartquotes` will convert basic quote marks to their opening and closing variants: + +- 'single quotes' -> ‘single quotes’ +- "double quotes" -> “double quotes” + +`replacements` will replace particular text constructs: + +- ``(c)``, ``(C)`` → © +- ``(tm)``, ``(TM)`` → ™ +- ``(r)``, ``(R)`` → ® +- ``(p)``, ``(P)`` → § +- ``+-`` → ± +- ``...`` → … +- ``?....`` → ?.. +- ``!....`` → !.. +- ``????????`` → ??? +- ``!!!!!`` → !!! +- ``,,,`` → , +- ``--`` → &ndash +- ``---`` → &mdash + +Both of these components require typography to be turned on, as well as the components enabled: + +```{code-cell} +md = MarkdownIt("commonmark", {"typographer": True}) +md.enable(["replacements", "smartquotes"]) +md.render("'single quotes' (c)") +``` + +### Linkify + +The `linkify` component requires that [linkify-it-py](https://github.com/tsutsu3/linkify-it-py) be installed (e.g. *via* `pip install markdown-it-py[linkify]`). +This allows URL links to be identified, without the need for enclosing in `<>` brackets: + +```{code-cell} +md = MarkdownIt("commonmark", {"linkify": True}) +md.enable(["linkify"]) +md.render("github.com") +``` + ### Plugins load Plugins load collections of additional syntax rules and render methods into the parser @@ -130,7 +190,6 @@ md.render(text) ## The Token Stream - +++ Before rendering, the text is parsed to a flat token stream of block level syntax elements, with nesting defined by opening (1) and closing (-1) attributes: @@ -183,22 +242,45 @@ This dictionary can also be deserialized: Token.from_dict(tokens[1].as_dict()) ``` -In some use cases `nest_tokens` may be useful, to collapse the opening/closing tokens into single tokens: +### Creating a syntax tree + +```{versionchanged} 0.7.0 +`nest_tokens` and `NestedTokens` are deprecated and replaced by `SyntaxTreeNode`. +``` + +In some use cases it may be useful to convert the token stream into a syntax tree, +with opening/closing tokens collapsed into a single token that contains children. ```{code-cell} -from markdown_it.token import nest_tokens -nested_tokens = nest_tokens(tokens) -[t.type for t in nested_tokens] +from markdown_it.tree import SyntaxTreeNode + +md = MarkdownIt("commonmark") +tokens = md.parse(""" +# Header + +Here's some text and an image ![title](image.png) + +1. a **list** + +> a *quote* +""") + +node = SyntaxTreeNode.from_tokens(tokens) +print(node.pretty(indent=2, show_text=True)) ``` -This introduces a single additional class `NestedTokens`, -containing an `opening`, `closing` and `children`, which can be a list of mixed -`Token` and `NestedTokens`. +You can then use methods to traverse the tree ```{code-cell} -nested_tokens[0] +node.children ``` +```{code-cell} +print(node[0]) +node[0].next_sibling +``` + + ## Renderers +++ diff --git a/markdown_it/main.py b/markdown_it/main.py index b96d52b1..acf29bac 100644 --- a/markdown_it/main.py +++ b/markdown_it/main.py @@ -27,36 +27,51 @@ linkify_it = None -_PRESETS = AttrDict( - { - "default": presets.default.make(), - "zero": presets.zero.make(), - "commonmark": presets.commonmark.make(), - } -) +_PRESETS = { + "default": presets.default.make(), + "zero": presets.zero.make(), + "commonmark": presets.commonmark.make(), +} +_PRESETS["js-default"] = _PRESETS["default"] +_PRESETS["gfm-like"] = presets.default.make() +_PRESETS["gfm-like"]["options"]["linkify"] = True class MarkdownIt: def __init__( - self, config: Union[str, Mapping] = "commonmark", renderer_cls=RendererHTML + self, + config: Union[str, Mapping] = "commonmark", + options_update: Optional[Mapping] = None, + *, + renderer_cls=RendererHTML, ): """Main parser class :param config: name of configuration to load or a pre-defined dictionary + :param options_update: dictionary that will be merged into ``config["options"]`` :param renderer_cls: the class to load as the renderer: ``self.renderer = renderer_cls(self) """ + # add modules + self.utils = utils + self.helpers: Any = helpers + + # initialise classes self.inline = ParserInline() self.block = ParserBlock() self.core = ParserCore() self.renderer = renderer_cls(self) + self.linkify = linkify_it.LinkifyIt() if linkify_it else None - self.utils = utils - self.helpers: Any = helpers + # set the configuration + if options_update and not isinstance(options_update, Mapping): + # catch signature change where renderer_cls was not used as a key-word + raise TypeError( + f"options_update should be a mapping: {options_update}" + "\n(Perhaps you intended this to be the renderer_cls?)" + ) self.options = AttrDict() - self.configure(config) - - self.linkify = linkify_it.LinkifyIt() if linkify_it else None + self.configure(config, options_update=options_update) def __repr__(self) -> str: return f"{self.__class__.__module__}.{self.__class__.__name__}()" @@ -79,7 +94,9 @@ def set(self, options: AttrDict) -> None: """ self.options = options - def configure(self, presets: Union[str, Mapping]) -> "MarkdownIt": + def configure( + self, presets: Union[str, Mapping], options_update: Optional[Mapping] = None + ) -> "MarkdownIt": """Batch load of all options and component settings. This is an internal method, and you probably will not need it. But if you will - see available presets and data structure @@ -89,21 +106,24 @@ def configure(self, presets: Union[str, Mapping]) -> "MarkdownIt": That will give better compatibility with next versions. """ if isinstance(presets, str): - presetName = presets - presets = _PRESETS.get(presetName, None) - if not presets: - raise KeyError( - 'Wrong `markdown-it` preset "' + presetName + '", check name' - ) - if not presets: - raise ValueError("Wrong `markdown-it` preset, can't be empty") - config = AttrDict(presets) - - if "options" in config: - self.set(config.options) + config = _PRESETS.get(presets, None) + if not config: + raise KeyError(f"Wrong `markdown-it` preset '{presets}', check name") + else: + config = presets + + if not config: + raise ValueError("Wrong `markdown-it` config, can't be empty") + + options = config.get("options", {}) or {} + if options_update: + options = {**options, **options_update} + + if options: + self.set(AttrDict(options)) if "components" in config: - for name, component in config.components.items(): + for name, component in config["components"].items(): rules = component.get("rules", None) if rules: self[name].ruler.enableOnly(rules) diff --git a/markdown_it/port.yaml b/markdown_it/port.yaml index 5cfb82a7..dc932797 100644 --- a/markdown_it/port.yaml +++ b/markdown_it/port.yaml @@ -23,6 +23,11 @@ In markdown_it/rules_block/reference.py, record line range in state.env["references"] and add state.env["duplicate_refs"] This is to allow renderers to report on issues regarding references + - | + The `MarkdownIt.__init__` signature is slightly different for updating options, + since you must always specify the config first, e.g. + use `MarkdownIt("commonmark", {"html": False})` instead of `MarkdownIt({"html": False})` + - The default configuration preset for `MarkdownIt` if "commonmark" not "default" - Allow custom renderer to be passed to `MarkdownIt` - | change render method signatures diff --git a/markdown_it/presets/commonmark.py b/markdown_it/presets/commonmark.py index 603393cc..c17fdd14 100644 --- a/markdown_it/presets/commonmark.py +++ b/markdown_it/presets/commonmark.py @@ -1,4 +1,11 @@ -"""Commonmark default options.""" +"""Commonmark default options. + +This differs to presets.default, +primarily in that it allows HTML and does not enable components: + +- block: table +- inline: strikethrough +""" def make(): diff --git a/markdown_it/renderer.py b/markdown_it/renderer.py index 24b51b14..aa24f08b 100644 --- a/markdown_it/renderer.py +++ b/markdown_it/renderer.py @@ -34,7 +34,7 @@ def strong_open(self, tokens, idx, options, env): def strong_close(self, tokens, idx, options, env): return '' - md = MarkdownIt(renderer=CustomRenderer) + md = MarkdownIt(renderer_cls=CustomRenderer) result = md.render(...) diff --git a/markdown_it/tree.py b/markdown_it/tree.py index c0036139..0a8fc82e 100644 --- a/markdown_it/tree.py +++ b/markdown_it/tree.py @@ -2,6 +2,7 @@ This module is not part of upstream JavaScript markdown-it. """ +import textwrap from typing import NamedTuple, Sequence, Tuple, Dict, List, Optional, Any from .token import Token @@ -44,6 +45,12 @@ def __init__(self) -> None: # children (i.e. inline or img) self.children: List["SyntaxTreeNode"] = [] + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.type})" + + def __getitem__(self, item: int) -> "SyntaxTreeNode": + return self.children[item] + @classmethod def from_tokens(cls, tokens: Sequence[Token]) -> "SyntaxTreeNode": """Instantiate a `SyntaxTreeNode` from a token stream. @@ -76,6 +83,11 @@ def recursive_collect_tokens( recursive_collect_tokens(self, tokens) return tokens + @property + def is_root(self) -> bool: + """Is the node a special root node?""" + return not (self.token or self.nester_tokens) + @property def is_nested(self) -> bool: """Is this node nested?. @@ -183,6 +195,23 @@ def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None: ) child._set_children_from_tokens(nested_tokens[1:-1]) + def pretty( + self, *, indent: int = 2, show_text: bool = False, _current: int = 0 + ) -> str: + """Create an XML style string of the tree.""" + prefix = " " * _current + text = prefix + f"<{self.type}" + if not self.is_root and self.attrs: + text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items()) + text += ">" + if show_text and not self.is_root and self.type == "text" and self.content: + text += "\n" + textwrap.indent(self.content, prefix + " " * indent) + for child in self.children: + text += "\n" + child.pretty( + indent=indent, show_text=show_text, _current=_current + indent + ) + return text + # NOTE: # The values of the properties defined below directly map to properties # of the underlying `Token`s. A root node does not translate to a `Token` diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py index ad220cd3..d6995c39 100644 --- a/tests/test_api/test_main.py +++ b/tests/test_api/test_main.py @@ -85,6 +85,13 @@ def test_load_presets(): } +def test_override_options(): + md = MarkdownIt("zero") + assert md.options["maxNesting"] == 20 + md = MarkdownIt("zero", {"maxNesting": 99}) + assert md.options["maxNesting"] == 99 + + def test_enable(): md = MarkdownIt("zero").enable("heading") assert md.get_active_rules() == { diff --git a/tests/test_tree.py b/tests/test_tree.py index d6dadec9..bd04527e 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -37,6 +37,7 @@ def test_type(): assert tree.type == "root" # "_open" suffix must be stripped from nested token type assert tree.children[0].type == "heading" + assert tree[0].type == "heading" # For unnested tokens, node type must remain same as token type assert tree.children[0].children[0].type == "inline" @@ -54,3 +55,20 @@ def test_sibling_traverse(): assert another_text_node.next_sibling is None assert another_text_node.previous_sibling.previous_sibling == text_node assert text_node.previous_sibling is None + + +def test_pretty(file_regression): + md = MarkdownIt("commonmark") + tokens = md.parse( + """ +# Header + +Here's some text and an image ![title](image.png) + +1. a **list** + +> a *quote* + """ + ) + node = SyntaxTreeNode.from_tokens(tokens) + file_regression.check(node.pretty(indent=2, show_text=True), extension=".xml") diff --git a/tests/test_tree/test_pretty.xml b/tests/test_tree/test_pretty.xml new file mode 100644 index 00000000..41d399ef --- /dev/null +++ b/tests/test_tree/test_pretty.xml @@ -0,0 +1,30 @@ + + + + + Header + + + + Here's some text and an image + + + title + + + + + + a + + + list + +
+ + + + a + + + quote \ No newline at end of file diff --git a/tox.ini b/tox.ini index 855aa8e6..fd9a80e6 100644 --- a/tox.ini +++ b/tox.ini @@ -28,7 +28,7 @@ extras = testing commands = pytest benchmarking/bench_plugins.py {posargs} [testenv:docs-{update,clean}] -extras = rtd +extras = linkify,rtd whitelist_externals = rm setenv = update: SKIP_APIDOC = true From 2b5bce24f6f990724d4302cc7a4ab84bda50bbad Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 4 Mar 2021 00:12:36 +0100 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Taneli Hukkinen --- docs/using.md | 2 +- markdown_it/main.py | 4 ++-- markdown_it/port.yaml | 2 +- markdown_it/tree.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/using.md b/docs/using.md index fa61fd44..2928f688 100644 --- a/docs/using.md +++ b/docs/using.md @@ -149,7 +149,7 @@ md.render("'single quotes' (c)") ### Linkify The `linkify` component requires that [linkify-it-py](https://github.com/tsutsu3/linkify-it-py) be installed (e.g. *via* `pip install markdown-it-py[linkify]`). -This allows URL links to be identified, without the need for enclosing in `<>` brackets: +This allows URI autolinks to be identified, without the need for enclosing in `<>` brackets: ```{code-cell} md = MarkdownIt("commonmark", {"linkify": True}) diff --git a/markdown_it/main.py b/markdown_it/main.py index acf29bac..4579bb90 100644 --- a/markdown_it/main.py +++ b/markdown_it/main.py @@ -106,9 +106,9 @@ def configure( That will give better compatibility with next versions. """ if isinstance(presets, str): - config = _PRESETS.get(presets, None) - if not config: + if presets not in _PRESETS: raise KeyError(f"Wrong `markdown-it` preset '{presets}', check name") + config = _PRESETS[presets] else: config = presets diff --git a/markdown_it/port.yaml b/markdown_it/port.yaml index dc932797..aa0ace83 100644 --- a/markdown_it/port.yaml +++ b/markdown_it/port.yaml @@ -27,7 +27,7 @@ The `MarkdownIt.__init__` signature is slightly different for updating options, since you must always specify the config first, e.g. use `MarkdownIt("commonmark", {"html": False})` instead of `MarkdownIt({"html": False})` - - The default configuration preset for `MarkdownIt` if "commonmark" not "default" + - The default configuration preset for `MarkdownIt` is "commonmark" not "default" - Allow custom renderer to be passed to `MarkdownIt` - | change render method signatures diff --git a/markdown_it/tree.py b/markdown_it/tree.py index 0a8fc82e..e531cb28 100644 --- a/markdown_it/tree.py +++ b/markdown_it/tree.py @@ -46,7 +46,7 @@ def __init__(self) -> None: self.children: List["SyntaxTreeNode"] = [] def __repr__(self) -> str: - return f"{self.__class__.__name__}({self.type})" + return f"{type(self).__name__}({self.type})" def __getitem__(self, item: int) -> "SyntaxTreeNode": return self.children[item] From bc3a2efae6660a1a27ff28b540d6a7ee2bd7c430 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 4 Mar 2021 00:41:32 +0100 Subject: [PATCH 3/4] add review suggestions --- .pre-commit-config.yaml | 1 + docs/using.md | 3 +-- markdown_it/main.py | 5 ++--- markdown_it/presets/__init__.py | 24 ++++++++++++++++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fe1d48ce..a77e4eae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,7 @@ repos: rev: 3.8.4 hooks: - id: flake8 + additional_dependencies: [flake8-bugbear==21.3.1] - repo: https://github.com/psf/black rev: 20.8b1 diff --git a/docs/using.md b/docs/using.md index 2928f688..ec21b6bc 100644 --- a/docs/using.md +++ b/docs/using.md @@ -56,7 +56,7 @@ You can define this configuration *via* directly supplying a dictionary or a pre - `js-default`: This is the default in the JavaScript version. Compared to `commonmark`, it disables HTML parsing and enables the table and strikethrough components. - `gfm-like`: This configures the parser to approximately comply with the [GitHub Flavored Markdown specification](https://github.github.com/gfm/). - Compared to `commonmark`, it disables HTML parsing and enables the table, strikethrough and linkify components. + Compared to `commonmark`, it enables the table, strikethrough and linkify components. **Important**, to use this configuration you must have `linkify-it-py` installed. ```{code-cell} @@ -280,7 +280,6 @@ print(node[0]) node[0].next_sibling ``` - ## Renderers +++ diff --git a/markdown_it/main.py b/markdown_it/main.py index 4579bb90..c95465e2 100644 --- a/markdown_it/main.py +++ b/markdown_it/main.py @@ -29,12 +29,11 @@ _PRESETS = { "default": presets.default.make(), + "js-default": presets.js_default.make(), "zero": presets.zero.make(), "commonmark": presets.commonmark.make(), + "gfm-like": presets.gfm_like.make(), } -_PRESETS["js-default"] = _PRESETS["default"] -_PRESETS["gfm-like"] = presets.default.make() -_PRESETS["gfm-like"]["options"]["linkify"] = True class MarkdownIt: diff --git a/markdown_it/presets/__init__.py b/markdown_it/presets/__init__.py index 43d11484..78f26934 100644 --- a/markdown_it/presets/__init__.py +++ b/markdown_it/presets/__init__.py @@ -1 +1,25 @@ from . import commonmark, default, zero # noqa: F401 + +js_default = default + + +class gfm_like: + """GitHub Flavoured Markdown (GFM) like. + + This adds the linkify, table and strikethrough components to CommmonMark. + + Note, it lacks task-list items and raw HTML filtering, + to meet the the full GFM specification + (see https://github.github.com/gfm/#autolinks-extension-). + """ + + @staticmethod + def make(): + config = commonmark.make() + config["components"]["core"]["rules"].append("linkify") + config["components"]["block"]["rules"].append("table") + config["components"]["inline"]["rules"].append("strikethrough") + config["components"]["inline"]["rules2"].append("strikethrough") + config["options"]["linkify"] = True + config["options"]["html"] = True + return config From 3b57e5b555a1967b893c70bb68fe281574faed82 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 4 Mar 2021 00:46:35 +0100 Subject: [PATCH 4/4] add review suggestion --- markdown_it/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown_it/tree.py b/markdown_it/tree.py index e531cb28..dcc46132 100644 --- a/markdown_it/tree.py +++ b/markdown_it/tree.py @@ -117,7 +117,7 @@ def type(self) -> str: - `Token.type` of the opening token, with "_open" suffix stripped, if the node represents a nester token pair """ - if not self.token and not self.nester_tokens: + if self.is_root: return "root" if self.token: return self.token.type