Skip to content

Commit

Permalink
Re-work markdown fenced codeblock lexing
Browse files Browse the repository at this point in the history
This primarily makes lexing of nested fenced codeblocks work but also added support for tilde-delimited blocks and corrected the end offset for blocks.
  • Loading branch information
cjw296 committed Mar 18, 2024
1 parent d627202 commit 813e5a0
Show file tree
Hide file tree
Showing 9 changed files with 407 additions and 52 deletions.
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ ReST Parsing and Lexing
Markdown Parsing and Lexing
---------------------------

.. autoclass:: sybil.parsers.markdown.lexers.RawFencedCodeBlockLexer

.. autoclass:: sybil.parsers.markdown.lexers.FencedCodeBlockLexer

.. autoclass:: sybil.parsers.markdown.lexers.DirectiveInHTMLCommentLexer
Expand Down
108 changes: 97 additions & 11 deletions sybil/parsers/markdown/lexers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,106 @@
import re
from typing import Optional, Dict
import textwrap
from typing import Optional, Dict, Pattern, Iterable, Match, List

from sybil import Document, Region, Lexeme
from sybil.parsers.abstract.lexers import BlockLexer

CODEBLOCK_START_TEMPLATE = r"^(?P<prefix>[ \t]*)```(?P<language>{language})$\n"
CODEBLOCK_END_TEMPLATE = r"(?<=\n){prefix}```(:?\n|\Z)"
FENCE = re.compile(r"^(?P<prefix>[ \t]*)(?P<fence>`{3,}|~{3,})", re.MULTILINE)


class FencedCodeBlockLexer(BlockLexer):
class RawFencedCodeBlockLexer:
"""
A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for Markdown fenced code blocks.
A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks allowing flexible lexing
of the whole `info` line along with more complicated prefixes.
The following lexemes are extracted:
- ``source`` as a :class:`~sybil.Lexeme`.
- any other named groups specified in ``info_pattern`` as :class:`strings <str>`.
:param info_pattern:
a :class:`re.Pattern` to match the `info` line and any required prefix that follows it.
:param mapping:
If provided, this is used to rename lexemes from the keys in the mapping to their
values. Only mapped lexemes will be returned in any :class:`~sybil.Region` objects.
"""


def __init__(
self,
info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE),
mapping: Optional[Dict[str, str]] = None,
) -> None:
self.info_pattern = info_pattern
self.mapping = mapping

@staticmethod
def match_closes_existing(current: Match[str], existing: Match[str]) -> bool:
current_fence = current.group('fence')
existing_fence = existing.group('fence')
same_type = current_fence[0] == existing_fence[0]
okay_length = len(current_fence) >= len(existing_fence)
same_prefix = len(current.group('prefix')) == len(existing.group('prefix'))
return same_type and okay_length and same_prefix

def make_region(
self, opening: Match[str], document: Document, closing: Optional[Match[str]]
) -> Optional[Region]:
if closing is None:
content_end = region_end = len(document.text)
else:
content_end = closing.start()
region_end = closing.end()
content = document.text[opening.end(): content_end]
info = self.info_pattern.match(content)
if info is None:
return None
lexemes = info.groupdict()
lines = content[info.end():].splitlines(keepends=True)
stripped = ''.join(line[len(opening.group('prefix')):] for line in lines)
lexemes['source'] = Lexeme(
textwrap.dedent(stripped),
offset=len(opening.group(0))+info.end(),
line_offset=0,
)
if self.mapping:
lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()}
return Region(opening.start(), region_end, lexemes=lexemes)

def __call__(self, document: Document) -> Iterable[Region]:
open_blocks: List[Match[str]] = []
index = 0
while True:
match = FENCE.search(document.text, index)
if match is None:
break
else:
index = match.end()
# does this fence close any open block?
for i in range(len(open_blocks)):
existing = open_blocks[i]
if self.match_closes_existing(match, existing):
maybe_region = self.make_region(existing, document, match)
if maybe_region is not None:
yield maybe_region
open_blocks = open_blocks[:i]
break
else:
open_blocks.append(match)
if open_blocks:
maybe_region = self.make_region(open_blocks[0], document, closing=None)
if maybe_region is not None:
yield maybe_region


class FencedCodeBlockLexer(RawFencedCodeBlockLexer):
"""
A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks where a language is specified.
:class:`RawFencedCodeBlockLexer` can be used if the whole `info` line, or a more complicated
prefix, is required.
The following lexemes are extracted:
Expand All @@ -28,14 +119,9 @@ class FencedCodeBlockLexer(BlockLexer):

def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None:
super().__init__(
start_pattern=re.compile(CODEBLOCK_START_TEMPLATE.format(language=language)),
end_pattern_template=CODEBLOCK_END_TEMPLATE,
info_pattern=re.compile(f'(?P<language>{language})$\n', re.MULTILINE),
mapping=mapping,
)
self.start_pattern = re.compile(
CODEBLOCK_START_TEMPLATE.format(language=language),
re.MULTILINE
)


DIRECTIVE_IN_HTML_COMMENT_START = (
Expand Down
21 changes: 10 additions & 11 deletions sybil/parsers/myst/lexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

from sybil import Document, Region
from sybil.parsers.abstract.lexers import BlockLexer
from sybil.parsers.markdown.lexers import CODEBLOCK_END_TEMPLATE
from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
from sybil.parsers.rest.lexers import parse_options_and_source

DIRECTIVE_START_TEMPLATE = (
r"^(?P<prefix>[ \t]*)```\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n"
r'(?P<options>(?:\1[ \t]*:[\w-]*:[^\n]*\n)+)?'
r"(\1---\n(?P<yaml_options>(?:.+\n)*)\1---\n)?"
INFO_PATTERN = (
r'\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n'
r'(?P<options>(?:[ \t]*:[\w-]*:[^\n]*\n)+)?'
r"([ \t]*---\n(?P<yaml_options>(?:.+\n)*)[ \t]*---\n)?"
)


Expand All @@ -23,9 +23,9 @@ def parse_yaml_options(lexed: Region) -> None:
lexemes['options'].update(options)


class DirectiveLexer(BlockLexer):
class DirectiveLexer(RawFencedCodeBlockLexer):
"""
A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for MyST directives such as:
A :class:`~sybil.typing.Lexer` for MyST directives such as:
.. code-block:: markdown
Expand Down Expand Up @@ -60,11 +60,10 @@ def __init__(
self, directive: str, arguments: str = '.*', mapping: Optional[Dict[str, str]] = None
) -> None:
super().__init__(
start_pattern=re.compile(
DIRECTIVE_START_TEMPLATE.format(directive=directive, arguments=arguments),
re.MULTILINE
info_pattern=re.compile(
INFO_PATTERN.format(directive=directive, arguments=arguments),
re.MULTILINE,
),
end_pattern_template=CODEBLOCK_END_TEMPLATE,
mapping=mapping,
)

Expand Down
55 changes: 55 additions & 0 deletions tests/samples/markdown-fenced-code-block.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
backticks:

```
<
>
```

tildes:
~~~
<
>
~~~

Fewer than three backticks is not enough:
``
foo
``


The closing code fence must use the same character as the opening fence:


```
aaa
~~~
```


The closing code fence must be at least as long as the opening fence:

````
aaa
```
``````

Nested:

~~~~
~~~
aaa
~~~
~~~~


Can't mix chars:

~`~
foo
~`~


This one gets closed by the end of document:
```
some stuff here
~~~
53 changes: 53 additions & 0 deletions tests/samples/myst-complicated-nesting.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# {py:mod}`bytewax.connectors.demo`

```{py:module} bytewax.connectors.demo
```

```{autodoc2-docstring} bytewax.connectors.demo
:parser: myst
:allowtitles:
```

## Data

````{py:data} X
:canonical: bytewax.connectors.demo.X
:type: typing.TypeVar
```{autodoc2-docstring} bytewax.connectors.demo.X
:parser: myst
```
````


## Classes

`````{py:class} RandomMetricSource(metric_name: str, interval: datetime.timedelta = timedelta(seconds=0.7), count: int = sys.maxsize, next_random: typing.Callable[[], float] = lambda: random.randrange(0, 10))
:canonical: bytewax.connectors.demo.RandomMetricSource
:Bases:
- {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``], `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]`
```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource
:parser: myst
```
```{rubric} Initialization
```
```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource.__init__
:parser: myst
```
````{py:method} list_parts() -> typing.List[str]
:canonical: bytewax.connectors.demo.RandomMetricSource.list_parts
````
````{py:method} build_part(now: datetime.datetime, for_part: str, resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState])
:canonical: bytewax.connectors.demo.RandomMetricSource.build_part
````
`````
13 changes: 13 additions & 0 deletions tests/samples/myst-directive-nested.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
````{note}
The warning block will be properly-parsed
```{warning}
Here's my warning
```
But the next block will be parsed as raw text
```{warning}
Here's my raw text warning that isn't parsed...
```
````
18 changes: 18 additions & 0 deletions tests/test_markdown_lexers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from testfixtures import compare

from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
from sybil.region import Region
from .helpers import region_details, check_lexed_regions


def test_fenced_code_block():
lexer = RawFencedCodeBlockLexer()
check_lexed_regions('markdown-fenced-code-block.md', lexer, expected = [
Region(12, 24, lexemes={'source': '<\n >\n'}),
Region(34, 46, lexemes={'source': '<\n >\n'}),
Region(177, 192, lexemes={'source': 'aaa\n~~~\n'}),
Region(266, 285, lexemes={'source': 'aaa\n```\n'}),
Region(301, 312, lexemes={'source': 'aaa\n'}),
Region(296, 317, lexemes={'source': '~~~\naaa\n~~~\n'}),
Region(397, 421, lexemes={'source': 'some stuff here\n~~~\n'}),
])
8 changes: 6 additions & 2 deletions tests/test_myst_codeblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import pytest
from testfixtures import compare

from sybil import Example, Region
from sybil.evaluators.python import PythonEvaluator
from sybil import Example
from sybil.parsers.myst import PythonCodeBlockParser, CodeBlockParser
from .helpers import check_excinfo, parse

Expand Down Expand Up @@ -35,6 +34,11 @@ def test_basic():
assert '__builtins__' not in namespace


def test_complicated_nesting():
# This has no code blocks, but should still parse fine:
parse('myst-complicated-nesting.md', PythonCodeBlockParser(), expected=0)


def test_doctest_at_end_of_fenced_codeblock():
examples, namespace = parse('myst-codeblock-doctests-end-of-fenced-codeblocks.md',
PythonCodeBlockParser(), expected=2)
Expand Down

0 comments on commit 813e5a0

Please sign in to comment.