Skip to content
This repository has been archived by the owner on Aug 7, 2020. It is now read-only.

Various Updates #12

Merged
merged 8 commits into from Mar 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/conf.py
Expand Up @@ -116,5 +116,4 @@ def run_apidoc(app):

def setup(app):
"""Add functions to the Sphinx setup."""
# TODO run apidoc
# app.connect("builder-inited", run_apidoc)
9 changes: 5 additions & 4 deletions docs/using/intro.md
Expand Up @@ -244,8 +244,9 @@ This process is illustrated in the following example, using the lower level pars
{py:func}`~mistletoe.block_tokenizer.tokenize_main`:

```python
>> from mistletoe.block_tokenizer import tokenize_main
>> paragraph = tokenize_main(["a [text][key]\n", "\n", '[key]: link "target"\n'], expand_spans=False)[0]
>> from mistletoe.block_tokenizer import tokenize_main, SourceLines
>> lines = SourceLines('a [text][key]\n\n[key]: link "target"', standardize_ends=True)
>> paragraph = tokenize_main(lines, expand_spans=False)[0]
>> paragraph.children
SpanContainer('a [text][key]')
```
Expand All @@ -265,8 +266,8 @@ ParseContext(block_cls=11,span_cls=9,link_defs=1,footnotes=0)

````{important}
If directly using {py:func}`~mistletoe.block_tokenizer.tokenize_main`,
you should (a) ensure all lines are terminated with `\n`, and
(b) ensure that the global context is reset (if you don't want to use previously read defintions):
you should ensure that the global context is reset,
if you don't want to use previously read defintions:

```python
>> get_parse_context(reset=True)
Expand Down
2 changes: 1 addition & 1 deletion mistletoe/__init__.py
Expand Up @@ -2,7 +2,7 @@
Make mistletoe easier to import.
"""

__version__ = "0.10.0a2"
__version__ = "0.10.0a3"
__all__ = [
"renderers",
"base_elements",
Expand Down
35 changes: 31 additions & 4 deletions mistletoe/base_elements.py
@@ -1,6 +1,7 @@
from collections import namedtuple, OrderedDict
import json
from typing import List, Optional, Pattern, Tuple
import re
from typing import List, Optional, Pattern, Tuple, Union

import attr

Expand Down Expand Up @@ -90,6 +91,12 @@ def _get_children(_parent):

next_children = new_children

def expand_spans(self):
"""Walk through children and process any ``SpanContainer``."""
for result in self.walk(include_self=True):
if isinstance(result.node.children, SpanContainer):
result.node.children = result.node.children.expand()


class TokenEncoder(json.JSONEncoder):
"""A JSON encoder for mistletoe tokens."""
Expand Down Expand Up @@ -150,14 +157,34 @@ class SourceLines:
"""A class for storing source lines and tracking current line index.

:param lines: the source lines
:param start_line: the position of the lines with the full source text.
:param start_line: the position of the initial line within the full source text.
:param standardize_ends: standardize all lines to end with ``\\n``
:param metadata: any metadata associated with the lines
"""

def __init__(self, lines: List[str], start_line=0):
self.lines = lines if isinstance(lines, list) else list(lines)
line_end_pattern = re.compile(".*(\n|\r)$")

def __init__(
self,
lines: Union[str, List[str]],
start_line: int = 0,
standardize_ends: bool = False,
metadata: Optional[dict] = None,
):

if isinstance(lines, str):
lines = lines.splitlines(keepends=True)
if standardize_ends:
lines = [
"{}\n".format(l[:-1] if self.line_end_pattern.match(l) else l)
for l in lines
]

self.lines = lines
self._index = -1
self._anchor = 0
self.start_line = start_line
self.metadata = metadata or {}

@property
def lineno(self):
Expand Down
24 changes: 9 additions & 15 deletions mistletoe/block_tokenizer.py
@@ -1,20 +1,19 @@
"""
Block-level tokenizer for mistletoe.
"""
from mistletoe.base_elements import SpanContainer, SourceLines
from mistletoe.base_elements import SourceLines
from mistletoe.parse_context import get_parse_context


def tokenize_main(
iterable,
lines: SourceLines,
token_types=None,
start_line: int = 0,
expand_spans: bool = True,
skip_tokens: list = ("LinkDefinition", "Footnote"),
):
"""Searches for token_types in an iterable.

:param iterable: list of strings (each line must end with a newline `\\n`!).
:param lines: the source lines
:param token_types: override block-level tokens set in global context
:param start_line: the source line number corresponding to `iterable[0]`
:param expand_spans: After the initial parse the span text is not yet tokenized,
Expand All @@ -26,29 +25,24 @@ def tokenize_main(

:returns: list of block-level token instances.
"""
if not isinstance(lines, SourceLines):
lines = SourceLines(lines)
if token_types is None:
token_types = get_parse_context().block_tokens
tokens = tokenize_block(
iterable,
token_types=token_types,
start_line=start_line,
skip_tokens=skip_tokens,
)
tokens = tokenize_block(lines, token_types=token_types, skip_tokens=skip_tokens)
if expand_spans:
for token in tokens + list(get_parse_context().foot_definitions.values()):
for result in list(token.walk(include_self=True)):
if isinstance(result.node.children, SpanContainer):
result.node.children = result.node.children.expand()
token.expand_spans()
return tokens


def tokenize_block(
iterable, token_types=None, start_line=0, skip_tokens=("LinkDefinition", "Footnote")
lines: SourceLines, token_types=None, skip_tokens=("LinkDefinition", "Footnote")
):
"""Returns a list of parsed tokens."""
assert isinstance(lines, SourceLines), "lines must be `SourceLines` instance"
if token_types is None:
token_types = get_parse_context().block_tokens
lines = SourceLines(iterable, start_line)
parsed_tokens = ParseBuffer()
line = lines.peek()
while line is not None:
Expand Down
90 changes: 52 additions & 38 deletions mistletoe/block_tokens.py
Expand Up @@ -2,7 +2,7 @@
Built-in block-level token classes.
"""
import re
from typing import Dict, Optional, Tuple
from typing import Dict, Optional, Tuple, Union
from typing import List as ListType

import attr
Expand All @@ -17,7 +17,7 @@
normalize_label,
)
from mistletoe.parse_context import get_parse_context
from mistletoe.base_elements import Token, BlockToken, SpanContainer
from mistletoe.base_elements import Token, BlockToken, SpanContainer, SourceLines
from mistletoe.attr_doc import autodoc


Expand Down Expand Up @@ -58,30 +58,44 @@ class FrontMatter(BlockToken):
if `front_matter=True`, and stored on `Document.front_matter` in the syntax tree.
"""

content: str = attr.ib(
content: Union[str, dict] = attr.ib(
repr=False, metadata={"doc": "Source text (should be valid YAML)"}
)
position: Tuple[int, int] = attr.ib(
metadata={"doc": "Line position in source text (start, end)"}
)

def get_data(self) -> dict:
"""Return the de-serialized front matter data (requires pyyaml)."""
if isinstance(self.content, str):
import yaml

return yaml.safe_load(self.content) or {}
return self.content

@classmethod
def start(cls, line: str) -> bool:
# handled by Document
return False

@classmethod
def read(cls, lines):
assert lines and lines[0].startswith("---")
end_line = None
for i, line in enumerate(lines[1:]):
if line.startswith("---"):
end_line = i + 2
break
# TODO raise/report error if closing block not found
if end_line is None:
end_line = len(lines)
def read(cls, lines: SourceLines):
start_line = lines.lineno + 1

next(lines) # skip first ``---``
line_buffer = []
next_line = lines.peek()
while not (next_line is None or next_line.startswith("---")):
line_buffer.append(next(lines))
next_line = lines.peek()
if next_line is not None:
next(lines) # move past closing ``---``
else:
get_parse_context().logger.warning(
"No closing --- was found for initial metadata block."
)

return cls(content="".join(lines[1 : end_line - 1]), position=(0, end_line))
return cls(content="".join(line_buffer), position=(start_line, lines.lineno))


@autodoc
Expand Down Expand Up @@ -121,41 +135,35 @@ class Document(BlockToken):
@classmethod
def read(
cls,
lines,
start_line: int = 0,
lines: Union[str, ListType[str], SourceLines],
reset_definitions: bool = True,
skip_tokens: list = ("LinkDefinition", "Footnote"),
front_matter: bool = False,
):
"""Read a document

:param lines: Lines or string to parse
:param start_line: The initial line (used for nested parsing)
:param lines: Lines to parse
:param reset_definitions: remove any previously stored definitions
in the global context (see ``ParseContext.reset_definitions()``).
:param skip_tokens: do not store these ``token.name`` in the syntax tree.
These are usually tokens that store themselves in the global context.
:param front_matter: search for an initial YAML block front matter block
(note this is not strictly CommonMark compliant)
"""
if isinstance(lines, str):
lines = lines.splitlines(keepends=True)
lines = [line if line.endswith("\n") else "{}\n".format(line) for line in lines]
if reset_definitions:
get_parse_context().reset_definitions()

if not isinstance(lines, SourceLines):
lines = SourceLines(lines, standardize_ends=True)

# TODO can we do this in a way where we are checking
# FrontMatter in get_parse_context().block_tokens?
# then it would be easier to add/remove it in the renderers
front_matter_token = None
if front_matter and lines and lines[0].startswith("---"):
if front_matter and lines.peek() and lines.peek().startswith("---"):
front_matter_token = FrontMatter.read(lines)
start_line += front_matter_token.position[1]
lines = lines[front_matter_token.position[1] :]

children = tokenizer.tokenize_main(
lines, start_line=start_line, skip_tokens=skip_tokens
)
children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens)
foot_defs = get_parse_context().foot_definitions
return cls(
children=children,
Expand Down Expand Up @@ -305,7 +313,9 @@ def read(cls, lines):
# in quotes can be recognized before span-level tokenizing.
Paragraph.parse_setext = False
try:
child_tokens = tokenizer.tokenize_block(line_buffer, start_line=start_line)
child_tokens = tokenizer.tokenize_block(
SourceLines(line_buffer, start_line=start_line)
)
finally:
Paragraph.parse_setext = True
return cls(children=child_tokens, position=(start_line, lines.lineno))
Expand Down Expand Up @@ -688,7 +698,7 @@ def read(cls, lines, prev_marker=None):
next_line = lines.peek()
if empty_first_line and next_line is not None and next_line.strip() == "":
child_tokens = tokenizer.tokenize_block(
[next(lines)], start_line=lines.lineno
SourceLines([next(lines)], start_line=lines.lineno)
)
next_line = lines.peek()
if next_line is not None:
Expand Down Expand Up @@ -743,7 +753,9 @@ def read(cls, lines, prev_marker=None):
newline = newline + 1 if next_line.strip() == "" else 0
next_line = lines.peek()

child_tokens = tokenizer.tokenize_block(line_buffer, start_line=start_line)
child_tokens = tokenizer.tokenize_block(
SourceLines(line_buffer, start_line=start_line)
)

return cls(
children=child_tokens,
Expand Down Expand Up @@ -794,12 +806,9 @@ def read(cls, lines):
break
offset, match = match_info
matches.append(match)
cls.append_link_definitions(matches)
return (
cls(position=(start_line, lines.lineno), definitions=matches)
if matches
else None
)
position = (start_line, lines.lineno)
cls.append_link_definitions(matches, position)
return cls(position=position, definitions=matches) if matches else None

@classmethod
def match_reference(cls, lines, string, offset):
Expand Down Expand Up @@ -922,15 +931,20 @@ def match_link_title(cls, string, offset):
return None

@staticmethod
def append_link_definitions(matches):
def append_link_definitions(matches, position):
for key, dest, title in matches:
key = normalize_label(key)
dest = span_tokens.EscapeSequence.strip(dest.strip())
title = span_tokens.EscapeSequence.strip(title)
link_definitions = get_parse_context().link_definitions
if key not in link_definitions:
# TODO store/emit warning if duplicate
link_definitions[key] = dest, title
else:
get_parse_context().logger.warning(
"ignoring duplicate link definition '{}' at: {}".format(
key, position
)
)

@staticmethod
def backtrack(lines, string, offset):
Expand Down
7 changes: 6 additions & 1 deletion mistletoe/block_tokens_ext.py
Expand Up @@ -67,8 +67,13 @@ def read(cls, lines: SourceLines):
position=(start_line, lines.lineno),
)
if target not in get_parse_context().foot_definitions:
# TODO store/emit warning if duplicate
get_parse_context().foot_definitions[target] = token
else:
get_parse_context().logger.warning(
"ignoring duplicate footnote definition '{}' at: {}".format(
target, token.position
)
)
return token


Expand Down