executablebooks · chrisjsewell · Mar 12, 2020 · Mar 11, 2020 · Mar 11, 2020 · Mar 11, 2020
diff --git a/docs/conf.py b/docs/conf.py
@@ -116,5 +116,4 @@ def run_apidoc(app):
 
 def setup(app):
     """Add functions to the Sphinx setup."""
-    # TODO run apidoc
     # app.connect("builder-inited", run_apidoc)
diff --git a/docs/using/intro.md b/docs/using/intro.md
@@ -244,8 +244,9 @@ This process is illustrated in the following example, using the lower level pars
 {py:func}`~mistletoe.block_tokenizer.tokenize_main`:
 
 ```python
->> from mistletoe.block_tokenizer import tokenize_main
->> paragraph = tokenize_main(["a [text][key]\n", "\n", '[key]: link "target"\n'], expand_spans=False)[0]
+>> from mistletoe.block_tokenizer import tokenize_main, SourceLines
+>> lines = SourceLines('a [text][key]\n\n[key]: link "target"', standardize_ends=True)
+>> paragraph = tokenize_main(lines, expand_spans=False)[0]
 >> paragraph.children
 SpanContainer('a [text][key]')
 ```
@@ -265,8 +266,8 @@ ParseContext(block_cls=11,span_cls=9,link_defs=1,footnotes=0)
 
 ````{important}
 If directly using {py:func}`~mistletoe.block_tokenizer.tokenize_main`,
-you should (a) ensure all lines are terminated with `\n`, and
-(b) ensure that the global context is reset (if you don't want to use previously read defintions):
+you should ensure that the global context is reset,
+if you don't want to use previously read defintions:
 
 ```python
 >> get_parse_context(reset=True)

diff --git a/mistletoe/__init__.py b/mistletoe/__init__.py
@@ -2,7 +2,7 @@
 Make mistletoe easier to import.
 """
 
-__version__ = "0.10.0a2"
+__version__ = "0.10.0a3"
 __all__ = [
     "renderers",
     "base_elements",

diff --git a/mistletoe/base_elements.py b/mistletoe/base_elements.py
@@ -1,6 +1,7 @@
 from collections import namedtuple, OrderedDict
 import json
-from typing import List, Optional, Pattern, Tuple
+import re
+from typing import List, Optional, Pattern, Tuple, Union
 
 import attr
 
@@ -90,6 +91,12 @@ def _get_children(_parent):
 
             next_children = new_children
 
+    def expand_spans(self):
+        """Walk through children and process any ``SpanContainer``."""
+        for result in self.walk(include_self=True):
+            if isinstance(result.node.children, SpanContainer):
+                result.node.children = result.node.children.expand()
+
 
 class TokenEncoder(json.JSONEncoder):
     """A JSON encoder for mistletoe tokens."""
@@ -150,14 +157,34 @@ class SourceLines:
     """A class for storing source lines and tracking current line index.
 
     :param lines: the source lines
-    :param start_line: the position of the lines with the full source text.
+    :param start_line: the position of the initial line within the full source text.
+    :param standardize_ends: standardize all lines to end with ``\\n``
+    :param metadata: any metadata associated with the lines
     """
 
-    def __init__(self, lines: List[str], start_line=0):
-        self.lines = lines if isinstance(lines, list) else list(lines)
+    line_end_pattern = re.compile(".*(\n|\r)$")
+
+    def __init__(
+        self,
+        lines: Union[str, List[str]],
+        start_line: int = 0,
+        standardize_ends: bool = False,
+        metadata: Optional[dict] = None,
+    ):
+
+        if isinstance(lines, str):
+            lines = lines.splitlines(keepends=True)
+        if standardize_ends:
+            lines = [
+                "{}\n".format(l[:-1] if self.line_end_pattern.match(l) else l)
+                for l in lines
+            ]
+
+        self.lines = lines
         self._index = -1
         self._anchor = 0
         self.start_line = start_line
+        self.metadata = metadata or {}
 
     @property
     def lineno(self):

diff --git a/mistletoe/block_tokenizer.py b/mistletoe/block_tokenizer.py
@@ -1,20 +1,19 @@
 """
 Block-level tokenizer for mistletoe.
 """
-from mistletoe.base_elements import SpanContainer, SourceLines
+from mistletoe.base_elements import SourceLines
 from mistletoe.parse_context import get_parse_context
 
 
 def tokenize_main(
-    iterable,
+    lines: SourceLines,
     token_types=None,
-    start_line: int = 0,
     expand_spans: bool = True,
     skip_tokens: list = ("LinkDefinition", "Footnote"),
 ):
     """Searches for token_types in an iterable.
 
-    :param iterable: list of strings (each line must end with a newline `\\n`!).
+    :param lines: the source lines
     :param token_types: override block-level tokens set in global context
     :param start_line: the source line number corresponding to `iterable[0]`
     :param expand_spans: After the initial parse the span text is not yet tokenized,
@@ -26,29 +25,24 @@ def tokenize_main(
 
     :returns: list of block-level token instances.
     """
+    if not isinstance(lines, SourceLines):
+        lines = SourceLines(lines)
     if token_types is None:
         token_types = get_parse_context().block_tokens
-    tokens = tokenize_block(
-        iterable,
-        token_types=token_types,
-        start_line=start_line,
-        skip_tokens=skip_tokens,
-    )
+    tokens = tokenize_block(lines, token_types=token_types, skip_tokens=skip_tokens)
     if expand_spans:
         for token in tokens + list(get_parse_context().foot_definitions.values()):
-            for result in list(token.walk(include_self=True)):
-                if isinstance(result.node.children, SpanContainer):
-                    result.node.children = result.node.children.expand()
+            token.expand_spans()
     return tokens
 
 
 def tokenize_block(
-    iterable, token_types=None, start_line=0, skip_tokens=("LinkDefinition", "Footnote")
+    lines: SourceLines, token_types=None, skip_tokens=("LinkDefinition", "Footnote")
 ):
     """Returns a list of parsed tokens."""
+    assert isinstance(lines, SourceLines), "lines must be `SourceLines` instance"
     if token_types is None:
         token_types = get_parse_context().block_tokens
-    lines = SourceLines(iterable, start_line)
     parsed_tokens = ParseBuffer()
     line = lines.peek()
     while line is not None:

diff --git a/mistletoe/block_tokens.py b/mistletoe/block_tokens.py
@@ -2,7 +2,7 @@
 Built-in block-level token classes.
 """
 import re
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, Union
 from typing import List as ListType
 
 import attr
@@ -17,7 +17,7 @@
     normalize_label,
 )
 from mistletoe.parse_context import get_parse_context
-from mistletoe.base_elements import Token, BlockToken, SpanContainer
+from mistletoe.base_elements import Token, BlockToken, SpanContainer, SourceLines
 from mistletoe.attr_doc import autodoc
 
 
@@ -58,30 +58,44 @@ class FrontMatter(BlockToken):
     if `front_matter=True`, and stored on `Document.front_matter` in the syntax tree.
     """
 
-    content: str = attr.ib(
+    content: Union[str, dict] = attr.ib(
         repr=False, metadata={"doc": "Source text (should be valid YAML)"}
     )
     position: Tuple[int, int] = attr.ib(
         metadata={"doc": "Line position in source text (start, end)"}
     )
 
+    def get_data(self) -> dict:
+        """Return the de-serialized front matter data (requires pyyaml)."""
+        if isinstance(self.content, str):
+            import yaml
+
+            return yaml.safe_load(self.content) or {}
+        return self.content
+
     @classmethod
     def start(cls, line: str) -> bool:
+        # handled by Document
         return False
 
     @classmethod
-    def read(cls, lines):
-        assert lines and lines[0].startswith("---")
-        end_line = None
-        for i, line in enumerate(lines[1:]):
-            if line.startswith("---"):
-                end_line = i + 2
-                break
-        # TODO raise/report error if closing block not found
-        if end_line is None:
-            end_line = len(lines)
+    def read(cls, lines: SourceLines):
+        start_line = lines.lineno + 1
+
+        next(lines)  # skip first ``---``
+        line_buffer = []
+        next_line = lines.peek()
+        while not (next_line is None or next_line.startswith("---")):
+            line_buffer.append(next(lines))
+            next_line = lines.peek()
+        if next_line is not None:
+            next(lines)  # move past closing ``---``
+        else:
+            get_parse_context().logger.warning(
+                "No closing --- was found for initial metadata block."
+            )
 
-        return cls(content="".join(lines[1 : end_line - 1]), position=(0, end_line))
+        return cls(content="".join(line_buffer), position=(start_line, lines.lineno))
 
 
 @autodoc
@@ -121,41 +135,35 @@ class Document(BlockToken):
     @classmethod
     def read(
         cls,
-        lines,
-        start_line: int = 0,
+        lines: Union[str, ListType[str], SourceLines],
         reset_definitions: bool = True,
         skip_tokens: list = ("LinkDefinition", "Footnote"),
         front_matter: bool = False,
     ):
         """Read a document
 
-        :param lines:  Lines or string to parse
-        :param start_line: The initial line (used for nested parsing)
+        :param lines: Lines to parse
         :param reset_definitions: remove any previously stored definitions
             in the global context (see ``ParseContext.reset_definitions()``).
         :param skip_tokens: do not store these ``token.name`` in the syntax tree.
             These are usually tokens that store themselves in the global context.
         :param front_matter: search for an initial YAML block front matter block
             (note this is not strictly CommonMark compliant)
         """
-        if isinstance(lines, str):
-            lines = lines.splitlines(keepends=True)
-        lines = [line if line.endswith("\n") else "{}\n".format(line) for line in lines]
         if reset_definitions:
             get_parse_context().reset_definitions()
 
+        if not isinstance(lines, SourceLines):
+            lines = SourceLines(lines, standardize_ends=True)
+
         # TODO can we do this in a way where we are checking
         # FrontMatter in get_parse_context().block_tokens?
         # then it would be easier to add/remove it in the renderers
         front_matter_token = None
-        if front_matter and lines and lines[0].startswith("---"):
+        if front_matter and lines.peek() and lines.peek().startswith("---"):
             front_matter_token = FrontMatter.read(lines)
-            start_line += front_matter_token.position[1]
-            lines = lines[front_matter_token.position[1] :]
 
-        children = tokenizer.tokenize_main(
-            lines, start_line=start_line, skip_tokens=skip_tokens
-        )
+        children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens)
         foot_defs = get_parse_context().foot_definitions
         return cls(
             children=children,
@@ -305,7 +313,9 @@ def read(cls, lines):
         # in quotes can be recognized before span-level tokenizing.
         Paragraph.parse_setext = False
         try:
-            child_tokens = tokenizer.tokenize_block(line_buffer, start_line=start_line)
+            child_tokens = tokenizer.tokenize_block(
+                SourceLines(line_buffer, start_line=start_line)
+            )
         finally:
             Paragraph.parse_setext = True
         return cls(children=child_tokens, position=(start_line, lines.lineno))
@@ -688,7 +698,7 @@ def read(cls, lines, prev_marker=None):
         next_line = lines.peek()
         if empty_first_line and next_line is not None and next_line.strip() == "":
             child_tokens = tokenizer.tokenize_block(
-                [next(lines)], start_line=lines.lineno
+                SourceLines([next(lines)], start_line=lines.lineno)
             )
             next_line = lines.peek()
             if next_line is not None:
@@ -743,7 +753,9 @@ def read(cls, lines, prev_marker=None):
             newline = newline + 1 if next_line.strip() == "" else 0
             next_line = lines.peek()
 
-        child_tokens = tokenizer.tokenize_block(line_buffer, start_line=start_line)
+        child_tokens = tokenizer.tokenize_block(
+            SourceLines(line_buffer, start_line=start_line)
+        )
 
         return cls(
             children=child_tokens,
@@ -794,12 +806,9 @@ def read(cls, lines):
                 break
             offset, match = match_info
             matches.append(match)
-        cls.append_link_definitions(matches)
-        return (
-            cls(position=(start_line, lines.lineno), definitions=matches)
-            if matches
-            else None
-        )
+        position = (start_line, lines.lineno)
+        cls.append_link_definitions(matches, position)
+        return cls(position=position, definitions=matches) if matches else None
 
     @classmethod
     def match_reference(cls, lines, string, offset):
@@ -922,15 +931,20 @@ def match_link_title(cls, string, offset):
         return None
 
     @staticmethod
-    def append_link_definitions(matches):
+    def append_link_definitions(matches, position):
         for key, dest, title in matches:
             key = normalize_label(key)
             dest = span_tokens.EscapeSequence.strip(dest.strip())
             title = span_tokens.EscapeSequence.strip(title)
             link_definitions = get_parse_context().link_definitions
             if key not in link_definitions:
-                # TODO store/emit warning if duplicate
                 link_definitions[key] = dest, title
+            else:
+                get_parse_context().logger.warning(
+                    "ignoring duplicate link definition '{}' at: {}".format(
+                        key, position
+                    )
+                )
 
     @staticmethod
     def backtrack(lines, string, offset):

diff --git a/mistletoe/block_tokens_ext.py b/mistletoe/block_tokens_ext.py
@@ -67,8 +67,13 @@ def read(cls, lines: SourceLines):
             position=(start_line, lines.lineno),
         )
         if target not in get_parse_context().foot_definitions:
-            # TODO store/emit warning if duplicate
             get_parse_context().foot_definitions[target] = token
+        else:
+            get_parse_context().logger.warning(
+                "ignoring duplicate footnote definition '{}' at: {}".format(
+                    target, token.position
+                )
+            )
         return token