# Tangling [Markdown] to [Python]

The `tangle` step is the keystone of `pidgy` by defining the
heuristics that translate [Markdown] to [Python] execute
blocks of narrative as interactive code, and entire programs.
A key constraint in the translation is a line-for-line mapping
between representations, with this we'll benefit from reusable 
tracebacks for [Markdown] source.

There are many ways to translate [Markdown] to other formats specifically with tools
like `"pandoc"`.  The formats are document formatting language, and not programs.
The [Markdown] to [Python] translation adds a computable dimension to the document.
`pidgy` is one implementation and it should be possible to apply to different heuristics to other
programming languages.

In [1]:
    
    import IPython, typing as τ, mistune as markdown, IPython, importnb as _import_, textwrap, ast, doctest, typing, re, dataclasses
    if __name__ == '__main__':
        import pidgy
        shell = IPython.get_ipython()

<!--
    
    import IPython, typing as τ, mistune as markdown, IPython, importnb as _import_, textwrap, ast, doctest, typing, re, dataclasses
    if __name__ == '__main__':
        import pidgy
        shell = IPython.get_ipython()

-->

The `pidgyTransformer` manages the high level API the `IPython.InteractiveShell` interacts with for `pidgy`.
The `IPython.core.inputtransformer2.TransformerManager` is a configurable class for modifying
input source to before it passes to the compiler.  It is the object that introduces `IPython`s line
and cell magics.

    >>> assert isinstance(shell.input_transformer_manager, IPython.core.inputtransformer2.TransformerManager)
    
This configurable class has three different flavors of transformations.

* `shell.input_transformer_manager.cleanup_transforms`
* `shell.input_transformer_manager.line_transforms`
* `shell.input_transformer_manager.token_transformers`

In [2]:
    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell):
            return super().transform_cell(self.pidgy_transform(cell))
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell):
            return super().transform_cell(self.pidgy_transform(cell))
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

## Block level lexical analysis.

Translating [Markdown] to [Python] rely only on block level objects in the [Markdown]
grammar.  The `BlockLexer` is a modified analyzer that adds logic to include `doctest` 
blocks in the grammar.

In [3]:
    class BlockLexer(markdown.BlockLexer):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            block_code = re.compile(r'^((?!\s+>>>\s) {4}[^\n]+\n*)+')
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m): self.tokens.append(dict(type='hrule', text=m.group(0)))
            
        def parse_def_links(self, m):
            super().parse_def_links(m)
            self.tokens.append(dict(type='def_link', text=m.group(0)))
            
        def parse(self, text: str, default_rules=None, normalize=True) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if normalize and not self.depth: tokens = self.normalize(text, tokens)
            return tokens
        
        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1


    class BlockLexer(markdown.BlockLexer):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            block_code = re.compile(r'^((?!\s+>>>\s) {4}[^\n]+\n*)+')
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m): self.tokens.append(dict(type='hrule', text=m.group(0)))
            
        def parse_def_links(self, m):
            super().parse_def_links(m)
            self.tokens.append(dict(type='def_link', text=m.group(0)))
            
        def parse(self, text: str, default_rules=None, normalize=True) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if normalize and not self.depth: tokens = self.normalize(text, tokens)
            return tokens
        
        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1

The `doctest` token is identified before the block code.

In [4]:
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


<!--
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


-->

Our translation creates tokens specific to each [Markdown] rule, 
for code it is only necessary to identify code and paragraph tokens.
The normalizer compacts tokens into the necessary tokens.

In [5]:
    class Normalizer(BlockLexer):
        def normalize(self, text, tokens):
            """Combine non-code tokens into contiguous blocks."""
            compacted = []
            while tokens:
                token = tokens.pop(0)
                if 'text' not in token: continue
                else: 
                    if not token['text'].strip(): continue
                    block, body = token['text'].splitlines(), ""
                while block:
                    line = block.pop(0)
                    if line:
                        before, line, text = text.partition(line)
                        body += before + line
                if token['type']=='code':
                    compacted.append({'type': 'code', 'lang': None, 'text': body})
                else:
                    if compacted and compacted[-1]['type'] == 'paragraph':
                        compacted[-1]['text'] += body
                    else: compacted.append({'type': 'paragraph', 'text': body})
            if compacted and compacted[-1]['type'] == 'paragraph':
                compacted[-1]['text'] += text
            elif text.strip():
                compacted.append({'type': 'paragraph', 'text': text})
            # Deal with front matter
            if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
                token = compacted.pop(0)
                front_matter, sep, paragraph = token['text'][4:].partition('---')
                compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                            {'type': 'paragraph', 'text': paragraph}] + compacted
            return compacted

    class Normalizer(BlockLexer):
        def normalize(self, text, tokens):
            """Combine non-code tokens into contiguous blocks."""
            compacted = []
            while tokens:
                token = tokens.pop(0)
                if 'text' not in token: continue
                else: 
                    if not token['text'].strip(): continue
                    block, body = token['text'].splitlines(), ""
                while block:
                    line = block.pop(0)
                    if line:
                        before, line, text = text.partition(line)
                        body += before + line
                if token['type']=='code':
                    compacted.append({'type': 'code', 'lang': None, 'text': body})
                else:
                    if compacted and compacted[-1]['type'] == 'paragraph':
                        compacted[-1]['text'] += body
                    else: compacted.append({'type': 'paragraph', 'text': body})
            if compacted and compacted[-1]['type'] == 'paragraph':
                compacted[-1]['text'] += text
            elif text.strip():
                compacted.append({'type': 'paragraph', 'text': text})
            # Deal with front matter
            if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
                token = compacted.pop(0)
                front_matter, sep, paragraph = token['text'][4:].partition('---')
                compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                            {'type': 'paragraph', 'text': paragraph}] + compacted
            return compacted

## Tokenizer logic

The tokenizer controls the translation of markdown strings to python strings.  Our major constraint is that the Markdown input should retain line numbers.

In [6]:
    class Tokenizer(Normalizer):
        def untokenize(self, tokens: τ.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, SPACE*max(indent-num_first_indent(object), 0))
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = quote(object)
                source += object

            # add a semicolon to the source if the last block is code.
            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source
            
    pidgy = pidgyTransformer()

    class Tokenizer(Normalizer):
        def untokenize(self, tokens: τ.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, SPACE*max(indent-num_first_indent(object), 0))
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = quote(object)
                source += object

            # add a semicolon to the source if the last block is code.
            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source
            
    pidgy = pidgyTransformer()

<details><summary>Utility functions for the tangle module</summary>

In [7]:
    def load_ipython_extension(shell):
        shell.input_transformer_manager = shell.tangle = pidgyTransformer()        
    
    def unload_ipython_extension(shell):
        shell.input_transformer_manager = __import__('IPython').core.inputtransformer2.TransformerManager()
    
    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def num_first_indent(text):
        for str in text.splitlines():
            if str.strip(): return len(str) - len(str.lstrip())
        return 0
    
    def num_last_indent(text):
        for str in reversed(text.splitlines()):
            if str.strip(): return len(str) - len(str.lstrip())
        return 0

    def base_indent(tokens):
        "Look ahead for the base indent."
        for i, token in enumerate(tokens):
            if token['type'] == 'code':
                code = token['text']
                if code.lstrip().startswith(FENCE): continue
                indent = num_first_indent(code)
                break
        else: indent = 4
        return indent

    def quote(text):
        """wrap text in `QUOTES`"""
        if text.strip():
            left, right = len(text)-len(text.lstrip()), len(text.rstrip())
            quote = QUOTES[(text[right-1] in QUOTES[0]) or (QUOTES[0] in text)]
            return text[:left] + quote + text[left:right] + quote + text[right:]
        return text    

    def num_whitespace(text): return len(text) - len(text.lstrip())
    
    def whiten(text: str) -> str:
        """`whiten` strips empty lines because the `markdown.BlockLexer` doesn't like that."""
        return '\n'.join(x.rstrip() for x in text.splitlines())

    def load_ipython_extension(shell):
        shell.input_transformer_manager = shell.tangle = pidgyTransformer()        
    
    def unload_ipython_extension(shell):
        shell.input_transformer_manager = __import__('IPython').core.inputtransformer2.TransformerManager()
    
    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def num_first_indent(text):
        for str in text.splitlines():
            if str.strip(): return len(str) - len(str.lstrip())
        return 0
    
    def num_last_indent(text):
        for str in reversed(text.splitlines()):
            if str.strip(): return len(str) - len(str.lstrip())
        return 0

    def base_indent(tokens):
        "Look ahead for the base indent."
        for i, token in enumerate(tokens):
            if token['type'] == 'code':
                code = token['text']
                if code.lstrip().startswith(FENCE): continue
                indent = num_first_indent(code)
                break
        else: indent = 4
        return indent

    def quote(text):
        """wrap text in `QUOTES`"""
        if text.strip():
            left, right = len(text)-len(text.lstrip()), len(text.rstrip())
            quote = QUOTES[(text[right-1] in QUOTES[0]) or (QUOTES[0] in text)]
            return text[:left] + quote + text[left:right] + quote + text[right:]
        return text    

    def num_whitespace(text): return len(text) - len(text.lstrip())
    
    def whiten(text: str) -> str:
        """`whiten` strips empty lines because the `markdown.BlockLexer` doesn't like that."""
        return '\n'.join(x.rstrip() for x in text.splitlines())

</summary></details>