# Tangling [Markdown] to [Python]



In [2]:
    
    import IPython, typing, mistune as markdown, IPython, textwrap, ast, doctest, re
    try: from . import base
    except: import base

<!--
    
    import IPython, typing, mistune as markdown, IPython, textwrap, ast, doctest, re
    try: from . import base
    except: import base

-->

The `pidgyTransformer` using the existing `IPython.core.inputtransformer2.TransformerManager` to configure the [Markdown] language features, and it is the public API for manipulating `pidgy` strings.  It implements the heuristics applied create predictable [Python] from [Markdown]

In [3]:
    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager, base.Extension):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell: str) -> str:
            return super().transform_cell(self.pidgy_transform(cell))
        transform = transform_cell
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager, base.Extension):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell: str) -> str:
            return super().transform_cell(self.pidgy_transform(cell))
        transform = transform_cell
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

## The translation process

A convenient considerations when tangling pidgy documents is that we are only concerned with the relative placements of block code objects relative the `not "code"` blocks.  `pidgy` customizes `mistune` language features for the purposes of a [Literate Computing] experience. The conversion to [Python] uses:
1. lexical analysis to tokenize the markdown.
2. the token are normalized to block `"code" and not "code"` objects.
3. the tokens are translated to a string using heuristics that maintain line numbers between the representations.

### Block level lexical analysis.

The block lexer converts a string in tokens that represent blocks of markdown in a text.  `pidgy` establishes a modified mistune block lexer that patches some needed features. It includes `doctest` syntax as a language feature. `doctest` are tested interactively, but they are consider `not "code"` objects in the [Markdown] to [Python] translation.  `doctest` is added because it is a common documentation approach in [Python], it is an example of [Literate Programming].

<details><summary><code>BlockLexer</code></summary>

In [10]:
    class BlockLexer(markdown.BlockLexer, util.ContextDepth):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            block_code = re.compile(r'^((?!\s+>>>\s) {4}[^\n]+\n*)+')
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m): self.tokens.append(dict(type='hrule', text=m.group(0)))
            
        def parse_def_links(self, m):
            super().parse_def_links(m)
            self.tokens.append(dict(type='def_link', text=m.group(0)))
            
        def parse(self, text: str, default_rules=None, normalize=True) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if normalize and not self.depth: tokens = normalizer(text, tokens)
            return tokens
        
        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1


    class BlockLexer(markdown.BlockLexer, util.ContextDepth):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            block_code = re.compile(r'^((?!\s+>>>\s) {4}[^\n]+\n*)+')
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m): self.tokens.append(dict(type='hrule', text=m.group(0)))
            
        def parse_def_links(self, m):
            super().parse_def_links(m)
            self.tokens.append(dict(type='def_link', text=m.group(0)))
            
        def parse(self, text: str, default_rules=None, normalize=True) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if normalize and not self.depth: tokens = normalizer(text, tokens)
            return tokens
        
        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1

In [7]:
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


<!--
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


-->

</details>

### Normalizing the tokens

This extra step flattens the canonical mistune token representation to the collection of `"code" and not "code"` tokens.

<details><summary><code>normalizer</code></summary>

In [11]:
    def normalizer(text, tokens):
        """Combine non-code tokens into contiguous blocks."""
        compacted = []
        while tokens:
            token = tokens.pop(0)
            if 'text' not in token: continue
            else: 
                if not token['text'].strip(): continue
                block, body = token['text'].splitlines(), ""
            while block:
                line = block.pop(0)
                if line:
                    before, line, text = text.partition(line)
                    body += before + line
            if token['type']=='code':
                compacted.append({'type': 'code', 'lang': None, 'text': body})
            else:
                if compacted and compacted[-1]['type'] == 'paragraph':
                    compacted[-1]['text'] += body
                else: compacted.append({'type': 'paragraph', 'text': body})
        if compacted and compacted[-1]['type'] == 'paragraph':
            compacted[-1]['text'] += text
        elif text.strip():
            compacted.append({'type': 'paragraph', 'text': text})
        # Deal with front matter
        if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
            token = compacted.pop(0)
            front_matter, sep, paragraph = token['text'][4:].partition('---')
            compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                        {'type': 'paragraph', 'text': paragraph}] + compacted
        return compacted

    def normalizer(text, tokens):
        """Combine non-code tokens into contiguous blocks."""
        compacted = []
        while tokens:
            token = tokens.pop(0)
            if 'text' not in token: continue
            else: 
                if not token['text'].strip(): continue
                block, body = token['text'].splitlines(), ""
            while block:
                line = block.pop(0)
                if line:
                    before, line, text = text.partition(line)
                    body += before + line
            if token['type']=='code':
                compacted.append({'type': 'code', 'lang': None, 'text': body})
            else:
                if compacted and compacted[-1]['type'] == 'paragraph':
                    compacted[-1]['text'] += body
                else: compacted.append({'type': 'paragraph', 'text': body})
        if compacted and compacted[-1]['type'] == 'paragraph':
            compacted[-1]['text'] += text
        elif text.strip():
            compacted.append({'type': 'paragraph', 'text': text})
        # Deal with front matter
        if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
            token = compacted.pop(0)
            front_matter, sep, paragraph = token['text'][4:].partition('---')
            compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                        {'type': 'paragraph', 'text': paragraph}] + compacted
        return compacted

</details>

### Flattening the tokens to a [Python] string.

The tokenizer controls the translation of markdown strings to python strings.  Our major constraint is that the Markdown input should retain line numbers.

<details><summary><code>Flatten</code></summary>

In [4]:
    class Tokenizer(BlockLexer):
        def untokenize(self, tokens: typing.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + util.num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = util.num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({util.quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, SPACE*max(indent-util.num_first_indent(object), 0))
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = util.num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = util.num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = util.quote(object)
                source += object

            # add a semicolon to the source if the last block is code.
            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source

    class Tokenizer(BlockLexer):
        def untokenize(self, tokens: typing.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = util.base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + util.num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = util.num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({util.quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, SPACE*max(indent-util.num_first_indent(object), 0))
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = util.num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = util.num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = util.quote(object)
                source += object

            # add a semicolon to the source if the last block is code.
            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source

</details>

### Normalizing the tokens

This step may be superfluous, but it assisted in considering the logic necessary to compose the resulting python.  This extra step flattens the canonical mistune token representation is reduced to one of `"paragraph code front_matter"` tokens.

</details>

<details><summary>Utility functions for the tangle module</summary>

In [5]:
    def normalizer(text: str, tokens: typing.List[dict]):
        """Combine non-code tokens into contiguous blocks."""
        compacted = []
        while tokens:
            token = tokens.pop(0)
            if 'text' not in token: continue
            else: 
                if not token['text'].strip(): continue
                block, body = token['text'].splitlines(), ""
            while block:
                line = block.pop(0)
                if line:
                    before, line, text = text.partition(line)
                    body += before + line
            if token['type']=='code':
                compacted.append({'type': 'code', 'lang': None, 'text': body})
            else:
                if compacted and compacted[-1]['type'] == 'paragraph':
                    compacted[-1]['text'] += body
                else: compacted.append({'type': 'paragraph', 'text': body})
        if compacted and compacted[-1]['type'] == 'paragraph':
            compacted[-1]['text'] += text
        elif text.strip():
            compacted.append({'type': 'paragraph', 'text': text})
        
        if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
            token = compacted.pop(0)
            front_matter, sep, paragraph = token['text'][4:].partition('---')
            compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                        {'type': 'paragraph', 'text': paragraph}] + compacted
        return compacted

    def normalizer(text: str, tokens: typing.List[dict]):
        """Combine non-code tokens into contiguous blocks."""
        compacted = []
        while tokens:
            token = tokens.pop(0)
            if 'text' not in token: continue
            else: 
                if not token['text'].strip(): continue
                block, body = token['text'].splitlines(), ""
            while block:
                line = block.pop(0)
                if line:
                    before, line, text = text.partition(line)
                    body += before + line
            if token['type']=='code':
                compacted.append({'type': 'code', 'lang': None, 'text': body})
            else:
                if compacted and compacted[-1]['type'] == 'paragraph':
                    compacted[-1]['text'] += body
                else: compacted.append({'type': 'paragraph', 'text': body})
        if compacted and compacted[-1]['type'] == 'paragraph':
            compacted[-1]['text'] += text
        elif text.strip():
            compacted.append({'type': 'paragraph', 'text': text})
        
        if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
            token = compacted.pop(0)
            front_matter, sep, paragraph = token['text'][4:].partition('---')
            compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                        {'type': 'paragraph', 'text': paragraph}] + compacted
        return compacted

</details>

In [6]:
    def load_ipython_extension(shell):
        shell.tangle = pidgyTransformer().register()
    
    def unload_ipython_extension(shell):
        if hasattr(shell, 'tangle'): shell.tangle.unregister(shell)
    
    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def num_first_indent(text):
        for str in text.splitlines():
            if str.strip(): return len(str) - len(str.lstrip())
        return 0
    
    def num_last_indent(text):
        for str in reversed(text.splitlines()):
            if str.strip(): return len(str) - len(str.lstrip())
        return 0

    def unload_ipython_extension(shell):
        if hasattr(shell, 'tangle'): shell.tangle.unregister(shell)

    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def load_ipython_extension(shell):
        shell.tangle = pidgyTransformer().register()

    def unload_ipython_extension(shell):
        if hasattr(shell, 'tangle'): shell.tangle.unregister(shell)

    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

In [7]:
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


<!--
    
    for x in "default_rules footnote_rules list_rules".split():
        setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
        getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(BlockLexer, x):
            getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))


-->

</summary></details>