# Tangling [Markdown] to [Python]

The tangle process in literate programming converts the input document 
into the programming language.

In [1]:
    
    import IPython, typing as τ, mistune as markdown, IPython, importnb as _import_, textwrap, ast, doctest, typing, re

<!--
    
    import IPython, typing as τ, mistune as markdown, IPython, importnb as _import_, textwrap, ast, doctest, typing, re

-->

The `pidgyTransformer` manages the high level API the `IPython.InteractiveShell` interacts with for `pidgy`.

In [2]:
    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell):
            return super().transform_cell(self.pidgy_transform(cell))
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

    class pidgyTransformer(IPython.core.inputtransformer2.TransformerManager):
        def pidgy_transform(self, cell: str) -> str: 
            return self.tokenizer.untokenize(self.tokenizer.parse(''.join(cell)))
        
        def transform_cell(self, cell):
            return super().transform_cell(self.pidgy_transform(cell))
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tokenizer = Tokenizer()

        def pidgy_magic(self, *text): 
            return IPython.display.Code(self.pidgy_transform(''.join(text)), language='python')

## Tokenizer logic

The tokenizer controls the translation of markdown strings to python strings.  Our major constraint is that the Markdown input should retain line numbers.

In [10]:
    class Tokenizer(markdown.BlockLexer):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse(self, text: str, default_rules=None) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if not self.depth: tokens = self.normalize(text, tokens)
            return tokens

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m):
            self.tokens.append({'type': 'hrule', 'text': m.group(0)})

        def normalize(self, text, tokens):
            """Combine non-code tokens into contiguous blocks."""
            compacted = []
            while tokens:
                token = tokens.pop(0)
                if 'text' not in token: continue
                else: 
                    if not token['text'].strip(): continue
                    block, body = token['text'].splitlines(), ""
                while block:
                    line = block.pop(0)
                    if line:
                        before, line, text = text.partition(line)
                        body += before + line
                if token['type']=='code':
                    compacted.append({'type': 'code', 'lang': None, 'text': body})
                else:
                    if compacted and compacted[-1]['type'] == 'paragraph':
                        compacted[-1]['text'] += body
                    else: compacted.append({'type': 'paragraph', 'text': body})
            if compacted and compacted[-1]['type'] == 'paragraph':
                compacted[-1]['text'] += text
            elif text.strip():
                compacted.append({'type': 'paragraph', 'text': text})
            # Deal with front matter
            if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
                token = compacted.pop(0)
                front_matter, sep, paragraph = token['text'][4:].partition('---')
                compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                            {'type': 'paragraph', 'text': paragraph}] + compacted
            return compacted

        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1

        def untokenize(self, tokens: τ.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, indent*SPACE)
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = quote(object)
                source += object

            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source
            
    for x in "default_rules footnote_rules list_rules".split():
        setattr(Tokenizer, x, list(getattr(Tokenizer, x)))
        getattr(Tokenizer, x).insert(getattr(Tokenizer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(Tokenizer, x):
            getattr(Tokenizer, x).pop(getattr(Tokenizer, x).index('block_html'))
            
    pidgy = pidgyTransformer()

    class Tokenizer(markdown.BlockLexer):
        class grammar_class(markdown.BlockGrammar):
            doctest = doctest.DocTestParser._EXAMPLE_RE
            default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()

        def parse(self, text: str, default_rules=None) -> typing.List[dict]:
            if not self.depth: self.tokens = []
            with self: tokens = super().parse(whiten(text), default_rules)
            if not self.depth: tokens = self.normalize(text, tokens)
            return tokens

        def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})

        def parse_fences(self, m):
            if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
            else: super().parse_fences(m)

        def parse_hrule(self, m):
            self.tokens.append({'type': 'hrule', 'text': m.group(0)})

        def normalize(self, text, tokens):
            """Combine non-code tokens into contiguous blocks."""
            compacted = []
            while tokens:
                token = tokens.pop(0)
                if 'text' not in token: continue
                else: 
                    if not token['text'].strip(): continue
                    block, body = token['text'].splitlines(), ""
                while block:
                    line = block.pop(0)
                    if line:
                        before, line, text = text.partition(line)
                        body += before + line
                if token['type']=='code':
                    compacted.append({'type': 'code', 'lang': None, 'text': body})
                else:
                    if compacted and compacted[-1]['type'] == 'paragraph':
                        compacted[-1]['text'] += body
                    else: compacted.append({'type': 'paragraph', 'text': body})
            if compacted and compacted[-1]['type'] == 'paragraph':
                compacted[-1]['text'] += text
            elif text.strip():
                compacted.append({'type': 'paragraph', 'text': text})
            # Deal with front matter
            if compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
                token = compacted.pop(0)
                front_matter, sep, paragraph = token['text'][4:].partition('---')
                compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
                            {'type': 'paragraph', 'text': paragraph}] + compacted
            return compacted

        depth = 0
        def __enter__(self): self.depth += 1
        def __exit__(self, *e): self.depth -= 1

        def untokenize(self, tokens: τ.List[dict], source: str = """""", last: int =0) -> str:
            INDENT = indent = base_indent(tokens) or 4
            for i, token in enumerate(tokens):
                object = token['text']
                if token and token['type'] == 'code':
                    if object.lstrip().startswith(FENCE):

                        object = ''.join(''.join(object.partition(FENCE)[::2]).rpartition(FENCE)[::2])
                        indent = INDENT + num_first_indent(object)
                        object = textwrap.indent(object, INDENT*SPACE)

                    if object.lstrip().startswith(MAGIC):  ...
                    else: indent = num_last_indent(object)
                elif token and token['type'] == 'front_matter': 
                    object = textwrap.indent(
                        F"locals().update(__import__('yaml').safe_load({quote(object)}))\n", indent*SPACE)

                elif not object: ...
                else:
                    object = textwrap.indent(object, indent*SPACE)
                    for next in tokens[i+1:]:
                        if next['type'] == 'code':
                            next = num_first_indent(next['text'])
                            break
                    else: next = indent       
                    Δ = max(next-indent, 0)

                    if not Δ and source.rstrip().rstrip(CONTINUATION).endswith(COLON): 
                        Δ += 4

                    spaces = num_whitespace(object)
                    "what if the spaces are ling enough"
                    object = object[:spaces] + Δ*SPACE+ object[spaces:]
                    if not source.rstrip().rstrip(CONTINUATION).endswith(QUOTES): 
                        object = quote(object)
                source += object

            for token in reversed(tokens):
                if token['text'].strip():
                    if token['type'] != 'code': 
                        source = source.rstrip() + SEMI
                    break

            return source
            
    for x in "default_rules footnote_rules list_rules".split():
        setattr(Tokenizer, x, list(getattr(Tokenizer, x)))
        getattr(Tokenizer, x).insert(getattr(Tokenizer, x).index('block_code'), 'doctest')
        if 'block_html' in getattr(Tokenizer, x):
            getattr(Tokenizer, x).pop(getattr(Tokenizer, x).index('block_html'))
            
    pidgy = pidgyTransformer()

In [11]:
    
    # This has to be in a separate cell because the tests go crazy.
    
    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    

<!--
    
    # This has to be in a separate cell because the tests go crazy.
    
    (FENCE, CONTINUATION, SEMI, COLON, MAGIC, DOCTEST), QUOTES, SPACE ='``` \\ ; : %% >>>'.split(), ('"""', "'''"), ' '
    

-->

In [12]:
    
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def num_first_indent(text):
        for str in text.splitlines():
            if str.strip(): return len(str) - len(str.lstrip())
        return 0
    
    def num_last_indent(text):
        for str in reversed(text.splitlines()):
            if str.strip(): return len(str) - len(str.lstrip())
        return 0

    def base_indent(tokens):
        "Look ahead for the base indent."
        for i, token in enumerate(tokens):
            if token['type'] == 'code':
                code = token['text']
                if code.lstrip().startswith(FENCE): continue
                indent = num_first_indent(code)
                break
        else: indent = 4
        return indent

    def quote(text):
        """wrap text in `QUOTES`"""
        if text.strip():
            left, right = len(text)-len(text.lstrip()), len(text.rstrip())
            quote = QUOTES[(text[right-1] in QUOTES[0]) or (QUOTES[0] in text)]
            return text[:left] + quote + text[left:right] + quote + text[right:]
        return text    

    def num_whitespace(text): return len(text) - len(text.lstrip())
    
    def whiten(text: str) -> str:
        """`whiten` strips empty lines because the `markdown.BlockLexer` doesn't like that."""
        return '\n'.join(x.rstrip() for x in text.splitlines())

<!--
    
    WHITESPACE = re.compile('^\s*', re.MULTILINE)

    def num_first_indent(text):
        for str in text.splitlines():
            if str.strip(): return len(str) - len(str.lstrip())
        return 0
    
    def num_last_indent(text):
        for str in reversed(text.splitlines()):
            if str.strip(): return len(str) - len(str.lstrip())
        return 0

    def base_indent(tokens):
        "Look ahead for the base indent."
        for i, token in enumerate(tokens):
            if token['type'] == 'code':
                code = token['text']
                if code.lstrip().startswith(FENCE): continue
                indent = num_first_indent(code)
                break
        else: indent = 4
        return indent

    def quote(text):
        """wrap text in `QUOTES`"""
        if text.strip():
            left, right = len(text)-len(text.lstrip()), len(text.rstrip())
            quote = QUOTES[(text[right-1] in QUOTES[0]) or (QUOTES[0] in text)]
            return text[:left] + quote + text[left:right] + quote + text[right:]
        return text    

    def num_whitespace(text): return len(text) - len(text.lstrip())
    
    def whiten(text: str) -> str:
        """`whiten` strips empty lines because the `markdown.BlockLexer` doesn't like that."""
        return '\n'.join(x.rstrip() for x in text.splitlines())

-->

In [13]:
    
    def load_ipython_extension(shell):
        shell.input_transformer_manager = shell.tangle = pidgyTransformer()        
    
    def unload_ipython_extension(shell):
        shell.input_transformer_manager = __import__('IPython').core.inputtransformer2.TransformerManager()

<!--
    
    def load_ipython_extension(shell):
        shell.tangle = pidgy_transformer = pidgyTransformer()        
        shell.input_transformer_manager = pidgy_transformer
        if not any(x for x in shell.ast_transformers if isinstance(x, ReturnYield)):
            shell.ast_transformers.append(ReturnYield())
    
    def unload_ipython_extension(shell):
        shell.input_transformer_manager = __import__('IPython').core.inputtransformer2.TransformerManager()
        shell.ast_transformers = [x for x in shell.ast_transformers if not isinstance(x, ReturnYield)]

-->