In [None]:
#default_exp mdx

# Preprocessors For MDX

> Custom preprocessors that help convert notebook content into MDX

In [None]:
# export
import re, uuid

from nbprocess.read import get_config
from nbprocess.processor import *
from nbconvert.preprocessors import ExtractOutputPreprocessor

from fastcore.basics import *
from fastcore.foundation import *
from traitlets.config import Config
from pathlib import Path
from html.parser import HTMLParser

In [None]:
#hide
from nbprocess.export import read_nb
from nbconvert import NotebookExporter
import json

__file__ = '../nbprocess/export.py'
_test_file = '../tests/docs_test.ipynb'

This module defines [nbconvert.Custom Preprocessors](https://nbconvert.readthedocs.io/en/latest/nbconvert_library.html#Custom-Preprocessors) that facilitate transforming notebook content into MDX, which is a variation of markdown.

In [None]:
#export
def default_pp_cfg():
    "Default Preprocessor Config for MDX export"
    c = Config()
    c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell", "hide")
    c.TagRemovePreprocessor.remove_all_outputs_tags = ("remove_output", "remove_outputs", "hide_output", "hide_outputs")
    c.TagRemovePreprocessor.remove_input_tags = ('remove_input', 'remove_inputs', "hide_input", "hide_inputs")
    return c

In [None]:
#export
def _mdx_exporter(pps, cfg=None, tpl_file='ob.tpl'):
    "An mdx notebook exporter which composes preprocessors"
    cfg = cfg or default_pp_cfg()
    cfg.MarkdownExporter.preprocessors = pps or []
    tmp_dir = Path(__file__).parent/'templates/'
    tpl_file = tmp_dir/f"{tpl_file}"
    if not tpl_file.exists(): raise ValueError(f"{tpl_file} does not exist in {tmp_dir}")
    cfg.MarkdownExporter.template_file = str(tpl_file)
    return MarkdownExporter(config=cfg)

In [None]:
#export
def _run_preprocessor(pps, fname, display=False):
    "An mdx notebook exporter which composes preprocessors"
    exp = _mdx_exporter(pps)
    result = exp.from_filename(fname)
    if display: print(result[0])
    return result

## Cell Tag Cheatsheet

These preprocessors allow you to make special comments to enable/disable them.  Here is a list of all special comments:

All comments start with `#meta` or `#cell_meta`, which are both aliases for the same thing.  For brevity, we will use `#meta` in this cheatsheet.

### Show/Hide Cells

1. Remove entire cells:  `#meta:tag=remove_cell` or `#meta:tag=hide`
2. Remove output: `#meta:tag=remove_output` or `#meta:tag=remove_output` or `#meta:tag=hide_outputs` or `#meta:tag=hide_output`
3. Remove input: same as above, except `input` instead of `output`.

## Injecting Metadata Into Cells -

In [None]:
#export
_re_meta= r'^\s*#(?:cell_meta|meta):\S+\s*[\n\r]'

@preprocess_cell
def InjectMeta(cell):
    "Inject metadata into a cell for further preprocessing with a comment."
    _pattern = r'(^\s*#(?:cell_meta|meta):)(\S+)(\s*[\n\r])'
    if cell.cell_type == 'code' and re.search(_re_meta, cell.source, flags=re.MULTILINE):
        cell_meta = re.findall(_pattern, cell.source, re.MULTILINE)
        d = cell.metadata.get('nbprocess', {})
        for _, m, _ in cell_meta:
            if '=' in m:
                k,v = m.split('=')
                d[k] = v
            else: print(f"Warning cell_meta:{m} does not have '=' will be ignored.")
        cell.metadata['nbprocess'] = d

To inject metadata make a comment in a cell with the following pattern: `#cell_meta:{key=value}`. Note that `#meta` is an alias for `#cell_meta`

For example, consider the following code:

In [None]:
_test_file = '../tests/docs_test.ipynb'
first_cell = read_nb(_test_file)['cells'][2]
print(first_cell['source'])

At the moment, this cell has no metadata:

In [None]:
first_cell.metadata

However, after we process this notebook with `InjectMeta`, the appropriate metadata will be injected:

In [None]:
c = Config()
c.NotebookExporter.preprocessors = [InjectMeta]
exp = NotebookExporter(config=c)
cells, _ = exp.from_filename(_test_file)
first_cell = json.loads(cells)['cells'][2]

assert first_cell['metadata'] == {'nbprocess': {'show_steps': 'start,train'}}

## Strip Ansi Characters From Output -

In [None]:
#export
_re_ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')

@preprocess_cell
def StripAnsi(cell):
    "Strip Ansi Characters."
    for o in cell.get('outputs', []):
        if o.get('name') == 'stdout': o['text'] = _re_ansi_escape.sub('', o.text)

Gets rid of colors that are streamed from standard out, which can interfere with static site generators:

In [None]:
c, _ = _run_preprocessor([StripAnsi], _test_file)
assert not _re_ansi_escape.findall(c)

## Insert Warning Into Markdown -

In [None]:
# export
@preprocess
def InsertWarning(nb):
    """Insert Autogenerated Warning Into Notebook after the first cell."""
    content = "<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->"
    mdcell = AttrDict(cell_type='markdown', id=uuid.uuid4().hex[:36], metadata={}, source=content)
    nb.cells.insert(1, mdcell)

This preprocessor inserts a warning in the markdown destination that the file is autogenerated.  This warning is inserted in the second cell so we do not interfere with front matter.

In [None]:
c, _ = _run_preprocessor([InsertWarning], _test_file)
assert "<!-- WARNING: THIS FILE WAS AUTOGENERATED!" in c

## Remove Empty Code Cells -

In [None]:
# export
def _keepCell(cell): return cell['cell_type'] != 'code' or cell.source.strip()

@preprocess
def RmEmptyCode(nb):
    "Remove empty code cells."
    nb.cells = filter(_keepCell,nb.cells)

In [None]:
c, _ = _run_preprocessor([RmEmptyCode], _test_file)
assert '```python\n\n```' not in c

## Turn Metadata into Cell Tags -

In [None]:
#export
@preprocess_cell
def UpdateTags(cell):
    root = cell.metadata.get('nbprocess', {})
    tags = root.get('tags', root.get('tag')) # allow the singular also
    if tags: cell.metadata['tags'] = cell.metadata.get('tags', []) + tags.split(',')

`UpdateTags` is meant to be used with `InjectMeta` and `TagRemovePreprocessor` to configure the visibility of cells in rendered docs.

In [None]:
# Configure an exporter from scratch
c = Config()
c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell",)
c.TagRemovePreprocessor.remove_all_outputs_tags = ('remove_output',)
c.TagRemovePreprocessor.remove_input_tags = ('remove_input',)
c.MarkdownExporter.preprocessors = [InjectMeta, UpdateTags, TagRemovePreprocessor]
exp = MarkdownExporter(config=c)
result = exp.from_filename(_test_file)[0]

# show the results
assert 'you will not be able to see this cell at all either' not in result

## Hide Specific Lines of Code -

In [None]:
#export
@preprocess_cell
def HideInputLines(cell):
    "Hide lines of code in code cells with the comment `#meta_hide_line` at the end of a line of code."
    tok = '#meta_hide_line'
    if cell.cell_type == 'code' and tok in cell.source:
        cell.source = '\n'.join([c for c in cell.source.splitlines() if not c.strip().endswith(tok)])

In [None]:
c, _ = _run_preprocessor([InjectMeta, HideInputLines], _test_file)

_res = """```python
def show():
    a = 2
```"""
assert _res in c

## Hide Specific Lines of Output With Keywords -

In [None]:
#export
@preprocess_cell
def FilterOutput(cell):
    root = cell.metadata.get('nbprocess', {})
    words = root.get('filter_words', root.get('filter_word'))
    # import ipdb; ipdb.set_trace()
    if 'outputs' in cell and words:
        _re = f"^(?!.*({'|'.join(words.split(','))}))"
        for o in cell.outputs:
            if o.name == 'stdout':
                filtered_lines = [l for l in o['text'].splitlines() if re.findall(_re, l)]
                o['text'] = '\n'.join(filtered_lines)

If we want to exclude output with certain keywords, we can use the `#meta:filter_words` comment.  For example, if we wanted to ignore all output that contains the text `FutureWarning` or `MultiIndex` we can use the comment:

`#meta:filter_words=FutureWarning,MultiIndex`

In [None]:
c, _ = _run_preprocessor([InjectMeta, FilterOutput], _test_file)
_res="""<CodeOutputBlock lang="python">

    A line
    Another line.

</CodeOutputBlock>"""
assert _res in c

## Clean Flags and Magics -

In [None]:
#export
_tst_flags = get_config()['tst_flags'].split('|')

@preprocess_cell
def CleanFlags(cell):
    "A preprocessor to remove Flags"
    if cell.cell_type != 'code': return
    for p in [re.compile(r'^#\s*{0}\s*'.format(f), re.MULTILINE) for f in _tst_flags]:
        cell.source = p.sub('', cell.source).strip()

In [None]:
c, _ = _run_preprocessor([CleanFlags], _test_file)
assert '#notest' not in c

In [None]:
#export
@preprocess_cell
def CleanMagics(cell):
    "A preprocessor to remove cell magic commands and #cell_meta: comments"
    pattern = re.compile(r'(^\s*(%%|%).+?[\n\r])|({0})'.format(_re_meta), re.MULTILINE)
    if cell.cell_type == 'code': cell.source = pattern.sub('', cell.source).strip()

`CleanMagics` strips magic cell commands `%%` so they do not appear in rendered markdown files:

In [None]:
c, _ = _run_preprocessor([CleanMagics], _test_file)
assert '%%' not in c

## Format Shell Commands -

In [None]:
#export
@preprocess_cell
def BashIdentify(cell):
    "A preprocessor to identify bash commands and mark them appropriately"
    pattern = re.compile('^\s*!', flags=re.MULTILINE)
    if cell.cell_type == 'code' and pattern.search(cell.source):
        cell.metadata.magics_language = 'bash'
        cell.source = pattern.sub('', cell.source).strip()

When we issue a shell command in a notebook with `!`, we need to change the code-fence from `python` to `bash` and remove the `!`:

In [None]:
c, _ = _run_preprocessor([BashIdentify], _test_file)
assert "```bash" in c

## Remove `ShowDoc` Input Cells -

In [None]:
#export
_re_showdoc = re.compile(r'^ShowDoc', re.MULTILINE)

def _isShowDoc(cell):
    "Return True if cell contains ShowDoc."
    return cell['cell_type'] == 'code' and _re_showdoc.search(cell.source)

@preprocess_cell
def CleanShowDoc(cell):
    "Ensure that ShowDoc output gets cleaned in the associated notebook."
    _re_html = re.compile(r'<HTMLRemove>.*</HTMLRemove>', re.DOTALL)
    if not _isShowDoc(cell): return
    all_outs = [o['data'] for o in cell.outputs if 'data' in o]
    html_outs = [o['text/html'] for o in all_outs if 'text/html' in o]
    if len(html_outs) != 1: return
    cleaned_html = self._re_html.sub('', html_outs[0])
    return AttrDict({'cell_type':'raw', 'id':cell.id, 'metadata':cell.metadata, 'source':cleaned_html})

In [None]:
_result, _ = _run_preprocessor([CleanShowDoc], _test_file)
assert '<HTMLRemove>' not in _result

## Escaping HTML

In [None]:
#export
class _HTMLdf(HTMLParser):
    "HTML Parser that finds a dataframe."
    df,scoped = False,False
    def handle_starttag(self, tag, attrs):
        if tag == 'style' and 'scoped' in dict(attrs): self.scoped=True
    def handle_data(self, data):
        if '.dataframe' in data and self.scoped: self.df=True
    def handle_endtag(self, tag):
        if tag == 'style': self.scoped=False
                
    @classmethod
    def search(cls, x):
        parser = cls()
        parser.feed(x)
        return parser.df

In [None]:
#export
@preprocess_cell
def HTMLEscape(cell):
    "Place HTML in a codeblock and surround it with a <HTMLOutputBlock> component."
    if cell.cell_type !='code': return
    for o in cell.outputs:
        if nested_idx(o, 'data', 'text/html'):
            cell.metadata.html_output = True
            html = o['data']['text/html']
            cell.metadata.html_center = not _HTMLdf.search(html)
            o['data']['text/html'] = '```html\n'+html.strip()+'\n```'

By default, HTML is incompatible with MDX.  We place HTML in a code block and wrap it with the a custom component so that the static site generator can render it.

In [None]:
c, _ = _run_preprocessor([HTMLEscape], '../tests/docs_test.ipynb')
assert '<HTMLOutputBlock' in c and '</HTMLOutputBlock>' in c and 'center' not in c
assert '```html\n<div>' in c and '</div>\n```' in c

## Removing Headers Ending In A Dash `-`

In [None]:
#export
_re_hdr_dash = re.compile(r'^#+\s+.*\s+-\s*$', re.MULTILINE)

@preprocess_cell
def RmHeaderDash(cell):
    "Remove headings that end with a dash -"
    if cell.cell_type == 'markdown': 
        exclude = {l.strip() for l in _re_hdr_dash.findall(cell.source)}
        if exclude:
            lines = [l for l in cell.source.splitlines() if l not in exclude]
            cell.source = '\n'.join(lines)

In [None]:
c, _ = _run_preprocessor([RmHeaderDash], '../tests/docs_test.ipynb')
assert 'some words' in c
assert 'A heading to Hide' not in c and 'Another Heading' not in c and 'Yet another heading to hide' not in c

## Composing Preprocessors Into A Pipeline

Lets see how you can compose all of these preprocessors together to process notebooks appropriately:

In [None]:
#export
def default_pps(c):
    "Default Preprocessors for MDX export"
    return [InjectMeta, CleanMagics, BashIdentify, UpdateTags, InsertWarning, TagRemovePreprocessor,
            CleanFlags, CleanShowDoc, RmEmptyCode, StripAnsi, HideInputLines, RmHeaderDash,
            ExtractAttachmentsPreprocessor, ExtractOutputPreprocessor, HTMLEscape]

In [None]:
#export
def mdx_exporter(tpl_file, cfg=None, pps=None):
    "An mdx notebook exporter which composes preprocessors"
    pps = pps or default_pps(cfg)
    return _mdx_exporter(pps, cfg, tpl_file=tpl_file)

In [None]:
#export
def nb2md(fname, exp=None, dest=None, cfg=None, pps=None, tpl_file='ob.tpl'):
    "Convert notebook to markdown and export attached/output files"
    if isinstance(dest,Path): dest=dest.name
    file = Path(fname)
    assert file.name.endswith('.ipynb'), f'{fname} is not a notebook.'
    assert file.is_file(), f'file {fname} not found.'
    print(f"converting: {file}")
    exp = mdx_exporter(cfg=cfg, pps=pps, tpl_file=tpl_file)
    # https://gitlab.kwant-project.org/solidstate/lectures/-/blob/master/execute.py
    fw = FilesWriter()

    try:
        md = exp.from_filename(fname, resources=dict(unique_key=file.stem, output_files_dir=file.stem))
        if dest: fw.build_directory = dest
        return fw.write(*md, notebook_name=file.stem)
    except Exception as e: print(e)

In [None]:
#When No argument for dest is passed, it will build things in the same directory as the notebook
_test_fname = Path('../tests/docs_test.ipynb')

_dest_file = Path(_test_fname).with_suffix('.md')
_dest_file.unlink(missing_ok=True)
nb2md(_test_fname)
assert _dest_file.exists(), f'{_dest_file} does not exist.'

In [None]:
#When an argument for dest is passed, it will build things in the dest directory
_test_dest = Path('build')
_test_outp = _test_dest/'docs_test'
_test_file = _test_dest/'docs_test.md'
_test_file.unlink(missing_ok=True)
[p.unlink(missing_ok=True) for p in _test_outp.ls()]
_test_outp.rmdir()

nb2md(_test_fname, dest=_test_dest)
assert _test_file.exists() # make sure the markdown file does exist
assert _test_outp.exists() # make sure the images folder exists
assert len(_test_file.readlines()) > 10
assert _test_outp.ls()

_test_file.unlink(missing_ok=True)

## Export -

In [None]:
#skip
from nbprocess.export import nbs_export
nbs_export()