In [1]:
#default_exp mdx

# Preprocessors For MDX

> Custom preprocessors that help convert notebook content into MDX

This module defines [nbconvert.Custom Preprocessors](https://nbconvert.readthedocs.io/en/latest/nbconvert_library.html#Custom-Preprocessors) that facilitate transforming notebook content into MDX, which is a variation of markdown.

## Cell Tag Cheatsheet

These preprocessors allow you to make special comments to enable/disable them.  Here is a list of all special comments:

All comments start with `#meta` or `#cell_meta`, which are both aliases for the same thing.  For brevity, we will use `#meta` in this cheatsheet.

### Show/Hide Cells

1. Remvoe entire cells:  `#meta:tag=remove_cell` or `#meta:tag=hide`
2. Remove output: `#meta:tag=remove_output` or `#meta:tag=remove_output` or `#meta:tag=hide_outputs` or `#meta:tag=hide_output`
3. Remove input: same as above, except `input` instead of `output`.

In [2]:
# export
from nbconvert.preprocessors import Preprocessor
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import TagRemovePreprocessor

from nbprocess.read import get_config

from fastcore.basics import *
from fastcore.foundation import *
from traitlets.config import Config
from pathlib import Path
import re, uuid
from nbprocess.media import ImagePath, ImageSave, HTMLEscape
from functools import wraps

In [3]:
#hide
from nbprocess.export import read_nb
from nbconvert import NotebookExporter
from nbprocess.test_utils import run_preprocessor, show_plain_md
# from nbdoc.run import _gen_nb
import json

__file__ = '../nbprocess/export.py'
_test_file = '../tests/docs_test.ipynb'

In [4]:
#export
def preprocess_cell(func):
    @wraps(func, updated=())
    class _C(Preprocessor):
        def preprocess_cell(self, cell, resources, index):
            res = func(cell)
            if res: cell = res
            return cell, resources
    return _C

In [5]:
#export
def preprocess(func):
    @wraps(func, updated=())
    class _C(Preprocessor):
        def preprocess(self, nb, resources):
            res = func(nb)
            if res: nb = res
            nb.cells = list(nb.cells)
            return nb, resources
    return _C

## Injecting Metadata Into Cells -

In [6]:
#export
_re_meta= r'^\s*#(?:cell_meta|meta):\S+\s*[\n\r]'

@preprocess_cell
def InjectMeta(cell):
    "Inject metadata into a cell for further preprocessing with a comment."
    _pattern = r'(^\s*#(?:cell_meta|meta):)(\S+)(\s*[\n\r])'
    if cell.cell_type == 'code' and re.search(_re_meta, cell.source, flags=re.MULTILINE):
        cell_meta = re.findall(_pattern, cell.source, re.MULTILINE)
        d = cell.metadata.get('nbdoc', {})
        for _, m, _ in cell_meta:
            if '=' in m:
                k,v = m.split('=')
                d[k] = v
            else: print(f"Warning cell_meta:{m} does not have '=' will be ignored.")
        cell.metadata['nbdoc'] = d

To inject metadata make a comment in a cell with the following pattern: `#cell_meta:{key=value}`. Note that `#meta` is an alias for `#cell_meta`

For example, consider the following code:

In [7]:
_test_file = '../tests/docs_test.ipynb'
first_cell = read_nb(_test_file)['cells'][2]
print(first_cell['source'])

#meta:show_steps=start,train
import re


At the moment, this cell has no metadata:

In [8]:
first_cell.metadata



However, after we process this notebook with `InjectMeta`, the appropriate metadata will be injected:

In [9]:
c = Config()
c.NotebookExporter.preprocessors = [InjectMeta]
exp = NotebookExporter(config=c)
cells, _ = exp.from_filename(_test_file)
first_cell = json.loads(cells)['cells'][2]

assert first_cell['metadata'] == {'nbdoc': {'show_steps': 'start,train'}}

## Strip Ansi Characters From Output -

In [10]:
#export
_re_ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')

@preprocess_cell
def StripAnsi(cell):
    "Strip Ansi Characters."
    for o in cell.get('outputs', []):
        if o.get('name') == 'stdout': o['text'] = _re_ansi_escape.sub('', o.text)

Gets rid of colors that are streamed from standard out, which can interfere with static site generators:

In [11]:
c, _ = run_preprocessor([StripAnsi], _test_file)
assert not _re_ansi_escape.findall(c)

## Insert Warning Into Markdown -

In [12]:
# export
@preprocess
def InsertWarning(nb):
    """Insert Autogenerated Warning Into Notebook after the first cell."""
    content = "<!--- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT!"
    mdcell = AttrDict(cell_type='markdown', id=uuid.uuid4().hex[:36], metadata={}, source=content)
    nb.cells.insert(1, mdcell)

This preprocessor inserts a warning in the markdown destination that the file is autogenerated.  This warning is inserted in the second cell so we do not interfere with front matter.

In [13]:
c, _ = run_preprocessor([InsertWarning], _test_file)
assert "<!--- WARNING: THIS FILE WAS AUTOGENERATED!" in c

## Remove Empty Code Cells -

In [14]:
# export
def _keepCell(cell): return cell['cell_type'] != 'code' or cell.source.strip()

@preprocess
def RmEmptyCode(nb):
    "Remove empty code cells."
    nb.cells = filter(_keepCell,nb.cells)

In [15]:
c, _ = run_preprocessor([RmEmptyCode], _test_file)
assert '```python\n\n```' not in c

## Turn Metadata into Cell Tags -

In [16]:
#export
@preprocess_cell
def UpdateTags(cell):
    root = cell.metadata.get('nbdoc', {})
    tags = root.get('tags', root.get('tag')) # allow the singular also
    if tags: cell.metadata['tags'] = cell.metadata.get('tags', []) + tags.split(',')

`UpdateTags` is meant to be used with `InjectMeta` and `TagRemovePreprocessor` to configure the visibility of cells in rendered docs.

In [17]:
# Configure an exporter from scratch
c = Config()
c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell",)
c.TagRemovePreprocessor.remove_all_outputs_tags = ('remove_output',)
c.TagRemovePreprocessor.remove_input_tags = ('remove_input',)
c.MarkdownExporter.preprocessors = [InjectMeta, UpdateTags, TagRemovePreprocessor]
exp = MarkdownExporter(config=c)
result = exp.from_filename(_test_file)[0]

# show the results
assert 'you will not be able to see this cell at all either' not in result

## Hide Specific Lines of Code -

In [18]:
#export
@preprocess_cell
def HideInputLines(cell):
    "Hide lines of code in code cells with the comment `#meta_hide_line` at the end of a line of code."
    tok = '#meta_hide_line'
    if cell.cell_type == 'code' and tok in cell.source:
        cell.source = '\n'.join([c for c in cell.source.splitlines() if not c.strip().endswith(tok)])

In [19]:
c, _ = run_preprocessor([InjectMeta, HideInputLines], _test_file)

_res = """```python
def show():
    a = 2
```"""
assert _res in c

## Clean Flags and Magics -

In [20]:
#export
_tst_flags = get_config()['tst_flags'].split('|')

@preprocess_cell
def CleanFlags(cell):
    "A preprocessor to remove Flags"
    if cell.cell_type != 'code': return
    for p in [re.compile(r'^#\s*{0}\s*'.format(f), re.MULTILINE) for f in _tst_flags]:
        cell.source = p.sub('', cell.source).strip()

In [21]:
c, _ = run_preprocessor([CleanFlags], _test_file)
assert '#notest' not in c

In [22]:
#export
@preprocess_cell
def CleanMagics(cell):
    "A preprocessor to remove cell magic commands and #cell_meta: comments"
    pattern = re.compile(r'(^\s*(%%|%).+?[\n\r])|({0})'.format(_re_meta), re.MULTILINE)
    if cell.cell_type == 'code': cell.source = pattern.sub('', cell.source).strip()

`CleanMagics` strips magic cell commands `%%` so they do not appear in rendered markdown files:

In [23]:
c, _ = run_preprocessor([CleanMagics], _test_file)
assert '%%' not in c

## Format Shell Commands -

In [24]:
#export
@preprocess_cell
def BashIdentify(cell):
    "A preprocessor to identify bash commands and mark them appropriately"
    pattern = re.compile('^\s*!', flags=re.MULTILINE)
    if cell.cell_type == 'code' and pattern.search(cell.source):
        cell.metadata.magics_language = 'bash'
        cell.source = pattern.sub('', cell.source).strip()

When we issue a shell command in a notebook with `!`, we need to change the code-fence from `python` to `bash` and remove the `!`:

In [25]:
c, _ = run_preprocessor([BashIdentify], _test_file)
assert "```bash" in c

## Remove `ShowDoc` Input Cells -

In [33]:
#export
_re_showdoc = re.compile(r'^ShowDoc', re.MULTILINE)

def _isShowDoc(cell):
    "Return True if cell contains ShowDoc."
    return cell['cell_type'] == 'code' and _re_showdoc.search(cell.source)

@preprocess_cell
def CleanShowDoc(cell):
    "Ensure that ShowDoc output gets cleaned in the associated notebook."
    _re_html = re.compile(r'<HTMLRemove>.*</HTMLRemove>', re.DOTALL)
    if not _isShowDoc(cell): return
    all_outs = [o['data'] for o in cell.outputs if 'data' in o]
    html_outs = [o['text/html'] for o in all_outs if 'text/html' in o]
    if len(html_outs) != 1: return
    cleaned_html = self._re_html.sub('', html_outs[0])
    return AttrDict({'cell_type':'raw', 'id':cell.id, 'metadata':cell.metadata, 'source':cleaned_html})

In [35]:
_result, _ = run_preprocessor([CleanShowDoc], _test_file)
assert '<HTMLRemove>' not in _result

## Composing Preprocessors Into A Pipeline

Lets see how you can compose all of these preprocessors together to process notebooks appropriately:

In [39]:
#export
def get_mdx_exporter(template_file='ob.tpl'):
    """A mdx notebook exporter which composes many pre-processors together."""
    c = Config()
    c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell", "hide")
    c.TagRemovePreprocessor.remove_all_outputs_tags = ("remove_output", "remove_outputs", "hide_output", "hide_outputs")
    c.TagRemovePreprocessor.remove_input_tags = ('remove_input', 'remove_inputs', "hide_input", "hide_inputs")
    pp = [InjectMeta, CleanMagics, BashIdentify, UpdateTags, InsertWarning, TagRemovePreprocessor,
          CleanFlags, CleanShowDoc, RmEmptyCode, StripAnsi, HideInputLines, ImageSave, ImagePath, HTMLEscape]
    c.MarkdownExporter.preprocessors = pp
    tmp_dir = Path(__file__).parent/'templates/'
    tmp_file = tmp_dir/f"{template_file}"
    if not tmp_file.exists(): raise ValueError(f"{tmp_file} does not exist in {tmp_dir}")
    c.MarkdownExporter.template_file = str(tmp_file)
    return MarkdownExporter(config=c)

`get_mdx_exporter` combines all of the previous preprocessors, along with the built in `TagRemovePreprocessor` to allow for hiding cell inputs/outputs based on cell tags.

In [40]:
exp = get_mdx_exporter()
print(exp.from_filename(_test_file)[0])

## a title


some md


```python
import re
```


```python
print('\033[94mhello')
```

<CodeOutputBlock lang="python">

    hello


</CodeOutputBlock>


```python
1+1
```

<CodeOutputBlock lang="python">




    2



</CodeOutputBlock>


```python
import pandas as pd
pd.DataFrame(dict(a=[1,2]))
```
    
<HTMLOutputBlock >




```html
<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>a</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2</td>
    </tr>
  </tbody>
</table>
</div>
```



</HTMLOutputBlock>


```python
import matplotlib.pyplot as plt
plt.figure(figsize=(2,1))
plt.plot([1,2]);
```

<CodeOutputBlock lang="python">