In [None]:
#default_exp clean

In [None]:
#export
from fastcore.script import *
from fastcore.utils import *
from fastcore.imports import *

from nbprocess.imports import *
from nbprocess.read import *
from nbprocess.sync import *

# from pathlib import Path
# import io,sys,json,glob,re

# Clean notebooks

> Strip notebooks from superfluous metadata

To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with `nbdev_install_git_hooks`). The following functions are used to do that.

## Utils

In [None]:
#export
def _clean_cell_output(cell):
    "Remove execution count in `cell`"
    if 'outputs' in cell:
        for o in cell['outputs']:
            if 'execution_count' in o: o['execution_count'] = None
            o.get('data',{}).pop("application/vnd.google.colaboratory.intrinsic+json", None)
            o.get('metadata', {}).pop('tags', None)

In [None]:
#export
def _clean_cell(cell, clear_all=False):
    "Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
    if 'execution_count' in cell: cell['execution_count'] = None
    if 'outputs' in cell:
        if clear_all: cell['outputs'] = []
        else:         _clean_cell_output(cell)
    if cell['source'] == ['']: cell['source'] = []
    cell['metadata'] = {} if clear_all else {
        k:v for k,v in cell['metadata'].items() if k=="hide_input"}

In [None]:
tst = {'cell_type': 'code', 'execution_count': 26,
       'metadata': {'hide_input': True, 'meta': 23},
       'outputs': [{'execution_count': 2, 
                    'data': {
                        'application/vnd.google.colaboratory.intrinsic+json': {'type': 'string'},
                        'plain/text': ['sample output',]
                    }, 'output': 'super'}],
       'source': 'awesome_code'}
tst1 = tst.copy()

_clean_cell(tst)
test_eq(tst, {'cell_type': 'code',
              'execution_count': None,
              'metadata': {'hide_input': True},
              'outputs': [{'execution_count': None, 
                           'data': {'plain/text': ['sample output',]},
                           'output': 'super'}],
              'source': 'awesome_code'})

_clean_cell(tst1, clear_all=True)
test_eq(tst1, {'cell_type': 'code', 'execution_count': None, 'metadata': {},
               'outputs': [], 'source': 'awesome_code'})

In [None]:
tst2 = {'metadata': {'tags':[]},
        'outputs': [{'metadata': {'tags':[]}}],
        "source": [""]}
_clean_cell(tst2, clear_all=False)
test_eq(tst2, {'metadata': {}, 'outputs': [{'metadata':{}}], 'source': []})

In [None]:
#export
def clean_nb(nb, clear_all=False):
    "Clean `nb` from superfluous metadata, passing `clear_all` to `_clean_cell`"
    for c in nb['cells']: _clean_cell(c, clear_all=clear_all)
    nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in
                     ("kernelspec", "jekyll", "jupytext", "doc")}

In [None]:
tst = {'cell_type': 'code', 'execution_count': 26,
       'metadata': {'hide_input': True, 'meta': 23},
       'outputs': [{'execution_count': 2,
                    'data': {
                        'application/vnd.google.colaboratory.intrinsic+json': {'type': 'string'},
                        'plain/text': ['sample output',]
                    }, 'output': 'super'}],
       'source': 'awesome_code'}
nb = {'metadata': {'kernelspec': 'some_spec', 'jekyll': 'some_meta', 'meta': 37}, 'cells': [tst]}

clean_nb(nb)
test_eq(nb['cells'][0], {'cell_type': 'code', 'execution_count': None,
              'metadata': {'hide_input': True},
              'outputs': [{'execution_count': None, 
                           'data': { 'plain/text': ['sample output',]},
                           'output': 'super'}],
              'source': 'awesome_code'})
test_eq(nb['metadata'], {'kernelspec': 'some_spec', 'jekyll': 'some_meta'})

## Main function

In [None]:
#export
def _wrapio(strm): return io.TextIOWrapper(strm, encoding='utf-8', line_buffering=True)

In [None]:
#export
@call_parse
def nbdev_clean_nbs(
    fname:str=None, # A notebook name or glob to convert
    clear_all:bool_arg=False, # Clean all metadata and outputs
    disp:bool_arg=False, # Print the cleaned outputs
    read_input_stream:bool_arg=False # Read input stram and not nb folder
):
    "Clean all notebooks in `fname` to avoid merge conflicts"
    #Git hooks will pass the notebooks in the stdin
    if read_input_stream:
        nb = json.load(_wrapio(sys.stdin))
        clean_nb(nb, clear_all=clear_all)
        write_nb(nb, _wrapio(sys.stdout))
        return

    path = None
    if fname is None: path = get_config().path("nbs_path")
    files = nbglob(fname=ifnone(fname,path))
    for f in files:
        if not str(f).endswith('.ipynb'): continue
        nb = json.loads(open(f, 'r', encoding='utf-8').read())
        clean_nb(nb, clear_all=clear_all)
        if disp: _print_output(nb)
        else:
            x = json.dumps(nb, sort_keys=True, indent=1, ensure_ascii=False)
            with io.open(f, 'w', encoding='utf-8') as f:
                f.write(x)
                f.write("\n")

By default (`fname` left to `None`), the all the notebooks in `lib_folder` are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing `clear_all=True`. `disp` is only used for internal use with git hooks and will print the clean notebook instead of saving it. Same for `read_input_stream` that will read the notebook from the input stream instead of the file names.

## Export -

In [None]:
#hide
from nbprocess.export import nbs_export
nbs_export()