In [None]:
#|default_exp clean

In [None]:
#|export
import warnings,stat

from execnb.nbio import *
from fastcore.script import *
from fastcore.basics import *
from fastcore.imports import *

from nbdev.imports import *
from nbdev.read import *
from nbdev.sync import *
from nbdev.process import first_code_ln

In [None]:
#|hide
from fastcore.test import *

To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with `nbdev_install_git_hooks`). The following functions are used to do that.

In [None]:
#|export
@call_parse
def nbdev_trust(
    fname:str=None,  # A notebook name or glob to trust
    force_all:bool=False  # Also trust notebooks that haven't changed
):
    "Trust notebooks matching `fname`"
    try: from nbformat.sign import NotebookNotary
    except:
        import warnings
        warnings.warn("Please install jupyter and try again")
        return

    fname = Path(fname if fname else config_key("nbs_path", '.'))
    path = fname if fname.is_dir() else fname.parent
    check_fname = path/".last_checked"
    last_checked = os.path.getmtime(check_fname) if check_fname.exists() else None
    nbs = globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]') if fname.is_dir() else [fname]
    for fn in nbs:
        if last_checked and not force_all:
            last_changed = os.path.getmtime(fn)
            if last_changed < last_checked: continue
        nb = read_nb(fn)
        if not NotebookNotary().check_signature(nb): NotebookNotary().sign(nb)
    check_fname.touch(exist_ok=True)

## Utils

In [None]:
#|export
def _clean_cell_output(cell):
    "Remove execution count in `cell`"
    if 'outputs' in cell:
        for o in cell['outputs']:
            if 'execution_count' in o: o['execution_count'] = None
            o.get('data',{}).pop("application/vnd.google.colaboratory.intrinsic+json", None)
            o.get('metadata', {}).pop('tags', None)

In [None]:
#|export
def _clean_cell(cell, clear_all=False, allowed_metadata_keys=None):
    "Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
    if 'execution_count' in cell: cell['execution_count'] = None
    if 'outputs' in cell:
        if clear_all: cell['outputs'] = []
        else:         _clean_cell_output(cell)
    if cell['source'] == ['']: cell['source'] = []
    cell['metadata'] = {} if clear_all else {
        k:v for k,v in cell['metadata'].items() if k in allowed_metadata_keys}

In [None]:
#|export
def clean_nb(
    nb, # The notebook to clean
    clear_all=False, # Remove all cell metadata and cell outputs
    allowed_metadata_keys:list=None, # Preserve the list of keys in the main notebook metadata
    allowed_cell_metadata_keys:list=None # Preserve the list of keys in cell level metadata
):
    "Clean `nb` from superfluous metadata"
    metadata_keys = {"kernelspec", "jekyll", "jupytext", "doc"}
    if allowed_metadata_keys: metadata_keys.update(allowed_metadata_keys)
    cell_metadata_keys = {"hide_input"}
    if allowed_cell_metadata_keys: cell_metadata_keys.update(allowed_cell_metadata_keys)
    for c in nb['cells']: _clean_cell(c, clear_all=clear_all, allowed_metadata_keys=cell_metadata_keys)
    nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in metadata_keys}

The test notebook has metadata in both the main metadata section and contains cell level metadata in the second cell:

In [None]:
test_nb = read_nb('../tests/metadata.ipynb')

assert set(['meta', 'jekyll', 'my_extra_key', 'my_removed_key']) <= set(test_nb.metadata.keys())
assert set(['meta', 'hide_input', 'my_extra_cell_key', 'my_removed_cell_key']) == set(test_nb.cells[1].metadata.keys())

After cleaning the notebook, all extra metadata is removed, only some keys are allowed by default:

In [None]:
clean_nb(test_nb)

assert set(['jekyll', 'kernelspec']) == set(test_nb.metadata.keys())
assert set(['hide_input']) == set(test_nb.cells[1].metadata.keys())

We can preserve some additional keys at the notebook or cell levels:

In [None]:
test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, allowed_metadata_keys={'my_extra_key'}, allowed_cell_metadata_keys={'my_extra_cell_key'})

assert set(['jekyll', 'kernelspec', 'my_extra_key']) == set(test_nb.metadata.keys())
assert set(['hide_input', 'my_extra_cell_key']) == set(test_nb.cells[1].metadata.keys())

Passing the `clear_all=True` keyword removes everything from the cell metadata:

In [None]:
test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, clear_all=True)

assert set(['jekyll', 'kernelspec']) == set(test_nb.metadata.keys())
test_eq(test_nb.cells[1].metadata, {})

## clean_nbs -

In [None]:
#|export
def _reconfigure(*strms):
    for s in strms:
        if hasattr(s,'reconfigure'): s.reconfigure(encoding='utf-8')

In [None]:
#|export
def process_write(warn_msg, proc_nb, f_in, f_out=None, disp=False):
    if not f_out: f_out = sys.stdout if disp else f_in
    if isinstance(f_in, (str,Path)): f_in = Path(f_in).open()
    try:
        _reconfigure(f_in, f_out)
        nb = loads(f_in.read())
        proc_nb(nb)
        write_nb(nb, f_out)
    except Exception as e:
        warn(f'{warn_msg}')
        warn(e)

In [None]:
#|export
@call_parse
def nbdev_clean(
    fname:str=None, # A notebook name or glob to clean
    clear_all:bool=False, # Clean all metadata and outputs
    disp:bool=False,  # Print the cleaned outputs
    stdin:bool=False # Read notebook from input stream
):
    "Clean all notebooks in `fname` to avoid merge conflicts"
    # Git hooks will pass the notebooks in stdin
    allowed_metadata_keys = config_key("allowed_metadata_keys", default='', missing_ok=True, path=False).split()
    allowed_cell_metadata_keys = config_key("allowed_cell_metadata_keys", default='', missing_ok=True, path=False).split()
    _clean = partial(clean_nb, clear_all=clear_all,
                     allowed_metadata_keys=allowed_metadata_keys,
                     allowed_cell_metadata_keys=allowed_cell_metadata_keys)
    _write = partial(process_write, warn_msg='Failed to clean notebook', proc_nb=_clean)
    if stdin: return _write(f_in=sys.stdin, f_out=sys.stdout)
    
    if fname is None: fname = config_key("nbs_path", '.', missing_ok=True)
    for f in globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]'): _write(f_in=f, disp=disp)

By default (`fname` left to `None`), the all the notebooks in `lib_folder` are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing `clear_all=True`.

If you want to keep some keys in the main notebook metadata you can set `allowed_metadata_keys` in `settings.ini`.
Similarly for cell level metadata use: `allowed_cell_metadata_keys`. For example, to preserve both `k1` and `k2` at both the notebook and cell level adding the following in `settings.ini`:
```
...
allowed_metadata_keys = k1 k2
allowed_cell_metadata_keys = k1 k2
...
```

In [None]:
#|export
@call_parse
def nbdev_install_hooks():
    "Install git hooks to clean and trust notebooks automatically"
    nb_path = config_key("nbs_path", '.')
    path = get_config().config_path
    hook_path = path/'.git'/'hooks'
    fn = hook_path/'post-merge'
    hook_path.mkdir(parents=True, exist_ok=True)
    fn.write_text("#!/bin/bash\nnbdev_trust")
    os.chmod(fn, os.stat(fn).st_mode | stat.S_IEXEC)
    #Clean notebooks on commit/diff
    (path/'.gitconfig').write_text("""# Generated by nbdev_install_git_hooks
#
# If you need to disable this instrumentation do:
#   git config --local --unset include.path
#
# To restore the filter
#   git config --local include.path .gitconfig
#
# If you see notebooks not stripped, checked the filters are applied in .gitattributes
#
[filter "clean-nbs"]
        clean = nbdev_clean --stdin
        smudge = cat
        required = true
[diff "ipynb"]
        textconv = nbdev_clean --disp --fname
""")
    cmd = "git config --local include.path ../.gitconfig"
    run(cmd)
    print("Hooks are installed and repo's .gitconfig is now trusted")
    (nb_path/'.gitattributes').write_text("**/*.ipynb filter=clean-nbs\n**/*.ipynb diff=ipynb\n")

## Export -

In [None]:
#|hide
#|eval:false
from nbdev.doclinks import nbdev_export
nbdev_export()