In [None]:
#| default_exp extract

# Squeeze your notebooks 

> How it works 

Here is a small example to show how you can run `nbsqueeze` from a notebook. Let's first create an in-line plot with `matplotlib`. 

In [None]:
import matplotlib.pyplot as plt
import numpy as np 

In [None]:
x = np.arange(40)
y = np.cos(x)

fig, ax = plt.subplots(figsize=[2, 3])
ax.plot(x, y, 'g')

The output plot image that we see above is encoded as a long base64 encoded png string, somewhere in this notebook. If you open this notebook file with a text editor you would see this corresponding json code for the image in the cell above: 

    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAK...A.VERY.LONG.STRING.HERE...",
      "text/plain": [
       "<Figure size 144x216 with 1 Axes>"
      ]
     },

We can use the `squeeze_this_nb` function to extract the very long base64 png string as above, save the image as a png file in the `./images/` folder, and replace the long string with a short link to the png file.  

In [None]:
from nbsqueeze import squeeze_this_nb  

In [None]:
squeeze_this_nb()

If we would now look at the json code of this notebook with a text editor, one would find the following updated json code: 

    {
     "data": {
      "text/html": [
       "..",
       "<img src=\"./images/00_extract_361824cd7e.png\">"
      ]
     },

Much shorter! This means my notebook file size is much smaller. If we check the file system, the corresponding png image file is present in the `./images/` sub folder: 

    .
    ├── 00_extract.ipynb
    └── images
        └── 00_extract_361824cd7e.png
    

In [None]:
#|export 

import nbformat 
import re 
import hashlib 
import ipynb_path
import base64 
import os 
import glob

In [None]:
#|export 

def find_png_strings(nb_path): 
    '''Find cells with inline base64 image/png strings in `notebook_file`.
    
    Returns: [[cell_idx, md5, base64_string], ...]'''
    
    nb = nbformat.read(nb_path, as_version=nbformat.NO_CONVERT) 
    cells = nb['cells']
    
    cell_pngs = []

    for i, cell in enumerate(cells):   
        
        # 
        if cell['cell_type'] == 'code':  

            # extract base64 png strings 
            # not sure if multiple png strings can exist in outputs. 
            # to simplify logic return only single png string per cell 
            
            code_cell_outputs = cell['outputs'] 
            
            png_strings = [] 
             
            for output in code_cell_outputs: 
                
                # simple inline image 
                try: 
                    base64_string = output['data']['image/png']
                    png_strings.append(base64_string) 
            
                # old school interactive matplotlib figure 
                except: 
                    try: 
                        html_string = output['data']['text/html']
                        m = re.match('.*base64,\s*([^"]+)', html_string)
                        base64_string = m.groups()[0]
                        png_strings.append(base64_string) 
                        
                    except: 
                        pass
                
            if len(png_strings) > 0:
                if len(png_strings) > 1: 
                    print(f'Found multiple png strings in output of cell {i}.')
                    print(' Extracting only first png string!')
                png_string = png_strings[0] 
                
                md5 = hashlib.md5(png_string.encode()).hexdigest()
                md5 = md5[0:10]
                cell_pngs.append([i, md5, png_string]) 
                                                            
    return cell_pngs 


def make_imdir(nb_path): 
    '''Create standard image subdirectory 'images' in current directory for notebook file `nb_path`. 
    
    Returns: imdir_path'''
    
    nbdir = os.path.dirname(os.path.abspath(nb_path))
    imdir_path = os.path.join(nbdir, 'images')
 
    os.makedirs(imdir_path, exist_ok=True)
    
    return imdir_path 
   

def export_pngs(nb_path, cell_pngs, imdir_path):
    '''Save base64 png strings from `cell_pngs` as png images in `img_dir`.'''
    
    prefix = os.path.basename(nb_path)
    prefix = re.sub('\.ipynb', '', prefix)
        
    url_list = []                      

    for idx, md5, base64_string in cell_pngs: 
        
        decoded = base64.b64decode((base64_string))
        png_filepath = os.path.join(imdir_path, f'{prefix}_{md5}.png')
        
        
        imdir = os.path.basename(imdir_path) 
        url = [idx, f'./{imdir}/{prefix}_{md5}.png']
        url_list.append(url)
        
        with open(png_filepath, 'wb') as fh: 
            
            fh.write(decoded)
            fh.close()
                          
    return url_list
                          

def replace_strings(nb_path, url_list, overwrite=True, verbose=False): 
    '''Replace base64 png strings with links. '''

    nb = nbformat.read(nb_path, as_version=nbformat.NO_CONVERT) 
    
    n_strings = len(url_list)

    # replace pngs strings in specific cell outputs with links 
    for i, url in url_list: 
        nb['cells'][i].outputs = [nbformat.from_dict({'output_type': 'execute_result', 
                                   'data': {'text/html': ['..', f'<img src="{url}">']}, 
                                   'execution_count': None, 'metadata': {}})]
                
    old_size = os.path.getsize(nb_path) 
    
    if not overwrite: 
        nb_path = re.sub('\.ipynb', '_sqz.ipynb', nb_path)
        
    if n_strings > 0: 
        
        if verbose: 
            print(f'-- Replacing {n_strings} png string(s) with link(s)')

        with open(nb_path, 'w') as fh: 
            nbformat.write(nb, fh)

        new_size = os.path.getsize(nb_path) 
        
        if verbose: 
            print(f'   File size: {old_size//1e3} Kb -> {new_size//1e3} Kb')
        
    else: 
        if verbose: 
            print(f'  (No png strings found)')
        
    

def squeeze_this_nb(nb_path=None, overwrite=True, verbose=False):
    '''Extract figure png strings from notebook `nb_path`. 
    
    If `nb_path` is None, current notebook is used. '''
    
    if nb_path is None: 
        nb_path = ipynb_path.get()
        
    cell_pngs = find_png_strings(nb_path)
    imdir_path = make_imdir(nb_path)
    url_list = export_pngs(nb_path, cell_pngs, imdir_path) 
    replace_strings(nb_path, url_list, overwrite=overwrite, verbose=verbose) 
    
    