# Parameterizing and executing the notebook with papermill
#### Authors: 
* Srishti Yadav (https://github.com/copperwiring)
* Akshit Arora (https://github.com/aroraakshit)

--------------

If you find any bug in the code, please raise an issue or send a PR are: https://github.com/copperwiring/scalable-hpo-pybay

### Install dependencies

In [None]:
!pip3 install papermill ipywidgets jupyter-client==6.1.5

### Import libraries

In [None]:
import os
from glob import glob
from shutil import rmtree
import papermill as pm
import time
import multiprocessing as mp
from sklearn.model_selection import ParameterGrid
import numpy as np
import os
import nest_asyncio
nest_asyncio.apply()

### Delete output folder if it exists

In [None]:
path = os.getcwd()
pattern = os.path.join(path, "xgboost*")

for item in glob(pattern):
    if not os.path.isdir(item):
        continue
    rmtree(item)

### Set parallel processing

In [None]:
parallel = True

### Define variables and hypeparameters range
- `max_experiments`: max number of experiments to run
- `output_dir`: name of the directory where final output files will be saved
- `eta`: learnig rate

In [None]:
max_experiments = 5
input_notebook = 'XGBoost_Demo.ipynb'
output_dir = 'xgboost_'+time.strftime("%Y%m%d_%H%M%S")
hyperparameter_ranges = {
    'max_depth'    : [int(i) for i in np.random.randint(5, 15+1, 5)],
    'num_rounds' : [int(i) for i in np.random.randint(2, 10+1, 5)],
    'eta' : [float(i) for i in np.random.uniform(0.1, 1.0, 5)]
}


### Set hyperparamters for different jobs 
- Hyperparamters are set randomly from the search space above
- `max_experiments` will determine how many sets of hyperparamters will be set, one for each experiment

In [None]:
candidate_params = np.random.choice(ParameterGrid(hyperparameter_ranges), max_experiments)

# print a sample parameter
print(candidate_params[0])

### Paramerterize and execute the noebook using Papermill
Paermill execution using api: https://papermill.readthedocs.io/en/latest/usage-execute.html

In [None]:
def run_papermill(cand, inp, outd):
    
    if not os.path.exists(outd):
        os.makedirs(outd)
    
    output_notebook = outd + '/' + inp.split('.')[0] + str(cand['max_depth']) + '_' + str(cand['num_rounds']) + '_' + str(int(cand['eta']*100)) + '.ipynb'
    
    if os.path.exists(output_notebook):
        os.remove(output_notebook)
    
    print(cand, "Starting process..")
    
    try:
        pm.execute_notebook(inp, output_notebook, dict(max_depth=cand['max_depth'], eta = cand['eta'], num_rounds=cand['num_rounds']))
    except:
        pass
    
    print(output_notebook, "Process finished..")

### Parallelize exxcution of the paramterized notebook
- set `parallel` = True for parallel processing
- set `parallel` = False for sequential processing

In [None]:
for candidate in candidate_params:    
    if parallel:
        p = mp.Process(
            target=run_papermill,
            args=(candidate, input_notebook, output_dir,)
        )
        p.start()
    else:
        run_papermill(candidate, input_notebook, output_dir)

### Papermill CLI command
Papermill execution using CLI: https://papermill.readthedocs.io/en/latest/usage-execute.html

In [None]:
# !LC_ALL=C.UTF-8 LANG=C.UTF-8 papermill XGBoost_Demo.ipynb XGBoost_Demo4.ipynb -p max_depth 13 -p eta 0.19026762501776856 -p num_rounds 22