# Optimize GradientBoostingClassifier
Using the previously determined variable list, lets see what we get in terms of optimization for the training settings.

# Initalization

In [None]:
from bdt_training_scikit_tools import load_trimmed_sample, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

## Load data

In [None]:
%%time
input_events = load_trimmed_sample(106)

## Training
We want to be able run multiple trainings in threads, so we have to be a little tricky here by writing out a file.

In [7]:
%%writefile grad_perf_training.py
from bdt_training_scikit_tools import default_training_variable_list, \
    test_train_samples, prep_samples, calc_performance
from sklearn.ensemble import GradientBoostingClassifier
    
def do_training (vlist):
    all_events, leaf_depth, estimators = vlist
    return get_training_performance (all_events, leaf_depth, estimators)
    
def get_training_performance (all_events, leaf_depth, estimators):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2])
    
    # Run training
    bdt = GradientBoostingClassifier(max_depth=leaf_depth, n_estimators=estimators)
    bdt.fit(all_events, all_events_class, sample_weight = training_weight)
    
    # Create a thing of all the results
    return {(leaf_depth, estimators): calc_performance(bdt, test)}

Overwriting grad_perf_training.py


In [4]:
import grad_perf_training
pool = mp.Pool(processes=4)

In [5]:
def scan_performance(all_events, leaf_depth_range=[3], estimators=[10]):
    results = pool.map(grad_perf_training.do_training,
             [(all_events, ldepth, nest) for ldepth in leaf_depth_range for nest in estimators])

    one_dict = {}
    for kp in results:
        one_dict.update(kp)

    return pd.DataFrame(one_dict).T

## Leaf Depth and Estimators
These two have to be connected, rigth?

In [None]:
%%time
lde = scan_performance(input_events, leaf_depth_range=[3,10,50], estimators=[10,50,100])

In [None]:
lde[['HSSSsqrtB','HSSEff','BIBEff','MJEff']]