# Gradient Boost
It turns out the cms translation software we are using to push a scikit-learn BDT back to a TMVA one was designed for Gradient Boost only. While we did a translation as an experiment for the Ada boost we used, we should try this guy out as well to see what happens and see if it is "better".

## Initalization

In [1]:
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools
from sklearn.ensemble import GradientBoostingClassifier
import mlglue.tree

In [2]:
default_cut_Lxy = 1250
default_cut_Lz = 3500
eta_seperator_cut = 1.4
def trim_sample(sample, cut_Lxy = default_cut_Lxy, cut_Lz = default_cut_Lz):
    '''Trim lxy and lz cuts for a sample'''
    return sample[((abs(sample.JetEta) > eta_seperator_cut) & (sample.mc_Lz*1000 > cut_Lz)) | ((abs(sample.JetEta) <= eta_seperator_cut) & (sample.mc_Lxy*1000 > cut_Lxy))]

def trim_samples(all_events):
    '''Trim default lxy and lz cuts for a tuple of (mj, bib, signal) samples'''
    return (all_events[0], all_events[1], trim_sample(all_events[2]))

def load_trimmed_sample(jobNo):
    '''Load and trim a sample from a job, and record it in our sample archive'''
    print ('Job {0}:'.format(jobNo))
    all_events_all = load_default_samples(jobNo)
    all_events = trim_samples(all_events_all)
    print (" ", [len(e.index) for e in all_events])
    
    return all_events

## Load the data

In [3]:
%%time
input_events = load_trimmed_sample(106)

Job 106:
  BIB: 800000 events
  Multijet: 800000 events
  Signal: 800000 events
  [800000, 800000, 504190]
Wall time: 33.1 s


## Training Setup

In [4]:
training_variables = ['EnergyDensity',
 'BIBDeltaTimingM',
 'JetPt',
 'HadronicLayer1Fraction',
 'ShowerCenter',
 'JetLat',
 'FirstClusterRadius',
 'JetLong',
 'MaxTrackPt',
 'PredictedLxy',
 'BIBDeltaTimingP',
 'PredictedLz',
 'SumPtOfAllTracks']

In [5]:
%%time
# Split into testing and training samples
train, test = test_train_samples(input_events)

# Prep samples for training
all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_variables)

Wall time: 4.11 s


## Training
So a simple training to see how this works

In [6]:
def gradient_training (events, events_weight, events_class):
    '''Given samples prepared, run the default "best" training we know how to run.
    
    Args:
        events - A DF with an entry for every event, with all columns to be trained on
        events_weight - weight assigned to each event (None if no weight is to be used)
        events_class - the training class (0, 1, 2 for bib, mj, and signal)
        min_leaf_fraction - fraction of sample that can be in each leaf. Defaults to 1%
        
    Returns
        bdt - A trained boosted decision tree
    '''
    bdt = GradientBoostingClassifier()
    
    bdt.fit(events, events_class.Class, sample_weight = events_weight)
    
    # The BDT is sent back for use
    return bdt

In [7]:
%%time
bdt = gradient_training(all_events, training_weight, all_events_class)

Wall time: 45min 42s


In [8]:
calc_performance(bdt, test, training_variables=training_variables)

{'BIBBack': 191943104.5314537,
 'BIBEff': 0.5587998577498269,
 'BIBSsqrtB': 10.774591717872303,
 'BIBTotalCount': 267135,
 'BIBTotalWeight': 267135.0,
 'BIBinBIB': 149275.0,
 'BIBinHSS': 12081.0,
 'BIBinMJ': 105779.0,
 'HSSBack': 28370528.605109442,
 'HSSEff': 0.9842896378335547,
 'HSSSsqrtB': 31.017952900099445,
 'HSSTotalCount': 167851,
 'HSSTotalWeight': 167851.0,
 'HSSinBIB': 1165.0,
 'HSSinHSS': 165214.0,
 'HSSinMJ': 1472.0,
 'MJBack': 107251.0,
 'MJEff': 0.7347801486425272,
 'MJSsqrtB': 1863656.9468193203,
 'MJTotalCount': 266785,
 'MJTotalWeight': 830633099.3287433,
 'MJinBIB': 191941939.5314537,
 'MJinHSS': 28358447.605109442,
 'MJinMJ': 610332712.192177}

In [9]:
bdtGeneral = mlglue.tree.BDTsklearn(bdt, list(all_events.columns), ['BIB', 'MJ', 'Signal'])
bdtGeneral.to_tmva("training_106_test.xml")