What variables should be used for training. We look at the performance of the default training vs the training with each variable removed. If the performance improves when we remove a variable, we know that variable is making the training worse. Further, by looking at the change in performance we can rank the variables to first order.

# Initalization

In [1]:
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

# Load Data Samples

In [2]:
default_cut_Lxy = 1250
default_cut_Lz = 3500
eta_seperator_cut = 1.4
def trim_sample(sample, cut_Lxy = default_cut_Lxy, cut_Lz = default_cut_Lz):
    '''Trim lxy and lz cuts for a sample'''
    return sample[((abs(sample.JetEta) > eta_seperator_cut) & (sample.mc_Lz*1000 > cut_Lz)) | ((abs(sample.JetEta) <= eta_seperator_cut) & (sample.mc_Lxy*1000 > cut_Lxy))]

def trim_samples(all_events):
    '''Trim default lxy and lz cuts for a tuple of (mj, bib, signal) samples'''
    return (all_events[0], all_events[1], trim_sample(all_events[2]))

In [3]:
all_events_all = load_default_samples("106")

BIB: 800000 events
Multijet: 800000 events
Signal: 800000 events


In [4]:
all_events = trim_samples(all_events_all)
print ([len(e.index) for e in all_events])

[800000, 800000, 504190]


# Drop First Variable

In [5]:
%%writefile get_training_performance.py
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
    
def do_training (vlist):
    all_events, training_list = vlist
    return get_training_performance (all_events, training_list)
    
def get_training_performance (all_events, training_list):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_list)
    
    # Run training
    bdt = default_training(all_events, training_weight, all_events_class)
    
    # Create a thing of all the results
    return {tuple(training_list): calc_performance(bdt, test, training_variables = training_list)}

Overwriting get_training_performance.py


In [6]:
def all_but_one (vlist, number_to_drop = 1):
    '''Return vlist and vlist with each item removed
    
    Arguments:
        vlist - the source list or tuple
        number_to_drop - return a list that is len(vlist)-number_to_drop - all possible combinations
        
    Returns:
        A list of tuples that have len(vlist)-number_to_drop items.
    
    '''
    var_training_list = itertools.combinations(vlist, len(vlist)-number_to_drop)
    var_training_list = list(var_training_list) + [tuple(vlist)]
    return var_training_list

In [7]:
import get_training_performance
pool = mp.Pool(processes=4)

def unused_var(original_list, used_list):
    r = tuple(i for i in original_list if i not in used_list)
    r = r if len(r) != 0 else ('None',)
    return r

def calc_var_removal(all_events, training_list = default_training_variable_list):
    r_drop_first = pool.map(get_training_performance.do_training,
             [(all_events, tvar_list) for tvar_list in all_but_one(training_list)])

    one_dict = {}
    for kp in r_drop_first:
        one_dict.update(kp)

    return pd.DataFrame({unused_var(training_list, k):one_dict[k] for k in one_dict}).T


In [8]:
r_drop_first = calc_var_removal(all_events)

In [9]:
r_drop_first.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     31.571177
BIBDeltaTimingP           33.345183
FirstClusterRadius        34.084652
MaxTrackPt                34.439625
BIBDeltaTimingM           34.821558
ShowerCenter              35.157016
EnergyDensity             35.220922
JetLong                   36.636057
None                      37.622429
JetWidth                  38.342294
HadronicLayer1Fraction    38.619610
SumPtOfAllTracks          39.090312
NTracks                   39.607152
JetLat                    39.976526
PredictedLxy              41.687032
PredictedLz               42.200492
CalRatio                  42.631388
Name: HSSSsqrtB, dtype: float64

# Drop Second Variable

In [10]:
down_one_list = list(set(default_training_variable_list) - set(["CalRatio"]))
down_one_list

['NTracks',
 'JetPt',
 'ShowerCenter',
 'JetLong',
 'SumPtOfAllTracks',
 'PredictedLz',
 'EnergyDensity',
 'JetWidth',
 'BIBDeltaTimingM',
 'HadronicLayer1Fraction',
 'JetLat',
 'FirstClusterRadius',
 'MaxTrackPt',
 'PredictedLxy',
 'BIBDeltaTimingP']

In [11]:
r_drop_second = calc_var_removal(all_events, down_one_list)

In [12]:
r_drop_second.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     33.560652
FirstClusterRadius        35.318773
EnergyDensity             35.756891
MaxTrackPt                36.731971
ShowerCenter              37.015712
BIBDeltaTimingP           37.628097
HadronicLayer1Fraction    37.944158
PredictedLz               38.863548
BIBDeltaTimingM           40.418985
NTracks                   40.636561
PredictedLxy              40.825376
JetLong                   41.256371
SumPtOfAllTracks          41.398124
None                      42.631388
JetLat                    42.849140
JetWidth                  43.121987
Name: HSSSsqrtB, dtype: float64

# Drop NTracks

In [13]:
down_two_list = list(set(down_one_list) - set(["JetWidth"]))
down_two_list

['EnergyDensity',
 'NTracks',
 'BIBDeltaTimingM',
 'JetPt',
 'HadronicLayer1Fraction',
 'ShowerCenter',
 'JetLat',
 'FirstClusterRadius',
 'BIBDeltaTimingP',
 'JetLong',
 'MaxTrackPt',
 'PredictedLxy',
 'SumPtOfAllTracks',
 'PredictedLz']

In [14]:
r_drop_third = calc_var_removal(all_events, down_two_list)

In [15]:
r_drop_third.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     33.519731
BIBDeltaTimingM           35.923559
BIBDeltaTimingP           36.454895
JetLong                   38.038265
MaxTrackPt                38.789296
EnergyDensity             39.070243
ShowerCenter              39.447733
PredictedLz               39.746835
JetLat                    40.528748
HadronicLayer1Fraction    41.187335
FirstClusterRadius        41.811634
PredictedLxy              42.050360
SumPtOfAllTracks          42.644607
None                      43.121987
NTracks                   43.675357
Name: HSSSsqrtB, dtype: float64

In [19]:
down_three_list = list(set(down_two_list) - set(["NTracks"]))
down_three_list

['EnergyDensity',
 'BIBDeltaTimingM',
 'JetPt',
 'HadronicLayer1Fraction',
 'ShowerCenter',
 'JetLat',
 'FirstClusterRadius',
 'JetLong',
 'MaxTrackPt',
 'PredictedLxy',
 'BIBDeltaTimingP',
 'PredictedLz',
 'SumPtOfAllTracks']

In [20]:
r_drop_forth = calc_var_removal(all_events, down_three_list)

In [40]:
r_drop_forth.sort_values("HSSSsqrtB")[['HSSSsqrtB', 'HSSinHSS', 'MJinHSS', 'BIBinHSS']]

Unnamed: 0,HSSSsqrtB,HSSinHSS,MJinHSS,BIBinHSS
JetPt,33.441069,161973.0,23448800.0,11034.0
FirstClusterRadius,35.885192,162550.0,20509280.0,9115.0
ShowerCenter,36.262187,157932.0,18959160.0,9313.0
BIBDeltaTimingP,36.510289,161893.0,19642810.0,19091.0
HadronicLayer1Fraction,36.783127,161026.0,19155410.0,8965.0
PredictedLxy,36.793246,161843.0,19339520.0,9174.0
JetLat,37.81671,160183.0,17933350.0,8425.0
JetLong,37.82336,161054.0,18122520.0,8526.0
EnergyDensity,38.584012,162664.0,17764160.0,9139.0
BIBDeltaTimingM,38.722234,160869.0,17239440.0,19889.0
