What variables should be used for training. We look at the performance of the default training vs the training with each variable removed. If the performance improves when we remove a variable, we know that variable is making the training worse. Further, by looking at the change in performance we can rank the variables to first order.

# Initalization

In [3]:
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

# Load Data Samples

In [8]:
default_cut_Lxy = 1250
default_cut_Lz = 3500
eta_seperator_cut = 1.4
def trim_sample(sample, cut_Lxy = default_cut_Lxy, cut_Lz = default_cut_Lz):
    '''Trim lxy and lz cuts for a sample'''
    return sample[((abs(sample.JetEta) > eta_seperator_cut) & (sample.mc_Lz*1000 > cut_Lz)) | ((abs(sample.JetEta) <= eta_seperator_cut) & (sample.mc_Lxy*1000 > cut_Lxy))]

def trim_samples(all_events):
    '''Trim default lxy and lz cuts for a tuple of (mj, bib, signal) samples'''
    return (all_events[0], all_events[1], trim_sample(all_events[2]))

In [6]:
all_events_all = load_default_samples("92")

BIB: 800000 events
Multijet: 800000 events
Signal: 800000 events


In [11]:
all_events = trim_samples(all_events_all)
print ([len(e.index) for e in all_events])

[800000, 800000, 432189]


# Drop First Variable

In [12]:
%%writefile get_training_performance.py
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
    
def do_training (vlist):
    all_events, training_list = vlist
    return get_training_performance (all_events, training_list)
    
def get_training_performance (all_events, training_list):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_list)
    
    # Run training
    bdt = default_training(all_events, training_weight, all_events_class)
    
    # Create a thing of all the results
    return {tuple(training_list): calc_performance(bdt, test, training_variables = training_list)}

Overwriting get_training_performance.py


In [13]:
def all_but_one (vlist, number_to_drop = 1):
    '''Return vlist and vlist with each item removed
    
    Arguments:
        vlist - the source list or tuple
        number_to_drop - return a list that is len(vlist)-number_to_drop - all possible combinations
        
    Returns:
        A list of tuples that have len(vlist)-number_to_drop items.
    
    '''
    var_training_list = itertools.combinations(vlist, len(vlist)-number_to_drop)
    var_training_list = list(var_training_list) + [tuple(vlist)]
    return var_training_list

In [16]:
import get_training_performance
pool = mp.Pool(processes=4)

def unused_var(original_list, used_list):
    r = tuple(i for i in original_list if i not in used_list)
    r = r if len(r) != 0 else ('None',)
    return r

def calc_var_removal(all_events, training_list = default_training_variable_list):
    r_drop_first = pool.map(get_training_performance.do_training,
             [(all_events, tvar_list) for tvar_list in all_but_one(training_list)])

    one_dict = {}
    for kp in r_drop_first:
        one_dict.update(kp)

    return pd.DataFrame({unused_var(training_list, k):one_dict[k] for k in one_dict}).T


In [17]:
r_drop_first = calc_var_removal(all_events)

In [18]:
r_drop_first.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     42.237424
PredictedLz               45.222097
BIBDeltaTimingP           45.695701
SumPtOfAllTracks          47.307187
HadronicLayer1Fraction    49.157968
JetLat                    49.218267
JetLong                   51.890276
BIBDeltaTimingM           51.984148
CalRatio                  52.336483
NTracks                   53.186099
ShowerCenter              54.064282
JetWidth                  54.382896
MaxTrackPt                54.782898
PredictedLxy              54.930037
None                      55.580290
FirstClusterRadius        59.610414
EnergyDensity             60.989457
Name: HSSSsqrtB, dtype: float64

# Drop SumPtOfAllTracks

In [20]:
down_one_list = list(set(default_training_variable_list) - set(["EnergyDensity"]))
down_one_list

['CalRatio',
 'SumPtOfAllTracks',
 'JetWidth',
 'JetPt',
 'FirstClusterRadius',
 'BIBDeltaTimingM',
 'PredictedLz',
 'JetLat',
 'ShowerCenter',
 'BIBDeltaTimingP',
 'PredictedLxy',
 'JetLong',
 'MaxTrackPt',
 'NTracks',
 'HadronicLayer1Fraction']

In [21]:
r_drop_second = calc_var_removal(all_events, down_one_list)

In [22]:
r_drop_second.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     40.340262
PredictedLz               46.544526
PredictedLxy              49.913065
HadronicLayer1Fraction    50.644301
ShowerCenter              51.487157
BIBDeltaTimingP           51.860469
JetLat                    51.896442
SumPtOfAllTracks          52.256830
JetWidth                  53.725551
BIBDeltaTimingM           54.989699
MaxTrackPt                55.649745
JetLong                   55.893929
CalRatio                  58.136389
FirstClusterRadius        58.404335
None                      60.989457
NTracks                   62.250806
Name: HSSSsqrtB, dtype: float64

# Drop NTracks

In [23]:
down_two_list = list(set(down_one_list) - set(["NTracks"]))
down_two_list

['CalRatio',
 'JetLat',
 'ShowerCenter',
 'SumPtOfAllTracks',
 'JetWidth',
 'JetPt',
 'BIBDeltaTimingP',
 'PredictedLxy',
 'JetLong',
 'MaxTrackPt',
 'FirstClusterRadius',
 'BIBDeltaTimingM',
 'PredictedLz',
 'HadronicLayer1Fraction']

In [24]:
r_drop_third = calc_var_removal(all_events, down_two_list)

In [28]:
r_drop_third.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     41.108144
BIBDeltaTimingP           47.157923
PredictedLz               48.596636
SumPtOfAllTracks          48.674995
JetLong                   50.933884
FirstClusterRadius        51.432953
MaxTrackPt                51.839051
JetLat                    53.836807
HadronicLayer1Fraction    55.085178
ShowerCenter              55.194859
JetWidth                  55.590546
PredictedLxy              56.320394
CalRatio                  56.790429
BIBDeltaTimingM           58.105604
None                      62.250806
Name: HSSSsqrtB, dtype: float64