What variables should be used for training. We look at the performance of the default training vs the training with each variable removed. If the performance improves when we remove a variable, we know that variable is making the training worse. Further, by looking at the change in performance we can rank the variables to first order.

# Initalization

In [1]:
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

# Load Data Samples

In [2]:
all_events = load_default_samples("92")

BIB: 800000 events
Multijet: 800000 events
Signal: 800000 events


# Drop First Variable

In [28]:
%%writefile get_training_performance.py
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
    
def do_training (vlist):
    all_events, training_list = vlist
    return get_training_performance (all_events, training_list)
    
def get_training_performance (all_events, training_list):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_list)
    
    # Run training
    bdt = default_training(all_events, training_weight, all_events_class)
    
    # Create a thing of all the results
    return {tuple(training_list): calc_performance(bdt, test, training_variables = training_list)}

Overwriting get_training_performance.py


In [29]:
def all_but_one (vlist, number_to_drop = 1):
    '''Return vlist and vlist with each item removed
    
    Arguments:
        vlist - the source list or tuple
        number_to_drop - return a list that is len(vlist)-number_to_drop - all possible combinations
        
    Returns:
        A list of tuples that have len(vlist)-number_to_drop items.
    
    '''
    var_training_list = itertools.combinations(vlist, len(vlist)-number_to_drop)
    var_training_list = list(var_training_list) + [tuple(vlist)]
    return var_training_list

In [47]:
import get_training_performance
pool = mp.Pool(processes=4)

def unused_var(original_list, used_list):
    r = tuple(i for i in original_list if i not in used_list)
    r = r if len(r) != 0 else ('None',)
    return r

def calc_var_removal(all_events, trainging_list = default_training_variable_list):
    r_drop_first = pool.map(get_training_performance.do_training,
             [(all_events, tvar_list) for tvar_list in all_but_one(trainging_list)])

    one_dict = {}
    for kp in r_drop_first:
        one_dict.update(kp)

    return pd.DataFrame({unused_var(training_list, k):one_dict[k] for k in one_dict}).T


In [31]:
r_drop_first = calc_var_removal(all_events)

In [38]:
r_drop_first.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt                     48.207755
HadronicLayer1Fraction    58.711762
ShowerCenter              58.880425
BIBDeltaTimingP           58.932652
PredictedLxy              59.021461
NTracks                   59.478448
CalRatio                  60.245566
JetLong                   60.251028
PredictedLz               60.478484
MaxTrackPt                61.059570
JetLat                    61.130377
EnergyDensity             61.292645
BIBDeltaTimingM           62.149705
JetWidth                  62.374182
FirstClusterRadius        62.574145
None                      62.586100
SumPtOfAllTracks          65.347513
Name: HSSSsqrtB, dtype: float64

# Drop SumPtOfAllTracks

In [42]:
down_one_list = list(set(default_training_variable_list) - set(["SumPtOfAllTracks"]))

In [43]:
r_drop_second = calc_var_removal(all_events, down_one_list)

In [48]:
r_drop_second.sort_values("HSSSsqrtB").HSSSsqrtB

JetPt             SumPtOfAllTracks          52.275475
SumPtOfAllTracks  MaxTrackPt                57.703929
                  JetLat                    60.803398
                  BIBDeltaTimingP           61.264952
CalRatio          SumPtOfAllTracks          61.434637
SumPtOfAllTracks  HadronicLayer1Fraction    61.503080
                  PredictedLz               61.614564
                  JetWidth                  61.897495
                  ShowerCenter              62.899632
NTracks           SumPtOfAllTracks          63.463857
SumPtOfAllTracks  FirstClusterRadius        63.471797
                  PredictedLxy              63.596035
                  EnergyDensity             63.873272
                  BIBDeltaTimingM           63.983250
                  JetLong                   64.423945
                  NaN                       65.347513
Name: HSSSsqrtB, dtype: float64