What variables should be used for training. We look at the performance of the default training vs the training with each variable removed. If the performance improves when we remove a variable, we know that variable is making the training worse. Further, by looking at the change in performance we can rank the variables to first order.

# Initalization

In [1]:
from bdt_training_scikit_tools import load_trimmed_sample, \
    test_train_samples, prep_samples, default_training, calc_performance, get_fraction_of_events, \
    default_training_variable_list
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

# Load Data Samples

In [2]:
all_events_all = load_trimmed_sample("133")

Job 133:
  BIB: 800000 events
  Multijet: 800000 events
  Signal: 800000 events
  [800000, 800000, 473600]


In [3]:
all_events = get_fraction_of_events(all_events_all, 200000)
print ([len(e.index) for e in all_events])

[200387, 199675, 196736]


# Determine Full Variable List
We have to determine what variable list we want to start with before we start removing them according to their performance. Reasons for removal:

- Weight variables are MC only things
- Variables that have nothing to do with phyisics (like run #)
- DR to closest track because it is used later in the analysis
- JetPhi because physics *shouldn't* depend on that

In [4]:
all_events_all[0].columns

Index(['RunNumber', 'Weight', 'WeightMCEvent', 'WeightXSection',
       'WeightFlatten', 'mc_Lxy', 'mc_Lz', 'MHTOverHT', 'JetPt', 'JetPhi',
       'CalRatio', 'JetEta', 'NTracks', 'SumPtOfAllTracks', 'MaxTrackPt',
       'EventNumber', 'JetET', 'JetWidth', 'JetDRTo2GeVTrack', 'EnergyDensity',
       'HadronicLayer1Fraction', 'JetLat', 'JetLong', 'FirstClusterRadius',
       'NumberOfClusters', 'ShowerCenter', 'BIBDeltaTimingM',
       'BIBDeltaTimingP', 'FirstCellTiming', 'InteractionsPerCrossing',
       'RPredictedLxy', 'RPredictedLz', 'PredictedLxy', 'PredictedLz',
       'PredictedLxyHighEta', 'PredictedLxyLowEta', 'PredictedLzHighEta',
       'PredictedLzLowEta'],
      dtype='object')

In [5]:
variable_list = set(all_events_all[0].columns) - set(['RunNumber', 'Weight', 'WeightMCEvent', 'WeightXSection', 'WeightFlatten', 'mc_Lxy', 'mc_Lz', 'MHTOverHT', 'JetPhi', 'EventNumber', 'InteractionsPerCrossing', 'JetDRTo2GeVTrack', 'JetET'])
variable_list

{'BIBDeltaTimingM',
 'BIBDeltaTimingP',
 'CalRatio',
 'EnergyDensity',
 'FirstCellTiming',
 'FirstClusterRadius',
 'HadronicLayer1Fraction',
 'JetEta',
 'JetLat',
 'JetLong',
 'JetPt',
 'JetWidth',
 'MaxTrackPt',
 'NTracks',
 'NumberOfClusters',
 'PredictedLxy',
 'PredictedLxyHighEta',
 'PredictedLxyLowEta',
 'PredictedLz',
 'PredictedLzHighEta',
 'PredictedLzLowEta',
 'RPredictedLxy',
 'RPredictedLz',
 'ShowerCenter',
 'SumPtOfAllTracks'}

# Drop First Variable

In [6]:
%%writefile get_training_performance.py
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
    
def do_training (vlist):
    all_events, training_list = vlist
    return get_training_performance (all_events, training_list)
    
def get_training_performance (all_events, training_list):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_list)
    
    # Run training
    bdt = default_training(all_events, training_weight, all_events_class, estimators=200)
    
    # Create a thing of all the results
    return {tuple(training_list): calc_performance(bdt, test, training_variables = training_list)}

Overwriting get_training_performance.py


In [7]:
def all_but_one (vlist, number_to_drop = 1):
    '''Return vlist and vlist with each item removed
    
    Arguments:
        vlist - the source list or tuple
        number_to_drop - return a list that is len(vlist)-number_to_drop - all possible combinations
        
    Returns:
        A list of tuples that have len(vlist)-number_to_drop items.
    
    '''
    var_training_list = itertools.combinations(vlist, len(vlist)-number_to_drop)
    var_training_list = list(var_training_list) + [tuple(vlist)]
    return var_training_list

In [8]:
import get_training_performance
pool = mp.Pool(processes=10)

def unused_var(original_list, used_list):
    r = tuple(i for i in original_list if i not in used_list)
    r = r if len(r) != 0 else ('None',)
    return r

def calc_var_removal(all_events, training_list = default_training_variable_list):
    r_drop_first = pool.map(get_training_performance.do_training,
             [(all_events, tvar_list) for tvar_list in all_but_one(training_list)])

    one_dict = {}
    for kp in r_drop_first:
        one_dict.update(kp)

    return pd.DataFrame({unused_var(training_list, k):one_dict[k] for k in one_dict}).T

In [9]:
def remove_all_variables(all_events, training_list = variable_list):
    done = False
    count = 0
    rlist = []
    while not done:
        count = count + 1
        print ("Iteration #{0}".format(count))
        print ('  Training with variable list:')
        print ('  ' + str(training_list))
        result = calc_var_removal(all_events, training_list=training_list)
        result_sorted = result.sort_values("HSSSsqrtB")
        rlist = rlist + [result_sorted]
        print (result_sorted.HSSSsqrtB)
        last_var_name = result_sorted.index[-1][0]
        done = last_var_name == 'None'
        if not done:
            training_list = training_list - set([last_var_name])
    return rlist

In [10]:
%%time
#r_drop_first = calc_var_removal(all_events)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


In [11]:
#print (r_drop_first.sort_values("HSSSsqrtB").HSSSsqrtB)

In [14]:
%%time
r = remove_all_variables(all_events, variable_list)

Iteration #1
  Training with variable list:
  {'MaxTrackPt', 'RPredictedLxy', 'FirstClusterRadius', 'BIBDeltaTimingM', 'JetPt', 'JetLong', 'ShowerCenter', 'SumPtOfAllTracks', 'JetLat', 'EnergyDensity', 'NTracks', 'RPredictedLz', 'PredictedLxyLowEta', 'CalRatio', 'NumberOfClusters', 'PredictedLzLowEta', 'JetWidth', 'PredictedLxy', 'JetEta', 'PredictedLxyHighEta', 'BIBDeltaTimingP', 'HadronicLayer1Fraction', 'FirstCellTiming', 'PredictedLzHighEta', 'PredictedLz'}
JetPt                     155.048408
NTracks                   161.871547
FirstCellTiming           188.374472
BIBDeltaTimingP           198.933120
PredictedLxyHighEta       206.857576
MaxTrackPt                210.106238
NumberOfClusters          210.112357
FirstClusterRadius        210.242549
ShowerCenter              210.420620
RPredictedLz              212.728996
JetLat                    216.114443
CalRatio                  216.120086
BIBDeltaTimingM           216.167874
PredictedLxyLowEta        218.249039
PredictedLzHighE

In [1]:
len(r)

NameError: name 'r' is not defined

# Look at high $\eta$ and low $\eta$ list

In [15]:
%%time
r_low_high = remove_all_variables(all_events, {'PredictedLxyLowEta', \
                                              'CalRatio', 'HadronicLayer1Fraction', 'EnergyDensity', \
                                              'JetEta', 'SumPtOfAllTracks', 'ShowerCenter', \
                                              'JetPt', 'NTracks', 'NumberOfClusters', \
                                              'FirstCellTiming', 'BIBDeltaTimingM', 'JetLat', 'JetWidth', 'FirstClusterRadius', \
                                              'PredictedLzLowEta', 'BIBDeltaTimingP', 'PredictedLzHighEta', \
                                              'PredictedLxyHighEta', 'JetLong', 'MaxTrackPt'})

Iteration #1
  Training with variable list:
  {'PredictedLzHighEta', 'JetEta', 'FirstClusterRadius', 'JetLong', 'NumberOfClusters', 'JetLat', 'JetPt', 'BIBDeltaTimingP', 'PredictedLxyLowEta', 'JetWidth', 'PredictedLxyHighEta', 'FirstCellTiming', 'NTracks', 'EnergyDensity', 'CalRatio', 'ShowerCenter', 'SumPtOfAllTracks', 'MaxTrackPt', 'HadronicLayer1Fraction', 'BIBDeltaTimingM', 'PredictedLzLowEta'}
JetPt                     126.369315
ShowerCenter              152.716034
CalRatio                  162.728084
JetLat                    165.758028
NTracks                   165.848169
SumPtOfAllTracks          165.963986
JetWidth                  166.003901
JetLong                   166.407032
PredictedLzLowEta         166.690105
NumberOfClusters          166.942878
EnergyDensity             167.070414
FirstClusterRadius        167.171239
None                      167.254931
BIBDeltaTimingP           167.401277
HadronicLayer1Fraction    167.464332
MaxTrackPt                167.504917
BIBDel

In [13]:
r_low_high[-1]

NameError: name 'r_low_high' is not defined

# Look at full $\eta$ range predictions

In [10]:
r_full = remove_all_variables(all_events, {'PredictedLxy', \
                                              'CalRatio', 'HadronicLayer1Fraction', 'EnergyDensity', \
                                              'JetEta', 'SumPtOfAllTracks', 'ShowerCenter', \
                                              'JetPt', 'NTracks', 'NumberOfClusters', \
                                              'FirstCellTiming', 'BIBDeltaTimingM', 'JetLat', 'JetWidth', 'FirstClusterRadius', \
                                              'PredictedLz', 'BIBDeltaTimingP', \
                                              'JetLong', 'MaxTrackPt'})

Iteration #1
  Training with variable list:
  {'JetPt', 'JetEta', 'ShowerCenter', 'JetLong', 'BIBDeltaTimingP', 'MaxTrackPt', 'FirstClusterRadius', 'HadronicLayer1Fraction', 'CalRatio', 'SumPtOfAllTracks', 'JetLat', 'EnergyDensity', 'FirstCellTiming', 'BIBDeltaTimingM', 'NumberOfClusters', 'JetWidth', 'NTracks', 'PredictedLxy', 'PredictedLz'}
JetPt                     124.243520
NTracks                   151.788922
ShowerCenter              160.238450
PredictedLxy              167.303601
HadronicLayer1Fraction    168.973488
SumPtOfAllTracks          169.653653
FirstCellTiming           175.155385
JetEta                    199.640914
JetLat                    200.369754
BIBDeltaTimingM           200.471899
None                      200.491215
BIBDeltaTimingP           209.554493
FirstClusterRadius        211.061324
MaxTrackPt                212.019362
EnergyDensity             214.733850
NumberOfClusters          214.933687
PredictedLz               215.276793
JetLong                   

In [11]:
r_full[-1]

Unnamed: 0,BIBBack,BIBEff,BIBSsqrtB,BIBTotalCount,BIBTotalWeight,BIBinBIB,BIBinHSS,BIBinMJ,HSSBack,HSSEff,...,HSSinHSS,HSSinMJ,MJBack,MJEff,MJSsqrtB,MJTotalCount,MJTotalWeight,MJinBIB,MJinHSS,MJinMJ
JetPt,1564366.0,0.958132,51.267801,66925.0,66925.0,64123.0,2567.0,235.0,1508995.0,0.935401,...,147364.0,2542.0,2777.0,0.952175,1157304.0,66589.0,64049880.0,1556731.0,1506428.0,60986720.0
CalRatio,3113406.0,0.965364,36.615226,66925.0,66925.0,64607.0,2033.0,285.0,732428.7,0.892314,...,140576.0,2877.0,3162.0,0.940207,1070930.0,66589.0,64049880.0,3099318.0,730395.7,60220160.0
HadronicLayer1Fraction,2633140.0,0.967127,39.887344,66925.0,66925.0,64725.0,1970.0,230.0,702205.4,0.894529,...,140925.0,3020.0,3250.0,0.948169,1065276.0,66589.0,64049880.0,2619544.0,700235.4,60730100.0
NTracks,2343136.0,0.970893,42.448357,66925.0,66925.0,64977.0,1693.0,255.0,717990.2,0.909573,...,143295.0,2616.0,2871.0,0.952415,1138487.0,66589.0,64049880.0,2331506.0,716297.2,61002070.0
FirstCellTiming,3057008.0,0.963601,36.883942,66925.0,66925.0,64489.0,2173.0,263.0,640838.1,0.865667,...,136378.0,3255.0,3518.0,0.94258,1017861.0,66589.0,64049880.0,3039100.0,638665.1,60372110.0
BIBDeltaTimingM,2518625.0,0.967561,40.802322,66925.0,66925.0,64754.0,1933.0,238.0,662039.3,0.896319,...,141207.0,2984.0,3222.0,0.950579,1072615.0,66589.0,64049880.0,2505275.0,660106.3,60884500.0
PredictedLxy,3192326.0,0.966754,36.211848,66925.0,66925.0,64700.0,1976.0,249.0,566314.0,0.88695,...,139731.0,3401.0,3650.0,0.941573,998218.9,66589.0,64049880.0,3177917.0,564338.0,60307620.0
ShowerCenter,2296682.0,0.966829,42.696019,66925.0,66925.0,64705.0,1978.0,242.0,482103.5,0.894878,...,140980.0,2869.0,3111.0,0.95686,1098795.0,66589.0,64049880.0,2282990.0,480125.5,61286760.0
MaxTrackPt,2898710.0,0.966993,38.010998,66925.0,66925.0,64716.0,1961.0,248.0,444850.4,0.89225,...,140566.0,3197.0,3445.0,0.948043,1034551.0,66589.0,64049880.0,2884932.0,442889.4,60722060.0
JetLat,2568494.0,0.966754,40.370584,66925.0,66925.0,64700.0,1996.0,229.0,397894.0,0.89505,...,141007.0,2951.0,3180.0,0.95393,1083480.0,66589.0,64049880.0,2554911.0,395898.0,61099070.0


In [12]:
r_full[-2]

Unnamed: 0,BIBBack,BIBEff,BIBSsqrtB,BIBTotalCount,BIBTotalWeight,BIBinBIB,BIBinHSS,BIBinMJ,HSSBack,HSSEff,...,HSSinHSS,HSSinMJ,MJBack,MJEff,MJSsqrtB,MJTotalCount,MJTotalWeight,MJinBIB,MJinHSS,MJinMJ
JetPt,1694852.0,0.958879,49.293132,66925.0,66925.0,64173.0,2526.0,226.0,1517354.0,0.934849,...,147277.0,2630.0,2856.0,0.950007,1138586.0,66589.0,64049880.0,1687218.0,1514828.0,60847830.0
CalRatio,3332552.0,0.965738,35.404553,66925.0,66925.0,64632.0,2025.0,268.0,746298.4,0.894758,...,140961.0,2887.0,3155.0,0.936563,1067962.0,66589.0,64049880.0,3318859.0,744273.4,59986740.0
NTracks,2606825.0,0.971281,40.260339,66925.0,66925.0,65003.0,1677.0,245.0,760600.7,0.911902,...,143662.0,2539.0,2784.0,0.947628,1150328.0,66589.0,64049880.0,2595485.0,758923.7,60695470.0
HadronicLayer1Fraction,2334795.0,0.966933,42.350681,66925.0,66925.0,64712.0,1983.0,230.0,707155.9,0.894726,...,140956.0,3021.0,3251.0,0.952749,1070258.0,66589.0,64049880.0,2321231.0,705172.9,61023470.0
BIBDeltaTimingM,1953951.0,0.967008,46.297923,66925.0,66925.0,64717.0,1979.0,229.0,656295.3,0.897017,...,141317.0,3068.0,3297.0,0.959483,1070277.0,66589.0,64049880.0,1940795.0,654316.3,61454770.0
ShowerCenter,2384022.0,0.966858,41.907918,66925.0,66925.0,64707.0,2000.0,218.0,620848.4,0.896744,...,141274.0,2982.0,3200.0,0.953324,1079404.0,66589.0,64049880.0,2370737.0,618848.4,61060290.0
PredictedLxy,2956706.0,0.96656,37.619492,66925.0,66925.0,64687.0,2017.0,221.0,561755.4,0.88888,...,140035.0,3380.0,3601.0,0.945319,1008986.0,66589.0,64049880.0,2942580.0,559738.4,60547560.0
FirstCellTiming,3300950.0,0.963601,35.494917,66925.0,66925.0,64489.0,2188.0,248.0,433295.9,0.870434,...,137129.0,3342.0,3590.0,0.941999,1006981.0,66589.0,64049880.0,3283880.0,431107.9,60334890.0
MaxTrackPt,2878073.0,0.966784,38.138779,66925.0,66925.0,64702.0,1983.0,240.0,426949.2,0.891285,...,140414.0,3290.0,3530.0,0.948646,1022669.0,66589.0,64049880.0,2864236.0,424966.2,60760670.0
BIBDeltaTimingP,1960273.0,0.966769,46.211784,66925.0,66925.0,64701.0,2001.0,223.0,388123.6,0.894313,...,140891.0,3152.0,3375.0,0.963577,1062350.0,66589.0,64049880.0,1946775.0,386122.6,61716980.0
