What variables should be used for training. We look at the performance of the default training vs the training with each variable removed. If the performance improves when we remove a variable, we know that variable is making the training worse. Further, by looking at the change in performance we can rank the variables to first order.

# Initalization

In [1]:
from bdt_training_scikit_tools import load_trimmed_sample, \
    test_train_samples, prep_samples, default_training, calc_performance, get_fraction_of_events, \
    default_training_variable_list
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

# Load Data Samples

In [2]:
all_events_all = load_trimmed_sample("133")

Job 133:
  BIB: 800000 events
  Multijet: 800000 events
  Signal: 800000 events
  [800000, 800000, 473600]


In [3]:
all_events = get_fraction_of_events(all_events_all, 200000)
print ([len(e.index) for e in all_events])

[200387, 199675, 196736]


# Determine Full Variable List
We have to determine what variable list we want to start with before we start removing them according to their performance. Reasons for removal:

- Weight variables are MC only things
- Variables that have nothing to do with phyisics (like run #)
- DR to closest track because it is used later in the analysis
- JetPhi because physics *shouldn't* depend on that

In [4]:
all_events_all[0].columns

Index(['RunNumber', 'Weight', 'WeightMCEvent', 'WeightXSection',
       'WeightFlatten', 'mc_Lxy', 'mc_Lz', 'MHTOverHT', 'JetPt', 'JetPhi',
       'CalRatio', 'JetEta', 'NTracks', 'SumPtOfAllTracks', 'MaxTrackPt',
       'EventNumber', 'JetET', 'JetWidth', 'JetDRTo2GeVTrack', 'EnergyDensity',
       'HadronicLayer1Fraction', 'JetLat', 'JetLong', 'FirstClusterRadius',
       'NumberOfClusters', 'ShowerCenter', 'BIBDeltaTimingM',
       'BIBDeltaTimingP', 'FirstCellTiming', 'InteractionsPerCrossing',
       'RPredictedLxy', 'RPredictedLz', 'PredictedLxy', 'PredictedLz',
       'PredictedLxyHighEta', 'PredictedLxyLowEta', 'PredictedLzHighEta',
       'PredictedLzLowEta'],
      dtype='object')

In [5]:
variable_list = set(all_events_all[0].columns) - set(['RunNumber', 'Weight', 'WeightMCEvent', 'WeightXSection', 'WeightFlatten', 'mc_Lxy', 'mc_Lz', 'MHTOverHT', 'JetPhi', 'EventNumber', 'InteractionsPerCrossing', 'JetDRTo2GeVTrack', 'JetET'])
variable_list

{'BIBDeltaTimingM',
 'BIBDeltaTimingP',
 'CalRatio',
 'EnergyDensity',
 'FirstCellTiming',
 'FirstClusterRadius',
 'HadronicLayer1Fraction',
 'JetEta',
 'JetLat',
 'JetLong',
 'JetPt',
 'JetWidth',
 'MaxTrackPt',
 'NTracks',
 'NumberOfClusters',
 'PredictedLxy',
 'PredictedLxyHighEta',
 'PredictedLxyLowEta',
 'PredictedLz',
 'PredictedLzHighEta',
 'PredictedLzLowEta',
 'RPredictedLxy',
 'RPredictedLz',
 'ShowerCenter',
 'SumPtOfAllTracks'}

# Drop First Variable

In [6]:
%%writefile get_training_performance.py
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
    
def do_training (vlist):
    all_events, training_list = vlist
    return get_training_performance (all_events, training_list)
    
def get_training_performance (all_events, training_list):
    '''Run a training with the set of varaibles given. Return a performance table.'''
    
    # Split into testing and training samples
    train, test = test_train_samples(all_events)
        
    # Prep samples for training
    all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_list)
    
    # Run training
    bdt = default_training(all_events, training_weight, all_events_class, estimators=400)
    
    # Create a thing of all the results
    return {tuple(training_list): calc_performance(bdt, test, training_variables = training_list)}

Overwriting get_training_performance.py


In [7]:
def all_but_one (vlist, number_to_drop = 1):
    '''Return vlist and vlist with each item removed
    
    Arguments:
        vlist - the source list or tuple
        number_to_drop - return a list that is len(vlist)-number_to_drop - all possible combinations
        
    Returns:
        A list of tuples that have len(vlist)-number_to_drop items.
    
    '''
    var_training_list = itertools.combinations(vlist, len(vlist)-number_to_drop)
    var_training_list = list(var_training_list) + [tuple(vlist)]
    return var_training_list

In [8]:
import get_training_performance
pool = mp.Pool(processes=10)

def unused_var(original_list, used_list):
    r = tuple(i for i in original_list if i not in used_list)
    r = r if len(r) != 0 else ('None',)
    return r

def calc_var_removal(all_events, training_list = default_training_variable_list):
    r_drop_first = pool.map(get_training_performance.do_training,
             [(all_events, tvar_list) for tvar_list in all_but_one(training_list)])

    one_dict = {}
    for kp in r_drop_first:
        one_dict.update(kp)

    return pd.DataFrame({unused_var(training_list, k):one_dict[k] for k in one_dict}).T

In [9]:
def remove_all_variables(all_events, training_list = variable_list):
    done = False
    count = 0
    rlist = []
    while not done:
        count = count + 1
        print ("Iteration #{0}".format(count))
        print ('  Training with variable list:')
        print ('  ' + str(training_list))
        result = calc_var_removal(all_events, training_list=training_list)
        result_sorted = result.sort_values("HSSSsqrtB")
        rlist = rlist + [result_sorted]
        print (result_sorted.HSSSsqrtB)
        last_var_name = result_sorted.index[-1][0]
        done = last_var_name == 'None'
        if not done:
            training_list = training_list - set([last_var_name])
    return rlist

In [10]:
%%time
#r_drop_first = calc_var_removal(all_events)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.29 µs


In [11]:
#print (r_drop_first.sort_values("HSSSsqrtB").HSSSsqrtB)

In [12]:
%%time
r = remove_all_variables(all_events, variable_list)

Iteration #1
  Training with variable list:
  {'NumberOfClusters', 'PredictedLz', 'JetLong', 'JetLat', 'FirstCellTiming', 'BIBDeltaTimingM', 'ShowerCenter', 'NTracks', 'JetEta', 'PredictedLzHighEta', 'PredictedLxyHighEta', 'FirstClusterRadius', 'SumPtOfAllTracks', 'JetPt', 'PredictedLxy', 'RPredictedLz', 'BIBDeltaTimingP', 'CalRatio', 'JetWidth', 'MaxTrackPt', 'HadronicLayer1Fraction', 'PredictedLxyLowEta', 'PredictedLzLowEta', 'EnergyDensity', 'RPredictedLxy'}
JetPt                     154.983054
FirstCellTiming           163.264202
NTracks                   167.000372
JetWidth                  169.560500
CalRatio                  173.497509
PredictedLxyLowEta        176.480392
SumPtOfAllTracks          177.051209
PredictedLxy              179.295146
HadronicLayer1Fraction    179.843643
MaxTrackPt                209.509431
RPredictedLxy             210.916151
RPredictedLz              213.121358
BIBDeltaTimingP           216.279491
JetLat                    218.425479
ShowerCenter    

In [13]:
len(r)

5

# Look at high $\eta$ and low $\eta$ list

In [14]:
%%time
r_low_high = remove_all_variables(all_events, {'PredictedLxyLowEta', \
                                              'CalRatio', 'HadronicLayer1Fraction', 'EnergyDensity', \
                                              'JetEta', 'SumPtOfAllTracks', 'ShowerCenter', \
                                              'JetPt', 'NTracks', 'NumberOfClusters', \
                                              'FirstCellTiming', 'BIBDeltaTimingM', 'JetLat', 'JetWidth', 'FirstClusterRadius', \
                                              'PredictedLzLowEta', 'BIBDeltaTimingP', 'PredictedLzHighEta', \
                                              'PredictedLxyHighEta', 'JetLong', 'MaxTrackPt'})

Iteration #1
  Training with variable list:
  {'NumberOfClusters', 'JetLong', 'JetLat', 'FirstCellTiming', 'BIBDeltaTimingM', 'ShowerCenter', 'NTracks', 'JetEta', 'PredictedLzHighEta', 'PredictedLxyHighEta', 'FirstClusterRadius', 'SumPtOfAllTracks', 'JetPt', 'BIBDeltaTimingP', 'CalRatio', 'JetWidth', 'MaxTrackPt', 'HadronicLayer1Fraction', 'PredictedLxyLowEta', 'PredictedLzLowEta', 'EnergyDensity'}
JetPt                     125.881300
FirstCellTiming           138.533159
NTracks                   156.256465
ShowerCenter              157.751995
JetWidth                  160.882288
HadronicLayer1Fraction    161.234574
MaxTrackPt                162.306230
PredictedLxyHighEta       162.335597
JetLat                    163.699608
PredictedLzHighEta        163.756510
SumPtOfAllTracks          163.868787
BIBDeltaTimingM           164.477833
EnergyDensity             165.401708
PredictedLzLowEta         168.538381
FirstClusterRadius        169.387613
NumberOfClusters          169.417190
JetLon

In [15]:
r_low_high[-1]

Unnamed: 0,BIBBack,BIBEff,BIBSsqrtB,BIBTotalCount,BIBTotalWeight,BIBinBIB,BIBinHSS,BIBinMJ,HSSBack,HSSEff,...,HSSinHSS,HSSinMJ,MJBack,MJEff,MJSsqrtB,MJTotalCount,MJTotalWeight,MJinBIB,MJinHSS,MJinMJ
JetPt,3650411.0,0.961375,33.6752,66925.0,66925.0,64340.0,2357.0,228.0,1275672.0,0.941901,...,148388.0,2017.0,2245.0,0.923238,1248027.0,66589.0,64049880.0,3643275.0,1273315.0,59133290.0
FirstCellTiming,4088568.0,0.945028,31.278611,66925.0,66925.0,63246.0,3414.0,265.0,586110.2,0.665027,...,104769.0,2112.0,2377.0,0.927859,1218950.0,66589.0,64049880.0,4037908.0,582696.2,59429270.0
MaxTrackPt,3900876.0,0.968293,32.810589,66925.0,66925.0,64803.0,1878.0,244.0,767260.5,0.907986,...,143045.0,2752.0,2996.0,0.92733,1085130.0,66589.0,64049880.0,3889132.0,765382.5,59395360.0
PredictedLxyLowEta,3522038.0,0.968383,34.533318,66925.0,66925.0,64809.0,1870.0,246.0,729745.5,0.898109,...,141489.0,2644.0,2890.0,0.933856,1112626.0,66589.0,64049880.0,3508630.0,727875.5,59813370.0
PredictedLxyHighEta,3971642.0,0.967979,32.506431,66925.0,66925.0,64782.0,1893.0,250.0,694555.1,0.905396,...,142637.0,2747.0,2997.0,0.927367,1084992.0,66589.0,64049880.0,3959485.0,692662.1,59397730.0
PredictedLzHighEta,3871927.0,0.968846,32.951824,66925.0,66925.0,64840.0,1828.0,257.0,666625.4,0.905542,...,142660.0,2746.0,3003.0,0.929358,1086235.0,66589.0,64049880.0,3859792.0,664797.4,59525290.0
NumberOfClusters,3973148.0,0.969399,32.54793,66925.0,66925.0,64877.0,1810.0,238.0,633601.0,0.908494,...,143125.0,2738.0,2976.0,0.928286,1089893.0,66589.0,64049880.0,3961470.0,631791.0,59456620.0
NTracks,4425954.0,0.972043,30.922213,66925.0,66925.0,65054.0,1653.0,218.0,639838.2,0.921106,...,145112.0,2090.0,2308.0,0.921096,1228019.0,66589.0,64049880.0,4415615.0,638185.2,58996080.0
SumPtOfAllTracks,3768041.0,0.969234,33.416374,66925.0,66925.0,64866.0,1817.0,242.0,569667.5,0.908145,...,143070.0,2899.0,3141.0,0.932485,1065679.0,66589.0,64049880.0,3756469.0,567850.5,59725560.0
HadronicLayer1Fraction,3432266.0,0.966769,34.923721,66925.0,66925.0,64701.0,1966.0,258.0,503534.6,0.900972,...,141940.0,2734.0,2992.0,0.938783,1099265.0,66589.0,64049880.0,3419399.0,501568.6,60128910.0


# Look at full $\eta$ range predictions

In [16]:
r_full = remove_all_variables(all_events, {'PredictedLxy', \
                                              'CalRatio', 'HadronicLayer1Fraction', 'EnergyDensity', \
                                              'JetEta', 'SumPtOfAllTracks', 'ShowerCenter', \
                                              'JetPt', 'NTracks', 'NumberOfClusters', \
                                              'FirstCellTiming', 'BIBDeltaTimingM', 'JetLat', 'JetWidth', 'FirstClusterRadius', \
                                              'PredictedLz', 'BIBDeltaTimingP', \
                                              'JetLong', 'MaxTrackPt'})

Iteration #1
  Training with variable list:
  {'FirstClusterRadius', 'CalRatio', 'JetWidth', 'NumberOfClusters', 'PredictedLz', 'PredictedLxy', 'HadronicLayer1Fraction', 'MaxTrackPt', 'JetLong', 'JetLat', 'SumPtOfAllTracks', 'JetPt', 'FirstCellTiming', 'BIBDeltaTimingM', 'ShowerCenter', 'NTracks', 'EnergyDensity', 'JetEta', 'BIBDeltaTimingP'}
NTracks                   133.338624
JetPt                     137.131194
BIBDeltaTimingP           164.614306
JetEta                    165.096024
HadronicLayer1Fraction    165.927582
NumberOfClusters          166.092621
JetLat                    166.102498
SumPtOfAllTracks          167.442485
MaxTrackPt                167.628694
ShowerCenter              168.108997
BIBDeltaTimingM           169.284440
JetWidth                  170.267896
EnergyDensity             170.319773
PredictedLz               170.510132
None                      171.448484
FirstClusterRadius        172.010419
FirstCellTiming           173.718744
JetLong                   

In [17]:
r_full[-1]

Unnamed: 0,BIBBack,BIBEff,BIBSsqrtB,BIBTotalCount,BIBTotalWeight,BIBinBIB,BIBinHSS,BIBinMJ,HSSBack,HSSEff,...,HSSinHSS,HSSinMJ,MJBack,MJEff,MJSsqrtB,MJTotalCount,MJTotalWeight,MJinBIB,MJinHSS,MJinMJ
JetPt,3475050.0,0.951498,34.159831,66925.0,66925.0,63679.0,2989.0,257.0,1183978.0,0.934493,...,147221.0,2605.0,2862.0,0.927426,1110358.0,66589.0,64049880.0,3467335.0,1180989.0,59401550.0
NTracks,4291913.0,0.968368,31.282622,66925.0,66925.0,64808.0,1854.0,263.0,560995.9,0.896636,...,141257.0,2095.0,2358.0,0.924483,1219397.0,66589.0,64049880.0,4277724.0,559141.9,59213010.0
PredictedLz,4562992.0,0.962555,30.157069,66925.0,66925.0,64419.0,2229.0,277.0,474068.9,0.876464,...,138079.0,2398.0,2675.0,0.921658,1141370.0,66589.0,64049880.0,4545928.0,471839.9,59032110.0
BIBDeltaTimingP,4650886.0,0.960837,29.817424,66925.0,66925.0,64304.0,2355.0,266.0,259027.0,0.801049,...,126198.0,2377.0,2643.0,0.923831,1150966.0,66589.0,64049880.0,4621920.0,256672.0,59171280.0
PredictedLxy,4823888.0,0.961121,29.286513,66925.0,66925.0,64323.0,2373.0,229.0,247577.3,0.863445,...,136028.0,2440.0,2669.0,0.921155,1142028.0,66589.0,64049880.0,4804815.0,245204.3,58999860.0
JetLat,4263511.0,0.96375,31.236996,66925.0,66925.0,64499.0,2163.0,263.0,230060.9,0.875086,...,137862.0,2515.0,2778.0,0.930144,1130323.0,66589.0,64049880.0,4246347.0,227897.9,59575630.0
BIBDeltaTimingM,5100670.0,0.959597,28.435668,66925.0,66925.0,64221.0,2423.0,281.0,195250.9,0.813928,...,128227.0,2321.0,2602.0,0.917775,1152394.0,66589.0,64049880.0,5073677.0,192827.9,58783370.0
MaxTrackPt,4109071.0,0.963437,31.808248,66925.0,66925.0,64478.0,2202.0,245.0,179983.0,0.867907,...,136731.0,2769.0,3014.0,0.933352,1088910.0,66589.0,64049880.0,4091030.0,177781.0,59781070.0
JetWidth,4075474.0,0.963526,31.942058,66925.0,66925.0,64484.0,2173.0,268.0,173267.0,0.872998,...,137533.0,2479.0,2747.0,0.933973,1141361.0,66589.0,64049880.0,4057945.0,171094.0,59820840.0
JetLong,4311835.0,0.963482,31.052797,66925.0,66925.0,64481.0,2180.0,264.0,168537.2,0.875867,...,137985.0,2422.0,2686.0,0.93035,1149772.0,66589.0,64049880.0,4294701.0,166357.2,59588820.0


In [18]:
r_full[-2]

Unnamed: 0,BIBBack,BIBEff,BIBSsqrtB,BIBTotalCount,BIBTotalWeight,BIBinBIB,BIBinHSS,BIBinMJ,HSSBack,HSSEff,...,HSSinHSS,HSSinMJ,MJBack,MJEff,MJSsqrtB,MJTotalCount,MJTotalWeight,MJinBIB,MJinHSS,MJinMJ
JetPt,2455247.0,0.953814,40.738445,66925.0,66925.0,63834.0,2850.0,241.0,1339510.0,0.9345,...,147222.0,2622.0,2863.0,0.940918,1126314.0,66589.0,64049880.0,2447550.0,1336660.0,60265670.0
NTracks,4266382.0,0.970026,31.429823,66925.0,66925.0,64919.0,1771.0,235.0,609243.2,0.908252,...,143087.0,2347.0,2582.0,0.924094,1164814.0,66589.0,64049880.0,4254275.0,607472.2,59188130.0
BIBDeltaTimingM,4034996.0,0.963407,32.097895,66925.0,66925.0,64476.0,2196.0,253.0,410662.7,0.82979,...,130726.0,2556.0,2809.0,0.931004,1125107.0,66589.0,64049880.0,4010737.0,408466.7,59630670.0
MaxTrackPt,4234284.0,0.964318,31.363086,66925.0,66925.0,64537.0,2160.0,228.0,326086.3,0.871551,...,137305.0,3203.0,3431.0,0.929099,1015945.0,66589.0,64049880.0,4217251.0,323926.3,59508700.0
BIBDeltaTimingP,4058382.0,0.962884,31.987904,66925.0,66925.0,64441.0,2230.0,254.0,269116.9,0.813477,...,128156.0,2655.0,2909.0,0.932888,1107837.0,66589.0,64049880.0,4031652.0,266886.9,59751340.0
HadronicLayer1Fraction,4159082.0,0.963332,31.612997,66925.0,66925.0,64471.0,2209.0,245.0,273921.3,0.880952,...,138786.0,2844.0,3089.0,0.931071,1072981.0,66589.0,64049880.0,4143171.0,271712.3,59634990.0
PredictedLz,3770046.0,0.963765,33.218986,66925.0,66925.0,64500.0,2182.0,243.0,274281.9,0.881847,...,138927.0,2802.0,3045.0,0.937138,1087747.0,66589.0,64049880.0,3754234.0,272099.9,60023540.0
JetLat,3841950.0,0.964811,32.942379,66925.0,66925.0,64570.0,2118.0,237.0,259225.0,0.880837,...,138768.0,2912.0,3149.0,0.93625,1068621.0,66589.0,64049880.0,3826089.0,257107.0,59966680.0
PredictedLxy,4159097.0,0.963377,31.614409,66925.0,66925.0,64474.0,2240.0,211.0,243838.1,0.872992,...,137532.0,2881.0,3092.0,0.93156,1073024.0,66589.0,64049880.0,4141969.0,241598.1,59666310.0
JetWidth,3875000.0,0.964273,32.783305,66925.0,66925.0,64534.0,2149.0,242.0,246057.1,0.882037,...,138957.0,2787.0,3029.0,0.935939,1089221.0,66589.0,64049880.0,3859203.0,243908.1,59946770.0
