# Final Training
This notebook contains the final training that is used in the analysis.

## Initalization

In [2]:
from bdt_training_scikit_tools import load_trimmed_sample, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance, get_fraction_of_events
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

## Load data

In [4]:
input_events = load_trimmed_sample(106)

Job 106:
  BIB: 800000 events
  Multijet: 800000 events
  Signal: 800000 events
  [800000, 800000, 504190]


In [5]:
events_25 = get_fraction_of_events(input_events, 0.25)
print ([len(i.index) for i in events_25])

[201018, 200123, 126005]


## Training
Default training variables. Likely were arrived at by analysis in the Training Variables workbook.

In [7]:
training_variables = ['EnergyDensity',
 'BIBDeltaTimingM',
 'JetPt',
 'HadronicLayer1Fraction',
 'ShowerCenter',
 'JetLat',
 'FirstClusterRadius',
 'JetLong',
 'MaxTrackPt',
 'PredictedLxy',
 'BIBDeltaTimingP',
 'PredictedLz',
 'SumPtOfAllTracks']

Run the training to get a bdt back.

In [8]:
%%time
# Split into testing and training samples
train, test = test_train_samples(events_25)

# Prep samples for training
all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_variables)

CPU times: user 136 ms, sys: 81.1 ms, total: 217 ms
Wall time: 213 ms


In [None]:
# Run training
bdt = default_training(all_events, training_weight, all_events_class)

Calculate the performance for this training

In [6]:
%%time
calc_performance(bdt, input_events, training_variables=training_variables)

CPU times: user 50.8 s, sys: 820 ms, total: 51.7 s
Wall time: 51.4 s


{'BIBBack': 688666572.509355,
 'BIBEff': 0.6489275,
 'BIBSsqrtB': 19.782522318259858,
 'BIBTotalCount': 800000,
 'BIBTotalWeight': 800000.0,
 'BIBinBIB': 519142.0,
 'BIBinHSS': 19031.0,
 'BIBinMJ': 261827.0,
 'HSSBack': 29459870.69691914,
 'HSSEff': 0.98546778000357,
 'HSSSsqrtB': 91.54217757840817,
 'HSSTotalCount': 504190,
 'HSSTotalWeight': 504190.0,
 'HSSinBIB': 3902.0,
 'HSSinHSS': 496863.0,
 'HSSinMJ': 3425.0,
 'MJBack': 265252.0,
 'MJEff': 0.7117782877217131,
 'MJSsqrtB': 3443307.1006705863,
 'MJTotalCount': 800000,
 'MJTotalWeight': 2491496926.202851,
 'MJinBIB': 688662670.509355,
 'MJinHSS': 29440839.69691914,
 'MJinMJ': 1773393415.9965768}

In [12]:
from sklearn.externals import joblib
joblib.dump(bdt, 'test_106_Grad1000.pkl') 

['test_106_Grad1000.pkl']

## Conversion to TMVA format

In [10]:
import mlglue
import mlglue.tree
from sklearn.externals import joblib
bdt1 = joblib.load('test_106_Grad1000.pkl')

In [11]:
bdtGeneral = mlglue.tree.BDTsklearn(bdt1, list(all_events.columns), ['BIB', 'MJ', 'Signal'])

In [12]:
bdtGeneral.to_tmva("training_106_Grad1000.xml")