# Final Training
This notebook contains the final training that is used in the analysis.

## Initalization

In [1]:
from bdt_training_scikit_tools import load_default_samples, default_training_variable_list, \
    test_train_samples, prep_samples, default_training, calc_performance
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

In [2]:
default_cut_Lxy = 1250
default_cut_Lz = 3500
eta_seperator_cut = 1.4
def trim_sample(sample, cut_Lxy = default_cut_Lxy, cut_Lz = default_cut_Lz):
    '''Trim lxy and lz cuts for a sample'''
    return sample[((abs(sample.JetEta) > eta_seperator_cut) & (sample.mc_Lz*1000 > cut_Lz)) | ((abs(sample.JetEta) <= eta_seperator_cut) & (sample.mc_Lxy*1000 > cut_Lxy))]

def trim_samples(all_events):
    '''Trim default lxy and lz cuts for a tuple of (mj, bib, signal) samples'''
    return (all_events[0], all_events[1], trim_sample(all_events[2]))

def load_trimmed_sample(jobNo):
    '''Load and trim a sample from a job, and record it in our sample archive'''
    print ('Job {0}:'.format(jobNo))
    all_events_all = load_default_samples(jobNo)
    all_events = trim_samples(all_events_all)
    print (" ", [len(e.index) for e in all_events])
    
    return all_events

## Load data

In [3]:
input_events = load_trimmed_sample(106)

Job 106:
  BIB: 800000 events
  Multijet: 800000 events
  Signal: 800000 events
  [800000, 800000, 504190]


## Training
Default training variables. Likely were arrived at by analysis in the Training Variables workbook.

In [4]:
training_variables = ['EnergyDensity',
 'BIBDeltaTimingM',
 'JetPt',
 'HadronicLayer1Fraction',
 'ShowerCenter',
 'JetLat',
 'FirstClusterRadius',
 'JetLong',
 'MaxTrackPt',
 'PredictedLxy',
 'BIBDeltaTimingP',
 'PredictedLz',
 'SumPtOfAllTracks']

Run the training to get a bdt back.

In [5]:
%%time
# Split into testing and training samples
train, test = test_train_samples(input_events)

# Prep samples for training
all_events, all_events_class, training_weight, evaluation_weight = prep_samples(train[0], train[1], train[2], training_variable_list=training_variables)

# Run training
bdt = default_training(all_events, training_weight, all_events_class)

Wall time: 8min 59s


Calculate the performance for this training

In [6]:
%%time
calc_performance(bdt, test, training_variables=training_variables)

Wall time: 4.04 s


{'BIBBack': 246129395.84977883,
 'BIBEff': 0.6123308439553035,
 'BIBSsqrtB': 10.42641929933816,
 'BIBTotalCount': 267135,
 'BIBTotalWeight': 267135.0,
 'BIBinBIB': 163575.0,
 'BIBinHSS': 9913.0,
 'BIBinMJ': 93647.0,
 'HSSBack': 13954251.963751648,
 'HSSEff': 0.9719989752816486,
 'HSSSsqrtB': 43.67535662015565,
 'HSSTotalCount': 167851,
 'HSSTotalWeight': 167851.0,
 'HSSinBIB': 3158.0,
 'HSSinHSS': 163151.0,
 'HSSinMJ': 1542.0,
 'MJBack': 95189.0,
 'MJEff': 0.6869007784258705,
 'MJSsqrtB': 1849310.6015782051,
 'MJTotalCount': 266785,
 'MJTotalWeight': 830633099.3287433,
 'MJinBIB': 246126237.84977883,
 'MJinHSS': 13944338.963751648,
 'MJinMJ': 570562522.5152072}

## Conversion to TMVA format

In [18]:
import mlglue

In [28]:
bdtGeneral = mlglue.tree.BDTsklearn(bdt, list(all_events.columns), ['BIB', 'MJ', 'Signal'])
bdtGeneral.kind = 'multiclass'

In [29]:
bdtGeneral.to_tmva("training_106.xml")

In [9]:
import importlib

In [21]:
importlib.reload(mlglue.tree)

<module 'mlglue.tree' from 'K:\\Anaconda3\\lib\\site-packages\\mlglue-0.24-py3.6.egg\\mlglue\\tree.py'>