# Building Bonsai BDT for Online Triggering

* Step 1 : Pull out features for classification.  These include energy deposit on wire, energy deposit of left neighbour, energy deposit of right neighbour, and layer id (radial distance).
* Step 2 : Bin these features into 11 bins, 10 with deposits, one with no deposits
* Step 3 : Train the GBDT to recognize hit wires based on these binned energy features and layer_id
* Step 4 : Build a look up table of dimention [11,11,11,18], so it has an entry for each possible combination of features
* Step 5 : Fill the table with the output of the wire GBDT
* Step 6 : Define layer features based on this output
* Step 7 : Pass layer features through event GBDT
* Step 8 : Determine signal or background based on this output

In [1]:
%pylab inline
%run visualizations.ipynb
import sys
sys.path.insert(0, '../modules')

Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


In [2]:
from tracking import HoughSpace
from scipy import sparse
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.ticker as mtick
from hits import CyDetHits, CTHHits, CDCHits

In [3]:
def bin_energies(deposits, edges=[5,15,20,25,30,35], zero_bin=True):
    deposits_f = deposits.flatten()
    edges = numpy.percentile(deposits_f[deposits_f > 0],edges)
    new_deps = numpy.searchsorted(edges,deposits)
    if zero_bin:
        new_deps = new_deps + 1
        new_deps[deposits == 0] = 0
    return new_deps

In [4]:
def binary_binning(deposits, threshold=0.000005):
    new_deps = deposits < threshold
    new_deps[deposits == 0] = 0
    return new_deps.astype(int)

In [5]:
def plot_rocs(labels, predictions, zoom=False):
    predicts = predictions.keys()
    fig = plt.figure(1)
    axs = [fig.add_subplot(111)]*len(predicts)

    for pred, ax in zip(predicts,axs):
        fpr, tpr, values =  roc_curve(labels, predictions[pred][:,1])
        fpr *= 100.
        tpr *= 100.
        ax.xaxis.tick_top()
        ax.set_xlabel('Signal Retention Efficiency', fontsize=15)
        ax.xaxis.set_label_position('top') 
        ax.set_ylabel('Background Rejection Efficiency', fontsize=15)    
        ax.plot(tpr, 100-fpr, label=pred)
        ax.grid(b=True, which='minor', color='grey', linestyle=':')
        ax.grid(b=True, which='major', color='grey', linestyle='--')
        
        
        fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
        xticks = mtick.FormatStrFormatter(fmt)
        ax.xaxis.set_major_formatter(xticks)
        ax.yaxis.set_major_formatter(xticks)
        
        ax.set_axisbelow(True)
        if zoom:
            ax.set_xlim([70,100])
            ax.set_ylim([70,100])
            #ax.set_xticklabels(["98%","98.5%","99%","99.5%","100%"])
        ax.minorticks_on()
        leg = ax.legend(loc=0, frameon=1, fontsize=15)
    frame = leg.get_frame()
    frame.set_facecolor('white')

# Signal Data

In [15]:
sig_cydet = CyDetHits('../data/151209_signal_tracks.root', signal_coding=[1], hit_type_name="tid")
sig_cth = CTHHits('../data/151209_signal_tracks.root', signal_coding=[1], hit_type_name="tid")
sig = CDCHits(sig_cydet, sig_cth)

from cylinder import CyDet
geom = CyDet()
n_events = sig_cth.n_events
print "Start Events: {}".format(sig.cth.n_events)

Start Events: 79587


In [7]:
# Define cuts to use
n_hits_cut = False
layer_cut = False
timing_cut = False
use_cth_cut = False

In [8]:
if timing_cut:
    # Apply the timing filter
    print "CTH Hits: {}".format(sig.cth.n_hits)
    print "CyDet Hits: {}".format(sig.cydet.n_hits)
    sig.cth.trim_hits(variable=sig.cth.time_name, less_than=1100, greater_than=700)
    sig.cydet.trim_hits(variable=sig.cydet.time_name, less_than=1620, greater_than=700)
    print "CTH Hits: {}".format(sig.cth.n_hits)
    print "CyDet Hits: {}".format(sig.cydet.n_hits)

In [9]:
# Filter for CTH in time window.  Get the time
trigger_events = []
for evt in range(sig.n_events):
    sig_hits = sig.cth.get_signal_hits(evt)
    if len(sig_hits) != 0:
        trigger_events.append(evt)
trigger_events = np.array(trigger_events)
print "Events that pass CTH Trigger: {}".format(len(trigger_events))

Events that pass CTH Trigger: 22091


In [10]:
# Filter for max layer is five
max_layer = []
for evt in range(sig.n_events):
    these_layers = sig.cydet.geom.point_layers[sig.cydet.get_sig_wires(evt)]
    if len(sig.cydet.get_sig_wires(evt)) != 0:
        max_layer.append(np.max(these_layers))
    else:
        max_layer.append(-1)
max_layer = np.array(max_layer)
good_max_layer = np.where(max_layer >= 4)[0]
print "Number that pass layer cut: {}".format(len(good_max_layer))

Number that pass layer cut: 42646


In [11]:
# Filter for number of signal hits
n_signal_hits  = np.array([len(sig.cydet.get_signal_hits(evt)) 
                         for evt in range(sig.n_events)])
n_signal_hits = np.array(n_signal_hits)
good_n_hits = np.where(n_signal_hits > 30)[0]
print "Number that pass n hits cut: {}".format(len(good_n_hits))

Number that pass n hits cut: 45145


In [12]:
# Apply quality and CTH cuts
print "Start Events: {}".format(sig.n_events)
if use_cth_cut:
    sig.trim_events(trigger_events)
    print "After CTH Trigger {}".format(sig.n_events)
if layer_cut:
    sig.trim_events(good_max_layer)
    print "After Layer Cut Trigger {}".format(sig.n_events)
if n_hits_cut:
    sig.trim_events(good_events)
    print "Afterb N Hits Cut Trigger {}".format(sig.n_events)

Start Events: 79587


# Background Data

In [26]:
hits_cydet = CyDetHits('../data/151208_SimChen_noise.root', signal_coding=[1])
hits_cth = CTHHits('../data/151208_SimChen_noise.root', signal_coding=[1])
hits = CDCHits(hits_cydet, hits_cth)

n_events = min(sig.n_events, hits.n_events)

In [27]:
# Set the trigger times
trig_time_evt = []
for evt in range(n_events):
    sig_hits = sig.cth.get_signal_hits(evt)
    if len(sig_hits) != 0:
        this_trig_time = sig_hits[sig.cth.time_name][0]
        trig_time_evt.append(this_trig_time)
        sig.cydet.data[sig.cydet.trig_name][sig.cydet.event_to_hits[evt]] = this_trig_time
        hits.cydet.data[hits.cydet.trig_name][hits.cydet.event_to_hits[evt]] = this_trig_time

In [28]:
figsize(8,8)
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 15}

matplotlib.rc('font', **font)

## Define Training Sample For Track Finding

In [37]:
from sklearn.cross_validation import train_test_split

In [None]:
# Trim to the 

In [38]:
# Quality tracks
quality_tracks_set = np.intersect1d(good_max_layer, good_n_hits)
quality_tracks_set = np.intersect1d(best_tracks_set, trigger_events)

# Trigger events without quality
bad_tracks_set = np.setdiff1d(trigger_events, quality_tracks_set)

In [42]:
# Splt the quality tracks sample for hit filter
remain_quality_tracks, wire_train =\
    train_test_split(quality_tracks_set, train_size=0.30, test_size=0.60)
    
# Get a filter testing sample out of the remaining, define the rest as the signal for the trigger testing
wire_test, trig_sig_trk_test =\
    train_test_split(remain_quality_tracks, train_size=0.50, test_size=0.50)

# Define the trigger signal training sample as the wire testing sample
trig_sig_trk_train = wire_test

# Split the bad tracks indexes into two samples, 
# one which will include the bad tracks in the background, one which will not
trig_bad_trk, trig_no_trk = train_test_split(bad_tracks_set, train_size=0.45, test_size=0.45)

# Split these samples into testing 
trig_bad_trk_test, trig_bad_trk_train =\
    train_test_split(trig_bad_trk, train_size=0.50, test_size=0.50)
    
# Split these samples into testing 
trig_no_track_test, trig_no_track_train =\
    train_test_split(trig_no_trk, train_size=0.50, test_size=0.50)
    
# Define which events are in which stage for the wire events
wire_events = np.unique(np.append(wire_test,wire_train))

# Define the total training and testing samples for trigger
trig_test = np.append(np.append(trig_no_track_test, trig_bad_trk_test), trig_sig_trk_test)
trig_train = np.append(np.append(trig_no_track_train, trig_bad_trk_train), trig_sig_trk_train)

## Define Wire Features 

In [51]:
# Figure out coincidence
sig_time =  numpy.vstack(sig.cydet.get_hit_time(i)  for i in wire_events) # + 700.
hit_time =  numpy.vstack(hits.cydet.get_hit_time(i) for i in wire_events)
# Flatten at first
sig_time = sig_time.flatten()
hit_time = hit_time.flatten()
# Find signal hits
sig_mask = np.where(sig_time != 0)[0]
# Find background hits
hit_mask = np.where(hit_time != 0)[0]
# Find coincidence
both = np.where( (sig_time != 0) & \
                 (hit_time != 0) )[0]
# Find when coincidence meant signal was sooner
sig_sooner = np.where(sig_time[both] < hit_time[both])[0]
# Take these away from background mask
hit_mask = np.setdiff1d(hit_mask, sig_sooner)

IndexError: index 5755 is out of bounds for axis 0 with size 5750

In [29]:
# Truth Values
sig_labels = numpy.vstack(sig.cydet.get_hit_types(i) for i in range(wire_sample))
hit_labels = numpy.vstack(hits.cydet.get_hit_types(i) for i in range(wire_sample)) 
# Flatten at first
sig_labels = sig_labels.flatten()
hit_labels = hit_labels.flatten()
# Define the overall relative time
labels =  hit_labels
# Factor in signal (and coincidence)
labels[sig_mask] = sig_labels[sig_mask]
# Reshape bacl
labels = labels.reshape((n_events,-1))

NameError: name 'wire_sample' is not defined

In [30]:
# Energy deposits
deposits = numpy.vstack(hits.get_energy_deposits(i) + sig.get_energy_deposits(i)\
                        for i in range(wire_sample))
# Flatten before binning 
deposits_f = deposits.flatten()

## Bin the deposits data
#new_deps = bin_energies(deposits, e_bin_edges)

## Binary Binning
new_deps = binary_binning(deposits)

NameError: name 'wire_sample' is not defined

### Define Left/Right Neighbours

In [31]:
right_neighs = numpy.array([hits.cydet.shift_wire(wire, 1) for wire in range(hits.cydet.n_points)])
left_neighs = numpy.array([hits.cydet.shift_wire(wire, -1) for wire in range(hits.cydet.n_points)])

AttributeError: 'CyDetHits' object has no attribute 'n_points'

### Define Features

In [32]:
from collections import OrderedDict
features = OrderedDict()
# Features
features['Layer_ID'] = layer_id

## Unbinned deposits
# Wire deposits scaled by 1000 to avoid small numbers for GBDT
features['Deposits'] = deposits * 1000
# LR-Neighbour Features
features['R_Deposits'] = (deposits * 1000)[:, right_neighs]
features['L_Deposits'] = (deposits * 1000)[:, left_neighs]


# Wire features
features['Binned_Deposits'] = new_deps
# LR-Neighbour Features
features['Binned_R_Deposits'] = new_deps[:, right_neighs]
features['Binned_L_Deposits'] = new_deps[:, left_neighs]

# Truth values
features['Labels'] = labels

NameError: name 'layer_id' is not defined

In [33]:
print numpy.unique(new_deps)

NameError: name 'new_deps' is not defined

In [34]:
# Name train features as all featurs except label
train_features = features.keys()[:-1]
# Name data as all features stacked in 3D array
data = numpy.dstack(features.values())
# Count the number of features
n_feats = data.shape[-1]
print data.shape

ValueError: need at least one array to concatenate

## Train/test splitting 

In [35]:
from sklearn.cross_validation import train_test_split
train_events, test_events = train_test_split(range(len(labels)), train_size=0.15, test_size=0.25)

NameError: name 'labels' is not defined

## Features for Wire GBDT

In [36]:
import pandas
# Define a small portion of the data to be used as a 0th order guess of if the wire is a signal or not
wire_trn_data = data[wire_].reshape(-1, n_feats)
wire_trn_data = pandas.DataFrame(data=wire_trn_data, columns=features.keys())
wire_tst_data = data[test_events].reshape(-1, n_feats)
wire_tst_data = pandas.DataFrame(data=wire_tst_data, columns=features.keys())



NameError: name 'data' is not defined

In [None]:
# Purge cells with no hits out
wire_trn_data = wire_trn_data[wire_trn_data['Labels'] != 0]
wire_tst_data = wire_tst_data[wire_tst_data['Labels'] != 0]

# Define signal hit as 1, background hit as 0
wire_trn_data['Labels'] = 2 - wire_trn_data['Labels']
wire_tst_data['Labels'] = 2 - wire_tst_data['Labels']

## Prepare the GBDT

In [None]:
from rep.metaml.factory import ClassifiersFactory
from rep.estimators import SklearnClassifier
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc

In [None]:
factory = ClassifiersFactory()

train_features_org = ['Deposits','L_Deposits','R_Deposits', 'Layer_ID']
train_features_bin = ['Binned_Deposits','Binned_L_Deposits','Binned_R_Deposits', 'Layer_ID']


# Define GBDT over unbinned wire features
gb_wire_features_org = SklearnClassifier(GradientBoostingClassifier(n_estimators=400), 
                                    features=train_features_org)
# Define GBDT over binned wire features
gb_wire_features_bin = SklearnClassifier(GradientBoostingClassifier(n_estimators=400), 
                                    features=train_features_bin)
# Define GBDT over energy deposit
gb_energy = SklearnClassifier(GradientBoostingClassifier(n_estimators=100), 
                                    features=['Deposits'])

factory.add_classifier('Energy Deposit Only', gb_energy)
factory.add_classifier('Original Wire Features', gb_wire_features_org)
factory.add_classifier('Binned Wire Features', gb_wire_features_bin)

#only_binned = ClassifiersFactory()
#only_binned.add_classifier('Binned Wire Features', gb_wire_features_bin)

In [None]:
# Train the classifier
factory.fit(wire_trn_data, wire_trn_data['Labels'])
#only_binned.fit(wire_trn_data, wire_trn_data['Labels'])
pass

In [None]:
# Test the classifier
wire_predictions = factory.test_on(wire_tst_data, wire_tst_data['Labels'])
#binned_predictions = only_binned.test_on(wire_tst_data, wire_tst_data['Labels'])

In [None]:
figsize(6,6) 
savedir = "/home/elg112/COMET/Presentations_Papers/COMET_TDR_2015/images/"
save_all = True

plot_rocs(wire_tst_data['Labels'], wire_predictions.prediction, zoom=False)
if save_all:
    savefig(savedir+"roc_bbdt.png", bbox_inches='tight')
show()

plot_rocs(wire_tst_data['Labels'], wire_predictions.prediction, zoom=True)
if save_all:
    savefig(savedir+"roc_bbdt_zoom.png", bbox_inches='tight')

In [None]:
#plt.plot([.99, .99], [0.75, 1.], 'k-', lw=2,  c='g' )
#plt.plot([.997, .997], [0.75, 1.], 'k-', lw=2,c='orange')

In [None]:
wire_predictions.feature_importance().plot(figsize=[14, 14])

In [None]:
dep = wire_predictions.features_pdf(features=['Binned_Deposits'], bins=10)
dep.plot()

In [None]:
# Take only the all features GBDT from here on out
binned_gbdt = factory['Binned Wire Features']

# Apply to the final training data
binned_gbdt_out = binned_gbdt.predict_proba(wire_tst_data)

# Take only the signal probability
binned_gbdt_out = binned_gbdt_out[:,1]

print len(numpy.unique(binned_gbdt_out))

# Remind outselves of the labels
binned_labels = wire_tst_data['Labels']

In [None]:
print shape(wire_tst_data)
print shape(binned_gbdt_out)

In [None]:
%run visualizations.ipynb
figsize(11,6)
plot_evt_feature( binned_gbdt_out, binned_labels, 
             xlabel="BBDT Classification Output", ylabel="Normalised Event Count",
            title="BBDT Output", nbins=20)
if save_all:
    savefig(savedir+"bbdt_out.png", bbox_inches='tight')

### Build Bonsai BDT

In [None]:
# Check for the binning used in the bonsai BDT
n_bins = len(numpy.unique(new_deps))
n_layers = len(numpy.unique(layer_id))
# Get the GBDT we want
binned_gbdt = factory['Binned Wire Features']

In [None]:
# Build a numpy array to hold all possible combinations
combinations = numpy.zeros((n_bins*n_bins*n_bins*n_layers,4))
n = 0
print n_layers

# Fill the combinations
for wire in range(n_bins):
    for left in range(n_bins):
        for right in range(n_bins):
            for layer in range(n_layers): 
                combinations[n,:] = wire, left, right, layer
                n += 1
# Make it a data frame with the correct labels
combinations = pandas.DataFrame(data=combinations,columns=['Binned_Deposits','Binned_L_Deposits',\
                                                           'Binned_R_Deposits','Layer_ID'])

In [None]:
# Check the shapes and indecies
print combinations.shape
print combinations.keys()
# Get the desired outputs for these combinations
binned_gbdt_all = binned_gbdt.predict_proba(combinations)[:,1]

In [None]:
# Reshape into a Bonsai BDT
print binned_gbdt_all.shape
bonsai_gbdt = binned_gbdt_all.reshape(n_bins,n_bins,n_bins,-1)
# Set no hits equal to zero chance of signal
## CONTROVERCIAL MOVE
bonsai_gbdt[0,:,:,:] = 0

In [None]:
# Test the output of the bonsai BDT
event = 10000
dummy_event = wire_trn_data[event:event+1]
print dummy_event
this_event = dummy_event.values[0]
print binned_gbdt.predict_proba(dummy_event)[:,1][0]
print bonsai_gbdt[round(this_event[4]), round(this_event[6]), round(this_event[5]), round(this_event[0])]

## Recover Layer Level Features

In [None]:
signal = ResampledHits(occupancy=0.30)
background = OnlyBackgroundHits(occupancy=0.30, n_events=2000)

In [None]:
#signal = AllHits('../data/online_sig_and_noise.root')
#background = AllHits('../data/online_noise.root')
used_signal = 2000
combined_events = used_signal + background.n_events
event_to_hit_look = np.arange(combined_events*signal.cydet.n_points).reshape(combined_events, -1)
print combined_events

In [None]:
event_features = OrderedDict()

# Energy deposits
sig_deposits = numpy.vstack(signal.get_energy_deposits(i) for i in range(used_signal))
sig_deposits = numpy.array()
bkg_deposits = numpy.vstack(background.get_energy_deposits(i) for i in range(background.n_events))
evt_deposits = numpy.vstack([sig_deposits,bkg_deposits])
print shape(evt_deposits)

In [None]:
# Layer ID of hit wires
evt_layer_id = numpy.vstack(signal.cydet.point_layers for i in range(combined_events))

# Truth labels
evt_labels = numpy.append(numpy.ones(used_signal), numpy.zeros(background.n_events)).transpose()

# Bin the energies
#evt_new_deps = bin_energies(evt_deposits)

## Binary Binning
evt_new_deps = binary_binning(evt_deposits)

# Get the left and right neighbours 
evt_new_deps_r =  evt_new_deps[:, right_neighs]
evt_new_deps_l =  evt_new_deps[:, left_neighs]

print evt_deposits.shape

In [None]:
# Get Bonsai Output
## Simple sum
evt_bonsai_out = numpy.vstack( bonsai_gbdt[evt_new_deps[i,:],\
                                           evt_new_deps_l[i,:],\
                                           evt_new_deps_r[i,:],\
                                           evt_layer_id[i,:]] for i in range(combined_events))
## Sum of squares
#evt_bonsai_out = numpy.square(evt_bonsai_out)
## Cut the values
#evt_bonsai_out = numpy.vstack( evt_bonsai_out[i,:] > 0.2 for i in range(combined_events))

In [None]:
def classify_layers(evt):
    sum_by_layer = [sum(evt_bonsai_out[evt,f_wre:f_wre+n_wres]) for f_wre, n_wres\
                            in zip(signal.cydet.first_point,signal.cydet.n_by_layer)]
    return numpy.array(sum_by_layer)

In [None]:
evt_layer_out = numpy.vstack( classify_layers(i) for i in range(combined_events))
evt_layer_out = numpy.vstack( np.divide(classify_layers(i), hits.cydet.n_points)\
                              for i in range(combined_events))

print evt_layer_out.shape

## Prepare GBDT Over Layer Features

In [None]:
layer_features = OrderedDict()
layer_labels = OrderedDict()

# First order is to just add each layer value as a feature.  This is upper limit of performance
for lay in range(n_layers):
    layer_features["Layer_"+str(lay)] = evt_layer_out[:,lay]
    layer_labels["Layer_"+str(lay)] = "Layer "+str(lay)

max_layer = numpy.hstack( [ np.where(evt_layer_out[i,:] == np.sort(evt_layer_out[i,:])[-1])[0][0]\
                                                      for i in range(combined_events)])
max_value = numpy.hstack( np.sort(evt_layer_out[i,:])[-1] for i in range(combined_events) )

n_max_layer = numpy.hstack( [ np.where(evt_layer_out[i,:] == np.sort(evt_layer_out[i,:])[-2])[0][0]\
                                                      for i in range(combined_events)])
n_max_value = numpy.hstack( [np.sort(evt_layer_out[i,:])[-2] for i in range(combined_events)])

nn_max_layer = numpy.hstack( [ np.where(evt_layer_out[i,:] == np.sort(evt_layer_out[i,:])[-3])[0][0]\
                                                      for i in range(combined_events)])
nn_max_value = numpy.hstack([np.sort(evt_layer_out[i,:])[-3] for i in range(combined_events)])

#layer_features["Max_Layer"] = max_layer
layer_features["Max_Value"] = max_value/(max_layer+np.ones(len(max_layer))).astype(float)
#np.array([signal.cydet.n_by_layer[max_l] for max_l in max_layer])
#layer_features["N_Max_Layer"] = n_max_layer
layer_features["N_Max_Value"] =n_max_value/(n_max_layer+np.ones(len(n_max_layer))).astype(float)
#layer_features["NN_Max_Layer"] = nn_max_layer
layer_features["NN_Max_Value"] = nn_max_value/(nn_max_layer+np.ones(len(nn_max_layer))).astype(float)
# Truth labels for each event
layer_features["Labels"] = evt_labels

In [None]:
# Name train features as all featurs except label
evt_train_features = layer_features.keys()[:-1]
# Name data as all features stacked in 3D array
evt_data = numpy.vstack(layer_features.values()).transpose()

# Count the number of features
n_evt_feats = evt_data.shape[-1]

In [None]:
evt_train_events, evt_test_events = train_test_split(range(len(evt_labels)), train_size=0.50, test_size=0.40)
evt_trn_data = pandas.DataFrame(data=evt_data[evt_train_events], columns=layer_features.keys())
evt_tst_data = pandas.DataFrame(data=evt_data[evt_test_events], columns=layer_features.keys())

In [None]:
print evt_train_features[0:9],

In [None]:
evt_factory = ClassifiersFactory()

evt_trn_feats_full = evt_train_features[:n_layers]
evt_all_layers = evt_train_features[:9]
evt_special_layers = evt_train_features[:n_layers][::3]+evt_train_features[1:n_layers][::3]

# Define GBDT over all layer sums
gb_full_layer_features = SklearnClassifier(GradientBoostingClassifier(n_estimators=200), 
                                    features=evt_trn_feats_full)
gb_sum_layer_features = SklearnClassifier(GradientBoostingClassifier(n_estimators=200), 
                                    features=evt_all_layers)
gb_special_layer_features = SklearnClassifier(GradientBoostingClassifier(n_estimators=200), 
                                    features=evt_special_layers)
 

evt_factory.add_classifier('Sums from All Layers', gb_full_layer_features)
evt_factory.add_classifier('Sums from Inner Layers', gb_sum_layer_features)
#evt_factory.add_classifier('Special Features', gb_special_layer_features)

#only_binned = ClassifiersFactory()
#only_binned.add_classifier('Binned Wire Features', gb_wire_features_bin)

In [None]:
# Train the classifier
evt_factory.fit(evt_trn_data, evt_trn_data['Labels'])
#only_binned.fit(wire_trn_data, wire_trn_data['Labels'])
pass

In [None]:
# Test the classifier
evt_predictions = evt_factory.test_on(evt_tst_data, evt_tst_data['Labels'])
#binned_predictions = only_binned.test_on(wire_tst_data, wire_tst_data['Labels'])

In [None]:
# Look at learning curves
figsize(10,10)
evt_predictions.learning_curve(RocAuc(), steps=1)

In [None]:
print sum(signal.cydet.n_by_layer[:9])
print sum(signal.cydet.n_by_layer[9:])

In [None]:
figsize(6,6) 
plot_rocs(evt_tst_data['Labels'], evt_predictions.prediction, zoom=False)
if save_all:
    savefig(savedir+"roc_evt.png", bbox_inches='tight')
show()
plot_rocs(evt_tst_data['Labels'], evt_predictions.prediction, zoom=True)
if save_all:
    savefig(savedir+"roc_evt_zoom.png", bbox_inches='tight')

In [None]:
figsize(11,6)
#figsize(5.5,3)
all_layers = evt_factory["Sums from All Layers"]
plot_feature_importance(all_layers, layer_labels)
if save_all:
    savefig(savedir+"feature_importance.png",bbox_inches="tight")
#show()
#plot_feature_correlations(factory, feat_label_dict)
#show()

#evt_predictions.feature_importance().plot(figsize=[11, 6])

In [None]:
corr = evt_predictions.features_correlation_matrix(evt_trn_feats_full+evt_special_layers)
                            #tick_labels=[feat_label_dict[key] for key in hough_gbdt_f])
corr.plot()

In [None]:
figsize(11,6)
plot_evt_feature( layer_features['Layer_0'], layer_features['Labels'], 
             xlabel="Average BBDT Output", ylabel="Normalised Event Count",
            title="Average BBDT Output for wires on Layer 0", nbins=20)
if save_all:
    plt.savefig(savedir+"layer_0_dist.png", bbox_inches='tight')

In [None]:
print sum(signal.cydet.n_by_layer[0:9])

In [None]:
figsize(11,6)
plot_evt_feature( layer_features['Layer_12'], layer_features['Labels'], 
             xlabel="Average BBDT Output", ylabel="Normalised Event Count",
            title="Average BBDT Output for wires on Layer 12", nbins=20)
if save_all:
    plt.savefig(savedir+"layer_12_dist.png", bbox_inches='tight')

In [None]:
# Event information\
event = 3
signal_event = True
add_trans = True

if signal_event:
    result = np.zeros(signal.cydet.n_points, dtype=int)
    result[signal.get_bkg_wires(event)] = 2
    result[signal.get_sig_wires(event)] = 1
else:
    result = np.zeros(background.cydet.n_points, dtype=int)
    result[background.get_bkg_wires(event)] = 2
    result[background.get_sig_wires(event)] = 1

In [None]:
# Basic output
plot_output(result, hits.cydet)
if save_all:
    plt.savefig(savedir+"orig_evt.png", bbox_inches='tight')
show()

# First GBDT output
plot_output(result, hits.cydet, size=plot_norm_size(evt_bonsai_out[event,:]))
plot_add_circle(2,38,36,lw=10, color="blue", l_alpha=0.35)
plot_add_outlines(result, hits.cydet)
plot_add_circle(0,0,5,color="grey", lw=40)
if save_all:
    plt.savefig(savedir+"bbdt_visual.png", bbox_inches='tight')
show()


## Check Mislabeled Events


In [None]:
# Get "is signal" predictions, i.e. 1 is signal, 0 is backgroun
is_signal = evt_predictions.prediction['Sums from Inner Layers'][:,1]

In [None]:
# Get False positive rate, true positive rate 
f_fpr, f_tpr, f_thersh = roc_curve(evt_tst_data['Labels'], is_signal)

In [None]:
# Get the needed threshold for background rejection at factor 20 suppression
ideal_thresh = f_thersh[np.where(f_fpr < 0.05)[0][-1]]
print ideal_thresh

In [None]:
# Get the signal events that pass through this threshold
mislabelled_signal = np.where( (is_signal < ideal_thresh) & 
                               (evt_tst_data['Labels'] == 1)
                              )[0]
print len(mislabelled_signal)

In [None]:
n_hits_mis_evts = [len(signal.get_sig_wires(evt)) for evt in mislabelled_signal]
print np.average(n_hits_mis_evts)
print np.std(n_hits_mis_evts)/np.sqrt(len(mislabelled_signal))

In [None]:
n_hits_evts = [len(signal.get_sig_wires(evt)) for evt in range(signal.n_events)]
print np.average(n_hits_evts)
print np.std(n_hits_evts)/np.sqrt(signal.n_events)