In [None]:
import os,sys,string, time
import matplotlib.pyplot as plt
import numpy as np
import uproot
import pandas as pd
from platform import python_version
import scipy
from scipy import stats
import uproot3
import math
from matplotlib.patches import Rectangle
import xgboost
from xgboost import XGBClassifier
import joblib
import seaborn as sns
from importlib import reload
import pickle

import Utilities.Plotter as PT
import Utilities.Constants as Constants
import Utilities.Variables_list as Variables
import Utilities.Functions as Functions
from Utilities.ipython_exit import exit

print ('Success')


## Reading in files

In [None]:
Params = {"Run":"run1", #The run number, so far either "run1" or "run3"
          "Load_single_file":False, #This will override everything else, put the desired file in the "single_file" line
          "single_file":10,
          "Load_standard":True,
          "Load_DetVars":False,
          "Only_keep_common_DetVar_evs":True,
          "Load_data":False,
          "FLATTEN":True, #Have one row per reconstructed object in the analysis dataframe
          "only_presel":False, #Create small files containing only variables necessary for pre-selection, for making pre-selection plots
          "EXT_in_training":True,
          "Load_pi0_signal":False} #Otherwise loads e+e- samples by default

loc_pkls = "pkl_files/"+Params["Run"]+"/current_files/my_vars/"

In [None]:
signal_samples_dict = {}
# end_string = "_FINAL"
end_string = "_full_Finished"

Presel_overlay = pd.read_pickle(loc_pkls+"Preselected_overlay_"+Params["Run"]+f"_flattened{end_string}.pkl")

if Params["Load_pi0_signal"] == False:
    if Params["Load_single_file"] == True:
        HNL_mass = Params["single_file"]
        Presel_signal = pd.read_pickle(loc_pkls+f"Preselected_{HNL_mass}_"+Params["Run"]+f"_flattened{end_string}.pkl")
        signal_samples_dict[HNL_mass] = Presel_signal
    else:
        for HNL_mass in Constants.HNL_ee_samples_names:
            Presel_signal = pd.read_pickle(loc_pkls+f"Preselected_{HNL_mass}_"+Params["Run"]+f"_flattened{end_string}.pkl")
            signal_samples_dict[HNL_mass] = Presel_signal
    
if Params["Load_pi0_signal"] == True:
    for HNL_mass in Constants.HNL_mass_pi0_samples_names:
        Presel_signal = pd.read_pickle(loc_pkls+f"pi0_selection/Preselected_{HNL_mass}_"+Params["Run"]+f"_flattened{end_string}.pkl")
        signal_samples_dict[HNL_mass] = Presel_signal
    
if Params["EXT_in_training"] == True:
    Presel_EXT = pd.read_pickle(loc_pkls+"Preselected_beamoff_"+Params["Run"]+f"_flattened{end_string}.pkl")

print(Presel_overlay.keys())

## Splitting into test and training samples

In [None]:
# ultimate_feature_list = ['n_pfps', 'n_tracks', 'shr_theta_v', 'shr_phi_v', 'shr_pz_v', 'shrclusdir2', 'shr_energy_tot',
#                          'trk_theta_v', 'trk_phi_v','trk_dir_z_v', 'trk_energy', 'trk_energy_tot', 'trk_calo_energy_u_v', 'trk_score_v',
#                          'pfnplanehits_U', 'pfnplanehits_V', 'pfnplanehits_Y', 'NeutrinoEnergy2']#, 'nu_flashmatch_score']

ultimate_feature_list = ['n_pfps', 'n_tracks', 'shr_theta_v', 'shr_phi_v', 'shr_pz_v', 'shrclusdir2', 'shr_energy_tot',
                         'trk_theta_v', 'trk_phi_v','trk_dir_z_v', 'trk_energy', 'trk_energy_tot', 'trk_score_v',
                         'pfnplanehits_U', 'pfnplanehits_V', 'pfnplanehits_Y', 'NeutrinoEnergy2']#, 'nu_flashmatch_score']

ultimate_feature_list += ['shr_tkfit_dedx_max', 'topological_score']
if Params["Run"]=="run3":
    ultimate_feature_list += ['nu_flashmatch_score']
print(f"Number of features is {len(ultimate_feature_list)}")

bdt_vars = ultimate_feature_list #This is using just the most important variables list

new_value = -9999.0 #This tells XGB what number refers to missing data for all variables

signal_train_dict = {}
signal_test_dict = {}
labels_dict = {} 

train_vs_test_fraction = 0.7 #This is the fraction used for training

print(f"Total length of overlay file is {len(Presel_overlay)}")

overlay_train = Presel_overlay[:int(len(Presel_overlay)*train_vs_test_fraction)]
overlay_test = Presel_overlay[int(len(Presel_overlay)*train_vs_test_fraction):]

print(f"Total length of overlay TRAIN file is {len(overlay_train)}")
print(f"Total length of overlay TEST file is {len(overlay_test)}")

# overlay_train = Presel_overlay[int(len(Presel_overlay)*train_vs_test_fraction):] #OLD WRONG WAY, i.e 70% test
# overlay_test = Presel_overlay[:int(len(Presel_overlay)*train_vs_test_fraction)]

In [None]:
# if Params["Load_pi0_signal"] == False: BDT_name = f"ee{end_string}"
# if Params["Load_pi0_signal"] == True: BDT_name = f"pi0{end_string}"
if Params["Load_pi0_signal"] == False: BDT_name = f"{end_string}"
if Params["Load_pi0_signal"] == True: BDT_name = f"{end_string}"

In [None]:
signal_samples_dict.keys()

In [None]:
def Split_samples(Presel_overlay, signal_samples_dict, Presel_EXT, overlay_train_frac=0.7, signal_train_frac=0.7, EXT_train_frac=0.3):
    """
    Input pkl files and train_vs_test fractions.
    Return the split samples.
    """
    overlay_train = Presel_overlay[:int(len(Presel_overlay)*overlay_train_frac)]
    overlay_test = Presel_overlay[int(len(Presel_overlay)*overlay_train_frac):]
    
    EXT_train = Presel_EXT[:int(len(Presel_EXT)*EXT_train_frac)]
    EXT_test = Presel_EXT[int(len(Presel_EXT)*EXT_train_frac):]
    
    signal_train_dict, signal_test_dict = {}, {}
    
    print(f"Length overlay train {len(overlay_train)}")
    print(f"Length overlay test {len(overlay_test)}")
    print(f"Length EXT train {len(EXT_train)}")
    print(f"Length EXT test {len(EXT_test)}")
    
    for HNL_mass in signal_samples_dict:
        signal_train_dict[HNL_mass] = signal_samples_dict[HNL_mass][:int(len(signal_samples_dict[HNL_mass])*signal_train_frac)]
        signal_test_dict[HNL_mass] = signal_samples_dict[HNL_mass][int(len(signal_samples_dict[HNL_mass])*signal_train_frac):]
        
        print(f"Length {HNL_mass} train {len(signal_train_dict[HNL_mass])}")
        print(f"Length {HNL_mass} test {len(signal_test_dict[HNL_mass])}")
        
    return overlay_train, overlay_test, EXT_train, EXT_test, signal_train_dict, signal_test_dict
    

In [None]:
overlay_train, overlay_test, EXT_train, EXT_test, signal_train_dict, signal_test_dict = Split_samples(Presel_overlay, signal_samples_dict, Presel_EXT)

In [None]:
def Save_test_pkls(Params, loc_pkls, save_str, overlay_test, signal_test_dict, EXT_test = []):
    """
    Input Params, save_str, overlay test df, signal_test_dict, and EXT test sample if using.
    Saves the dataframes as .pkl files.
    """
    start_str = loc_pkls
    if Params["Load_pi0_signal"] == False: start_str+="BDT_Test_dfs/"
    if Params["Load_pi0_signal"] == True: start_str+="BDT_Test_dfs/pi0_selection/"
    
    print(f"Pickling overlay test sample")
    overlay_test.to_pickle(start_str+"Test_overlay_"+Params["Run"]+f"_flattened{save_str}.pkl")
    
    for HNL_mass in signal_test_dict:
        print(f"Pickling {HNL_mass} test sample")
        signal_test_dict[HNL_mass].to_pickle(start_str+f"Test_{HNL_mass}_"+Params["Run"]+f"_flattened{save_str}.pkl")
        
    if Params["EXT_in_training"] == True: 
        print(f"Pickling beamoff test sample")
        EXT_test.to_pickle(start_str+"Test_beamoff_"+Params["Run"]+f"_flattened{save_str}.pkl")
        
    if Params["EXT_in_training"] == False: print("Not saving beamoff test, as not using in training.") 
        

def Make_train_labels_and_dicts(Params, bdt_vars, overlay_train, EXT_train, signal_train_dict):
    """
    Input Params, bdt variables, training samples.
    Return dicts of labels indicating if the event is signal (1) or bkg (0).
    """
    combined_dict, labels_dict = {}, {}
    
    for HNL_mass in signal_train_dict:
        # labels_dict[HNL_mass] = [1]*len(signal_train_dict[HNL_mass][bdt_vars]) + [0]*len(overlay_train[bdt_vars])
        combined_dict[HNL_mass] = pd.concat([signal_train_dict[HNL_mass][bdt_vars], overlay_train[bdt_vars]])
        labels_dict[HNL_mass] = [1]*len(signal_train_dict[HNL_mass]) + [0]*len(overlay_train)
        
    bkg_train = overlay_train.copy()
        
    if Params["EXT_in_training"] == True:
        for HNL_mass in signal_train_dict:
            # labels_dict[HNL_mass] = labels_dict[HNL_mass] + [0]*len(EXT_train[bdt_vars])
            combined_dict[HNL_mass] = pd.concat([combined_dict[HNL_mass][bdt_vars], EXT_train[bdt_vars]])
            labels_dict[HNL_mass] = labels_dict[HNL_mass] + [0]*len(EXT_train)
        bkg_train=pd.concat([overlay_train, EXT_train])
    
    return combined_dict, labels_dict, bkg_train

In [None]:
Save_test_pkls(Params, loc_pkls, "_full_Finished", overlay_test, signal_test_dict, EXT_test)


In [None]:
combined_dict, labels_dict, bkg_train = Make_train_labels_and_dicts(Params, bdt_vars, overlay_train, EXT_train, signal_train_dict)

if Params["EXT_in_training"] == True:
    bkg_test = pd.concat([overlay_test,EXT_test])
else: bkg_test = overlay_test.copy()

In [None]:
def Prepare_for_xgb(Params, bdt_vars, combined_train_dict, signal_train_dict, bkg_train,  signal_test_dict, bkg_test,
                    labels_train_dict, missing=-9999.0):
    """
    Input training dict, test dict, labels and xgboost parameters.
    Returns the DMatrix forms of the dataframes for training.
    """
    xgb_train_bkg = xgboost.DMatrix(bkg_train[bdt_vars],label=[0]*len(bkg_train[bdt_vars]), missing=missing, feature_names=bdt_vars)
    xgb_test_bkg = xgboost.DMatrix(bkg_test[bdt_vars], label=[0]*len(bkg_test[bdt_vars]), missing=missing, feature_names=bdt_vars)
    
    xgb_train_dict, xgb_sig_test_dict, xgb_sig_train_dict = {}, {}, {}
    
    for HNL_mass in signal_train_dict:
        xgb_train_dict[HNL_mass] = xgboost.DMatrix(combined_train_dict[HNL_mass][bdt_vars], label=labels_dict[HNL_mass], 
                                                   missing=missing, feature_names=bdt_vars)
        
        xgb_sig_test_dict[HNL_mass] = xgboost.DMatrix(signal_test_dict[HNL_mass][bdt_vars], label=[1]*len(signal_test_dict[HNL_mass][bdt_vars]), 
                                                      missing=missing, feature_names=bdt_vars)
        
        xgb_sig_train_dict[HNL_mass] = xgboost.DMatrix(signal_train_dict[HNL_mass][bdt_vars],label=[1]*len(signal_train_dict[HNL_mass][bdt_vars]),
                                                  missing=missing, feature_names=bdt_vars)
        
        
    return xgb_train_dict, xgb_sig_train_dict, xgb_sig_test_dict, xgb_train_bkg, xgb_test_bkg
        
def Train_BDTs(Params, bdt_vars, BDT_name, xgb_train_dict, xgb_sig_test_dict, xgb_test_bkg, xgb_param, progress, num_round = 150, missing=-9999.0):
    """
    Input training dict, test dict, labels and xgboost parameters.
    Saves the BDT models as .json files.
    """
    watchlist = {}
    for HNL_mass in xgb_train_dict:
        watchlist[HNL_mass] = [(xgb_train_dict[HNL_mass], 'train'), (xgb_sig_test_dict[HNL_mass], 'test_sig'), (xgb_test_bkg,'test_bkg')]
        print(f"Training {HNL_mass} BDT" + "\n")
        bdt = xgboost.train(xgb_param, xgb_train_dict[HNL_mass], num_round, watchlist[HNL_mass], evals_result=progress, verbose_eval=False)

        if Params["Load_pi0_signal"] == False:
            bdt.save_model("bdts/"+Params["Run"]+f"_{HNL_mass}{BDT_name}.json")
        if Params["Load_pi0_signal"] == True:
            bdt.save_model("bdts/pi0_selection/"+Params["Run"]+f"_{HNL_mass}{BDT_name}.json")
            
    return watchlist
        
def Test_vs_train_plot():
    results_sig = bdt.predict(xgb_sig_test_dict[HNL_mass])
    results_bkg = bdt.predict(xgb_test_bkg)
    
    train_results_sig = bdt.predict(xgb_sig_train_dict[HNL_mass])
    train_results_bkg = bdt.predict(xgb_bkg_train_dict[HNL_mass])
    
    test_results_sig_dict.update({HNL_mass:results_sig})
    test_results_bkg_dict.update({HNL_mass:results_bkg})
    
    train_results_sig_dict.update({HNL_mass:train_results_sig})
    train_results_bkg_dict.update({HNL_mass:train_results_bkg})
    
    print("write this")
        

In [None]:
xgb_train_dict, xgb_sig_train_dict, xgb_sig_test_dict, xgb_train_bkg, xgb_test_bkg = Prepare_for_xgb(Params, bdt_vars, combined_dict, 
                                                                                                     signal_train_dict, bkg_train,  signal_test_dict, 
                                                                                                     bkg_test, labels_dict, missing=-9999.0)

In [None]:
xgb_param = {'booster': 'dart',
        'max_depth':6,
        'eta':0.3,
        'objective':'binary:logistic',
#        'eval_metric':'auc', 
#        'subsample':0.5,
        'tree_method':'hist',
#        'scale_pos_weight': float(len(data_bkg_train))/float(len(data_sig_train)),
        'rate_drop': 0.1,
        'skip_drop': 0.5 }
progress = dict()

watchlist = Train_BDTs(Params, bdt_vars, BDT_name, xgb_train_dict, xgb_sig_test_dict, xgb_test_bkg, xgb_param, progress, num_round = 150)

In [None]:
xgb_train_dict.keys()

In [None]:
pickle_files = True

save_str = "_full_Finished"

# overlay_test.to_pickle(loc_pkls+"BDT_Test_dfs/Test_overlay_"+Params["Run"]+f"_flattened{end_string}.pkl")
if Params["Load_pi0_signal"] == False:
    overlay_test.to_pickle(loc_pkls+"BDT_Test_dfs/Test_overlay_"+Params["Run"]+f"_flattened{save_str}.pkl")
    # overlay_test.to_pickle(loc_pkls+"BDT_Test_dfs/Test_overlay_"+Params["Run"]+"_my_vars_flattened_ultimate.pkl")
    for HNL_mass in signal_samples_dict:
        signal_train_dict[HNL_mass] = signal_samples_dict[HNL_mass][:int(len(signal_samples_dict[HNL_mass])*train_vs_test_fraction)]
        signal_test_dict[HNL_mass] = signal_samples_dict[HNL_mass][int(len(signal_samples_dict[HNL_mass])*train_vs_test_fraction):]
        if pickle_files == True:
            print(f"Pickling {HNL_mass} HNL test sample")
            # signal_test_dict[HNL_mass].to_pickle(loc_pkls+f"BDT_Test_dfs/Test_signal_{HNL_mass}_"+Params["Run"]+f"{end_string}.pkl")
            signal_test_dict[HNL_mass].to_pickle(loc_pkls+f"BDT_Test_dfs/Test_{HNL_mass}_"+Params["Run"]+f"_flattened{save_str}.pkl")
    
        labels_dict[HNL_mass] = [1]*len(signal_train_dict[HNL_mass][bdt_vars]) + [0]*len(overlay_train[bdt_vars])
        
if Params["Load_pi0_signal"] == True:
    overlay_test.to_pickle(loc_pkls+"BDT_Test_dfs/pi0_selection/Test_overlay_"+Params["Run"]+f"_flattened{end_string}.pkl")

    for HNL_mass in signal_samples_dict:
        signal_train_dict[HNL_mass] = signal_samples_dict[HNL_mass][:int(len(signal_samples_dict[HNL_mass])*train_vs_test_fraction)]
        signal_test_dict[HNL_mass] = signal_samples_dict[HNL_mass][int(len(signal_samples_dict[HNL_mass])*train_vs_test_fraction):]
        if pickle_files == True:
            print(f"Pickling {HNL_mass} HNL pi0 test sample")
            # signal_test_dict[HNL_mass].to_pickle(loc_pkls+f"BDT_Test_dfs/pi0_selection/Test_{HNL_mass}_"+
            #                                      Params["Run"]+f"_flattened{end_string}.pkl")
            signal_test_dict[HNL_mass].to_pickle(loc_pkls+f"BDT_Test_dfs/Test_{HNL_mass}_"+
                                                 Params["Run"]+f"_flattened{save_str}.pkl")
    
        labels_dict[HNL_mass] = [1]*len(signal_train_dict[HNL_mass][bdt_vars]) + [0]*len(overlay_train[bdt_vars])
    
if Params["EXT_in_training"] == True:
    frac_EXT = 0.1
    EXT_train = Presel_EXT[:int(len(Presel_EXT)*frac_EXT)]
    EXT_test = Presel_EXT[int(len(Presel_EXT)*frac_EXT):]
    print("Number of EXT to train: " + str(len(EXT_train)))
    overlay_plus_EXT = pd.concat([overlay_train[bdt_vars],EXT_train[bdt_vars]])
    for HNL_mass in signal_samples_dict:
        labels_dict[HNL_mass] = labels_dict[HNL_mass] + [0]*len(EXT_train[bdt_vars])
    
    if pickle_files == True:
        EXT_test.to_pickle(loc_pkls+f"BDT_Test_dfs/Test_beamoff_"+Params["Run"]+f"_flattened{save_str}.pkl")

    print(f"Total length of beamoff TRAIN file is {len(EXT_train)}")
    print(f"Total length of beamoff TEST file is {len(EXT_test)}")
    

## BDT Training

In [None]:
xgb_train_dict = {}
xgb_test_dict = {}

xgb_sig_train_dict = {}
xgb_bkg_train_dict = {}

xgb_test_bkg = xgboost.DMatrix(overlay_test[bdt_vars], label=[0]*len(overlay_test[bdt_vars]), missing=new_value, feature_names=bdt_vars)

xgb_param = {'booster': 'dart',
        'max_depth':6,
        'eta':0.3,
        'objective':'binary:logistic',
#        'eval_metric':'auc', 
#        'subsample':0.5,
        'tree_method':'hist',
#        'scale_pos_weight': float(len(data_bkg_train))/float(len(data_sig_train)),
        'rate_drop': 0.1,
        'skip_drop': 0.5 }
num_round = 150
progress = dict()

for HNL_mass in signal_train_dict:
    if Params["EXT_in_training"] == False:
        xgb_train_dict[HNL_mass] = xgboost.DMatrix(pd.concat([signal_train_dict[HNL_mass][bdt_vars], #This is both signal and bkg combined into one
                                                               overlay_train[bdt_vars]]), 
                                               label=labels_dict[HNL_mass], 
                                                    missing=new_value, feature_names=bdt_vars)
    if Params["EXT_in_training"] == True:
        xgb_train_dict[HNL_mass] = xgboost.DMatrix(pd.concat([signal_train_dict[HNL_mass][bdt_vars], #This is both signal and bkg combined into one
                                                               overlay_plus_EXT[bdt_vars]]), 
                                               label=labels_dict[HNL_mass], 
                                                    missing=new_value, feature_names=bdt_vars)
    xgb_test_dict[HNL_mass] = xgboost.DMatrix(signal_test_dict[HNL_mass][bdt_vars], label=[1]*len(signal_test_dict[HNL_mass][bdt_vars]), #Just signal test
                                              missing=new_value, feature_names=bdt_vars)
    
    xgb_sig_train_dict[HNL_mass] = xgboost.DMatrix(signal_train_dict[HNL_mass][bdt_vars],label=[1]*len(signal_train_dict[HNL_mass][bdt_vars]), #Signal training
                                                  missing=new_value, feature_names=bdt_vars)
    xgb_bkg_train_dict[HNL_mass] = xgboost.DMatrix(overlay_train[bdt_vars],label=[0]*len(overlay_train[bdt_vars]), #Just background training
                                                  missing=new_value, feature_names=bdt_vars)

    #watchlist so that you can monitor the performance of the training by iterations
    watchlist = [(xgb_train_dict[HNL_mass], 'train'), (xgb_test_dict[HNL_mass], 'test_sig'), (xgb_test_bkg,'test_bkg')]
    

In [None]:
# if Params["Load_pi0_signal"] == False: BDT_name = "ee_FINAL"
# if Params["Load_pi0_signal"] == True: BDT_name = "pi0_FINAL"

for HNL_mass in signal_train_dict:
    print(f"Training {HNL_mass} BDT" + "\n")
    bdt = xgboost.train(xgb_param, xgb_train_dict[HNL_mass], num_round, watchlist, evals_result=progress, verbose_eval=False)
    # doesnt like watchlist/eval_result if using AOC
    # save model so you can load it later
    if Params["Load_pi0_signal"] == False:
        bdt.save_model("bdts/"+Params["Run"]+f"_{HNL_mass}{BDT_name}.json")
    if Params["Load_pi0_signal"] == True:
        bdt.save_model("bdts/pi0_selection/"+Params["Run"]+f"_{HNL_mass}{BDT_name}.json")

print("Finished")

In [None]:
print(BDT_name)

In [None]:
var_list = bdt_vars

# BDT_name = "New_20_variables_FIXED"
if Params["Load_pi0_signal"] == False:
    with open(f"bdts/input_vars/{BDT_name}_"+Params["Run"], "wb") as fp:   #Pickling
        pickle.dump(var_list, fp)
elif Params["Load_pi0_signal"] == True:
    with open(f"bdts/pi0_selection/input_vars/{BDT_name}_"+Params["Run"], "wb") as fp:   #Pickling
        pickle.dump(var_list, fp)

# with open("bdts/input_vars/"+BDT_name, "rb") as fp:   # Unpickling
#     b = pickle.load(fp)
    
# print(b)

# Finished Training

## Checking variable correlations

In [None]:
# bdt_vars = feature_names
HNL_mass = "150_pi0"

In [None]:
#Taken from Luis' code
# for HNL_mass in HNL_masses:
method = 'kendall'
correlations = signal_samples_dict[HNL_mass][bdt_vars].astype(np.float64).corr(method=method)
plt.figure(figsize=(15,12))
sns.heatmap(correlations,vmin=-1,annot=False,square=True,cbar_kws={'label':method+' correlation'},cmap = 'RdBu_r')
plt.title('Input Variable Correlations')
plt.show()

In [None]:
#Just looking at most correlated 
corr=signal_samples_dict[HNL_mass][bdt_vars].corr()
high_corr_var=np.where(corr>0.999)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
#high_corr_var
print("Done")

## Looking at feature importances

In [None]:
a = "150_pi0"
print(a.split("_")[1])

print(plt.rcParams['axes.prop_cycle'].by_key()['color'])


In [None]:
#Make an importances dict   
def Top_N_vars(bdt_model, N_vars):
    importance = bdt.get_score(importance_type="gain")
    # for key in importance.keys():
    #     importance[key] = round(importance[key],1)
    sorted_importance = dict(sorted(importance.items(), key=lambda item: item[1]))
    sorted_importance_list = list(sorted_importance.values())
    sorted_importance_keys= list(sorted_importance.keys())
    top_N = sorted_importance_keys[-N_vars:]
    
    return top_N

def Sorted_importance(bdt_model):
    importance = bdt.get_score(importance_type="gain")
    return importance

top_N_dict = {}
importance_dict = {}
list_of_lists = []
# BDT_name = "ee_Finished"
# BDT_name = "pi0"

# if Params["Load_pi0_signal"] == False: 
#     sample_names = Constants.HNL_ee_samples_names
#     loc = "bdts/"
# if Params["Load_pi0_signal"] == True: 
#     sample_names = Constants.HNL_mass_pi0_samples_names #I should load BOTH
#     loc = "bdts/pi0_selection/"
sample_names = Constants.HNL_ee_samples_names + Constants.HNL_mass_pi0_samples_names
# for HNL_mass in Constants.HNL_mass_samples:
for HNL_mass in sample_names:
    bdt = xgboost.Booster()
    if HNL_mass.split("_")[1] == "pi0": loc = "bdts/pi0_selection/"
    if HNL_mass.split("_")[1] == "ee": loc = "bdts/"
    # bdt.load_model("bdts/"+Params["Run"]+f"_{HNL_mass}MeV_ultimate.json")
    # bdt.load_model("bdts/"+Params["Run"]+f"_{HNL_mass}MeV_{BDT_name}.json")
    bdt.load_model(loc+Params["Run"]+f"_{HNL_mass}{BDT_name}.json")
    # print("Number of entries in top20 is " + str(len(Top_N_vars(bdt, 20))))
    # top_N_dict[HNL_mass] = Top_N_vars(bdt, 20)
    top_N = Top_N_vars(bdt, 50)
    top_N_dict[HNL_mass] = Top_N_vars(bdt, 50)
    list_of_lists.append(Top_N_vars(bdt, 50))
    importance_dict[HNL_mass] = Sorted_importance(bdt)
    
elements_in_all = list(set.intersection(*map(set, list_of_lists)))
print(len(elements_in_all))
print(elements_in_all)

In [None]:
highets_imps = ['n_showers', 'NeutrinoEnergy2', 'secondshower_Y_charge', 'trk_chipr_best', 'trk_energy_hits_tot', 'shr_energy_tot', 
                'contained_sps_ratio', 'pfnplanehits_Y', 'shrclusdir2', 'trk_dir_y_v', 'SliceCaloEnergy2', 'trk_theta_v', 'trk_start_x_v', 
                'pi0_radlen2', 'pfnplanehits_V', 'trk_energy', 'trk_score_v', 'shrclusdir0', 'trk_bragg_mip_v', 'shr_py_v', 
                'nu_flashmatch_score', 'n_pfps', 'CosmicIPAll3D', 'pi0_dir2_z']

ultimate_feature_list = ['n_pfps', 'n_tracks', 'shr_theta_v', 'shr_phi_v', 'shr_pz_v', 'shrclusdir2', 'shr_energy_tot',
                         'trk_theta_v', 'trk_phi_v','trk_dir_z_v', 'trk_energy', 'trk_energy_tot', 'trk_calo_energy_u_v', 'trk_score_v',
                         'pfnplanehits_U', 'pfnplanehits_V', 'pfnplanehits_Y', 'NeutrinoEnergy2', 'nu_flashmatch_score']

elements_in_all = list(set.intersection(*map(set, [highets_imps,ultimate_feature_list])))
print(len(elements_in_all))
print(elements_in_all)

In [None]:
print(importance_dict[245].keys())
print(len(importance_dict[245].keys()))

In [None]:
# for HNL_mass in Constants.HNL_mass_samples:
savefig = True
plt.figure(figsize=[20,8])
if Params["Load_pi0_signal"] == True: 
    colours = {"150_pi0":"coral", "200_pi0":"cornflowerblue", "245_pi0":"olivedrab"}
    smaller_samples = ["150_pi0", "200_pi0", "245_pi0"]
if Params["Load_pi0_signal"] == False: 
    colours = {"10_ee":"coral", "100_ee":"cornflowerblue", "150_ee":"olivedrab"}
    smaller_samples = ["10_ee", "100_ee", "150_ee"]
    
smaller_samples = ["10_ee", "100_ee", "150_ee","150_pi0", "200_pi0", "245_pi0"]
color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

for i, HNL_mass in enumerate(smaller_samples):
    plt.bar(importance_dict[HNL_mass].keys(),importance_dict[HNL_mass].values(), label=f"{HNL_mass} model", 
            fill=False,linewidth=3, edgecolor=color_cycle[i])
    # plt.bar(importance_dict[HNL_mass].keys(),importance_dict[HNL_mass].values(), label=f"{HNL_mass}MeV model", 
    #         fill=False,linewidth=3, edgecolor=colours[HNL_mass], color=colours[HNL_mass])
# plt.bar(importance_dict[245].keys(),importance_dict[245].values())
plt.xticks(np.array(range(0, len(importance_dict[HNL_mass].keys()))),importance_dict[HNL_mass].keys(),rotation=80)
plt.ylabel("Importance")
plt.legend(fontsize=20)
plt.tight_layout()

if savefig==True:
    plt.savefig("plots/BDT_output/variable_importance/"+ Params["Run"]+f"_importances{BDT_name}.pdf")
    plt.savefig("plots/BDT_output/variable_importance/"+ Params["Run"]+f"_importances{BDT_name}.png")

In [None]:
plt.figure(figsize=[20,8])
colours = {150:"coral", 200:"cornflowerblue", 245:"olivedrab"}
for HNL_mass in [245]:
    plt.bar(importance_dict[HNL_mass].keys(),importance_dict[HNL_mass].values(), label=f"{HNL_mass}MeV model", 
            fill=False,linewidth=3, edgecolor=colours[HNL_mass], color=colours[HNL_mass])
# plt.bar(importance_dict[245].keys(),importance_dict[245].values())
plt.xticks(np.array(range(0, len(importance_dict[HNL_mass].keys()))),importance_dict[HNL_mass].keys(),rotation=80)
plt.ylabel("Importance")
plt.legend(fontsize=20)
plt.tight_layout()

# plt.savefig("plots/BDT_output/variable_importance/"+ Params["Run"]+f"_importances_{BDT_name}.pdf")
# plt.savefig("plots/BDT_output/variable_importance/"+ Params["Run"]+f"_importances_{BDT_name}.png")

In [None]:
fig = plt.figure(figsize=[20,20])
ax1 = fig.add_subplot(projection='3d')

mass_list = ["2MeV", "100MeV", "245MeV"]
x = importance_dict[2].keys()
y = mass_list
data = np.array([list(importance_dict[2].values()),
                 list(importance_dict[100].values()),
                 list(importance_dict[245].values())])

numOfCols = len(x)
numOfRows = len(y)

xpos = np.arange(0, numOfCols, 1)
ypos = np.arange(0, numOfRows, 1)
xpos, ypos = np.meshgrid(xpos + 0.5, ypos + 0.5)

xpos = xpos.flatten()
ypos = ypos.flatten()
zpos = np.zeros(numOfCols * numOfRows)

dx = np.ones(numOfRows * numOfCols) * 0.5
dy = np.ones(numOfCols * numOfRows) * 0.5
dz = data.flatten()

mass_list_labels = [" "," ", "2MeV"," ", "100MeV"," ", "245MeV"]

ax1.bar3d(xpos, ypos, zpos, dx, dy, dz)
ax1.set_xticklabels(list(importance_dict[2].keys()),rotation=45)
# ax1.set_xticklabels(list(importance_dict[2].keys()))
ax1.set_yticklabels(mass_list_labels)

ax1.set_xlabel('Variable')
ax1.set_ylabel('Model')
ax1.set_zlabel('Importance')
# plt.show()

# Finished code

In [None]:
xgb_test_dict.keys()

In [None]:
# dirt_matrix = xgboost.DMatrix(dirt_BDT[bdt_vars])
# EXT_matrix = xgboost.DMatrix(EXT_BDT[bdt_vars])

test_results_sig_dict = {}
test_results_bkg_dict = {}

train_results_sig_dict = {}
train_results_bkg_dict = {}

for HNL_mass in Constants.HNL_mass_samples:

    bdt = xgboost.Booster()
    bdt.load_model("bdts/"+Params["Run"]+f"_{HNL_mass}_MeV_New_20_variables_FIXED.json")
    #bdt.load_model(f'bdts/{Run}_{HNL_mass}_MeV_REDUCED_variables_flattened_highest_E_2.json')
    
    importance = bdt.get_score(importance_type="gain")
    
    for key in importance.keys():
        importance[key] = round(importance[key],1)
        
    #importance_dict[HNL_mass] = importance

    results_sig = bdt.predict(xgb_test_dict[HNL_mass])
    results_bkg = bdt.predict(xgb_test_bkg)
    
    train_results_sig = bdt.predict(xgb_sig_train_dict[HNL_mass])
    train_results_bkg = bdt.predict(xgb_bkg_train_dict[HNL_mass])
    
    test_results_sig_dict.update({HNL_mass:results_sig})
    test_results_bkg_dict.update({HNL_mass:results_bkg})
    
    train_results_sig_dict.update({HNL_mass:train_results_sig})
    train_results_bkg_dict.update({HNL_mass:train_results_bkg})

    # results_dirt = bdt.predict(dirt_matrix)
    # results_EXT = bdt.predict(EXT_matrix)
    
    # dirt_BDT[f'BDT_output_{HNL_mass}MeV'] = results_dirt
    # EXT_BDT[f'BDT_output_{HNL_mass}MeV'] = results_EXT

    # overlay_test_BDT[f'BDT_output_{HNL_mass}MeV'] = results_bkg
    # #Can add in a second loop over HNL_masses so that I predict each signal mass point with every other mass point bdt
    # signal_test_BDT_dict[HNL_mass][f'BDT_output'] = results_sig
    
    #Plotting importances of variables
    plt.figure(figsize=(12,12),facecolor='white')
    print(f"Plotting {HNL_mass}MeV importances:")
    a = xgboost.plot_importance(importance,max_num_features=10,importance_type='gain')
    

## Test vs. Train

In [None]:
hist_range=[0,1.0]
n_bins=20

for HNL_mass in Constants.HNL_mass_samples:
    plt.figure(figsize=(10,7))
    plt.hist(train_results_sig_dict[HNL_mass],bins=n_bins, range=hist_range, density=True,alpha=0.4,color='red',label=f'Train {HNL_mass}MeV HNL' )
    counts,bin_edges = np.histogram(test_results_sig_dict[HNL_mass],bins=n_bins,range=hist_range,density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.
    plt.plot(bin_centers,counts,marker='o',linestyle="None",color='red',label=f'Test {HNL_mass}MeV HNL')

    plt.hist(train_results_bkg_dict[HNL_mass], bins = n_bins, range = hist_range, density = True, alpha = 0.4, color = 'orange', label = r'Train overlay')
    counts,bin_edges = np.histogram(test_results_bkg_dict[HNL_mass],bins = n_bins, range= hist_range,density = True)
    bin_centers = (bin_edges[:-1] +  bin_edges[1:])/2.
    plt.plot(bin_centers,counts,marker='o',linestyle ="None",color='orange',label = r'Test overlay')
    plt.legend()

## Checking variable correlations

In [None]:
#Taken from Luis' code
# for HNL_mass in HNL_masses:
method = 'kendall'
correlations = cleaned_signal_dict[100][bdt_vars].astype(np.float64).corr(method=method)
plt.figure(figsize=(15,12))
sns.heatmap(correlations,vmin=-1,annot=False,square=True,cbar_kws={'label':method+' correlation'},cmap = 'RdBu_r')
plt.title('Input Variable Correlations')
plt.show()

In [None]:
#Just looking at most correlated 
corr=cleaned_signal_dict[100][bdt_vars].corr()
high_corr_var=np.where(corr>0.95)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
high_corr_var

# End of code