In [1]:
# import modules
import uproot, sys, time, math, pickle, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import awkward as ak
from tqdm import tqdm
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
from scipy.special import betainc
from scipy.stats import norm

# import config functions
sys.path.append("/home/jlai/jet-faking/config")
from jet_faking_plot_config import getWeight, zbi, sample_dict, getVarDict
from plot_var import variables, variables_data, ntuple_names, ntuple_names_BDT
from n_1_iteration_functions import get_best_cut, calculate_significance, apply_cut_to_fb, apply_all_cuts, compute_total_significance, n_minus_1_optimizer
# from cut_config import cut_config

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14.0,10.0  # Roughly 11 cm wde by 8 cm high  
mpl.rcParams['font.size'] = 20.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"]  # Legend
})

tot = []
data = pd.DataFrame()
ntuple_names = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct', 'data23']

def test(fb):
    # checking if there are any none values
    mask = ak.is_none(fb['met_tst_et'])
    n_none = ak.sum(mask)
    print("Number of none values: ", n_none)
    # if n_none > 0:
    #     fb = fb[~mask]
    # print("Events after removing none values: ", len(fb), ak.sum(ak.is_none(fb['met_tst_et'])))

def print_cut(ntuple_name, fb, label):
    print(f"Unweighted Events {label}: ", len(fb))
    if ntuple_name == 'data23':
        print(f"Weighted Events {label}: ", sum(getWeight(fb, ntuple_name, jet_faking=True)))
    else: 
        print(f"Weighted Events {label}: ", sum(getWeight(fb, ntuple_name)))

for i in range(len(ntuple_names)):
    start_time = time.time()
    ntuple_name = ntuple_names[i]
    if ntuple_name == 'data23': # data
        path = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/data23_y_BDT_score.root" 
        print('processing file: ', path)
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables_data, library="ak")
        fb['VertexBDTScore'] = fb['BDTScore'] # renaming BDTScore to ensure this is recognized as Vertex BDT Score
        
        fb = fb[ak.num(fb['ph_eta']) > 0]     # for abs(ak.firsts(fb['ph_eta'])) to have value to the reweighting
                
        mask1 = (ak.firsts(fb['ph_topoetcone40'])-2450.)/ak.firsts(fb['ph_pt']) > 0.1   # jet_faking_photon cut
        fb = fb[mask1]
        fb = fb[fb['n_ph_baseline'] == 1]

    else: # MC
        path = f"/data/tmathew/ntups/mc23d/{ntuple_name}_y.root" 
        path_BDT = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_{ntuple_name}_y_BDT_score.root" 
        print('processing file: ', path)
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables, library="ak")

        # add BDT score to fb
        f_BDT = uproot.open(path_BDT)['nominal']
        fb_BDT = f_BDT.arrays(["event", "BDTScore"], library="ak")
        tmp = fb["event"] == fb_BDT["event"]
        if np.all(tmp) == True:
            fb["VertexBDTScore"] = fb_BDT["BDTScore"]
        else: 
            print("Something is wrong, need arranging")

        fb = fb[ak.num(fb['ph_eta']) > 0]     # for abs(ak.firsts(fb['ph_eta'])) to have value to the reweighting
        fb = fb[fb['n_ph'] == 1]
        
        # Zjets and Wjets (rule out everything except for e->gamma)
        if ntuple_name == 'Zjets' or ntuple_name == 'Wjets':
            mask = ak.firsts(fb['ph_truth_type']) == 2
            fb = fb[mask]
        
        # goodPV on signal only
        if ntuple_name == 'ggHyyd':
            fb = fb[ak.num(fb['pv_z']) > 0]
            good_pv_tmp = (np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5)
            fb = fb[good_pv_tmp]

    print_cut(ntuple_name, fb, 'before cut')

    fb = fb[fb['n_mu_baseline'] == 0]
    fb = fb[fb['n_el_baseline'] == 0]
    fb = fb[fb['n_tau_baseline'] == 0]
    fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M']==1]
    fb = fb[ak.num(fb['ph_pt']) > 0] # prevent none values in Tbranch
    fb = fb[ak.firsts(fb['ph_pt']) >= 50000] # ph_pt cut (basic cut)
    fb = fb[fb['met_tst_et'] >= 100000] # MET cut (basic cut)
    fb = fb[fb['n_jet_central'] <= 3] # n_jet_central cut (basic cut)

    mt_tmp = np.sqrt(2 * fb['met_tst_et'] * ak.firsts(fb['ph_pt']) * 
                            (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))) / 1000
    mask1 = mt_tmp > 100
    mask2 = mt_tmp < 140 
    fb = fb[mask1 * mask2]

    fb = fb[fb['VertexBDTScore'] > 0.1]

    print_cut(ntuple_name, fb, 'after basic cut')

    test(fb) # check for none value

    print(f"Reading Time for {ntuple_name}: {(time.time()-start_time)} seconds\n")


    tot.append(fb)

    fb = 0
    fb_BDT = 0
    tmp = 0


processing file:  /data/tmathew/ntups/mc23d/ggHyyd_y.root
Unweighted Events before cut:  86910
Weighted Events before cut:  8732.987955115426
Unweighted Events after basic cut:  2646
Weighted Events after basic cut:  268.2449529933159
Number of none values:  0
Reading Time for ggHyyd: 3.7210183143615723 seconds

processing file:  /data/tmathew/ntups/mc23d/Zjets_y.root
Unweighted Events before cut:  3242488
Weighted Events before cut:  676616.903247458
Unweighted Events after basic cut:  383
Weighted Events after basic cut:  16.82812304884456
Number of none values:  0
Reading Time for Zjets: 101.74367547035217 seconds

processing file:  /data/tmathew/ntups/mc23d/Zgamma_y.root
Unweighted Events before cut:  3423357
Weighted Events before cut:  249851.55031619867
Unweighted Events after basic cut:  19491
Weighted Events after basic cut:  1002.7013259970018
Number of none values:  0
Reading Time for Zgamma: 39.45332479476929 seconds

processing file:  /data/tmathew/ntups/mc23d/Wgamma_y.roo

In [6]:
signal_name = 'ggHyyd'  # Define signal dataset
cut_name = 'basic'

def getCutDict(): # same cut as the internal note
    cut_dict = {}
    cut_dict['dmet'] = {
        'lowercut': np.arange(-30000, 10000 + 1000, 1000), # dmet > cut
    }
    cut_dict['metsig'] = {
        'lowercut': np.arange(0, 10 + 1, 1), # metsig > cut
    }
    cut_dict['dphi_met_phterm'] = {
        'lowercut': np.arange(1, 2 + 0.01, 0.01), # dphi_met_phterm > cut
    }
    cut_dict['dphi_met_jetterm'] = {
        'uppercut': np.arange(0.5, 1, 0.01), # dphi_met_jetterm < cut
    }
    cut_dict['ph_eta'] = {
        'uppercut': np.arange(1, 2.5 + 0.01, 0.01), # ph_eta < cut
    }
    cut_dict['dphi_jj'] = {
        'uppercut': np.arange(1, 3.14 + 0.01, 0.01) # dphi_jj < cut
    }
    return cut_dict
cut_config = getCutDict()

"    \ndef getCutDict():\n    cut_dict = {}\n    # Reduced Features\n    # cut_dict['balance'] = {\n    #     'lowercut': np.arange(0, 1.5 + 0.01, 0.01), # balance > cut\n    #     'uppercut': np.arange(1.5, 9, 0.05) # balance < cut\n    # }\n    cut_dict['dmet'] = {\n        'lowercut': np.arange(-30000, 10000 + 100, 100), # dmet > cut\n        'uppercut': np.arange(10000, 100000 + 100, 100), # -10000 < dmet < cut\n    }\n    cut_dict['metsig'] = {\n        'lowercut': np.arange(0, 10 + 1, 1), # metsig > cut\n        'uppercut': np.arange(10, 30 + 1, 1), # metsig < cut \n    }\n    # cut_dict['jetterm'] = {\n    #     'lowercut': np.arange(0, 150000+500, 500) # jetterm > cut\n    # }\n    cut_dict['dphi_met_phterm'] = {\n        'lowercut': np.arange(1, 2 + 0.01, 0.01), # dphi_met_phterm > cut\n    }\n    #determine which one is better dphi_met_central_jet or dphi_met_jetterm\n    cut_dict['dphi_met_central_jet'] = {\n        'lowercut': np.arange(1.5, 2.8, 0.01)\n    }\n    cut_dict[

In [8]:
%%time
signal_name='ggHyyd'
initial_cut = []
tot2 = tot

# < -- Initial Cut on all variables (maximize the significance * acceptance) -- > 
for cut_var, cut_types in cut_config.items():
    for cut_type, cut_values in cut_types.items():
        sig_simple_list, sigacc_simple_list, acceptance_values = calculate_significance(
            cut_var, cut_type, cut_values, tot2, ntuple_names, signal_name, getVarDict, getWeight
        )

        best_cut, best_sig, idx = get_best_cut(cut_values, sigacc_simple_list) 
        
        if idx == 0 or idx == len(sigacc_simple_list) - 1: # I chose to use index to indicate not to make unnecessary cut (for initial cut)
            print(cut_var, idx, len(sigacc_simple_list))
            continue
            
        result = {
            "cut_var": cut_var,
            "cut_type": cut_type,
            "best_cut": best_cut,
            "best_sig_x_acc": best_sig,
            "significance": sig_simple_list[idx],
            "acceptance": acceptance_values[idx]
        }

        print(result)
        initial_cut.append(dict(list(result.items())[:3]))

{'cut_var': 'dmet', 'cut_type': 'lowercut', 'best_cut': -20000, 'best_sig_x_acc': 1.8159952804441448, 'significance': 1.8311985743004842, 'acceptance': 99.1697626860513}
{'cut_var': 'metsig', 'cut_type': 'lowercut', 'best_cut': 6, 'best_sig_x_acc': 2.3725553020181507, 'significance': 2.5828703809512277, 'acceptance': 91.85731190832652}
{'cut_var': 'dphi_met_phterm', 'cut_type': 'lowercut', 'best_cut': 1.03, 'best_sig_x_acc': 1.7935629035994418, 'significance': 1.8177016513714308, 'acceptance': 98.67201816349913}
{'cut_var': 'dphi_met_jetterm', 'cut_type': 'uppercut', 'best_cut': 0.8800000000000003, 'best_sig_x_acc': 1.7517008628820963, 'significance': 1.751699790657196, 'acceptance': 100.00006121053995}
{'cut_var': 'ph_eta', 'cut_type': 'uppercut', 'best_cut': 2.3500000000000014, 'best_sig_x_acc': 1.7530576561309936, 'significance': 1.7558692160199285, 'acceptance': 99.83987646327623}
{'cut_var': 'dphi_jj', 'cut_type': 'uppercut', 'best_cut': 3.120000000000002, 'best_sig_x_acc': 1.7521

In [9]:
tot2_initial_cut = apply_all_cuts(tot2, ntuple_names, initial_cut, getVarDict)
final_significance = compute_total_significance(tot2_initial_cut, ntuple_names, signal_name, getVarDict, getWeight)
print('after initial cutting, signficance: ', final_significance)

after initial cutting, signficance:  2.696286217643497


In [10]:
%%time
# < -- n-1 iterations until no further improvement (max significance) -- >
optimized_cuts, final_significance = n_minus_1_optimizer(
    initial_cut, cut_config, tot2, ntuple_names, signal_name, getVarDict, getWeight, final_significance
)
print('after optimized cutting, signficance: ', final_significance)



--- Iteration 1 ---
Updating dmet (lowercut): -20000 → -7000  (sig 2.70 → 2.70)
Updating metsig (lowercut): 6 → 7  (sig 2.70 → 2.82)
Updating dphi_met_phterm (lowercut): 1.03 → 1.2000000000000002  (sig 2.82 → 2.88)
Updating dphi_met_jetterm (uppercut): 0.8800000000000003 → 0.7300000000000002  (sig 2.88 → 2.88)
Updating ph_eta (uppercut): 2.3500000000000014 → 1.7400000000000007  (sig 2.88 → 3.06)
Updating dphi_jj (uppercut): 3.120000000000002 → 2.370000000000001  (sig 3.06 → 3.09)

--- Iteration 2 ---
Updating dmet (lowercut): -7000 → -21000  (sig 3.09 → 3.11)
Updating dphi_met_phterm (lowercut): 1.2000000000000002 → 1.2300000000000002  (sig 3.11 → 3.11)

--- Iteration 3 ---
 optimized cuts, end of iteration 
after optimized cutting, signficance:  3.106431940154422
CPU times: user 5min 36s, sys: 1.01 s, total: 5min 37s
Wall time: 5min 41s


In [11]:
print( ' < -- Final Optimized Cuts -- > ')
# print(optimized_cuts)

for cut in optimized_cuts:
    var = cut['cut_var']
    val = cut['best_cut']
    if cut['cut_type'] == 'uppercut':
        print(f"{var} < {val}")
    elif cut['cut_type'] == 'lowercut':
        print(f"{var} > {val}")
        
print('after optimized cutting, signficance: ', final_significance)


 < -- Final Optimized Cuts -- > 
dmet > -21000
metsig > 7
dphi_met_phterm > 1.2300000000000002
dphi_met_jetterm < 0.7300000000000002
ph_eta < 1.7400000000000007
dphi_jj < 2.370000000000001
after optimized cutting, signficance:  3.106431940154422
