In [1]:
# import modules
import uproot, sys, time, math, pickle, os, ROOT
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import awkward as ak
from tqdm import tqdm
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
from scipy.special import betainc
from scipy.stats import norm
from pathlib import Path

# import config functions
sys.path.append('/home/jlai/dark_photon/code/config')
from plot_config import getWeight, zbi, sample_dict, getVarDict
from plot_var import variables, variables_mc, ntuple_names
from n_1_iteration_functions import get_best_cut, calculate_significance, apply_cut_to_fb, apply_all_cuts, compute_total_significance, n_minus_1_optimizer
from perf_sig_plot import plot_performance, plot_significance, plot_n_1, calculate_significance2

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14.0,10.0  # Roughly 11 cm wde by 8 cm high  
mpl.rcParams['font.size'] = 20.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14,
    "title": 20
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"],  # Legend
    "axes.titlesize": font_size["title"] # Title
})


tot = []
signal_name = 'ggHyyd'
data = pd.DataFrame()

def test(fb):
    # checking if there are any none values
    mask = ak.is_none(fb['met_tst_et'])
    n_none = ak.sum(mask)
    print("Number of none values: ", n_none)
    # if n_none > 0:
    #     fb = fb[~mask]
    # print("Events after removing none values: ", len(fb), ak.sum(ak.is_none(fb['met_tst_et'])))

def print_cut(ntuple_name, fb, label):
    print(f"{ntuple_name} Unweighted Events {label}: ", len(fb))
    print(f"{ntuple_name} Weighted Events {label}: ", sum(getWeight(fb, ntuple_name)))
        
for i in range(len(ntuple_names)):
    start_time = time.time()
    ntuple_name = ntuple_names[i]
    path = f"/data/fpiazza/ggHyyd/NtuplesWithBDTSkim/{ntuple_name}_nominal_bdt.root"
    f = uproot.open(path)['nominal']
    if ntuple_name.startswith("mc"):
        fb = f.arrays(variables_mc, library='ak')
        print_cut(ntuple_name, fb, 'before cut')
        
        fb = fb[ak.num(fb['ph_eta']) > 0]     # for abs(ak.firsts(fb['ph_eta'])) to have value to the reweighting
        fb = fb[fb['n_ph'] == 1]
        fb = fb[fb['n_el_baseline'] == 0]

        # goodPV on signal only
        if ntuple_name == 'ggHyyd':
            fb = fb[ak.num(fb['pv_z']) > 0]
            good_pv_tmp = (np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5)
            fb = fb[good_pv_tmp]
            
        
    if (ntuple_name == "data23_y") or (ntuple_name == "data24_y"):  # jet-faking 
        fb = f.arrays(variables, library='ak')
        print_cut(ntuple_name, fb, 'before cut')

        fb = fb[ak.num(fb['ph_eta']) > 0]
        mask1 = (ak.firsts(fb['ph_topoetcone40'])-2450.)/ak.firsts(fb['ph_pt']) > 0.1   # jet_faking_photon cut
        fb = fb[mask1]
        fb = fb[fb['n_ph_baseline'] == 1]
        fb = fb[fb['n_el_baseline'] == 0]


    if (ntuple_name == "data23_eprobe") or (ntuple_name == "data24_eprobe"): # electron-faking
        fb = f.arrays(variables, library='ak')
        print_cut(ntuple_name, fb, 'before cut')
        
        fb = fb[fb['n_el'] == 1]
        fb = fb[fb['n_ph_baseline'] == 0]

        # using electron info for photon info
        fb['ph_pt'] = fb['el_pt']
        fb['ph_eta'] = fb['el_eta']
        fb['ph_phi'] = fb['el_phi']
        fb['dphi_met_phterm'] = fb['dphi_met_eleterm']  

    fb = fb[ak.num(fb['ph_pt']) > 0] # prevent none values in Tbranch
    fb = fb[ak.firsts(fb['ph_pt']) >= 50000] # ph_pt cut (basic cut)
    fb = fb[fb['n_mu_baseline'] == 0]
    fb = fb[fb['n_tau_baseline'] == 0]
    fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M']==1]
    fb = fb[fb['met_tst_et'] >= 100000] # MET cut (basic cut)
    fb = fb[fb['n_jet_central'] <= 3] # n_jet_central cut (basic cut)
    
    fb['VertexBDTScore'] = fb['BDTScore'] # renaming BDTScore to ensure this is recognized as Vertex BDT Score
    # fb = fb[fb['VertexBDTScore'] > 0.1]
    
    mt_tmp = np.sqrt(2 * fb['met_tst_et'] * ak.firsts(fb['ph_pt']) * 
                    (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))) / 1000
    mask1 = mt_tmp > 100
    mask2 = mt_tmp < 140
    fb = fb[mask1 * mask2]

    # ------ Adjustment --------
    fb['weights'] = getWeight(fb, ntuple_name)
    
    dphi_met_jetterm_tmp = fb['dphi_met_jetterm']
    cond = ak.fill_none(dphi_met_jetterm_tmp == -10, False)
    fb['dphi_met_jetterm'] = ak.where(cond, -999, dphi_met_jetterm_tmp)

    fb['dphi_met_phterm'] = np.arccos(np.cos(fb['dphi_met_phterm']))

    print_cut(ntuple_name, fb, 'after basic')

    test(fb) # check for none value

    print(f"Reading Time for {ntuple_name}: {(time.time()-start_time)} seconds\n")

    tot.append(fb)

    del fb 

# combining 23d + 23e {Zgamma (1, 6), Wgamma (2, 7), gammajet_direct (3, 8)}
# combining 2023 + 2024 {data_y (4, 9), data_eprobe (5, 10)}
tot_tmp = tot
tot = []
for i in tqdm(range(6)):
    tot.append(ak.concatenate([tot_tmp[i], tot_tmp[i+6]]))
ntuple_names = ["ggHyyd", "Zgamma", "Wgamma", "gammajet_direct", "data_y", "data_eprobe"]
del tot_tmp


mc23d_ggHyyd_y Unweighted Events before cut:  17999
mc23d_ggHyyd_y Weighted Events before cut:  344.07425718325356
mc23d_ggHyyd_y Unweighted Events after basic:  2998
mc23d_ggHyyd_y Weighted Events after basic:  58.68194395390023
Number of none values:  0
Reading Time for mc23d_ggHyyd_y: 0.8808169364929199 seconds

mc23d_Zgamma_y Unweighted Events before cut:  2520609
mc23d_Zgamma_y Weighted Events before cut:  15697.116266766878
mc23d_Zgamma_y Unweighted Events after basic:  21427
mc23d_Zgamma_y Weighted Events after basic:  222.79132641446043
Number of none values:  0
Reading Time for mc23d_Zgamma_y: 25.681639671325684 seconds

mc23d_Wgamma_y Unweighted Events before cut:  685525
mc23d_Wgamma_y Weighted Events before cut:  16946.649253377054
mc23d_Wgamma_y Unweighted Events after basic:  15128
mc23d_Wgamma_y Weighted Events after basic:  451.9858106707009
Number of none values:  0
Reading Time for mc23d_Wgamma_y: 5.942091226577759 seconds

mc23d_gammajet_direct_y Unweighted Events be

: 