In [9]:
# import modules
import uproot, sys, time, math, pickle, os, csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import awkward as ak
from tqdm import tqdm
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
from scipy.special import betainc
from scipy.stats import norm
from datetime import datetime

# import config functions
from jet_faking_plot_config import getWeight, zbi, sample_dict, getVarDict
from plot_var import variables, variables_data, ntuple_names, ntuple_names_BDT

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14.0,10.0  # Roughly 11 cm wde by 8 cm high  
mpl.rcParams['font.size'] = 20.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"]  # Legend
})

In [10]:
# -------- CONFIG --------
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
LOG_DIR = f"./cutlogs_{RUN_TAG}"
os.makedirs(LOG_DIR, exist_ok=True)
TXT_LOG = os.path.join(LOG_DIR, "cutflow.log")
CSV_LOG = os.path.join(LOG_DIR, "cutflow.csv")

ntuple_names = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct', 'data23']

def weight_sum(fb, ntuple_name):
    if ntuple_name == 'data23':
        return float(np.sum(getWeight(fb, ntuple_name, jet_faking=True)))
    else:
        return float(np.sum(getWeight(fb, ntuple_name)))

# ---- logging helpers ----
class CutLogger:
    def __init__(self, txt_path, csv_path):
        self.txt_path = txt_path
        self.csv_path = csv_path
        if not os.path.exists(csv_path):
            with open(csv_path, "w", newline="") as f:
                w = csv.writer(f)
                w.writerow(["sample","step_idx","step","events","weighted","elapsed_s"])
        # fresh txt header
        with open(txt_path, "a") as f:
            f.write(f"\n==== Cutflow run {RUN_TAG} ====\n")

    def write(self, sample, step_idx, step, events, weighted, elapsed):
        # text
        with open(self.txt_path, "a") as f:
            f.write(f"[{sample:12s}] {step_idx:02d}  {step:30s}  "
                    f"events={events:8d}  weighted={weighted:.6g}  dt={elapsed:.3f}s\n")
        # csv
        with open(self.csv_path, "a", newline="") as f:
            w = csv.writer(f)
            w.writerow([sample, step_idx, step, int(events), f"{weighted:.12g}", f"{elapsed:.6f}"])

logger = CutLogger(TXT_LOG, CSV_LOG)

def log_step(sample, step_idx, step_label, fb, t0):
    nevt = len(fb)
    wsum = weight_sum(fb, sample)
    logger.write(sample, step_idx, step_label, nevt, wsum, time.time() - t0)

def require(mask, name):
    """Utility to guard awkward masks and give readable errors if shapes mismatch."""
    if isinstance(mask, (np.ndarray, ak.Array)) and ak.num(mask, axis=0) is not None:
        return mask
    raise RuntimeError(f"Mask '{name}' has wrong shape/type: {type(mask)}")

# ---- your loop with logging ----
for ntuple_name in ntuple_names:
    start_time = time.time()
    step = 0

    if ntuple_name == 'data23':
        path = "/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/data23_uy_BDT_score.root"
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables_data, library="ak")
        fb['VertexBDTScore'] = fb['BDTScore']

        log_step(ntuple_name, step, "loaded", fb, start_time); step += 1

        # ensure photon arrays exist for reweighting usage downstream
        fb = fb[ak.num(fb['ph_eta']) > 0]
        log_step(ntuple_name, step, "has>=1 photon", fb, start_time); step += 1

        # jet-faking-photon cut (data control)
        mask = (ak.firsts(fb['ph_topoetcone40']) - 2450.)/ak.firsts(fb['ph_pt']) > 0.1
        fb = fb[require(mask, "jetfake")]
        log_step(ntuple_name, step, "jet_faking_photon", fb, start_time); step += 1

        fb = fb[fb['n_ph_baseline'] == 1]
        log_step(ntuple_name, step, "n_ph_baseline==1", fb, start_time); step += 1

    else:
        path = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_{ntuple_name}_uy_BDT_score.root"
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables, library="ak")

        # add BDT score (same file path, same tree)
        f_BDT = uproot.open(path)['nominal']
        fb_BDT = f_BDT.arrays(["event", "BDTScore"], library="ak")
        if np.all(fb["event"] == fb_BDT["event"]):
            fb["VertexBDTScore"] = fb_BDT["BDTScore"]
        else:
            print(f"[WARN] Event mismatch in {ntuple_name}; BDT not attached")

        log_step(ntuple_name, step, "loaded", fb, start_time); step += 1

        fb = fb[ak.num(fb['ph_eta']) > 0]
        log_step(ntuple_name, step, "has>=1 photon", fb, start_time); step += 1

        fb = fb[fb['n_ph'] == 1]
        log_step(ntuple_name, step, "n_ph==1", fb, start_time); step += 1

        if ntuple_name in ("Zjets","Wjets"):
            mask = ak.firsts(fb['ph_truth_type']) == 2   # keep e->gamma only
            fb = fb[require(mask, "ph_truth_type==2")]
            log_step(ntuple_name, step, "truth e->gamma", fb, start_time); step += 1

        if ntuple_name == "ggHyyd":
            fb = fb[ak.num(fb['pv_z']) > 0]
            log_step(ntuple_name, step, "pv_z exists", fb, start_time); step += 1
            good_pv = (np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5)
            fb = fb[require(good_pv, "goodPV")]
            log_step(ntuple_name, step, "goodPV", fb, start_time); step += 1

    # --------- BASIC CUTS (shared) ----------
    # NOTE: If 'ggHyyd' is signal without a prompt μ, consider not requiring n_mu==1 for that sample.
    fb = fb[fb['n_mu'] == 1]
    log_step(ntuple_name, step, "n_mu==1", fb, start_time); step += 1

    fb = fb[fb['n_el_baseline'] == 0]
    log_step(ntuple_name, step, "n_el_baseline==0", fb, start_time); step += 1

    fb = fb[fb['n_tau_baseline'] == 0]
    log_step(ntuple_name, step, "n_tau_baseline==0", fb, start_time); step += 1

    fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M'] == 1]
    log_step(ntuple_name, step, "trigger==1", fb, start_time); step += 1

    fb = fb[ak.num(fb['ph_pt']) > 0]
    log_step(ntuple_name, step, "ph_pt exists", fb, start_time); step += 1

    fb = fb[ak.firsts(fb['ph_pt']) >= 50_000]
    log_step(ntuple_name, step, "ph_pt>=50GeV", fb, start_time); step += 1

    fb = fb[fb['met_tst_et'] >= 100_000]
    log_step(ntuple_name, step, "MET>=100GeV", fb, start_time); step += 1

    fb = fb[fb['n_jet_central'] <= 3]
    log_step(ntuple_name, step, "n_jet_central<=3", fb, start_time); step += 1

    mt_tmp = np.sqrt(2 * fb['met_tst_et'] * ak.firsts(fb['ph_pt']) *
                     (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))) / 1000.0
    fb = fb[mt_tmp >= 100]
    log_step(ntuple_name, step, "mT>=100GeV", fb, start_time); step += 1

    # ---- sanity check for None ----
    n_none = int(ak.sum(ak.is_none(fb['met_tst_et'])))
    with open(TXT_LOG, "a") as ftxt:
        ftxt.write(f"[{ntuple_name:12s}] None-check met_tst_et: {n_none}\n")

    # optional: free memory
    del fb

print(f"\nLogs written to:\n - {TXT_LOG}\n - {CSV_LOG}\n")


Logs written to:
 - ./cutlogs_20250915_153910/cutflow.log
 - ./cutlogs_20250915_153910/cutflow.csv



In [3]:
# ---- Old code ---- (saving tot & fb)
%%time
tot = []
data = pd.DataFrame()
unweighted_bcut, weighted_bcut, unweighted_acut, weighted_acut = [], [], [], []
ntuple_names = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct', 'data23']

def test(fb):
    # checking if there are any none values
    mask = ak.is_none(fb['met_tst_et'])
    n_none = ak.sum(mask)
    print("Number of none values: ", n_none)
    # if n_none > 0:
    #     fb = fb[~mask]
    # print("Events after removing none values: ", len(fb), ak.sum(ak.is_none(fb['met_tst_et'])))

def print_cut(ntuple_name, fb, label):
    print(f"Unweighted Events {label}: ", len(fb))
    if ntuple_name == 'data23':
        print(f"Weighted Events {label}: ", sum(getWeight(fb, ntuple_name, jet_faking=True)))
    else: 
        print(f"Weighted Events {label}: ", sum(getWeight(fb, ntuple_name)))

for i in range(len(ntuple_names)):
    ucut, wcut = [], []
    start_time = time.time()
    ntuple_name = ntuple_names[i]
    if ntuple_name == 'data23': # data
        path = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/data23_uy_BDT_score.root" 
        print('processing file: ', path)
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables_data, library="ak")
        fb['VertexBDTScore'] = fb['BDTScore'] # renaming BDTScore to ensure this is recognized as Vertex BDT Score
        
        fb = fb[ak.num(fb['ph_eta']) > 0]     # for abs(ak.firsts(fb['ph_eta'])) to have value to the reweighting
                
        mask1 = (ak.firsts(fb['ph_topoetcone40'])-2450.)/ak.firsts(fb['ph_pt']) > 0.1   # jet_faking_photon cut
        fb = fb[mask1]
        fb = fb[fb['n_ph_baseline'] == 1]

    else: # MC
        # path = f"/data/tmathew/ntups/mc23d/{ntuple_name}_uy.root" 
        path_BDT = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_{ntuple_name}_uy_BDT_score.root" 
        path = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_{ntuple_name}_uy_BDT_score.root" 
        print('processing file: ', path)
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables, library="ak")

        # add BDT score to fb
        f_BDT = uproot.open(path_BDT)['nominal']
        fb_BDT = f_BDT.arrays(["event", "BDTScore"], library="ak")
        tmp = fb["event"] == fb_BDT["event"]
        if np.all(tmp) == True:
            fb["VertexBDTScore"] = fb_BDT["BDTScore"]
        else: 
            print("Something is wrong, need arranging")

        fb = fb[ak.num(fb['ph_eta']) > 0]     # for abs(ak.firsts(fb['ph_eta'])) to have value to the reweighting
        fb = fb[fb['n_ph'] == 1]
        
        # Zjets and Wjets (rule out everything except for e->gamma)
        if ntuple_name == 'Zjets' or ntuple_name == 'Wjets':
            mask = ak.firsts(fb['ph_truth_type']) == 2
            fb = fb[mask]
        
        # goodPV on signal only
        if ntuple_name == 'ggHyyd':
            fb = fb[ak.num(fb['pv_z']) > 0]
            good_pv_tmp = (np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5)
            fb = fb[good_pv_tmp]

    print_cut(ntuple_name, fb, 'before cut')
    wcut.append(sum(getWeight(fb, ntuple_name)))

    # fb = fb[fb['n_mu_baseline'] == 0]
    fb = fb[fb['n_mu'] == 1]
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[fb['n_el_baseline'] == 0]
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[fb['n_tau_baseline'] == 0]
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M']==1]
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[ak.num(fb['ph_pt']) > 0] # prevent none values in Tbranch
    fb = fb[ak.firsts(fb['ph_pt']) >= 50000] # ph_pt cut (basic cut)
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[fb['met_tst_et'] >= 100000] # MET cut (basic cut)
    wcut.append(sum(getWeight(fb, ntuple_name)))
    fb = fb[fb['n_jet_central'] <= 3] # n_jet_central cut (basic cut)
    wcut.append(sum(getWeight(fb, ntuple_name)))

    mt_tmp = np.sqrt(2 * fb['met_tst_et'] * ak.firsts(fb['ph_pt']) * 
                            (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))) / 1000
    mask1 = mt_tmp >= 100 # trigger cut
    fb = fb[mask1]
    # wcut.append(sum(getWeight(fb, ntuple_name)))

    print_cut(ntuple_name, fb, 'after basic cut')


    ucut.append(len(fb))

    unweighted_acut.append(ucut)
    weighted_acut.append(wcut)
    test(fb) # check for none value

    print(f"Reading Time for {ntuple_name}: {(time.time()-start_time)} seconds\n")



    tot.append(fb)

    fb = 0
    fb_BDT = 0
    tmp = 0


processing file:  /data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_ggHyyd_uy_BDT_score.root
Unweighted Events before cut:  15
Weighted Events before cut:  1.4966255836188793
Unweighted Events after basic cut:  0
Weighted Events after basic cut:  0
Number of none values:  0
Reading Time for ggHyyd: 0.5149648189544678 seconds

processing file:  /data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_Zjets_uy_BDT_score.root
Unweighted Events before cut:  13026
Weighted Events before cut:  689.1255711805002
Unweighted Events after basic cut:  206
Weighted Events after basic cut:  20.777934792884153
Number of none values:  0
Reading Time for Zjets: 2.9116899967193604 seconds

processing file:  /data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_Zgamma_uy_BDT_score.root


KeyboardInterrupt: 

In [16]:
ntuple_name = "ggHyyd"
path = "/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_ggHyyd_uy_BDT_score.root"

print("Opening:", path)
f = uproot.open(path)["nominal"]
fb = f.arrays(variables + ['BDTScore', 'event'], library="ak")
fb["VertexBDTScore"] = fb["BDTScore"]
print("Initial events:", len(fb))

fb

Opening: /data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_ggHyyd_uy_BDT_score.root
Initial events: 37


In [12]:
ntuple_name = "ggHyyd"
path = "/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_ggHyyd_uy_BDT_score.root"

print("Opening:", path)
f = uproot.open(path)["nominal"]
fb = f.arrays(variables + ['BDTScore', 'event'], library="ak")
fb["VertexBDTScore"] = fb["BDTScore"]
print("Initial events:", len(fb))

def show(label):
    print(f"{label}: {len(fb)}")

fb = fb[ak.num(fb['ph_eta']) > 0]
show("has >=1 photon")

fb = fb[fb['n_ph'] == 1]
show("n_ph==1")

# Zjets/Wjets electron->gamma filter (skip for ggHyyd)

# goodPV cut
fb = fb[ak.num(fb['pv_z']) > 0]
good_pv = np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5
fb = fb[good_pv]
show("good PV")

# n_mu==1
fb = fb[fb['n_mu'] == 1]
show("n_mu==1")

# n_el_baseline==0
fb = fb[fb['n_el_baseline'] == 0]
show("n_el_baseline==0")

# n_tau_baseline==0
fb = fb[fb['n_tau_baseline'] == 0]
show("n_tau_baseline==0")

# trigger
fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M']==1]
show("trigger")

# photon pt >= 50 GeV
fb = fb[ak.num(fb['ph_pt'])>0]
fb = fb[ak.firsts(fb['ph_pt']) >= 50000]
show("ph_pt>50")

# MET >= 100 GeV
fb = fb[fb['met_tst_et'] >= 100000]
show("met_tst_et>100GeV")

# n_jet_central <= 3
fb = fb[fb['n_jet_central'] <= 3]
show("n_jet_central<=3")

# trigger MT
mt = np.sqrt(
    2*fb['met_tst_et']*ak.firsts(fb['ph_pt']) *
    (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))
) / 1000
fb = fb[mt >= 100]
show("mT>=100")

print("Final surviving events:", len(fb))


Opening: /data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_ggHyyd_uy_BDT_score.root
Initial events: 37
has >=1 photon: 37
n_ph==1: 34
good PV: 15
n_mu==1: 15
n_el_baseline==0: 15
n_tau_baseline==0: 15
trigger: 4
ph_pt>50: 4
met_tst_et>100GeV: 1
n_jet_central<=3: 0
mT>=100: 0
Final surviving events: 0


In [7]:
weighted_acut

[[1.4966255836188793,
  1.4966255836188793,
  1.4966255836188793,
  1.4966255836188793,
  0.33959832414984703,
  0.33959832414984703,
  0.036512408405542374,
  0],
 [689.1255711805002,
  689.1255711805002,
  689.1255711805002,
  671.3638537789964,
  68.03742441914017,
  67.67209671909893,
  29.396469421110655,
  26.75008220052554]]