In [1]:
# import modules
import uproot, sys, time, math, pickle, os, csv, shutil
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import awkward as ak
from tqdm import tqdm
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
from scipy.special import betainc
from scipy.stats import norm
from datetime import datetime
from pathlib import Path

# import config functions
sys.path.append("/home/jlai/jet-faking/config")
from jet_faking_plot_config import getWeight, zbi, sample_dict, getVarDict
from plot_var import variables, variables_data, ntuple_names, ntuple_names_BDT

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14.0,10.0  # Roughly 11 cm wde by 8 cm high  
mpl.rcParams['font.size'] = 20.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"]  # Legend
})

In [2]:
# -------- CONFIG --------
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
# LOG_DIR = f"./cutlogs_{RUN_TAG}"
LOG_DIR = "./cutlogs_135_internal"
try:
    shutil.rmtree(LOG_DIR)
except FileNotFoundError:
    pass
os.makedirs(LOG_DIR, exist_ok=False)
TXT_LOG = os.path.join(LOG_DIR, "cutflow.log")
CSV_LOG = os.path.join(LOG_DIR, "cutflow.csv")

ntuple_names = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct', 'data23']

def weight_sum(fb, ntuple_name):
    if ntuple_name == 'data23':
        return float(np.sum(getWeight(fb, ntuple_name, jet_faking=True)))
    else:
        return float(np.sum(getWeight(fb, ntuple_name)))

# ---- logging helpers ----
class CutLogger:
    def __init__(self, txt_path, csv_path):
        self.txt_path = txt_path
        self.csv_path = csv_path
        if not os.path.exists(csv_path):
            with open(csv_path, "w", newline="") as f:
                w = csv.writer(f)
                w.writerow(["sample","step_idx","step","events","weighted","elapsed_s"])
        # fresh txt header
        with open(txt_path, "a") as f:
            f.write(f"\n==== Cutflow run {RUN_TAG} ====\n")

    def write(self, sample, step_idx, step, events, weighted, elapsed):
        # text
        with open(self.txt_path, "a") as f:
            f.write(f"[{sample:12s}] {step_idx:02d}  {step:30s}  "
                    f"events={events:8d}  weighted={weighted:.6g}  dt={elapsed:.3f}s\n")
        # csv
        with open(self.csv_path, "a", newline="") as f:
            w = csv.writer(f)
            w.writerow([sample, step_idx, step, int(events), f"{weighted:.12g}", f"{elapsed:.6f}"])

logger = CutLogger(TXT_LOG, CSV_LOG)

def log_step(sample, step_idx, step_label, fb, t0):
    nevt = len(fb)
    wsum = weight_sum(fb, sample)
    logger.write(sample, step_idx, step_label, nevt, wsum, time.time() - t0)

def require(mask, name):
    """Utility to guard awkward masks and give readable errors if shapes mismatch."""
    if isinstance(mask, (np.ndarray, ak.Array)) and ak.num(mask, axis=0) is not None:
        return mask
    raise RuntimeError(f"Mask '{name}' has wrong shape/type: {type(mask)}")

# ---- your loop with logging ----
for ntuple_name in tqdm(ntuple_names):
    start_time = time.time()
    step = 0

    if ntuple_name == 'data23':  # data-driven
        path = "/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/data23_y_BDT_score.root"
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables_data, library="ak")
        fb['VertexBDTScore'] = fb['BDTScore']

        log_step(ntuple_name, step, "loaded", fb, start_time); step += 1

        # ensure photon arrays exist for reweighting usage downstream
        fb = fb[ak.num(fb['ph_eta']) > 0]
        # jet-faking-photon cut (data control)
        mask = (ak.firsts(fb['ph_topoetcone40']) - 2450.)/ak.firsts(fb['ph_pt']) > 0.1
        fb = fb[require(mask, "jetfake")]
        log_step(ntuple_name, step, "jet_faking_photon", fb, start_time); step += 1

        fb = fb[fb['n_ph_baseline'] == 1]
        log_step(ntuple_name, step, "n_ph_baseline==1", fb, start_time); step += 1

    else:  # MC
        path = f"/data/tmathew/ntups/mc23d/{ntuple_name}_y.root" 
        path_BDT = f"/data/fpiazza/ggHyyd/Ntuples/MC23d/withVertexBDT/mc23d_{ntuple_name}_y_BDT_score.root"
        f = uproot.open(path)['nominal']
        fb = f.arrays(variables, library="ak")

        # add BDT score (same file path, same tree)
        f_BDT = uproot.open(path_BDT)['nominal']
        fb_BDT = f_BDT.arrays(["event", "BDTScore"], library="ak")
        if np.all(fb["event"] == fb_BDT["event"]):
            fb["VertexBDTScore"] = fb_BDT["BDTScore"]
        else:
            print(f"[WARN] Event mismatch in {ntuple_name}; BDT not attached")

        log_step(ntuple_name, step, "loaded", fb, start_time); step += 1

        fb = fb[ak.num(fb['ph_eta']) > 0]
        fb = fb[fb['n_ph'] == 1]
        log_step(ntuple_name, step, "n_ph==1", fb, start_time); step += 1

        if ntuple_name in ("Zjets","Wjets"):
            mask = ak.firsts(fb['ph_truth_type']) == 2   # keep e->gamma only
            fb = fb[require(mask, "ph_truth_type==2")]
            log_step(ntuple_name, step, "truth e->gamma", fb, start_time); step += 1

        if ntuple_name == "ggHyyd":
            fb = fb[ak.num(fb['pv_z']) > 0]
            log_step(ntuple_name, step, "pv_z exists", fb, start_time); step += 1
            good_pv = (np.abs(ak.firsts(fb['pv_truth_z']) - ak.firsts(fb['pv_z'])) <= 0.5)
            fb = fb[require(good_pv, "goodPV")]
            log_step(ntuple_name, step, "goodPV", fb, start_time); step += 1

    # --------- BASIC CUTS (shared) ----------
    # NOTE: If 'ggHyyd' is signal without a prompt μ, consider not requiring n_mu==1 for that sample.
    fb = fb[fb['n_mu_baseline'] == 0]
    log_step(ntuple_name, step, "n_mu_baseline==0", fb, start_time); step += 1

    fb = fb[fb['n_el_baseline'] == 0]
    log_step(ntuple_name, step, "n_el_baseline==0", fb, start_time); step += 1

    fb = fb[fb['n_tau_baseline'] == 0]
    log_step(ntuple_name, step, "n_tau_baseline==0", fb, start_time); step += 1

    fb = fb[fb['trigger_HLT_g50_tight_xe40_cell_xe70_pfopufit_80mTAC_L1eEM26M'] == 1]
    log_step(ntuple_name, step, "trigger==1", fb, start_time); step += 1

    fb = fb[ak.num(fb['ph_pt']) > 0]
    fb = fb[ak.firsts(fb['ph_pt']) > 50_000]
    log_step(ntuple_name, step, "ph_pt>50GeV", fb, start_time); step += 1

    fb = fb[fb['met_tst_et'] > 100_000]
    log_step(ntuple_name, step, "MET>100GeV", fb, start_time); step += 1

    fb = fb[fb['n_jet_central'] <= 3]
    log_step(ntuple_name, step, "n_jet_central<=3", fb, start_time); step += 1

    mt_tmp = np.sqrt(2 * fb['met_tst_et'] * ak.firsts(fb['ph_pt']) *
                     (1 - np.cos(fb['met_tst_phi'] - ak.firsts(fb['ph_phi'])))) / 1000.0
    mask1 = mt_tmp > 80
    fb = fb[mask1]
    log_step(ntuple_name, step, "mT>80GeV", fb, start_time); step += 1

    fb = fb[fb['VertexBDTScore'] > 0.1]
    log_step(ntuple_name, step, "VertexBDTScore>0.1", fb, start_time); step += 1

    # ---------- INTERNAL SELECTION CUT ------------

    metsig_tmp = fb['met_tst_sig'] 
    mask1 = metsig_tmp > 6
    fb = fb[mask1]
    log_step(ntuple_name, step, "met_tst_sig>6", fb, start_time); step += 1
    
    ph_eta_tmp = np.abs(ak.firsts(fb['ph_eta']))
    fb = fb[ph_eta_tmp < 1.75]
    log_step(ntuple_name, step, "ph_eta<1.75", fb, start_time); step += 1

    dphi_met_phterm_tmp = np.arccos(np.cos(fb['met_tst_phi'] - fb['met_phterm_phi'])) # added cut 3
    fb = fb[dphi_met_phterm_tmp > 1.25]
    log_step(ntuple_name, step, "dphi_met_phterm>1.25", fb, start_time); step += 1

    dmet_tmp = fb['met_tst_noJVT_et'] - fb['met_tst_et']
    mask1 = dmet_tmp > -10000
    fb = fb[mask1]
    log_step(ntuple_name, step, "dmet>-10GeV", fb, start_time); step += 1

    dphi_jj_tmp = fb['dphi_central_jj']
    dphi_jj_tmp = ak.where(dphi_jj_tmp == -10, np.nan, dphi_jj_tmp)
    dphi_jj_tmp = np.arccos(np.cos(dphi_jj_tmp))
    dphi_jj_tmp = ak.where(np.isnan(dphi_jj_tmp), -999, dphi_jj_tmp)
    fb = fb[dphi_jj_tmp < 2.5]
    log_step(ntuple_name, step, "dphi_jj_central<2.5", fb, start_time); step += 1

    dphi_met_jetterm_tmp = np.where(fb['met_jetterm_et'] != 0,   # added cut 5
                        np.arccos(np.cos(fb['met_tst_phi'] - fb['met_jetterm_phi'])),
                        -999)
    fb = fb[dphi_met_jetterm_tmp < 0.75]
    log_step(ntuple_name, step, "dphi_met_jetterm<0.75", fb, start_time); step += 1
    
    # tot.append(fb) # save the fb for further study

    # ---- sanity check for None ----
    n_none = int(ak.sum(ak.is_none(fb['met_tst_et'])))
    with open(TXT_LOG, "a") as ftxt:
        ftxt.write(f"[{ntuple_name:12s}] None-check met_tst_et: {n_none}\n")

    # optional: free memory
    del fb

print(f"\nLogs written to:\n - {TXT_LOG}\n - {CSV_LOG}\n")

100%|██████████| 7/7 [04:11<00:00, 35.86s/it]


Logs written to:
 - ./cutlogs_135_internal/cutflow.log
 - ./cutlogs_135_internal/cutflow.csv






In [3]:
process_order = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct','data23']

preselection_end = {
    'ggHyyd':          'goodPV',               # pv_z exists -> goodPV (take the last one)
    'Zjets':           'truth e->gamma',
    'Wjets':           'truth e->gamma',
    'Zgamma':          'n_ph==1',
    'Wgamma':          'n_ph==1',
    'gammajet_direct': 'n_ph==1',
    'data23':          'n_ph_baseline==1',     # jet_faking_photon -> n_ph_baseline==1 (take the last one)
}

# Internal Notes
shared_cuts = [
    'n_mu_baseline==0',
    'n_el_baseline==0',
    'n_tau_baseline==0',
    'trigger==1',
    'ph_pt>50GeV',
    'MET>100GeV',
    'n_jet_central<=3',
    'mT>80GeV',
    'VertexBDTScore>0.1',
    'met_tst_sig>6',
    'ph_eta<1.75',
    'dphi_met_phterm>1.25',
    'dmet>-10GeV',
    'dphi_jj_central<2.5',
    'dphi_met_jetterm<0.75'
]

# Load and sanitize the log
df = pd.read_csv(CSV_LOG)
# Ensure numeric
df['weighted'] = pd.to_numeric(df['weighted'], errors='coerce')
df['step_idx'] = pd.to_numeric(df['step_idx'], errors='coerce')

# Keep last entry per (sample, step)
df = (df.sort_values(['sample','step_idx'])
        .drop_duplicates(subset=['sample','step'], keep='last'))

# Helper to fetch the weighted yield for a given (sample, step label)
def get_yield(sample, step_label):
    row = df[(df['sample'] == sample) & (df['step'] == step_label)]
    if not row.empty:
        return float(row['weighted'].iloc[0])
    # Fallback: if a step is unexpectedly missing, try using the latest prior step by index
    # (shouldn't happen after preselection, but keeps the pipeline resilient)
    sample_rows = df[df['sample'] == sample].sort_values('step_idx')
    prior = sample_rows[sample_rows['step_idx'] <= sample_rows['step_idx'].max()]
    return float(prior['weighted'].iloc[-1]) if not prior.empty else np.nan

# 1) LOAD row = 'loaded'
rows = ['LOAD', 'CUT 1 (preprocessing)'] + shared_cuts
table = pd.DataFrame(index=rows, columns=process_order, dtype=float)

for p in process_order:
    table.loc['LOAD', p] = get_yield(p, 'loaded')

# 2) CUT 1 row = process-specific preselection_end
for p in process_order:
    end_step = preselection_end[p]
    table.loc['CUT 1 (preprocessing)', p] = get_yield(p, end_step)

# 3) Shared cuts: same step name for all processes
for cut in shared_cuts:
    for p in process_order:
        table.loc[cut, p] = get_yield(p, cut)

# Compute S/sqrt(B) for each row
signal_col = 'ggHyyd'
data_cols = ['data23']
bkg_cols = [c for c in process_order if c not in ([signal_col])]

S = table[signal_col].fillna(0.0)
B = table[bkg_cols].sum(axis=1).astype(float)

with np.errstate(divide='ignore', invalid='ignore'):
    ssb = S / np.sqrt(B)                     # S/sqrt(B)
    ssb.replace([np.inf, -np.inf], np.nan, inplace=True)

    ss_sb   = S / np.sqrt(S + B)             # S/sqrt(S+B)
    ss_sb.replace([np.inf, -np.inf], np.nan, inplace=True)

    ss_s13b = S / np.sqrt(S + 1.3*B)         # S/sqrt(S+1.3B)
    ss_s13b.replace([np.inf, -np.inf], np.nan, inplace=True)

SIGMA_B_FRAC = 0.30  # 30% relative background uncertainty; change as needed
# Vectorized ZBi over rows (robust if B<=0 -> 0.0)
ZBi = pd.Series(
    [zbi(float(s), float(b), sigma_b_frac=SIGMA_B_FRAC) for s, b in zip(S.values, B.values)],
    index=table.index, dtype=float
)

# Pretty output
disp = table.copy()
def fmt(x):
    if pd.isna(x): return 'n/a'
    # scientific for very small/large; fixed otherwise
    return f"{x:.3g}" if (x != 0 and (abs(x) < 1e-2 or abs(x) >= 1e4)) else f"{x:.3f}"

disp_out = disp.applymap(fmt)
disp_out['S/sqrt(B)']           = ssb.map(lambda v: 'n/a' if pd.isna(v) else f"{v:.3f}")
disp_out['S/sqrt(S+B)']         = ss_sb.map(lambda v: 'n/a' if pd.isna(v) else f"{v:.3f}")
disp_out['S/sqrt(S+1.3B)']      = ss_s13b.map(lambda v: 'n/a' if pd.isna(v) else f"{v:.3f}")
disp_out[f'ZBi (σ_b={SIGMA_B_FRAC:.0%})'] = ZBi.map(lambda v: 'n/a' if pd.isna(v) else f"{v:.3f}")
disp_out.insert(0, 'cut', disp_out.index)

# Save + print (add the two new columns)
out_csv = Path(LOG_DIR) / "cutflow_unified.csv"
out_md  = Path(LOG_DIR) / "cutflow_unified.md"
disp_out.to_csv(out_csv, index=False)
with open(out_md, "w") as f:
    f.write(disp_out.to_markdown(index=False))

print("Unified cutflow written to:")
print(f"  - {out_csv}")
print(f"  - {out_md}")
print(disp_out[['cut'] + process_order + ['S/sqrt(B)','S/sqrt(S+B)','S/sqrt(S+1.3B)', f'ZBi (σ_b={SIGMA_B_FRAC:.0%})']].to_string(index=False))

Unified cutflow written to:
  - cutlogs_135_internal/cutflow_unified.csv
  - cutlogs_135_internal/cutflow_unified.md
                  cut   ggHyyd    Zjets   Zgamma   Wgamma    Wjets gammajet_direct   data23 S/sqrt(B) S/sqrt(S+B) S/sqrt(S+1.3B) ZBi (σ_b=30%)
                 LOAD 2.29e+04 1.74e+06  3.1e+05 5.68e+05 3.97e+06        1.73e+08 2.13e+08     1.157       1.157          1.015        -0.199
CUT 1 (preprocessing) 8732.756 6.74e+05  2.5e+05 4.28e+05 2.81e+06        1.46e+08 3.62e+07     0.640       0.640          0.561        -0.199
     n_mu_baseline==0 8732.756 6.74e+05  2.5e+05 4.28e+05 2.81e+06        1.46e+08 3.62e+07     0.640       0.640          0.561        -0.199
     n_el_baseline==0 8732.756 1.99e+05  2.5e+05 4.28e+05 2.81e+06        1.46e+08 3.62e+07     0.641       0.641          0.562        -0.199
    n_tau_baseline==0 8630.402 1.96e+05  2.4e+05 3.95e+05 2.78e+06        1.43e+08 3.55e+07     0.639       0.639          0.560        -0.199
           trigger==1 258