In [48]:
# import modules
import uproot, sys, time, math, pickle, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import awkward as ak
from tqdm import tqdm
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
from scipy.special import betainc
from scipy.stats import norm

# import config functions
from jet_faking_plot_config import getWeight, zbi, sample_dict, getVarDict
from plot_var import variables, variables_data, ntuple_names, ntuple_names_BDT
from n_1_iteration_functions import get_best_cut, calculate_significance, apply_cut_to_fb, apply_all_cuts, compute_total_significance, n_minus_1_optimizer
# from cut_config import cut_config

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14.0,10.0  # Roughly 11 cm wde by 8 cm high  
mpl.rcParams['font.size'] = 20.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"]  # Legend
})

tot = []
ntuple_names = ['ggHyyd','Zjets','Zgamma','Wgamma','Wjets','gammajet_direct', 'data23']

def csv_to_tot(csv_path, ntuple_names=ntuple_names, process_col='process'):
    # Load CSV
    df = pd.read_csv(csv_path)
    df = df[df.MLBDTScore >= 0.03]
    df['weights'] = np.abs(df['weights'])

    all_cols = [c for c in df.columns if c != process_col]

    # Build tot: one ak.Array per process, in the given order
    tot = []
    for proc in ntuple_names:
        sub = df[df[process_col] == proc]
        # Awkward record array with consistent fields (even if empty)
        fb = ak.Array({col: sub[col].to_numpy() for col in all_cols})
        tot.append(fb)

    # Optional: quick summary printout
    print("[csv_to_tot] Rows per process:")
    for proc, fb in zip(ntuple_names, tot):
        n = len(fb)  # same as ak.num(fb['weights']) if present
        wsum = float(np.sum(fb['weights'])) if n > 0 else 0.0
        print(f"  {proc:15s}  n={n:6d}  sum_w={wsum:.6g}")

    return tot

# ---- usage ----
csv_path = "/data/jlai/ntups/csv/bdt_output_reduced.csv"
tot = csv_to_tot(csv_path)


[csv_to_tot] Rows per process:
  ggHyyd           n=  1938  sum_w=198.869
  Zjets            n=    22  sum_w=1.79321
  Zgamma           n= 10698  sum_w=620.066
  Wgamma           n=  7113  sum_w=1229.33
  Wjets            n=  2356  sum_w=1043.09
  gammajet_direct  n=   506  sum_w=24.4602
  data23           n=    10  sum_w=104.259


In [49]:
def getCutDict():
    cut_dict = {}

    cut_dict['dphi_jj'] = {
        'uppercut': np.arange(1, 3.14 + 0.01, 0.01) # dphi_jj < cut
    }
    cut_dict['dphi_phterm_jetterm'] = {
        'lowercut': np.arange(1, 2.5 + 0.05, 0.05), # dphi_phterm_jetterm > cut
        'uppercut': np.arange(2, 4 + 0.1, 0.1) # dphi_phterm_jetterm < cut
    }
    cut_dict['jet_central_eta'] = {
        'lowercut': np.arange(-2.5, 0+0.01, 0.01), # jet_central_eta > cut
        'uppercut': np.arange(0, 2.5+0.01, 0.01) # jet_central_eta < cut
    }
    cut_dict['jet_central_pt2'] = {
        'lowercut': np.arange(20000, 100000+1000, 1000) # jet_central_pt2 > cut
    }
    cut_dict['metsigres'] = {
        'lowercut': np.arange(8600, 15000, 100),
        'uppercut': np.arange(12000, 60000, 100)
    }
    cut_dict['met_noJVT'] = {
        'lowercut': np.arange(50000, 120000, 100),
        'uppercut': np.arange(100000, 250000, 100)
    }
    cut_dict['softerm'] = {
        'uppercut': np.arange(10000, 40000, 100)
    }
    cut_dict['n_jet_central'] = {
        'uppercut': np.arange(0, 8+1, 1) # njet < cut
    }

    return cut_dict
cut_config = getCutDict()

In [50]:
def calculate_significance(cut_var, cut_type, cut_values, tot2, ntuple_names, signal_name, getVarDict, getWeight):
    sig_simple_list = []
    sigacc_simple_list = []
    acceptance_values = []

    for cut in cut_values:
        sig_after_cut = 0
        bkg_after_cut = []
        sig_events = 0

        for i in range(len(ntuple_names)):
            process = ntuple_names[i]
            x = tot2[i][cut_var]
            mask = x != -999
            x = x[mask]

            if process == signal_name:
                sig_events = tot2[i]['weights']
                sig_events = sig_events[mask]
                mask = x >= cut if cut_type == 'lowercut' else x <= cut
                sig_after_cut = ak.sum(sig_events[mask])
            else:
                bkg_events = tot2[i]['weights']
                bkg_events = bkg_events[mask]
                mask = x >= cut if cut_type == 'lowercut' else x <= cut
                bkg_after_cut.append(ak.sum(bkg_events[mask]))


        total_bkg = sum(bkg_after_cut)
        total_signal = sig_after_cut

        sig_simple = total_signal / np.sqrt(total_bkg) if total_bkg > 0 else 0
        acceptance = total_signal / sum(sig_events) if sum(sig_events) > 0 else 0

        sig_simple_list.append(sig_simple)
        sigacc_simple_list.append(sig_simple * acceptance)
        acceptance_values.append(acceptance * 100)

    return sig_simple_list, sigacc_simple_list, acceptance_values

In [51]:
%%time
signal_name='ggHyyd'
initial_cut = []
tot2 = tot

# < -- Initial Cut on all variables (maximize the significance * acceptance) -- > 
for cut_var, cut_types in cut_config.items():
    for cut_type, cut_values in cut_types.items():
        sig_simple_list, sigacc_simple_list, acceptance_values = calculate_significance(
            cut_var, cut_type, cut_values, tot2, ntuple_names, signal_name, getVarDict, getWeight
        )

        best_cut, best_sig, idx = get_best_cut(cut_values, sigacc_simple_list) 
        
        if idx == 0 or idx == len(sigacc_simple_list) - 1: # I chose to use index to indicate not to make unnecessary cut (for initial cut)
            print(cut_var, idx, len(sigacc_simple_list))
            continue
            
        result = {
            "cut_var": cut_var,
            "cut_type": cut_type,
            "best_cut": best_cut,
            "best_sig_x_acc": best_sig,
            "significance": sig_simple_list[idx],
            "acceptance": acceptance_values[idx]
        }

        print(result)
        initial_cut.append(dict(list(result.items())[:3]))

dphi_jj 214 215
{'cut_var': 'dphi_phterm_jetterm', 'cut_type': 'lowercut', 'best_cut': 1.4500000000000004, 'best_sig_x_acc': 3.619275247678468, 'significance': 3.619275247678468, 'acceptance': 100.0}
{'cut_var': 'dphi_phterm_jetterm', 'cut_type': 'uppercut', 'best_cut': 3.100000000000001, 'best_sig_x_acc': 3.6170973308859207, 'significance': 3.6170973308859207, 'acceptance': 100.0}
{'cut_var': 'jet_central_eta', 'cut_type': 'lowercut', 'best_cut': -2.450000000000001, 'best_sig_x_acc': 3.6203402689685253, 'significance': 3.6203412907869934, 'acceptance': 99.99997177563147}
jet_central_eta 250 251
jet_central_pt2 0 81
{'cut_var': 'metsigres', 'cut_type': 'lowercut', 'best_cut': 9900, 'best_sig_x_acc': 3.617082689663944, 'significance': 3.617082689663944, 'acceptance': 100.0}
{'cut_var': 'metsigres', 'cut_type': 'uppercut', 'best_cut': 35100, 'best_sig_x_acc': 3.6235521247548155, 'significance': 3.624906210248336, 'acceptance': 99.96264495093163}
met_noJVT 0 700
{'cut_var': 'met_noJVT', '

In [52]:
def apply_cut_to_fb(fb, process, var, cut_val, cut_type, getVarDict):
    x = fb[var]
    mask = x != -999

    if cut_type == 'lowercut':
        mask = mask & (x >= cut_val)
    elif cut_type == 'uppercut':
        mask = mask & (x <= cut_val)
    return fb[mask]


def apply_all_cuts(tot2, ntuple_names, cut_list, getVarDict):
    new_tot2 = []
    for i, fb in enumerate(tot2):
        process = ntuple_names[i]
        for cut in cut_list:
            fb = apply_cut_to_fb(fb, process, cut["cut_var"], cut["best_cut"], cut["cut_type"], getVarDict)
        new_tot2.append(fb)
    return new_tot2

def compute_total_significance(tot2, ntuple_names, signal_name, getVarDict, getWeight):
    signal_sum = 0
    bkg_sum = 0
    for i in range(len(ntuple_names)):
        fb = tot2[i]
        print(fb)
        process = ntuple_names[i]
        weights = fb['weights']
        if process == signal_name:
            signal_sum += ak.sum(weights)
        else:
            bkg_sum += ak.sum(weights)
    return signal_sum / np.sqrt(bkg_sum) if bkg_sum > 0 else 0

In [54]:
tot2_initial_cut = apply_all_cuts(tot2, ntuple_names, initial_cut, getVarDict)
final_significance = compute_total_significance(tot2_initial_cut, ntuple_names, signal_name, getVarDict, getWeight)
print('after initial cutting, signficance: ', final_significance)

[{balance: 1.78, VertexBDTScore: 0.151, dmet: 0, dphi_jj: -999, ...}, ...]
[{balance: 1.26, VertexBDTScore: 0.177, dmet: 0, dphi_jj: 1.51, ...}, ...]
[{balance: 7.23, VertexBDTScore: 0.013, dmet: 0, dphi_jj: -999, ...}, ...]
[{balance: 1.35, VertexBDTScore: 0.301, dmet: 0, dphi_jj: 0.441, ...}, ...]
[{balance: 1.28, VertexBDTScore: 0.288, dmet: 0, dphi_jj: -999, ...}, ...]
[{balance: 1.15, VertexBDTScore: 0.301, dmet: 1.4e+03, dphi_jj: 1.12, ...}, ...]
[{balance: 1.18, VertexBDTScore: 0.145, dmet: 0, dphi_jj: 0.468, ...}, ...]
after initial cutting, signficance:  3.5483105323660187


In [56]:
%%time
def n_minus_1_optimizer(initial_cut, cut_config, tot2, ntuple_names, signal_name, getVarDict, getWeight, final_significance, max_iter=10, tolerance=1e-4):
    best_cuts = initial_cut.copy()
    iteration = 0
    converged = False

    while not converged and iteration < max_iter:
        converged = True
        print(f"\n--- Iteration {iteration + 1} ---")
        for i, cut in enumerate(best_cuts):
            # Apply all other cuts
            n_minus_1_cuts = best_cuts[:i] + best_cuts[i+1:]
            tot2_cut = apply_all_cuts(tot2, ntuple_names, n_minus_1_cuts, getVarDict)

            # Re-scan this variable
            cut_var = cut["cut_var"]
            cut_type = cut["cut_type"]
            cut_values = cut_config[cut_var][cut_type]

            sig_simple_list, sigacc_simple_list, _ = calculate_significance(
                cut_var, cut_type, cut_values, tot2_cut, ntuple_names
                , signal_name, getVarDict, getWeight
            )
            best_cut, best_sig, idx = get_best_cut(cut_values, sig_simple_list)

            if abs(best_cut - cut["best_cut"]) > tolerance:
            # if best_sig - final_significance > tolerance:
                print(f"Updating {cut_var} ({cut_type}): {cut['best_cut']} → {best_cut}  (sig {final_significance:.2f} → {best_sig:.2f})")
                best_cuts[i]["best_cut"] = best_cut
                final_significance = best_sig
                converged = False  # Found at least one improvement

        iteration += 1

    print( ' optimized cuts, end of iteration ' )
    return best_cuts, final_significance
    
# < -- n-1 iterations until no further improvement (max significance) -- >
optimized_cuts, final_significance = n_minus_1_optimizer(
    initial_cut, cut_config, tot2, ntuple_names, signal_name, getVarDict, getWeight, final_significance
)
print('after optimized cutting, signficance: ', final_significance)



--- Iteration 1 ---
Updating metsigres (uppercut): 35100 → 33600  (sig 3.55 → 3.55)
Updating met_noJVT (uppercut): 248600 → 242400  (sig 3.55 → 3.55)

--- Iteration 2 ---
 optimized cuts, end of iteration 
after optimized cutting, signficance:  3.548829904381028
CPU times: user 2min 51s, sys: 1.94 s, total: 2min 53s
Wall time: 2min 54s


In [57]:
print( ' < -- Final Optimized Cuts -- > ')
# print(optimized_cuts)

for cut in optimized_cuts:
    var = cut['cut_var']
    val = cut['best_cut']
    if cut['cut_type'] == 'uppercut':
        print(f"{var} <= {val}")
    elif cut['cut_type'] == 'lowercut':
        print(f"{var} >= {val}")
        
print('after optimized cutting, signficance: ', final_significance)


 < -- Final Optimized Cuts -- > 
dphi_phterm_jetterm >= 1.4500000000000004
dphi_phterm_jetterm <= 3.100000000000001
jet_central_eta >= -2.450000000000001
metsigres >= 9900
metsigres <= 33600
met_noJVT <= 242400
n_jet_central <= 4
after optimized cutting, signficance:  3.548829904381028


In [13]:
tot2_optimized_cuts = apply_all_cuts(tot2, ntuple_names, optimized_cuts, getVarDict)

In [14]:
# < -- Save data after cuts to a csv file for BDT input -- >
Vars = [
    'balance', 
    'VertexBDTScore',
    'dmet',
    'dphi_jj',
    'dphi_met_central_jet',
    'dphi_met_phterm',
    'dphi_met_ph',
    'dphi_met_jetterm',
    'dphi_phterm_jetterm',
    'dphi_ph_centraljet1',
    'ph_pt',
    'ph_eta',
    'ph_phi',
    'jet_central_eta',
    'jet_central_pt1',
    'jet_central_pt2',
    'jetterm',
    'jetterm_sumet',
    'metsig',
    'metsigres',
    'met',
    'met_noJVT',
    'metplusph',
    'failJVT_jet_pt1',
    'softerm',
    'n_jet_central'
]

data_list = []

for j in range(len(ntuple_names)):
    process = ntuple_names[j]
    fb = tot2_optimized_cuts[j] 
    
    data_dict = {}
    
    for var in Vars:
        var_config = getVarDict(fb, process, var_name=var)
        data_dict[var] = var_config[var]['var']
    
    weights = getWeight(fb, process)
    data_dict['weights'] = weights
    
    n_events = len(weights)
    data_dict['process'] = [process] * n_events
    label = 1 if process == 'ggHyyd' else 0
    data_dict['label'] = [label] * n_events
    
    df_temp = pd.DataFrame(data_dict)
    data_list.append(df_temp)

df_all = pd.concat(data_list, ignore_index=True)
df_all.head()

df_all.to_csv("/data/jlai/ntups/csv/jet_faking_BDT_input_basic_reduced2.csv", index=False)