In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import uproot, wandb, os, logging, json, random
import awkward as ak
# import torch
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, log_loss
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform

# ML model
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 12.0,8.0  # Roughly 11 cm wde by 8 cm high
mpl.rcParams['font.size'] = 14.0 # Use 14 point font
sns.set(style="whitegrid")


In [None]:
path = '/data/jlai/ntups/csv/BDT_input_basic.csv'
df = pd.read_csv(path)
# df = df[df.dphi_phterm_jetterm >= 1.80]
df.describe()

In [None]:
Vars = [
    'metsig', 'metsigres', 'met', 'met_noJVT', 'dmet', 'ph_pt', 'ph_eta', 'ph_phi',
    'jet_central_eta', 'jet_central_pt1', 'jet_central_pt2', 'dphi_met_phterm', 'dphi_met_ph',
    'dphi_met_jetterm', 'dphi_phterm_jetterm', 'dphi_ph_centraljet1', 'metplusph', 'failJVT_jet_pt1',
    'softerm', 'jetterm', 'jetterm_sumet', 'dphi_met_central_jet', 'balance', 'dphi_jj', 'BDTScore', 'n_jet_central'
]

Vars2 = [
    'metsig', 'met', 'met_noJVT', 'dmet', 'dphi_met_phterm','dphi_ph_centraljet1',
    'dphi_phterm_jetterm', 'jetterm', 'dphi_met_central_jet', 'BDTScore', 'weights', 'label', 'process'
]

Vars3 = [
    'metsigres', 'ph_pt', 'ph_eta', 'dphi_met_jetterm', 'failJVT_jet_pt1', 'n_jet_central', 'dphi_jj'
]

Vars_drop = ['weights', 'label', 'process', 'met', 'dphi_phterm_jetterm']

df_Vars2 = df[Vars2].copy()
df_Vars3 = df[Vars3].copy()
df_Vars3.replace(-999, np.nan, inplace=True)
df_Vars3_inverted = 1 / df_Vars3.replace({0: np.nan})  # Avoid division by zero

df_ml_input = pd.concat([df_Vars2, df_Vars3_inverted], axis=1)
df_ml_input.replace(-999, np.nan, inplace=True)
print("Number of event with negative weights :", np.sum(df_ml_input.weights < 0))
df_ml_input["weights"] = df_ml_input["weights"].abs() # some of the weights are negative

display(df_ml_input.describe())

print("Number of nan in each variable: ")
print(df_ml_input.isna().sum())

# Define X (features) and y (labels)
X = df_ml_input.drop(Vars_drop, axis=1)
y = df_ml_input['label']
weights = df_ml_input['weights']

# # Reweight signal so that total signal weight = total background weight
# sig_mask = df_ml_input['label'] == 1
# bkg_mask = df_ml_input['label'] == 0

# sum_sig = df_ml_input.loc[sig_mask, 'weights'].sum()
# sum_bkg = df_ml_input.loc[bkg_mask, 'weights'].sum()

# scale_factor = sum_bkg / sum_sig if sum_sig > 0 else 1.0
# df_ml_input.loc[sig_mask, 'weights'] *= scale_factor

random_num = random.randint(1, 100)
print("random number: ", random_num)

X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
    X, y, weights, test_size=0.3, random_state=random_num, stratify=y)

models = {
    "BDT": XGBClassifier(
        tree_method='hist',
        device='cuda',
        eval_metric='auc',
        missing=np.nan,
        random_state=random_num
    ),
    "LightGBM": LGBMClassifier(
        boosting_type='gbdt', random_state=random_num
    )
}

roc_curves = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Initialize wandb run for each model
    wandb.init(project="Dark_ph_ML_Comparison", name=model_name, reinit=True)

    # If model supports sample_weight, fit with it
    try:
        model.fit(X_train, y_train, sample_weight=sw_train)
    except:
        model.fit(X_train, y_train)  # Some models like MLP/LogReg may not support sample_weight

    # Predict probabilities
    y_train_pred_proba = model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]

    # Predict classes
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    metrics = {
        "Train LogLoss": log_loss(y_train, y_train_pred_proba, sample_weight=sw_train),
        "Test LogLoss": log_loss(y_test, y_test_pred_proba, sample_weight=sw_test),
        "Train Accuracy": accuracy_score(y_train, y_train_pred),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Train ROC AUC": roc_auc_score(y_train, y_train_pred_proba, sample_weight=sw_train),
        "Test ROC AUC": roc_auc_score(y_test, y_test_pred_proba, sample_weight=sw_test)
    }

    print(metrics)

    # Log metrics to wandb
    wandb.log(metrics)

    # Save model scores into df
    df_ml_input[f"score_{model_name.lower()}"] = model.predict_proba(X)[:, 1]

    # Calculate ROC for final plot
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba, sample_weight=sw_test)
    roc_curves[model_name] = (fpr, tpr, metrics["Test ROC AUC"])

    wandb.finish()

# --- Final ROC plot for all models ---
plt.figure(figsize=(8, 8))
for model_name, (fpr, tpr, auc) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {auc:.3f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Comparison of Different ML Models")
plt.legend()
plt.grid(True)
plt.tight_layout()
# plt.savefig("all_models_roc.png")
plt.show()
