In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import uproot, os, logging, json, random, wandb, shap, scipy
import awkward as ak
# import torch
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, log_loss, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GroupShuffleSplit, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import randint, uniform
from matplotlib.ticker import FormatStrFormatter
from collections import Counter

# ML model
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Config

# Set up plot defaults
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 12.0,8.0  # Roughly 11 cm wde by 8 cm high
mpl.rcParams['font.size'] = 14.0 # Use 14 point font
sns.set(style="whitegrid")

font_size = {
    "xlabel": 17,
    "ylabel": 17,
    "xticks": 15,
    "yticks": 15,
    "legend": 14
}

plt.rcParams.update({
    "axes.labelsize": font_size["xlabel"],  # X and Y axis labels
    "xtick.labelsize": font_size["xticks"],  # X ticks
    "ytick.labelsize": font_size["yticks"],  # Y ticks
    "legend.fontsize": font_size["legend"]  # Legend
})

# Check for gpu
# torch.cuda.is_available()

In [2]:
df = pd.read_csv("/data/jlai/vertex/vertex_data.csv")

In [10]:
# Data Preparation
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df['event_id']))

df_train = df.iloc[train_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)

X_train = df_train.drop(['label', 'event_id'], axis=1)
y_train = df_train['label'].values
X_test = df_test.drop(['label', 'event_id'], axis=1)
y_test = df_test['label'].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to tensors 
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(max_iter=300, early_stopping=True, random_state=42))
])

param_dist = {
    "mlp__hidden_layer_sizes": [(32,), (64,), (64, 32), (128, 64)],
    "mlp__activation": ["relu", "tanh"],
    "mlp__solver": ["adam", "sgd"],
    "mlp__alpha": scipy.stats.loguniform(1e-5, 1e-2),
    "mlp__learning_rate_init": scipy.stats.loguniform(1e-4, 1e-2),
}

search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist,
    n_iter=20, cv=3, verbose=2, n_jobs=-1, scoring="roc_auc"
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[CV] END mlp__activation=tanh, mlp__alpha=0.0004131893529617372, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.0005225431671557881, mlp__solver=sgd; total time=   0.4s
[CV] END mlp__activation=relu, mlp__alpha=3.585398145211779e-05, mlp__hidden_layer_sizes=(128, 64), mlp__learning_rate_init=0.0005872128442934968, mlp__solver=adam; total time=   0.5s
[CV] END mlp__activation=relu, mlp__alpha=5.0954436665254554e-05, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.00018629351809766105, mlp__solver=sgd; total time=   0.4s
[CV] END mlp__activation=tanh, mlp__alpha=7.373036992242607e-05, mlp__hidden_layer_sizes=(128, 64), mlp__learning_rate_init=0.00998021448096702, mlp__solver=sgd; total time=   0.4s
[CV] END mlp__activation=tanh, mlp__alpha=1.6968395030043562e-05, mlp__hidden_layer_sizes=(32,), mlp__learning_rate_init=0.003684748210527849, mlp__solver=sgd; total time=   0.1s
[CV] END mlp__activation=tanh, mlp__alpha=0.00010211572130744182, mlp__hidden_layer_sizes=

In [12]:
y_pred_proba = search.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test ROC AUC: {auc:.4f}")
print("Best Parameters:", search.best_params_)

print(classification_report(y_test, search.predict(X_test)))

Test ROC AUC: 0.5904
Best Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.002470967732215456, 'mlp__hidden_layer_sizes': (32,), 'mlp__learning_rate_init': 0.004347947490518464, 'mlp__solver': 'adam'}
              precision    recall  f1-score   support

           0       0.69      0.98      0.81      1760
           1       0.54      0.05      0.10       808

    accuracy                           0.69      2568
   macro avg       0.62      0.52      0.46      2568
weighted avg       0.65      0.69      0.59      2568

