In [1]:
import os
import pickle
import uproot
import pandas as pd
import numpy as np
import uproot
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import awkward as ak
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, roc_curve, auc
from utilities import *

In [None]:
def prepare_data_4(test_size=0.2, accept_data_filename="l1calo_hist_EGZ_extended.root", reject_data_filename="l1calo_hist_ZMUMU_extended.root", data_subdir="ZMUMU_EGZ_extended_np_pd",format_mode="SuperCell_ET"):
    save_path = os.path.join(os.path.pardir, "data", data_subdir)
    if os.path.exists(os.path.join(save_path,"np_data.npz")) and os.path.exists(os.path.join(save_path,"input_df.parquet")):
        print(f"found preprepared data in {save_path}")
        np_data = np.load(os.path.join(save_path,"np_data.npz"))
        input_np, labels_np = np_data["input_np"], np_data["labels_np"]
        input_df = pd.read_parquet(os.path.join(save_path,"input_df.parquet"))


    else:
        print(f"preprepared data in {save_path} is missing, preparing and saving here")
        accept_data_path= os.path.join(os.path.pardir, "data", accept_data_filename)
        reject_data_path= os.path.join(os.path.pardir, "data", reject_data_filename)
        DFs = import_data_files([accept_data_path, reject_data_path])

        accepted_labels = np.ones(DFs[0].shape[0])
        rejected_labels = np.zeros(DFs[1].shape[0])
        accepted_df = pd.DataFrame({'offline_ele_pt': DFs[0]['offline_ele_pt'],'Label': 1})
        rejected_df = pd.DataFrame({'offline_ele_pt': DFs[1]['offline_ele_pt'],'Label': 0})

        input_np = format_numpy_training_input(DFs,format_mode)
        input_df = pd.concat([accepted_df,rejected_df]).reset_index(drop=True)
        labels_np = np.concatenate((accepted_labels, rejected_labels), axis=0)
        
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        np.savez(os.path.join(save_path,"np_data.npz"), input_np=input_np,labels_np=labels_np)
        input_df.to_parquet(os.path.join(save_path,"input_df.parquet"), index=False)

    print(input_np[0].shape, input_np[0])
    X_train, X_test, pd_passthrough_train, pd_passthrough_test, y_train, y_test = train_test_split(input_np, input_df, labels_np, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test, pd_passthrough_train, pd_passthrough_test

In [None]:
def format_numpy_training_input(DFs,format_mode):
    if format_mode == "SuperCell_ET":
        accepted_numpy = ak.to_numpy(DFs[0]['SuperCell_ET'])
        rejected_numpy = ak.to_numpy(DFs[1]['SuperCell_ET'])
        
    elif format_mode == "iso_vars":
        columns = ["eFEX_ReC", "eFEX_ReE", "eFEX_RhE", "eFEX_RhH", "eFEX_WsN", "eFEX_WsD"]
        accepted_numpy = DFs[0][columns].to_numpy(dtype=np.float32)
        rejected_numpy = DFs[1][columns].to_numpy(dtype=np.float32)

    elif format_mode == "reduced_SuperCell_ET":
        full_accepted_numpy = ak.to_numpy(DFs[0]['SuperCell_ET'])
        full_rejected_numpy = ak.to_numpy(DFs[1]['SuperCell_ET'])
        accepted_seed_indecies = np.argmax(full_accepted_numpy[:, 49:52 + 1], axis=1) + 49
        rejected_seed_indecies = np.argmax(full_rejected_numpy[:, 49:52 + 1], axis=1) + 49

        delete_indices = {
        49: [57, 58, 59, 61, 62, 63],  # Indices to delete for 49
        50: [34, 38, 58, 59, 62, 63],  # Indices to delete for 50
        51: [34, 35, 38, 39, 59, 63],  # Indices to delete for 51
        52: [34, 35, 36, 38, 39, 40]}   # Indices to delete for 52

        accepted_numpy = np.array([np.delete(row, delete_indices[cond]) for row, cond in zip(full_accepted_numpy, accepted_seed_indecies)])
        rejected_numpy = np.array([np.delete(row, delete_indices[cond]) for row, cond in zip(full_rejected_numpy, rejected_seed_indecies)])


    return np.concatenate((accepted_numpy, rejected_numpy), axis=0)
    

In [2]:
accept_data_path= os.path.join(os.path.pardir, "data", "l1calo_hist_EGZ_extended.root")
reject_data_path= os.path.join(os.path.pardir, "data", "l1calo_hist_ZMUMU_extended.root")
DFs = import_data_files([accept_data_path, reject_data_path])

In [3]:
full_accepted_numpy = ak.to_numpy(DFs[0]['SuperCell_ET'])
full_rejected_numpy = ak.to_numpy(DFs[1]['SuperCell_ET'])

In [None]:
accepted_seed_indices = np.argmax(full_accepted_numpy[:, 49:52 + 1], axis=1) + 49
rejected_seed_indices = np.argmax(full_rejected_numpy[:, 49:52 + 1], axis=1) + 49

delete_indices = {
        49: [24, 25, 26, 28, 29, 30, 57, 58, 59, 61, 62, 63, 90, 91, 92, 94, 95, 96],  # Indices to delete for 49
        50: [1, 5, 25, 26, 29, 30, 34, 38, 58, 59, 62, 63, 67, 71, 91, 92, 95, 96],  # Indices to delete for 50
        51: [1, 2, 5, 6, 26, 30, 34, 35, 38, 39, 59, 63, 67, 68, 71, 72, 92, 96],  # Indices to delete for 51
        52: [1, 2, 3, 5, 6, 7, 34, 35, 36, 38, 39, 40, 67, 68, 69, 71, 72, 73]}   # Indices to delete for 52

In [None]:

accepted_numpy = np.array([np.delete(row, delete_indices[cond]) for row, cond in zip(full_accepted_numpy, accepted_seed_indices)])
rejected_numpy = np.array([np.delete(row, delete_indices[cond]) for row, cond in zip(full_rejected_numpy, rejected_seed_indices)])


In [18]:
input_np = np.concatenate((accepted_numpy, rejected_numpy), axis=0)
print(input_np[0].shape, input_np[10])

(81,) [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         5.6500001  0.         0.         1.70000005
 4.07499981 0.         0.72500002 5.5        9.25       0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.32499999 0.69999999 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
