In [1]:
from pathlib import Path

import pandas

from helpers.datasets.make_and_save.aggregated_signal import get_delta_C9_value_of_raw_signal_file, get_trial_num_of_raw_signal_file
from helpers.datasets.make_and_save.preprocessing import apply_q_squared_veto
from helpers.experiment.constants import Paths_to_Directories


bkg_dir = Paths_to_Directories().path_to_raw_bkg_dir.joinpath("new")

columns = [
    "isSignal", 
    "tfRedChiSqB0", 
    "deltaE", 
    "invM_K_pi_shifted",
    "K_p_kaonID", 
    "K_p_dr", 
    "K_p_dz", 
    "pi_m_dr", 
    "pi_m_dz", 
    "mu_p_dr", 
    "mu_p_dz", 
    "mu_p_muonID", 
    "mu_m_dr", 
    "mu_m_dz", 
    "mu_m_muonID",
    "q_squared",
    "costheta_mu",
    "costheta_K",
    "chi"
]


def load_standard_model_detector_level_signal_dataframe():

    trial_range = range(1, 41)
    file_paths = list(Path(Paths_to_Directories().path_to_raw_signal_dir).glob("*.pkl"))
    file_paths = [path for path in file_paths if get_trial_num_of_raw_signal_file(path, verbose=False) in trial_range]
    file_paths = [path for path in file_paths if get_delta_C9_value_of_raw_signal_file(path, verbose=False) == 0]

    dataframe = pandas.concat([pandas.read_pickle(path) for path in file_paths])
    detector_level_dataframe = dataframe.loc["det"]
    detector_level_dataframe = cut_to_signal_events(detector_level_dataframe)
    detector_level_dataframe = apply_q_squared_veto(detector_level_dataframe, "resonances")
    return detector_level_dataframe


def get_original_ratio_mix_to_charge():
    num_generic_charge_events = 2424628566
    num_generic_mix_events = 1813405232
    ratio_mix_to_charge = num_generic_mix_events / num_generic_charge_events
    return ratio_mix_to_charge


def load_charge_dataframe():
    dataframe = pandas.read_pickle("charge818.pkl").loc["det"]
    num_charge_events_balanced = int(len(dataframe) * get_original_ratio_mix_to_charge())
    dataframe = dataframe.iloc[:num_charge_events_balanced]
    dataframe = apply_q_squared_veto(dataframe, "resonances")
    return dataframe


def load_mix_dataframe():
    dataframe = pandas.read_pickle("mix818.pkl").loc["det"]
    dataframe = apply_q_squared_veto(dataframe, "resonances")
    return dataframe


def load_charge_and_mix_dataframes():
    charge_dataframe = load_charge_dataframe()
    mix_dataframe = load_mix_dataframe()
    return charge_dataframe, mix_dataframe


def cut_to_signal_region(dataframe):
    signal_mbc_cut = lambda df : df[df["Mbc"] > 5.27]
    signal_deltaE_cut = lambda df : df[(df["deltaE"] < 0.05) & (df["deltaE"] > -0.05)]
    return signal_mbc_cut(signal_deltaE_cut(dataframe))


def cut_to_sideband(dataframe):
    sideband_mbc_cut = lambda df : df[(df["Mbc"] > 5.0) & (df["Mbc"] < 5.26)]
    sideband_deltaE_cut = lambda df : df[(df["deltaE"] < 0.05) & (df["deltaE"] > -0.05)]    # same as signal region for now
    return sideband_mbc_cut(sideband_deltaE_cut(dataframe))


def cut_to_signal_events(dataframe):
    return dataframe[dataframe["isSignal"] == 1]


def cut_to_bkg_events(dataframe):
    return dataframe[dataframe["isSignal"] != 1]


def cut_to_multiple_candidate_events(dataframe):
    return dataframe[dataframe["__ncandidates__"] != 1]


def save_signal_region_datasets():

    val_split_frac = 0.5
    
    df_charge, df_mix = load_charge_and_mix_dataframes()

    df_charge_sr_bkg = cut_to_bkg_events(cut_to_signal_region(df_charge))
    num_val_charge_sr_bkg = int(val_split_frac*len(df_charge_sr_bkg))
    df_charge_sr_bkg_val = df_charge_sr_bkg.iloc[:num_val_charge_sr_bkg]
    df_charge_sr_bkg_test = df_charge_sr_bkg.iloc[num_val_charge_sr_bkg:]

    df_charge_sr_bkg_val[columns].to_parquet(bkg_dir.joinpath("charge_sr_bkg_val.parquet"))
    df_charge_sr_bkg_test[columns].to_parquet(bkg_dir.joinpath("charge_sr_bkg_test.parquet"))

    df_mix_sr_bkg = cut_to_bkg_events(cut_to_signal_region(df_mix))
    num_val_mix_sr_bkg = int(val_split_frac*len(df_mix_sr_bkg))
    df_mix_sr_bkg_val = df_mix_sr_bkg.iloc[:num_val_mix_sr_bkg]
    df_mix_sr_bkg_test = df_mix_sr_bkg.iloc[num_val_mix_sr_bkg:] 

    df_mix_sr_bkg_val[columns].to_parquet(bkg_dir.joinpath("mix_sr_bkg_val.parquet"))
    df_mix_sr_bkg_test[columns].to_parquet(bkg_dir.joinpath("mix_sr_bkg_test.parquet"))

    df_mix_sr_signal = cut_to_signal_events(cut_to_signal_region(df_mix))
    num_in_val_mix_sr_signal = int(val_split_frac*len(df_mix_sr_signal))
    df_mix_sr_signal_val = df_mix_sr_signal.iloc[:num_in_val_mix_sr_signal]
    df_mix_sr_signal_test = df_mix_sr_signal.iloc[num_in_val_mix_sr_signal:]

    df_mix_sr_signal_val[columns].to_parquet(bkg_dir.joinpath("mix_sr_signal_val.parquet"))
    df_mix_sr_signal_test[columns].to_parquet(bkg_dir.joinpath("mix_sr_signal_test.parquet"))

    df_sr_val = pandas.concat([df_charge_sr_bkg_val[columns], df_mix_sr_bkg_val[columns], df_mix_sr_signal_val[columns]])
    df_sr_test = pandas.concat([df_charge_sr_bkg_test[columns], df_mix_sr_bkg_test[columns], df_mix_sr_signal_test[columns]])

    df_sr_val.to_parquet(bkg_dir.joinpath("sr_val.parquet"))
    df_sr_test.to_parquet(bkg_dir.joinpath("sr_test.parquet"))

    print("Charge, bkg., signal region, val.: ", len(df_charge_sr_bkg_val))
    print("Charge, bkg., signal region, test: ", len(df_charge_sr_bkg_test))
    print("Mix, bkg., signal region, val.: ", len(df_mix_sr_bkg_val))
    print("Mix, bkg., signal region, test: ", len(df_mix_sr_bkg_test))
    print("Mix, signal, signal region, val.: ", len(df_mix_sr_signal_val))
    print("Mix, signal, signal region, test: ", len(df_mix_sr_signal_test))
    print("Signal region, val.: ", len(df_sr_val))
    print("Signal region, test: ", len(df_sr_test))


def save_sideband_datasets():

    train_split_frac = 0.5
    val_split_frac = 0.20
    
    df_charge, df_mix = load_charge_and_mix_dataframes()
    df_signal = load_standard_model_detector_level_signal_dataframe()

    df_charge_sb_bkg = cut_to_bkg_events(cut_to_sideband(df_charge))
    num_train_charge_sb_bkg = int(train_split_frac*len(df_charge_sb_bkg))
    num_val_charge_sb_bkg = int(val_split_frac*len(df_charge_sb_bkg))
    df_charge_sb_bkg_train = df_charge_sb_bkg.iloc[:num_train_charge_sb_bkg]
    df_charge_sb_bkg_val = df_charge_sb_bkg.iloc[num_train_charge_sb_bkg:num_train_charge_sb_bkg+num_val_charge_sb_bkg] 
    df_charge_sb_bkg_test = df_charge_sb_bkg.iloc[num_train_charge_sb_bkg+num_val_charge_sb_bkg:]

    df_charge_sb_bkg_train[columns].to_parquet(bkg_dir.joinpath("charge_sb_bkg_train.parquet"))
    df_charge_sb_bkg_val[columns].to_parquet(bkg_dir.joinpath("charge_sb_bkg_val.parquet"))
    df_charge_sb_bkg_test[columns].to_parquet(bkg_dir.joinpath("charge_sb_bkg_test.parquet"))

    df_mix_sb_bkg = cut_to_bkg_events(cut_to_sideband(df_mix))
    num_train_mix_sb_bkg = int(train_split_frac*len(df_mix_sb_bkg))
    num_val_mix_sb_bkg = int(val_split_frac*len(df_mix_sb_bkg))
    df_mix_sb_bkg_train = df_mix_sb_bkg.iloc[:num_train_mix_sb_bkg]
    df_mix_sb_bkg_val = df_mix_sb_bkg.iloc[num_train_mix_sb_bkg:num_train_mix_sb_bkg+num_val_mix_sb_bkg]
    df_mix_sb_bkg_test = df_mix_sb_bkg.iloc[num_train_mix_sb_bkg+num_val_mix_sb_bkg:]

    df_mix_sb_bkg_train[columns].to_parquet(bkg_dir.joinpath("mix_sb_bkg_train.parquet"))
    df_mix_sb_bkg_val[columns].to_parquet(bkg_dir.joinpath("mix_sb_bkg_val.parquet"))
    df_mix_sb_bkg_test[columns].to_parquet(bkg_dir.joinpath("mix_sb_bkg_test.parquet"))

    num_train_bkg = len(df_charge_sb_bkg_train) + len(df_mix_sb_bkg_train)
    num_val_bkg = len(df_charge_sb_bkg_val) + len(df_mix_sb_bkg_val)
    num_test_bkg = len(df_charge_sb_bkg_test) + len(df_mix_sb_bkg_test)
    df_signal_train = df_signal.iloc[:num_train_bkg]
    df_signal_val = df_signal.iloc[num_train_bkg:num_train_bkg+num_val_bkg]
    df_signal_test = df_signal.iloc[num_train_bkg+num_val_bkg:num_train_bkg+num_val_bkg+num_test_bkg]
    assert len(df_signal_train) == num_train_bkg
    assert len(df_signal_val) == num_val_bkg
    assert len(df_signal_test) == num_test_bkg

    df_sb_train = pandas.concat([df_charge_sb_bkg_train[columns], df_mix_sb_bkg_train[columns], df_signal_train[columns]])
    df_sb_val = pandas.concat([df_charge_sb_bkg_val[columns], df_mix_sb_bkg_val[columns], df_signal_val[columns]])
    df_sb_test = pandas.concat([df_charge_sb_bkg_test[columns], df_mix_sb_bkg_test[columns], df_signal_test[columns]])

    df_sb_train.to_parquet(bkg_dir.joinpath("sb_train.parquet"))
    df_sb_val.to_parquet(bkg_dir.joinpath("sb_val.parquet"))
    df_sb_test.to_parquet(bkg_dir.joinpath("sb_test.parquet"))

    print("Charge, bkg., sideband, train: ", len(df_charge_sb_bkg_train))
    print("Charge, bkg., sideband, val.: ", len(df_charge_sb_bkg_val))
    print("Charge, bkg., sideband, test: ", len(df_charge_sb_bkg_test))
    print("Mix, bkg., sideband, train: ", len(df_mix_sb_bkg_train))
    print("Mix, bkg., sideband, val.: ", len(df_mix_sb_bkg_val))
    print("Mix, bkg., sideband, test: ", len(df_mix_sb_bkg_test))
    print("Signal, MC signal, train: ", len(df_signal_train))
    print("Signal, MC signal, val.: ", len(df_signal_val))
    print("Signal, MC signal, test: ", len(df_signal_test))
    print("Sideband, train: ", len(df_sb_train))
    print("Sideband, val.: ", len(df_sb_val))
    print("Sideband, test: ", len(df_sb_test))


In [2]:
save_signal_region_datasets()
save_sideband_datasets()

Applying q^2 veto.
Applied q^2 veto.
Applying q^2 veto.
Applied q^2 veto.
Charge, bkg., signal region, val.:  2692
Charge, bkg., signal region, test:  2692
Mix, bkg., signal region, val.:  2162
Mix, bkg., signal region, test:  2162
Mix, signal, signal region, val.:  889
Mix, signal, signal region, test:  890
Signal region, val.:  5743
Signal region, test:  5744
Applying q^2 veto.
Applied q^2 veto.
Applying q^2 veto.
Applied q^2 veto.
Applying q^2 veto.
Applied q^2 veto.
Charge, bkg., sideband, train:  66189
Charge, bkg., sideband, val.:  26475
Charge, bkg., sideband, test:  39715
Mix, bkg., sideband, train:  49902
Mix, bkg., sideband, val.:  19960
Mix, bkg., sideband, test:  29942
Signal, MC signal, train:  116091
Signal, MC signal, val.:  46435
Signal, MC signal, test:  69657
Sideband, train:  232182
Sideband, val.:  92870
Sideband, test:  139314


In [None]:
validation_signal_region_dataset = pandas.read_parquet("sr_val.parquet")
validation_signal_region_dataset["isSignal"] = validation_signal_region_dataset["isSignal"].fillna(value=0)


In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.hist(validation_signal_region_dataset["q_squared"], range=(11,16), bins=500)
plt.ylim(0, 300)
plt.show()