In [1]:
from pathlib import Path

import pandas

from helpers.datasets.make_and_save.aggregated_signal import get_delta_C9_value_of_raw_signal_file, get_trial_num_of_raw_signal_file
from helpers.datasets.make_and_save.preprocessing import apply_q_squared_veto
from helpers.experiment.constants import Paths_to_Directories


def get_original_ratio_mix_to_charge():
    num_generic_charge_events = 2424628566
    num_generic_mix_events = 1813405232
    ratio_mix_to_charge = num_generic_mix_events / num_generic_charge_events
    return ratio_mix_to_charge
    
    # df_charge = load_charge_dataframe()
    # num_charge_events = int(len(df_charge) * ratio_mix_to_charge)

    # reduced_dataframe = detector_level_charge_dataframe.iloc[:num_charge_events] 
    # return reduced_dataframe


def load_standard_model_detector_level_signal_dataframe():

    trial_range = range(1, 41)
    file_paths = list(Path(Paths_to_Directories().path_to_raw_signal_dir).glob("*.pkl"))
    file_paths = [path for path in file_paths if get_trial_num_of_raw_signal_file(path, verbose=False) in trial_range]
    file_paths = [path for path in file_paths if get_delta_C9_value_of_raw_signal_file(path, verbose=False) == 0]

    dataframe = pandas.concat([pandas.read_pickle(path) for path in file_paths])
    detector_level_dataframe = dataframe.loc["det"]
    detector_level_dataframe = cut_to_signal_events(detector_level_dataframe)
    return detector_level_dataframe


def load_charge_dataframe():
    dataframe = pandas.read_pickle("charge818.pkl").loc["det"]
    num_charge_events_balanced = int(len(dataframe) * get_original_ratio_mix_to_charge())
    dataframe = dataframe.iloc[:num_charge_events_balanced]
    return dataframe


def load_mix_dataframe():
    dataframe = pandas.read_pickle("mix818.pkl").loc["det"]
    return dataframe


def load_charge_and_mix_dataframes():
    charge_dataframe = load_charge_dataframe()
    mix_dataframe = load_mix_dataframe()
    return charge_dataframe, mix_dataframe


def cut_to_signal_region(dataframe):
    signal_mbc_cut = lambda df : df[df["Mbc"] > 5.27]
    signal_deltaE_cut = lambda df : df[(df["deltaE"] < 0.05) & (df["deltaE"] > -0.05)]
    return signal_mbc_cut(signal_deltaE_cut(dataframe))


def cut_to_sideband(dataframe):
    sideband_mbc_cut = lambda df : df[(df["Mbc"] > 5.0) & (df["Mbc"] < 5.26)]
    sideband_deltaE_cut = lambda df : df[(df["deltaE"] < 0.05) & (df["deltaE"] > -0.05)]    # same as signal region for now
    return sideband_mbc_cut(sideband_deltaE_cut(dataframe))


def cut_to_signal_events(dataframe):
    return dataframe[dataframe["isSignal"] == 1]


def cut_to_bkg_events(dataframe):
    return dataframe[dataframe["isSignal"] != 1]


def cut_to_multiple_candidate_events(dataframe):
    return dataframe[dataframe["__ncandidates__"] != 1]



In [2]:
df_charge, df_mix = load_charge_and_mix_dataframes()

df_charge = apply_q_squared_veto(df_charge, "resonances")
df_mix = apply_q_squared_veto(df_mix, "resonances")

df_charge_sr_bkg = cut_to_bkg_events(cut_to_signal_region(df_charge))
num_in_val_charge_sr_bkg = int(0.5*len(df_charge_sr_bkg)) #  50%
df_charge_sr_bkg_val = df_charge_sr_bkg.iloc[:num_in_val_charge_sr_bkg]
df_charge_sr_bkg_test = df_charge_sr_bkg.iloc[num_in_val_charge_sr_bkg:]

df_mix_sr_bkg = cut_to_bkg_events(cut_to_signal_region(df_mix))
num_in_val_mix_sr_bkg = int(0.5*len(df_mix_sr_bkg)) #  50%
df_mix_sr_bkg_val = df_mix_sr_bkg.iloc[:num_in_val_mix_sr_bkg]
df_mix_sr_bkg_test = df_mix_sr_bkg.iloc[num_in_val_mix_sr_bkg:] 

num_val_bkg = len(df_charge_sr_bkg_val) + len(df_mix_sr_bkg_val)
df_mc_signal_sr_signal_val = load_standard_model_detector_level_signal_dataframe().sample(n=num_val_bkg, replace=True)

df_mix_sr_signal = cut_to_signal_events(cut_to_signal_region(df_mix))
num_in_val_mix_sr_signal = int(0.5*len(df_mix_sr_signal)) #   50%
df_mix_sr_signal_val = df_mix_sr_signal.iloc[:num_in_val_mix_sr_signal]
df_mix_sr_signal_test = df_mix_sr_signal.iloc[num_in_val_mix_sr_signal:]

df_charge_sb_bkg_train = cut_to_bkg_events(cut_to_sideband(df_charge))
df_mix_sb_bkg_train = cut_to_bkg_events(cut_to_sideband(df_mix))
num_train_bkg = len(df_charge_sb_bkg_train) + len(df_mix_sb_bkg_train)
df_mc_signal_sr_signal_train = load_standard_model_detector_level_signal_dataframe().sample(n=num_train_bkg, replace=True)

# df_charge_sr_bkg_val.to_pickle("charge_sr_bkg_val.pkl")
# df_charge_sr_bkg_test.to_pickle("charge_sr_bkg_test.pkl")
# df_mix_sr_bkg_val.to_pickle("mix_sr_bkg_val.pkl")
# df_mix_sr_bkg_test.to_pickle("mix_sr_bkg_test.pkl")
# df_mix_sr_signal_val.to_pickle("mix_sr_signal_val.pkl")
# df_mix_sr_signal_test.to_pickle("mix_sr_signal_test.pkl")
# df_charge_sb_bkg_train.to_pickle("charge_sb_train.pkl")
# df_mix_sb_bkg_train.to_pickle("mix_sb_train.pkl")
# df_mc_signal_sr_signal_train.to_pickle("mc_signal_sr_signal_train.pkl")

print("Charge, bkg., signal region, val.: ", len(df_charge_sr_bkg_val))
print("Charge, bkg., signal region, test: ", len(df_charge_sr_bkg_test))
print("Mix, bkg., signal region, val.: ", len(df_mix_sr_bkg_val))
print("Mix, bkg., signal region, test: ", len(df_mix_sr_bkg_test))
print("MC signal, signal, signal region, val: ", len(df_mc_signal_sr_signal_val))
print("Mix, signal, signal region, val.: ", len(df_mix_sr_signal_val))
print("Mix, signal, signal region, test: ", len(df_mix_sr_signal_test))
print("Charge, bkg., sideband, train: ", len(df_charge_sb_bkg_train))
print("Mix, bkg., sideband, train: ", len(df_mix_sb_bkg_train))
print("MC signal, signal, signal region, train: ", len(df_mc_signal_sr_signal_train))


features = [
    "isSignal", 
    "tfRedChiSqB0", 
    "deltaE", 
    "invM_K_pi_shifted",
    "K_p_kaonID", 
    "K_p_dr", 
    "K_p_dz", 
    "pi_m_dr", 
    "pi_m_dz", 
    "mu_p_dr", 
    "mu_p_dz", 
    "mu_p_muonID", 
    "mu_m_dr", 
    "mu_m_dz", 
    "mu_m_muonID"
]

df_sr_val = pandas.concat([df_charge_sr_bkg_val[features], df_mix_sr_bkg_val[features], df_mix_sr_signal_val[features]])
df_sr_test = pandas.concat([df_charge_sr_bkg_test[features], df_mix_sr_bkg_test[features], df_mix_sr_signal_test[features]])
df_sb_train = pandas.concat([df_charge_sb_bkg_train[features], df_mix_sb_bkg_train[features], df_mc_signal_sr_signal_train[features]])

df_sr_val.to_parquet("sr_val.parquet")
df_sr_test.to_parquet("sr_test.parquet")
df_sb_train.to_parquet("sb_train.parquet")

print("Signal region, val.: ", len(df_sr_val))
print("Signal region, test: ", len(df_sr_test))
print("Sideband, train: ", len(df_sb_train))

Applying q^2 veto.
Applied q^2 veto.
Applying q^2 veto.
Applied q^2 veto.
Charge, bkg., signal region, val.:  3105
Charge, bkg., signal region, test:  3106
Mix, bkg., signal region, val.:  4690
Mix, bkg., signal region, test:  4691
MC signal, signal, signal region, val:  7795
Mix, signal, signal region, val.:  988
Mix, signal, signal region, test:  989
Charge, bkg., sideband, train:  146653
Mix, bkg., sideband, train:  111894
MC signal, signal, signal region, train:  258547
Signal region, val.:  8783
Signal region, test:  8786
Sideband, train:  517094


In [None]:

num_charge_bkg_signal_region = len(
    cut_to_bkg_events(
        cut_to_signal_region(
            reduce_num_charge_events_to_num_mix_events(
                df_charge.loc["det"]
            )
        )
    )
)

num_mix_bkg_signal_region = len(
    cut_to_bkg_events(
        cut_to_signal_region(
            df_mix.loc["det"]
        )
    )
)

num_mix_signal_signal_region = len(
    cut_to_signal_events(
        cut_to_signal_region(
            df_mix.loc["det"]
        )
    )
)

num_charge_bkg_sideband = len(
    cut_to_bkg_events(
        cut_to_sideband(
            reduce_num_charge_events_to_num_mix_events(
                df_charge.loc["det"]
            )
        )
    )
)

num_mix_bkg_sideband = len(
    cut_to_bkg_events(
        cut_to_sideband(
            df_mix.loc["det"]
        )
    )
)

print("Charge, bkg., signal region: ", num_charge_bkg_signal_region)
print("Mix, bkg., signal region: ", num_mix_bkg_signal_region)
print("Mix, signal, signal region: ", num_mix_signal_signal_region)
print("Charge, bkg., sideband: ", num_charge_bkg_sideband)
print("Mix, bkg., sideband: ", num_mix_bkg_sideband)

In [None]:
df_charge, df_mix = load_charge_and_mix_dataframes()

df_charge_sr_bkg = cut_to_bkg_events(cut_to_signal_region(reduce_num_charge_events_to_num_mix_events(df_charge.loc["det"])))
num_in_train_charge_sr_bkg = int(0.5*len(df_charge_sr_bkg))
df_charge_sr_bkg_train = df_charge_sr_bkg.iloc[:num_in_train_charge_sr_bkg]
df_charge_sr_bkg_test = df_charge_sr_bkg.iloc[num_in_train_charge_sr_bkg:]

df_mix_sr_bkg = cut_to_bkg_events(cut_to_signal_region(df_mix.loc["det"]))
num_in_train_mix_sr_bkg = int(0.5*len(df_mix_sr_bkg)) #  50%
df_mix_sr_bkg_train = df_mix_sr_bkg.iloc[:num_in_train_mix_sr_bkg]
df_mix_sr_bkg_test = df_mix_sr_bkg.iloc[num_in_train_mix_sr_bkg:] 

num_train_bkg = len(df_charge_sr_bkg_train) + len(df_mix_sr_bkg_train)
df_mc_signal_sr_signal_train = load_standard_model_detector_level_signal_dataframe().iloc[:num_train_bkg]

df_charge_sb_bkg = cut_to_bkg_events(cut_to_sideband(reduce_num_charge_events_to_num_mix_events(df_charge.loc["det"])))
num_in_val_charge_sb_bkg = int(0.5*len(df_charge_sb_bkg))
df_charge_sb_bkg_val = df_charge_sb_bkg[:num_in_val_charge_sb_bkg]
df_charge_sb_bkg_test = df_charge_sb_bkg[num_in_val_charge_sb_bkg:]

df_mix_sb_bkg = cut_to_bkg_events(cut_to_sideband(df_mix.loc["det"]))
num_in_val_mix_sb_bkg = int(0.5*len(df_mix_sb_bkg))
df_mix_sb_bkg_val = df_mix_sb_bkg[:num_in_val_mix_sb_bkg]
df_mix_sb_bkg_test = df_mix_sb_bkg[num_in_val_mix_sb_bkg:]

num_val_bkg = len(df_charge_sb_bkg_val) + len(df_mix_sb_bkg_val)
df_mc_signal_sr_signal_val = load_standard_model_detector_level_signal_dataframe().iloc[num_train_bkg:num_val_bkg+num_train_bkg]


print("Charge, bkg., signal region, train.: ", len(df_charge_sr_bkg_train))
print("Charge, bkg., signal region, test: ", len(df_charge_sr_bkg_test))
print("Mix, bkg., signal region, train.: ", len(df_mix_sr_bkg_train))
print("Mix, bkg., signal region, test: ", len(df_mix_sr_bkg_test))
print("MC signal, signal, signal region, train: ", len(df_mc_signal_sr_signal_train))
print("Charge, bkg., sideband, val: ", len(df_charge_sb_bkg_val))
print("Mix, bkg., sideband, val: ", len(df_mix_sb_bkg_val))
print("MC signal, signal, signal region, val.: ", len(df_mc_signal_sr_signal_val))


features = [
    "isSignal", 
    "tfRedChiSqB0", 
    "deltaE", 
    "invM_K_pi_shifted",
    "K_p_kaonID", 
    "K_p_dr", 
    "K_p_dz", 
    "pi_m_dr", 
    "pi_m_dz", 
    "mu_p_dr", 
    "mu_p_dz", 
    "mu_p_muonID", 
    "mu_m_dr", 
    "mu_m_dz", 
    "mu_m_muonID"
]

df_sr_train = pandas.concat([df_charge_sr_bkg_train[features], df_mix_sr_bkg_train[features], df_mc_signal_sr_signal_train[features]])
df_sb_val = pandas.concat([df_charge_sb_bkg_val[features], df_mix_sb_bkg_val[features], df_mc_signal_sr_signal_val[features]])

df_sr_train.to_parquet("sr_train.parquet")
df_sb_val.to_parquet("sb_val.parquet")

print("Signal region, train: ", len(df_sr_train))
print("Sideband, val.: ", len(df_sb_val))

In [None]:
df_mix.columns.to_list()

In [None]:
df = pandas.read_pickle("../../state/new_physics/data/raw/signal/dc9_-0.01_1_re.pkl")

In [None]:
from pathlib import Path
from helpers.experiment.constants import Paths_to_Directories



In [None]:
df.loc["det"]

In [3]:
(3105+4690)/988

7.8896761133603235

In [None]:
load_charge_dataframe()