# Kernel Density Estimation (KDE) for the dijet mass distribution

## Imports

In [None]:
from os.path import join

import energyflow as ef
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from cycler import cycler
from sklearn.neighbors import KernelDensity

In [None]:
# define plot style
mpl.rcParams["axes.prop_cycle"] = cycler(
    color=[
        "#B6BFC3",
        "#3B515B",
        "#0271BB",
        "#E2001A",
    ]
)
mpl.rcParams["font.size"] = 15
mpl.rcParams["patch.linewidth"] = 1.25

## Load data

In [None]:
path = "/beegfs/desy/user/ewencedr/data/lhco/final_data/processed_data_background_rel.h5"
with h5py.File(path, "r") as f:
    jets = f["jet_data"][:]

## Calculate mjj from jet data

In [None]:
p4_jets = ef.p4s_from_ptyphims(jets)

In [None]:
sum_p4 = p4_jets[:, 0] + p4_jets[:, 1]
mjj = ef.ms_from_p4s(sum_p4)

In [None]:
# cut window
window_left = 3.3e3
window_right = 3.7e3
args_to_remove = (mjj >= window_left) & (mjj <= window_right)
mjj_cut = mjj[~args_to_remove]

In [None]:
print(np.argwhere(args_to_remove))

In [None]:
print(np.min(mjj))
print(np.max(mjj))

In [None]:
hist = plt.hist(
    mjj, bins=np.arange(1e3, 9.5e3, 0.1e3), histtype="stepfilled", label="mjj", alpha=0.5
)
plt.hist(mjj_cut, bins=hist[1], histtype="step", label="mjj with cut")
plt.legend()
plt.yscale("log")
plt.show()

## Estimate full mjj

In [None]:
kde_model_full = KernelDensity(kernel="gaussian", bandwidth=0.0001)
kde_model_full.fit(mjj.reshape(-1, 1))

samples_full = kde_model_full.sample(len(mjj))

In [None]:
hist = plt.hist(
    mjj, bins=np.arange(1e3, 9.5e3, 0.1e3), histtype="stepfilled", label="Truth", alpha=0.5
)
plt.hist(samples_full, bins=hist[1], histtype="step", label="KDE samples")
plt.xlabel("mjj [GeV]")
plt.legend(frameon=False)
plt.yscale("log")
plt.show()

## Estimate mjj with cut

In [None]:
kde_model_cut = KernelDensity(kernel="gaussian", bandwidth=0.0001)
kde_model_cut.fit(mjj_cut.reshape(-1, 1))

samples_cut = kde_model_cut.sample(len(mjj_cut))

In [None]:
hist = plt.hist(
    mjj_cut,
    bins=np.arange(1e3, 9.5e3, 0.1e3),
    histtype="stepfilled",
    label="Truth",
    alpha=0.5,
)
plt.hist(samples_cut, bins=hist[1], histtype="step", label="KDE samples")
plt.xlabel("mjj [GeV]")
plt.legend(frameon=False)
plt.yscale("log")
plt.show()

## Double Check

In [None]:
# data loading
data_path = "/beegfs/desy/user/sommerhm/clean_notebook_cathode/input_data/"
outerdata_train = np.load(join(data_path, "outerdata_train.npy"))
outerdata_val = np.load(join(data_path, "outerdata_val.npy"))
innerdata_train = np.load(join(data_path, "innerdata_train.npy"))
innerdata_val = np.load(join(data_path, "innerdata_val.npy"))
innerdata_test = np.load(join(data_path, "innerdata_test.npy"))

In [None]:
m_train = outerdata_train[:, 0:1]

In [None]:
hist = plt.hist(
    mjj_cut[: len(m_train)],
    bins=np.arange(1e3, 9.5e3, 0.1e3),
    histtype="stepfilled",
    label="Truth",
    alpha=0.5,
)
plt.hist(samples_cut[: len(m_train)], bins=hist[1], histtype="step", label="KDE samples")
plt.hist(m_train, bins=hist[1], histtype="step", label="Double check")
plt.xlabel("mjj [GeV]]")
plt.legend(frameon=False)
plt.yscale("log")
plt.show()

## Save data

In [None]:
path = "/beegfs/desy/user/ewencedr/data/lhco/final_data/gen_mjj.h5"
with h5py.File(path, "w") as f:
    f.create_dataset("mjj", data=mjj)
    f.create_dataset("mjj_cut", data=mjj_cut)
    f.create_dataset("gen_mjj", data=samples_full)
    f.create_dataset("gen_mjj_cut", data=samples_cut)
    f.create_dataset("args_to_remove", data=args_to_remove)