# LHCO Cathode Generation Pipeline
After the particle level models and the jet feature models have been trained, the final step is to run the whole generation pipeline. This is the purpose of this notebook.

## Imports

In [None]:
import os
import sys

sys.path.append("../")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from os.path import join

import energyflow as ef
import h5py
import hydra
import numpy as np
import pytorch_lightning as pl
import torch
from omegaconf import OmegaConf
from sklearn.neighbors import KernelDensity

In [None]:
# plots and metrics
import matplotlib.pyplot as plt

from src.data.components import (
    calculate_all_wasserstein_metrics,
    inverse_normalize_tensor,
    normalize_tensor,
)
from src.utils.data_generation import generate_data
from src.utils.plotting import apply_mpl_styles, plot_data, prepare_data_for_plotting

apply_mpl_styles()

In [None]:
# set env variable DATA_DIR again because of hydra
from dotenv import load_dotenv

load_dotenv()
os.environ["DATA_DIR"] = os.environ.get("DATA_DIR")

In [None]:
data_folder = os.environ.get("DATA_DIR")

# Generate mjj samples
We fit a KDE to the mjj distribution of the signal and background samples. We then sample from the KDE to generate new mjj samples in the signal region.

In [None]:
n_samples = 200_000

In [None]:
path = f"{data_folder}/lhco/final_data/processed_data_background_rel.h5"
with h5py.File(path, "r") as f:
    jets = f["jet_data"][:]

In [None]:
p4_jets = ef.p4s_from_ptyphims(jets)

In [None]:
sum_p4 = p4_jets[:, 0] + p4_jets[:, 1]
mjj = ef.ms_from_p4s(sum_p4)

In [None]:
args_to_keep = ((mjj < 3300) & (mjj > 2300)) | ((mjj > 3700) & (mjj < 5000))
args_to_keep_sr = (mjj > 3300) & (mjj < 3700)

In [None]:
mjj_sb = mjj[args_to_keep]
mjj_sr = mjj[args_to_keep_sr]
args_to_keep_sb_sr = args_to_keep | args_to_keep_sr
mjj_sb_sr = mjj[args_to_keep_sb_sr]

# Particle Feature Model

### Load Models

In [None]:
experiment_x = "/lhco/lhco_x_jet.yaml"
experiment_y = "/lhco/lhco_y_jet.yaml"

In [None]:
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    cfg_x = hydra.compose(config_name="train.yaml", overrides=[f"experiment={experiment_x}"])
    # print(OmegaConf.to_yaml(cfg_x))

In [None]:
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    cfg_y = hydra.compose(config_name="train.yaml", overrides=[f"experiment={experiment_y}"])
    # print(OmegaConf.to_yaml(cfg_y))

In [None]:
datamodule_x = hydra.utils.instantiate(cfg_x.data)
model_x = hydra.utils.instantiate(cfg_x.model)

In [None]:
datamodule_y = hydra.utils.instantiate(cfg_y.data)
model_y = hydra.utils.instantiate(cfg_y.model)

In [None]:
datamodule_x.setup()

In [None]:
datamodule_y.setup()

### Load checkpoint

In [None]:
ckpt_x = (
    "/beegfs/desy/user/ewencedr/deep-learning/logs/lhco x jet mass new cut"
    " interpolate/runs/2023-08-16_02-00-37/checkpoints/last-EMA.ckpt"
)
model_x = model_x.load_from_checkpoint(ckpt_x)

In [None]:
ckpt_y = (
    "/beegfs/desy/user/ewencedr/deep-learning/logs/lhco y jet mass new cut"
    " interpolate/runs/2023-08-16_03-49-00/checkpoints/epoch_2717_loss_16.28091-EMA.ckpt"
)
model_y = model_y.load_from_checkpoint(ckpt_y)

### Generate Data

In [None]:
n_samples_x = n_samples
n_samples_y = n_samples

In [None]:
print(len(datamodule_x.mask_train_sr))
print(len(datamodule_x.mask_test_sr))

In [None]:
cond_x = torch.concat(
    (
        datamodule_x.tensor_conditioning_train_sr,
        datamodule_x.tensor_conditioning_val_sr,
        datamodule_x.tensor_conditioning_test_sr,
    )
)
mask_x = torch.cat(
    (datamodule_x.mask_train_sr, datamodule_x.mask_val_sr, datamodule_x.mask_test_sr)
)

In [None]:
print(cond_x.shape)
print(mask_x.shape)

In [None]:
torch.manual_seed(9999)
data_x, generation_time_x = generate_data(
    model_x,
    num_jet_samples=len(mask_x),
    batch_size=2048,
    cond=cond_x,
    variable_set_sizes=datamodule_x.hparams.variable_jet_sizes,
    mask=mask_x,
    normalized_data=datamodule_x.hparams.normalize,
    means=datamodule_x.means,
    stds=datamodule_x.stds,
    ode_solver="midpoint",
    ode_steps=100,
)

In [None]:
path_x = f"{data_folder}/lhco/generated/x_data_datacond.h5"

In [None]:
print(mjj_sr.shape)

In [None]:
# with h5py.File(path_x, "w") as f:
#    f.create_dataset("data", data=data_x)
#    f.create_dataset("generation_time", data=generation_time_x)
#    f.create_dataset("mask", data=mask_x)
#    f.create_dataset("cond", data=cond_x)
#    f.create_dataset("mjj", data=mjj_sr)

In [None]:
with h5py.File(path_x, "r") as f:
    data_x = f["data"][:]

In [None]:
print(data_x.shape)

In [None]:
cond_y = torch.concat(
    (
        datamodule_y.tensor_conditioning_train_sr,
        datamodule_y.tensor_conditioning_val_sr,
        datamodule_y.tensor_conditioning_test_sr,
    )
)
mask_y = torch.cat(
    (datamodule_y.mask_train_sr, datamodule_y.mask_val_sr, datamodule_y.mask_test_sr)
)

In [None]:
print(cond_y.shape)
print(mask_y.shape)

In [None]:
torch.manual_seed(9999)
data_y, generation_time_y = generate_data(
    model_y,
    num_jet_samples=len(mask_y),
    batch_size=2048,
    cond=cond_y,
    variable_set_sizes=datamodule_y.hparams.variable_jet_sizes,
    mask=mask_y,
    normalized_data=datamodule_y.hparams.normalize,
    means=datamodule_y.means,
    stds=datamodule_y.stds,
    ode_solver="midpoint",
    ode_steps=100,
)

In [None]:
path_y = f"{data_folder}/lhco/generated/y_data_datacond.h5"

In [None]:
# with h5py.File(path_y, "w") as f:
#    f.create_dataset("data", data=data_y)
#    f.create_dataset("generation_time", data=generation_time_y)
#    f.create_dataset("mask", data=mask_y)
#    f.create_dataset("cond", data=cond_y)
#    f.create_dataset("mjj", data=mjj_sr)

In [None]:
with h5py.File(path_y, "r") as f:
    data_y = f["data"][:]

In [None]:
print(data_y.shape)

In [None]:
print(np.sum(data_x[:, :, 2] > 1))

In [None]:
np.min(data_y[:, :, 2][data_y[:, :, 2] > 0.0])

remove unphysical values

In [None]:
data_x[data_x[:, :, 2] < 0] = np.min(
    datamodule_x.tensor_train.numpy()[:, :, 2][datamodule_x.tensor_train.numpy()[:, :, 2] > 0.0]
)
data_x[data_x[:, :, 2] > 1] = np.max(
    datamodule_x.tensor_train.numpy()[:, :, 2][datamodule_x.tensor_train.numpy()[:, :, 2] < 1.0]
)

In [None]:
data_y[data_y[:, :, 2] < 0] = np.min(
    datamodule_y.tensor_train.numpy()[:, :, 2][datamodule_y.tensor_train.numpy()[:, :, 2] > 0.0]
)
data_y[data_y[:, :, 2] > 1] = np.max(
    datamodule_y.tensor_train.numpy()[:, :, 2][datamodule_y.tensor_train.numpy()[:, :, 2] < 1.0]
)

In [None]:
print(data_x.shape)
print(np.sum(data_x[:, :, 2] < 0))
print(np.sum(data_x[:, :, 2] > 1))

### Save generated data

In [None]:
path_final_rel = f"{data_folder}/lhco/generated/final_rel_data_datacond.h5"

In [None]:
print(data_x.shape)
print(data_y.shape)
# print(data_jet_feature[:, :5].shape)
# print(data_jet_feature[:, 5:].shape)
# print(mjj_samples_sr.shape)

In [None]:
# with h5py.File(path_final_rel, "w") as f:
#    f.create_dataset("particle_data_rel_x", data=data_x)
#    f.create_dataset("particle_data_rel_y", data=data_y)
#    f.create_dataset("jet_features_x", data=data_jet_feature[:, :5])
#    f.create_dataset("jet_features_y", data=data_jet_feature[:, 5:])
#    f.create_dataset("mjj", data=mjj_samples_sr)

In [None]:
with h5py.File(path_final_rel, "r") as f:
    data_x = f["particle_data_rel_x"][:]
    data_y = f["particle_data_rel_y"][:]

### Plots

In [None]:
background_data_x = np.array(datamodule_x.tensor_test_sr[: len(data_x)])

In [None]:
background_data_y = np.array(datamodule_y.tensor_test_sr[: len(data_y)])

In [None]:
print(data_x.shape)
print(background_data_x.shape)

In [None]:
plot_config_x = {
    "num_samples": min(len(data_x), len(background_data_x)),
    "plot_jet_features": False,
    "plot_w_dists": False,
    "plot_efps": True,
    "plot_selected_multiplicities": False,
    "selected_multiplicities": [10, 20, 30, 40, 50, 100],
    "selected_particles": [1, 5, 20],
    "plottype": "",
    "save_fig": False,
    "variable_jet_sizes_plotting": True,
    "bins": 100,
    "close_fig": False,
}
plot_prep_config_x = {
    "calculate_efps" if key == "plot_efps" else key: value
    for key, value in plot_config_x.items()
    if key in ["plot_efps", "selected_particles", "selected_multiplicities"]
}

In [None]:
plot_config_y = {
    "num_samples": min(len(data_y), len(background_data_y)),
    "plot_jet_features": False,
    "plot_w_dists": False,
    "plot_efps": True,
    "plot_selected_multiplicities": False,
    "selected_multiplicities": [10, 20, 30, 40, 50, 100],
    "selected_particles": [1, 5, 20],
    "plottype": "",
    "save_fig": False,
    "variable_jet_sizes_plotting": True,
    "bins": 100,
    "close_fig": False,
}
plot_prep_config_y = {
    "calculate_efps" if key == "plot_efps" else key: value
    for key, value in plot_config_y.items()
    if key in ["plot_efps", "selected_particles", "selected_multiplicities"]
}

In [None]:
(
    jet_data_x,
    efps_values_x,
    pt_selected_particles_x,
    pt_selected_multiplicities_x,
) = prepare_data_for_plotting(
    np.array([data_x[: len(background_data_x)]]),
    **plot_prep_config_x,
)

In [None]:
(
    jet_data_y,
    efps_values_y,
    pt_selected_particles_y,
    pt_selected_multiplicities_y,
) = prepare_data_for_plotting(
    np.array([data_y[: len(background_data_y)]]),
    **plot_prep_config_y,
)

In [None]:
(
    jet_data_sim_x,
    efps_sim_x,
    pt_selected_particles_sim_x,
    pt_selected_multiplicities_sim_x,
) = prepare_data_for_plotting(
    [background_data_x],
    **plot_prep_config_x,
)
jet_data_sim_x, efps_sim_x, pt_selected_particles_sim_x = (
    jet_data_sim_x[0],
    efps_sim_x[0],
    pt_selected_particles_sim_x[0],
)

In [None]:
(
    jet_data_sim_y,
    efps_sim_y,
    pt_selected_particles_sim_y,
    pt_selected_multiplicities_sim_y,
) = prepare_data_for_plotting(
    [background_data_y],
    **plot_prep_config_y,
)
jet_data_sim_y, efps_sim_y, pt_selected_particles_sim_y = (
    jet_data_sim_y[0],
    efps_sim_y[0],
    pt_selected_particles_sim_y[0],
)

In [None]:
print(efps_sim_x.shape)
print(np.concatenate(efps_sim_x).shape)

In [None]:
fig_x = plot_data(
    particle_data=np.array([data_x[: len(background_data_x)]]),
    sim_data=background_data_x,
    jet_data_sim=jet_data_sim_x,
    jet_data=jet_data_x,
    efps_sim=efps_sim_x,
    efps_values=efps_values_x,
    pt_selected_particles=pt_selected_particles_x,
    pt_selected_multiplicities=pt_selected_multiplicities_x,
    pt_selected_particles_sim=pt_selected_particles_sim_x,
    pt_selected_multiplicities_sim=pt_selected_multiplicities_sim_x,
    **plot_config_x,
)

In [None]:
fig_y = plot_data(
    particle_data=np.array([data_y[: len(background_data_y)]]),
    sim_data=background_data_y,
    jet_data_sim=jet_data_sim_y,
    jet_data=jet_data_y,
    efps_sim=efps_sim_y,
    efps_values=efps_values_y,
    pt_selected_particles=pt_selected_particles_y,
    pt_selected_multiplicities=pt_selected_multiplicities_y,
    pt_selected_particles_sim=pt_selected_particles_sim_y,
    pt_selected_multiplicities_sim=pt_selected_multiplicities_sim_y,
    **plot_config_y,
)

### Back to non-rel. Coordinates

conditioning from feature model to recalculate the nonrel coordinates of the generated particles

In [None]:
print(datamodule_x.cond_means)

In [None]:
cond_x = inverse_normalize_tensor(
    cond_x.clone(),
    datamodule_x.cond_means,
    datamodule_x.cond_stds,
    datamodule_x.hparams.normalize_sigma,
).numpy()
cond_y = inverse_normalize_tensor(
    cond_y.clone(),
    datamodule_y.cond_means,
    datamodule_y.cond_stds,
    datamodule_y.hparams.normalize_sigma,
).numpy()

In [None]:
cond_x = np.array(cond_x)
cond_y = np.array(cond_y)

In [None]:
print(cond_x.shape)

In [None]:
pt_x = cond_x[:, 0].reshape(-1, 1)
eta_x = cond_x[:, 1].reshape(-1, 1)
phi_x = cond_x[:, 2].reshape(-1, 1)
m_x = cond_x[:, 3].reshape(-1, 1)

In [None]:
pt_y = cond_y[:, 0].reshape(-1, 1)
eta_y = cond_y[:, 1].reshape(-1, 1)
phi_y = cond_y[:, 2].reshape(-1, 1)
m_y = cond_y[:, 3].reshape(-1, 1)

In [None]:
print(eta_x.shape)

In [None]:
mask_x_nonrel = np.expand_dims((data_x[..., 2] > 0).astype(int), axis=-1)
non_rel_eta_x = np.expand_dims(data_x.copy()[:, :, 0] + eta_x, axis=-1)
non_rel_phi_x = np.expand_dims(data_x.copy()[:, :, 1] + phi_x, axis=-1)
# wrap phi between -pi and pi
non_rel_phi_x = np.where(
    non_rel_phi_x > np.pi,
    non_rel_phi_x - 2 * np.pi,
    non_rel_phi_x,
)
non_rel_phi_x = np.where(
    non_rel_phi_x < -np.pi,
    non_rel_phi_x + 2 * np.pi,
    non_rel_phi_x,
)
non_rel_pt_x = np.expand_dims(data_x.copy()[:, :, 2] * pt_x, axis=-1)
# fix the masking
non_rel_eta_x = non_rel_eta_x * mask_x_nonrel
non_rel_phi_x = non_rel_phi_x * mask_x_nonrel
data_x_nonrel = np.concatenate([non_rel_eta_x, non_rel_phi_x, non_rel_pt_x], axis=-1)

In [None]:
mask_y_nonrel = np.expand_dims((data_y[..., 2] > 0).astype(int), axis=-1)
non_rel_eta_y = np.expand_dims(data_y.copy()[:, :, 0] + eta_y, axis=-1)
non_rel_phi_y = np.expand_dims(data_y.copy()[:, :, 1] + phi_y, axis=-1)
# wrap phi between -pi and pi
non_rel_phi_y = np.where(
    non_rel_phi_y > np.pi,
    non_rel_phi_y - 2 * np.pi,
    non_rel_phi_y,
)
non_rel_phi_y = np.where(
    non_rel_phi_y < -np.pi,
    non_rel_phi_y + 2 * np.pi,
    non_rel_phi_y,
)
non_rel_pt_y = np.expand_dims(data_y.copy()[:, :, 2] * pt_y, axis=-1)
# fix the masking
non_rel_eta_y = non_rel_eta_y * mask_y_nonrel
non_rel_phi_y = non_rel_phi_y * mask_y_nonrel
data_y_nonrel = np.concatenate([non_rel_eta_y, non_rel_phi_y, non_rel_pt_y], axis=-1)

load non rel. data and select the same events that were generated

In [None]:
path_nonrel = f"{data_folder}/lhco/final_data/processed_data_background.h5"
with h5py.File(path_nonrel, "r") as f:
    particle_data_nonrel = f["constituents"][:]

In [None]:
particle_data_nonrel_sr = particle_data_nonrel[args_to_keep_sr]

In [None]:
n_samples_val_sr_x = int(datamodule_x.hparams.val_fraction * len(particle_data_nonrel_sr))
n_samples_test_sr_x = int(datamodule_x.hparams.test_fraction * len(particle_data_nonrel_sr))

In [None]:
particle_data_nonrel_sr_train, particle_data_nonrel_sr_val, particle_data_nonrel_sr_test = (
    np.split(
        particle_data_nonrel_sr,
        [
            len(particle_data_nonrel_sr) - (n_samples_val_sr_x + n_samples_test_sr_x),
            len(particle_data_nonrel_sr) - n_samples_test_sr_x,
        ],
    )
)

In [None]:
background_data_x_nonrel = particle_data_nonrel_sr[
    :, 0
]  # particle_data_nonrel_sr_test[: len(data_x_nonrel), 0]
background_data_x_nonrel = background_data_x_nonrel[:, :, [1, 2, 0]]

In [None]:
background_data_y_nonrel = particle_data_nonrel_sr[
    :, 1
]  # particle_data_nonrel_sr_test[: len(data_y_nonrel), 1]
background_data_y_nonrel = background_data_y_nonrel[:, :, [1, 2, 0]]

In [None]:
plot_config_x_nonrel = {
    "num_samples": -1,
    "plot_jet_features": True,
    "plot_w_dists": False,
    "plot_efps": True,
    "plot_selected_multiplicities": False,
    "selected_multiplicities": [10, 20, 30, 40, 50, 100],
    "selected_particles": [1, 3, 10],
    "plottype": "",
    "save_fig": False,
    "variable_jet_sizes_plotting": True,
    "bins": 100,
    "close_fig": False,
    "mass_linear": True,
    "plot_xlabels": [
        r"Particle $p_\mathrm{T}$",
        r"Particle $\eta$",
        r"Particle $\phi$",
        r"Jet $p_\mathrm{T}$",
    ],
}
plot_prep_config_x_nonrel = {
    "calculate_efps" if key == "plot_efps" else key: value
    for key, value in plot_config_x_nonrel.items()
    if key in ["plot_efps", "selected_particles", "selected_multiplicities"]
}

In [None]:
plot_config_y_nonrel = {
    "num_samples": -1,
    "plot_jet_features": True,
    "plot_w_dists": False,
    "plot_efps": True,
    "plot_selected_multiplicities": False,
    "selected_multiplicities": [10, 20, 30, 40, 50, 100],
    "selected_particles": [1, 3, 10],
    "plottype": "",
    "save_fig": False,
    "variable_jet_sizes_plotting": True,
    "bins": 100,
    "close_fig": False,
    "mass_linear": True,
    "plot_xlabels": [
        r"Particle $p_\mathrm{T}$",
        r"Particle $\eta$",
        r"Particle $\phi$",
        r"Jet $p_\mathrm{T}$",
    ],
}
plot_prep_config_y_nonrel = {
    "calculate_efps" if key == "plot_efps" else key: value
    for key, value in plot_config_y_nonrel.items()
    if key in ["plot_efps", "selected_particles", "selected_multiplicities"]
}

In [None]:
(
    jet_data_x_nonrel,
    efps_values_x_nonrel,
    pt_selected_particles_x_nonrel,
    pt_selected_multiplicities_x_nonrel,
) = prepare_data_for_plotting(
    np.array([data_x_nonrel[: len(background_data_x_nonrel)]]),
    **plot_prep_config_x_nonrel,
)

In [None]:
(
    jet_data_y_nonrel,
    efps_values_y_nonrel,
    pt_selected_particles_y_nonrel,
    pt_selected_multiplicities_y_nonrel,
) = prepare_data_for_plotting(
    np.array([data_y_nonrel[: len(background_data_y_nonrel)]]),
    **plot_prep_config_y_nonrel,
)

In [None]:
(
    jet_data_sim_x_nonrel,
    efps_sim_x_nonrel,
    pt_selected_particles_sim_x_nonrel,
    pt_selected_multiplicities_sim_x_nonrel,
) = prepare_data_for_plotting(
    [background_data_x_nonrel],
    **plot_prep_config_x_nonrel,
)
jet_data_sim_x_nonrel, efps_sim_x_nonrel, pt_selected_particles_sim_x_nonrel = (
    jet_data_sim_x_nonrel[0],
    efps_sim_x_nonrel[0],
    pt_selected_particles_sim_x_nonrel[0],
)

In [None]:
(
    jet_data_sim_y_nonrel,
    efps_sim_y_nonrel,
    pt_selected_particles_sim_y_nonrel,
    pt_selected_multiplicities_sim_y_nonrel,
) = prepare_data_for_plotting(
    [background_data_y_nonrel],
    **plot_prep_config_y_nonrel,
)
jet_data_sim_y_nonrel, efps_sim_y_nonrel, pt_selected_particles_sim_y_nonrel = (
    jet_data_sim_y_nonrel[0],
    efps_sim_y_nonrel[0],
    pt_selected_particles_sim_y_nonrel[0],
)

In [None]:
fig_x_nonrel = plot_data(
    particle_data=np.array([data_x_nonrel[: len(background_data_x_nonrel)]]),
    sim_data=background_data_x_nonrel,
    jet_data_sim=jet_data_sim_x_nonrel,
    jet_data=jet_data_x_nonrel,
    efps_sim=efps_sim_x_nonrel,
    efps_values=efps_values_x_nonrel,
    pt_selected_particles=pt_selected_particles_x_nonrel,
    pt_selected_multiplicities=pt_selected_multiplicities_x_nonrel,
    pt_selected_particles_sim=pt_selected_particles_sim_x_nonrel,
    pt_selected_multiplicities_sim=pt_selected_multiplicities_sim_x_nonrel,
    **plot_config_x_nonrel,
)

In [None]:
fig_y_nonrel = plot_data(
    particle_data=np.array([data_y_nonrel[: len(background_data_y_nonrel)]]),
    sim_data=background_data_y_nonrel,
    jet_data_sim=jet_data_sim_y_nonrel,
    jet_data=jet_data_y_nonrel,
    efps_sim=efps_sim_y_nonrel,
    efps_values=efps_values_y_nonrel,
    pt_selected_particles=pt_selected_particles_y_nonrel,
    pt_selected_multiplicities=pt_selected_multiplicities_y_nonrel,
    pt_selected_particles_sim=pt_selected_particles_sim_y_nonrel,
    pt_selected_multiplicities_sim=pt_selected_multiplicities_sim_y_nonrel,
    **plot_config_y_nonrel,
)

# Save generated data to file

In [None]:
path_final_all = f"{data_folder}/lhco/generated/final_data_all_datacond.h5"

In [None]:
print(data_x.shape)
print(data_y.shape)
print(data_x_nonrel.shape)
print(data_y_nonrel.shape)
# print(data_jet_feature[:, :5].shape)
# print(data_jet_feature[:, 5:].shape)
print(cond_x.shape)
print(cond_y.shape)
print(mjj_sr.shape)

In [None]:
with h5py.File(path_final_all, "w") as f:
    f.create_dataset("particle_data_rel_x", data=data_x[:, :, [2, 0, 1]])
    f.create_dataset("particle_data_rel_y", data=data_y[:, :, [2, 0, 1]])
    f.create_dataset("particle_data_nonrel_x", data=data_x_nonrel[:, :, [2, 0, 1]])
    f.create_dataset("particle_data_nonrel_y", data=data_y_nonrel[:, :, [2, 0, 1]])
    f.create_dataset("jet_features_x", data=cond_x)
    f.create_dataset("jet_features_y", data=cond_y)
    f.create_dataset("mjj", data=mjj_sr)

## combine both jets

In [None]:
data_x_nonrel_ptetaphi = data_x_nonrel[:, :, [2, 0, 1]]
data_y_nonrel_ptetaphi = data_x_nonrel[:, :, [2, 0, 1]]

In [None]:
data_combined = np.concatenate((data_x_nonrel_ptetaphi, data_y_nonrel_ptetaphi), axis=1)
print(data_combined.shape)

In [None]:
padded_data = np.pad(
    data_combined,
    ((0, 0), (0, 700 - data_combined.shape[1]), (0, 0)),
    "constant",
    constant_values=0,
)
print(padded_data.shape)

In [None]:
final_data = np.reshape(padded_data, (padded_data.shape[0], -1))
print(final_data.shape)

In [None]:
import pandas as pd

In [None]:
final_path = f"{data_folder}/lhco/generated/final_data.h5"
df_final = pd.DataFrame(data=final_data)
df_final.to_hdf(final_path, key="df", mode="w")

In [None]:
filepath = f"{data_folder}/lhco/events_anomalydetection_v2.h5"

In [None]:
# Load everything into memory
df = pd.read_hdf(filepath)
print(df.shape)
print("Memory in GB:", sum(df.memory_usage(deep=True)) / (1024**3))

# Evaluation

In [None]:
with h5py.File(path_final_all, "r") as f:
    eval_x_rel = f["particle_data_rel_x"][:]
    eval_y_rel = f["particle_data_rel_y"][:]
    eval_x_nonrel = f["particle_data_nonrel_x"][:]
    eval_y_nonrel = f["particle_data_nonrel_y"][:]
    eval_jet_features_x = f["jet_features_x"][:]
    eval_jet_features_y = f["jet_features_y"][:]
    eval_mjj = f["mjj"][:]

In [None]:
def get_jet_data(consts: np.ndarray) -> np.ndarray:
    """Calculate jet data from constituent data. (pt, y, phi)->(pt, y, phi, m)

    Args:
        consts (np.ndarray): constituent data. (pt, y, phi)

    Returns:
        np.ndarray: jet data. (pt, y, phi, m)
    """
    p4s = ef.p4s_from_ptyphims(consts[..., :3])
    sum_p4 = np.sum(p4s, axis=-2)
    jet_data = ef.ptyphims_from_p4s(sum_p4, phi_ref=0)
    return jet_data

In [None]:
eval_x_rel = eval_x_rel[:, :, [2, 0, 1]]
eval_y_rel = eval_y_rel[:, :, [2, 0, 1]]
eval_x_nonrel = eval_x_nonrel[:, :, [2, 0, 1]]
eval_y_nonrel = eval_y_nonrel[:, :, [2, 0, 1]]
print(eval_x_rel.shape)

In [None]:
eval_jet_data_x = get_jet_data(eval_x_rel)
eval_jet_data_y = get_jet_data(eval_y_rel)
eval_jet_data_x_nonrel = get_jet_data(eval_x_nonrel)
eval_jet_data_y_nonrel = get_jet_data(eval_y_nonrel)

In [None]:
print(eval_jet_data_x[:, -1].shape)
print(eval_jet_features_x[:, -1].shape)

In [None]:
datamodule_x.cond_means.shape

In [None]:
# normalize conditioning variables
eval_normalized_cond_x = normalize_tensor(
    torch.tensor(eval_jet_features_x[:, :4], dtype=torch.float32).clone(),
    datamodule_x.cond_means,
    datamodule_x.cond_stds,
    datamodule_x.hparams.normalize_sigma,
).numpy()

In [None]:
eval_normalized_cond_x.shape

In [None]:
labels = ["pt", "eta", "phi", "mass"]
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
for i, ax in enumerate(axs):
    max_v = max(np.max(eval_normalized_cond_x[:, i]), np.max(eval_normalized_cond_x[:, i]))
    min_v = min(np.min(eval_normalized_cond_x[:, i]), np.min(eval_normalized_cond_x[:, i]))

    ax.hist2d(
        eval_normalized_cond_x[:, i],
        eval_jet_data_x[:, i],
        bins=50,
        range=[[min_v, max_v], [min_v, max_v]],
        cmap="jet",
    )
    ax.set_xlabel(f"{labels[i]} conditioning")
    ax.set_ylabel(f"{labels[i]} generated")
    ax.set_title("higher mass jet rel")
plt.tight_layout()
plt.plot()

In [None]:
max_v = max(np.max(eval_jet_features_x[:, -2]), np.max(eval_jet_features_x[:, -2]))
min_v = min(np.min(eval_jet_features_x[:, -2]), np.min(eval_jet_features_x[:, -2]))

plt.hist2d(
    eval_jet_features_x[:, -2],
    eval_jet_features_x[:, -2],
    bins=100,
    range=[[min_v, max_v], [min_v, max_v]],
    cmap="jet",
)
plt.xlabel("Jet mass conditioning")
plt.ylabel("Jet mass generated")
plt.title("ref.")
plt.plot()

In [None]:
max_v = max(np.max(eval_jet_features_x[:, -2]), np.max(eval_jet_features_x[:, -2]))
min_v = min(np.min(eval_jet_features_x[:, -2]), np.min(eval_jet_features_x[:, -2]))

plt.hist2d(
    eval_jet_features_x[:, -2],
    eval_jet_data_x[:, -1],
    bins=100,
    range=[[min_v, max_v], [min_v, max_v]],
    cmap="jet",
)
plt.xlabel("Jet mass conditioning")
plt.ylabel("Jet mass generated")
plt.title("rel.")
plt.plot()

In [None]:
labels = ["pt", "eta", "phi", "mass"]
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
for i, ax in enumerate(axs):
    max_v = max(np.max(eval_jet_features_x[:, i]), np.max(eval_jet_features_x[:, i]))
    min_v = min(np.min(eval_jet_features_x[:, i]), np.min(eval_jet_features_x[:, i]))

    ax.hist2d(
        eval_jet_features_x[:, i],
        eval_jet_data_x_nonrel[:, i],
        bins=50,
        range=[[min_v, max_v], [min_v, max_v]],
        cmap="jet",
    )
    ax.set_xlabel(f"{labels[i]} conditioning")
    ax.set_ylabel(f"{labels[i]} generated")
    ax.set_title("higher mass jet")
plt.tight_layout()
plt.plot()

In [None]:
max_v = max(np.max(eval_jet_features_y[:, -2]), np.max(eval_jet_features_y[:, -2]))
min_v = min(np.min(eval_jet_features_y[:, -2]), np.min(eval_jet_features_y[:, -2]))

plt.hist2d(
    eval_jet_features_y[:, -2],
    eval_jet_features_y[:, -2],
    bins=100,
    range=[[min_v, max_v], [min_v, max_v]],
    cmap="jet",
)
plt.xlabel("Jet mass conditioning")
plt.ylabel("Jet mass generated")
plt.title("ref. y")
plt.plot()

In [None]:
max_v = max(np.max(eval_jet_features_y[:, -2]), np.max(eval_jet_features_y[:, -2]))
min_v = min(np.min(eval_jet_features_y[:, -2]), np.min(eval_jet_features_y[:, -2]))

plt.hist2d(
    eval_jet_features_y[:, -2],
    eval_jet_data_y[:, -1],
    bins=100,
    range=[[min_v, max_v], [min_v, max_v]],
    cmap="jet",
)
plt.xlabel("Jet mass conditioning")
plt.ylabel("Jet mass generated")
plt.title("rel.")
plt.plot()

In [None]:
labels = ["pt", "eta", "phi", "mass"]
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
for i, ax in enumerate(axs):
    max_v = max(np.max(eval_jet_features_y[:, i]), np.max(eval_jet_features_y[:, i]))
    min_v = min(np.min(eval_jet_features_y[:, i]), np.min(eval_jet_features_y[:, i]))

    ax.hist2d(
        eval_jet_features_y[:, i],
        eval_jet_data_y_nonrel[:, i],
        bins=50,
        range=[[min_v, max_v], [min_v, max_v]],
        cmap="jet",
    )
    ax.set_xlabel(f"{labels[i]} conditioning")
    ax.set_ylabel(f"{labels[i]} generated")
    ax.set_title("lighter mass jet")
plt.tight_layout()
plt.plot()