In [None]:
import os
import sys

sys.path.append("../")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import h5py
import hydra
import numpy as np
import pytorch_lightning as pl
import torch
from omegaconf import OmegaConf

In [None]:
# set env variable DATA_DIR again because of hydra
from dotenv import load_dotenv

load_dotenv()
os.environ["DATA_DIR"] = os.environ.get("DATA_DIR")

In [None]:
# plots and metrics
import matplotlib.pyplot as plt

from src.data.components import calculate_all_wasserstein_metrics
from src.utils.data_generation import generate_data
from src.utils.plotting import apply_mpl_styles, create_and_plot_data, plot_single_jets

apply_mpl_styles()

In [None]:
experiment1 = "fm_tops150.yaml"
experiment2 = "fm_tops150_cond.yaml"
experiment3 = "diffusion_tops150_cond.yaml"

In [None]:
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    cfg1 = hydra.compose(config_name="train.yaml", overrides=[f"experiment={experiment1}"])
    # print(OmegaConf.to_yaml(cfg1))

In [None]:
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    cfg2 = hydra.compose(config_name="train.yaml", overrides=[f"experiment={experiment2}"])
    # print(OmegaConf.to_yaml(cfg1))

In [None]:
datamodule1 = hydra.utils.instantiate(cfg1.data)
datamodule2 = hydra.utils.instantiate(cfg2.data)
model1 = hydra.utils.instantiate(cfg1.model)
model2 = hydra.utils.instantiate(cfg2.model)

In [None]:
model_name_for_saving = "nb_fm_tops30"

In [None]:
datamodule1.setup()
datamodule2.setup()

In [None]:
test_data1 = np.array(datamodule1.tensor_test)
test_mask1 = np.array(datamodule1.mask_test)
test_cond1 = np.array(datamodule1.tensor_conditioning_test)
val_data1 = np.array(datamodule1.tensor_val)
val_mask1 = np.array(datamodule1.mask_val)
val_cond1 = np.array(datamodule1.tensor_conditioning_val)
train_data1 = np.array(datamodule1.tensor_train)
train_mask1 = np.array(datamodule1.mask_train)
train_cond1 = np.array(datamodule1.tensor_conditioning_train)
means1 = np.array(datamodule1.means)
stds1 = np.array(datamodule1.stds)

In [None]:
print(test_data1.shape)
print(test_mask1.shape)
print(test_cond1.shape)
print(val_data1.shape)
print(val_mask1.shape)
print(val_cond1.shape)
print(train_data1.shape)
print(train_mask1.shape)
print(train_cond1.shape)
print(means1)
print(stds1)

In [None]:
test_data2 = np.array(datamodule2.tensor_test)
test_mask2 = np.array(datamodule2.mask_test)
test_cond2 = np.array(datamodule2.tensor_conditioning_test)
val_data2 = np.array(datamodule2.tensor_val)
val_mask2 = np.array(datamodule2.mask_val)
val_cond2 = np.array(datamodule2.tensor_conditioning_val)
train_data2 = np.array(datamodule2.tensor_train)
train_mask2 = np.array(datamodule2.mask_train)
train_cond2 = np.array(datamodule2.tensor_conditioning_train)
means2 = np.array(datamodule2.means)
stds2 = np.array(datamodule2.stds)

In [None]:
ckpt1 = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150_cond/runs/2023-07-11_03-07-10/checkpoints/last-EMA.ckpt"
ckpt2 = "/beegfs/desy/user/ewencedr/deep-learning/logs/diffusion_tops150_cond/runs/2023-07-11_03-11-13/checkpoints/last-EMA.ckpt"
model1 = model1.load_from_checkpoint(ckpt1)
model2 = model2.load_from_checkpoint(ckpt2)

In [None]:
jet_type = "t"

In [None]:
mask1 = test_mask1
data1 = test_data1
cond1 = test_cond1

In [None]:
# select only data, mask and cond for the specified jet type
# also for training data because it is compared to test data later
index_jettype1 = np.squeeze(np.argwhere(np.array(datamodule1.jet_types) == jet_type))

indice_jettype1 = np.squeeze(np.argwhere(cond1[:, index_jettype1] == 1))
indice_jettype_train1 = np.squeeze(np.argwhere(train_cond1[:, index_jettype1] == 1))

mask_jettype1 = mask1[indice_jettype1]
data_jettype1 = data1[indice_jettype1]
cond_jettype1 = cond1[indice_jettype1]
train_mask_jettype1 = train_mask1[indice_jettype_train1]
train_data_jettype1 = train_data1[indice_jettype_train1]
train_cond_jettype1 = train_cond1[indice_jettype_train1]

print(mask_jettype1.shape)
print(data_jettype1.shape)
print(cond_jettype1.shape)
print(train_mask_jettype1.shape)
print(train_data_jettype1.shape)
print(train_cond_jettype1.shape)

In [None]:
mask2 = test_mask2
data2 = test_data2
cond2 = test_cond2

In [None]:
# select only data, mask and cond for the specified jet type
# also for training data because it is compared to test data later
index_jettype2 = np.squeeze(np.argwhere(np.array(datamodule2.jet_types) == jet_type))

indice_jettype2 = np.squeeze(np.argwhere(cond2[:, index_jettype2] == 1))
indice_jettype_train2 = np.squeeze(np.argwhere(train_cond2[:, index_jettype2] == 1))

mask_jettype2 = mask2[indice_jettype2]
data_jettype2 = data2[indice_jettype2]
cond_jettype2 = cond2[indice_jettype2]
train_mask_jettype1 = train_mask1[indice_jettype_train1]
train_data_jettype1 = train_data1[indice_jettype_train1]
train_cond_jettype1 = train_cond1[indice_jettype_train1]

print(mask_jettype1.shape)
print(data_jettype1.shape)
print(cond_jettype1.shape)
print(train_mask_jettype1.shape)
print(train_data_jettype1.shape)
print(train_cond_jettype1.shape)

In [None]:
# fig, data, generation_times = create_and_plot_data(
#    np.array(data_jettype),
#    [model1, model2],
#    cond=[torch.tensor(cond_jettype), torch.tensor(cond_jettype)],
#    save_name="fm_tops_nb",
#    labels=["FM", "2"],
#    mask=mask_jettype,
#    num_jet_samples=len(data_jettype),
#    batch_size=1000,
#    variable_set_sizes=True,
#    normalized_data=[True, True],
#    means=means,
#    stds=stds,
#    save_folder="./logs/nb_plots/",
#    plottype="sim_data",
#    plot_jet_features=True,
#    plot_w_dists=False,
#    plot_selected_multiplicities=False,
#    selected_multiplicities=[1, 3, 5, 10, 20, 30],
#    ode_solver="midpoint",
#    ode_steps=100,
#    bins=100,
#    mass_linear=False,
# )

In [None]:
data1 = np.load(
    "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150/runs/2023-07-11_03-06-15/final_generated_data_mp200nfe.npy"
)
data2 = np.load(
    "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150_cond/runs/2023-07-11_03-07-10/final_generated_data_mp200nfe.npy"
)
data3 = np.load(
    "/beegfs/desy/user/ewencedr/deep-learning/logs/diffusion_tops150_cond/runs/2023-07-11_03-11-13/final_generated_data_mp200nfe.npy"
)

# data1 = np.load("/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30/runs/2023-07-12_00-57-10/final_generated_data_mp200nfe.npy")
# data2 = np.load("/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30_cond/runs/2023-07-11_03-03-48/final_generated_data_mp200nfe.npy")
# data3 = np.load("/beegfs/desy/user/ewencedr/deep-learning/logs/diffusion_tops30_cond/runs/2023-07-11_03-12-22/final_generated_data_mp200nfe.npy")

In [None]:
data1 = data1[: len(test_data1)]
data2 = data2[: len(test_data1)]
data3 = data3[: len(test_data1)]

In [None]:
print(data1.shape)
print(data2.shape)
print(data3.shape)

In [None]:
from src.utils.plotting import plot_data, prepare_data_for_plotting

In [None]:
plot_config = {
    "num_samples": -1,
    "plot_jet_features": False,
    "plot_w_dists": False,
    "plot_efps": True,
    "plot_selected_multiplicities": False,
    "selected_multiplicities": [10, 20, 30, 40, 50, 100],
    "selected_particles": [1, 3, 10],
    "plottype": "sim_data",
    "save_fig": False,
    "variable_jet_sizes_plotting": True,
    "bins": 100,
    "close_fig": False,
    "labels": ["FM", "FM cond", "Jedi cond"],
}
plot_prep_config = {
    "calculate_efps" if key == "plot_efps" else key: value
    for key, value in plot_config.items()
    if key in ["plot_efps", "selected_particles", "selected_multiplicities"]
}

In [None]:
(
    jet_data,
    efps_values,
    pt_selected_particles,
    pt_selected_multiplicities,
) = prepare_data_for_plotting(
    np.array([data1, data2, data3]),
    **plot_prep_config,
)

In [None]:
print(efps_values.shape)

In [None]:
print(test_data1.shape)

In [None]:
(
    jet_data_sim,
    efps_sim,
    pt_selected_particles_sim,
    pt_selected_multiplicities_sim,
) = prepare_data_for_plotting(
    [test_data1],
    **plot_prep_config,
)

In [None]:
# print(jet_data.shape)
# print(data_jettype1.shape)
# print(mask_jettype1.shape)
# sim_data = np.concatenate([test_data1, test_mask1], axis=-1)
# print(sim_data.shape)

In [None]:
sim_data = test_data1

In [None]:
import matplotlib as mpl
from cycler import cycler

In [None]:
# mpl.rcParams["axes.prop_cycle"] = cycler(
#    color=[
#        "#B6BFC3",
#        "#0271BB",
#        "#E2001A",
#    ]
# )

In [None]:
jet_data_sim, efps_sim, pt_selected_particles_sim = (
    jet_data_sim[0],
    efps_sim[0],
    pt_selected_particles_sim[0],
)

In [None]:
fig = plot_data(
    particle_data=np.array([data1, data2, data3]),
    sim_data=sim_data,
    jet_data_sim=jet_data_sim,
    jet_data=jet_data,
    efps_sim=efps_sim,
    efps_values=efps_values,
    pt_selected_particles=pt_selected_particles,
    pt_selected_multiplicities=pt_selected_multiplicities,
    pt_selected_particles_sim=pt_selected_particles_sim,
    pt_selected_multiplicities_sim=pt_selected_multiplicities_sim,
    **plot_config,
)

# Substructure

In [None]:
b_sub = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150/runs/2023-07-11_03-06-15/substructure_jetnet_mp200nfe"
# b_sub = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30/runs/2023-07-12_00-57-10/substructure_jetnet_mp200nfe"

In [None]:
substr1 = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150/runs/2023-07-11_03-06-15/substructure_mp200nfe"
substr2 = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150_cond/runs/2023-07-11_03-07-10/substructure_mp200nfe"
substr3 = "/beegfs/desy/user/ewencedr/deep-learning/logs/diffusion_tops150_cond/runs/2023-07-11_03-11-13/substructure_mp200nfe"
# substr1 = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30/runs/2023-07-12_00-57-10/substructure_mp200nfe"
# substr2 = "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30_cond/runs/2023-07-11_03-03-48/substructure_mp200nfe"
# substr3 = "/beegfs/desy/user/ewencedr/deep-learning/logs/diffusion_tops30_cond/runs/2023-07-11_03-12-22/substructure_mp200nfe"

In [None]:
# load substructure for model generated data
data_substructure_b = []
with h5py.File(b_sub + ".h5", "r") as f:
    tau21_b = np.array(f["tau21"])
    tau32_b = np.array(f["tau32"])
    d2_b = np.array(f["d2"])
    for key in f.keys():
        data_substructure_b.append(np.array(f[key]))
data_substructure_b = np.array(data_substructure_b)

In [None]:
# load substructure for model generated data
data_substructure_1 = []
with h5py.File(substr1 + ".h5", "r") as f:
    tau21_1 = np.array(f["tau21"])[: len(tau21_b)]
    tau32_1 = np.array(f["tau32"])[: len(tau21_b)]
    d2_1 = np.array(f["d2"])[: len(tau21_b)]
    for key in f.keys():
        data_substructure_1.append(np.array(f[key]))
data_substructure_1 = np.array(data_substructure_1)

In [None]:
# load substructure for model generated data
data_substructure_2 = []
with h5py.File(substr2 + ".h5", "r") as f:
    tau21_2 = np.array(f["tau21"])[: len(tau21_b)]
    tau32_2 = np.array(f["tau32"])[: len(tau21_b)]
    d2_2 = np.array(f["d2"])[: len(tau21_b)]
    for key in f.keys():
        data_substructure_2.append(np.array(f[key]))
data_substructure_2 = np.array(data_substructure_2)

In [None]:
# load substructure for model generated data
data_substructure_3 = []
with h5py.File(substr3 + ".h5", "r") as f:
    tau21_3 = np.array(f["tau21"])[: len(tau21_b)]
    tau32_3 = np.array(f["tau32"])[: len(tau21_b)]
    d2_3 = np.array(f["d2"])[: len(tau21_b)]
    for key in f.keys():
        data_substructure_3.append(np.array(f[key]))
data_substructure_3 = np.array(data_substructure_3)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
bins = 100
histb = ax1.hist(tau21_b, bins=bins, label="Sim data", histtype="stepfilled", alpha=0.5)
hist1 = ax1.hist(tau21_1, bins=histb[1], label="FM", histtype="step")
hist2 = ax1.hist(tau21_2, bins=histb[1], label="FM cond", histtype="step")
hist3 = ax1.hist(tau21_3, bins=histb[1], label="Jedi cond", histtype="step")
ax1.set_title(r"$\tau_{21}$")
# ax1.legend(loc="best")

histb_t32 = ax2.hist(tau32_b, bins=bins, label="Sim data", histtype="stepfilled", alpha=0.5)
hist1_t32 = ax2.hist(tau32_1, bins=histb_t32[1], label="FM", histtype="step")
hist2_t32 = ax2.hist(tau32_2, bins=histb_t32[1], label="FM cond", histtype="step")
hist3_t32 = ax2.hist(tau32_3, bins=histb_t32[1], label="Jedi cond", histtype="step")
ax2.set_title(r"$\tau_{32}$")
# ax2.legend(loc="best")
ax2.legend(loc="best", prop={"size": 14}, frameon=True)

histb_d = ax3.hist(d2_b, bins=bins, label="Sim data", histtype="stepfilled", alpha=0.5)
hist1_d = ax3.hist(d2_1, bins=histb_d[1], label="FM", histtype="step")
hist2_d = ax3.hist(d2_2, bins=histb_d[1], label="FM cond", histtype="step")
hist3_d = ax3.hist(d2_3, bins=histb_d[1], label="Jedi cond", histtype="step")
ax3.set_title(r"$d_2$")
# ax3.legend(loc="best")

# Save file as h5

### Load conditioning

In [None]:
filepath = "../data/conditioning.h5"

In [None]:
with h5py.File(filepath, "r") as f:
    print(f.keys())
    types = f["type"][:]
    mass = f["mass"][:]
    pt = f["pt"][:]
    num_particles = f["num_particles"][:]
    gen_ctxt = f["gen_ctxt"][:]

In [None]:
print(pt.shape)

### Load generated data

In [None]:
data = np.load(
    "/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops150_cond/runs/2023-07-11_03-07-10/final_generated_data_mp200nfe.npy"
)
# data = np.load("/beegfs/desy/user/ewencedr/deep-learning/logs/fm_tops30_cond/runs/2023-07-11_03-03-48/final_generated_data_mp200nfe.npy")

In [None]:
print(data.shape)

In [None]:
data2 = data.copy()

In [None]:
data2[:, :, 2] = data.copy()[:, :, 2] * pt

In [None]:
print(data[:10, :5, 2])

In [None]:
print(pt[:10])

In [None]:
print(data2[:10, :5, 2])

In [None]:
plt.hist(data[:, :, 2].flatten(), bins=100, label="FM")
plt.yscale("log")
plt.show()

In [None]:
plt.hist(data2[:, :, 2].flatten(), bins=100, label="FM")
plt.yscale("log")
plt.show()

In [None]:
filepath_write = "/beegfs/desy/user/ewencedr/deep-learning/final_data/fm_tops150_cond.h5"

In [None]:
with h5py.File(filepath_write, "w") as f:
    f.create_dataset("etaphipt", data=data)
    f.create_dataset("etaphiptfrac", data=data2)