In [None]:
import os
import sys

sys.path.append("../")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing

from src.data.components import normalize_tensor
from src.utils.preprocessing import LogitScaler

In [None]:
# set env variable DATA_DIR again because of hydra
from dotenv import load_dotenv

load_dotenv()
os.environ["DATA_DIR"] = os.environ.get("DATA_DIR")

In [None]:
filepath = "/beegfs/desy/user/sommerhm/generative_challenge_2023/outerdata_kfold_1.npy"

In [None]:
data = np.load(filepath)

In [None]:
print(data.shape)

In [None]:
plt.hist(data[:, 0], bins=100)
plt.show()

In [None]:
for i in range(1, 5):
    plt.hist(data[:, i], bins=100)
    plt.show()

In [None]:
means = np.mean(data, axis=0)
stds = np.std(data, axis=0)
normalized_data_1 = normalize_tensor(np.copy(data), means, stds, sigma=1)
print(means)
print(stds)

In [None]:
scaler = preprocessing.StandardScaler().fit(data)
print(scaler.mean_)
print(scaler.scale_)
normalized_data_2 = scaler.transform(data)

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(20, 4))
for i in range(5):
    hist = ax[i].hist(normalized_data_1[:, i], bins=100, label="self")
    ax[i].hist(normalized_data_2[:, i], bins=hist[1], label="sklearn", histtype="step")
plt.legend()

plt.show()

In [None]:
from sklearn.pipeline import make_pipeline

scaler = make_pipeline(LogitScaler(), preprocessing.StandardScaler()).fit(data)
processed_data = scaler.transform(data)

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(20, 4))
for i in range(5):
    hist = ax[i].hist(processed_data[:, i], bins=100, label="logit + scale")
plt.legend()

plt.show()

In [None]:
data_back = scaler.inverse_transform(processed_data)

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(20, 4))
for i in range(5):
    hist = ax[i].hist(data[:, i], bins=100, label="data")
    ax[i].hist(data_back[:, i], bins=hist[1], label="back", histtype="step")
plt.legend()
plt.yscale("log")
plt.show()

# Evaluation

In [None]:
from os.path import join

import hydra
import torch
from omegaconf import OmegaConf

from src.utils.data_generation import generate_data_v2
from src.utils.plotting import apply_mpl_styles

In [None]:
apply_mpl_styles()

In [None]:
folder = (
    "/beegfs/desy/user/ewencedr/deep-learning/logs/gen_challenge"
    " logit_true/runs/2023-11-27_01-53-50/"
)
# folder = "/beegfs/desy/user/ewencedr/deep-learning/logs/gen_challenge/runs/2023-11-22_16-03-46/"

load config

In [None]:
cfg_backup_file = join(folder, "config.yaml")
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    if os.path.exists(cfg_backup_file):
        print("config file already exists --> loading from run directory")
    else:
        raise FileNotFoundError("config file not found")
cfg = OmegaConf.load(cfg_backup_file)

instantiate model and data module

In [None]:
datamodule = hydra.utils.instantiate(cfg.data)
model = hydra.utils.instantiate(cfg.model)

datamodule.setup()

load checkpoint

In [None]:
ckpt = join(folder, "checkpoints", "last-EMA.ckpt")
model = model.load_from_checkpoint(ckpt)

### Generate Conditioning

In [None]:
data_load0_sr = np.load(
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_0.npy"
)
data_load1_sr = np.load(
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_1.npy"
)
data_load2_sr = np.load(
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_2.npy"
)
data_load3_sr = np.load(
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_3.npy"
)
data_load4_sr = np.load(
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_4.npy"
)
data_load_list_sr = [
    data_load0_sr,
    data_load1_sr,
    data_load2_sr,
    data_load3_sr,
    data_load4_sr,
]
innerdata_train = np.concatenate(
    [
        data_load_list_sr[i]
        for i in range(5)
        if i not in datamodule.hparams.val_sets + datamodule.hparams.test_sets
    ],
    axis=0,
)

In [None]:
test_data_number = datamodule.hparams.test_sets[0]
innerdata_test = np.load(
    f"/beegfs/desy/user/sommerhm/generative_challenge_2023/innerdata_kfold_{test_data_number}.npy"
)

In [None]:
# fitting a KDE for the mass distribution based on the inner training set

# we also perform a logit first to stretch out the hard boundaries
from sklearn.neighbors import KernelDensity

m_scaler = LogitScaler()
m_train = m_scaler.fit_transform(innerdata_train[:, 0:1])

kde_model = KernelDensity(bandwidth=0.01, kernel="gaussian")
kde_model.fit(m_train)

# now let's sample 4x the number of training data
m_samples = kde_model.sample(4 * len(m_train)).astype(np.float32)
m_samples = m_scaler.inverse_transform(m_samples)

In [None]:
print(m_samples.shape)

pre-process mjj samples for use as conditioning

In [None]:
m_conditioning = datamodule.preprocessing_pipeline_cond.transform(m_samples)

### Generate Samples

In [None]:
torch.manual_seed(1111)
data, generation_time = generate_data_v2(
    model,
    num_jet_samples=len(m_conditioning),
    batch_size=2048,
    cond=torch.Tensor(m_conditioning),
    preprocessing_pipeline=datamodule.preprocessing_pipeline,
    ode_solver="midpoint",
    ode_steps=200,
)

In [None]:
samples = np.concatenate([m_samples, data], axis=1)  # [:len(innerdata_train)]

In [None]:
print(samples.shape)

In [None]:
# Load Manuel's generated data
path_comparison = (
    "/beegfs/desy/user/sommerhm/generative_challenge_2023/no-interpolation_samples.npy"
)
data_comparison = np.load(path_comparison)
print(data_comparison.shape)
samples_comparison = data_comparison[: len(samples)]
print(samples_comparison.shape)

In [None]:
# comparing samples to inner background (idealized sanity check)
label_map = {
    "0": r"$m_{jj}$",
    "1": r"$m_{J_1}$",
    "2": r"$\Delta m_J$",
    "3": r"$\tau_{41}^{J_1}$",
    "4": r"$\tau_{41}^{J_2}$",
}
fig, ax = plt.subplots(2, 5, figsize=(35, 8), gridspec_kw={"height_ratios": [3, 1]})
for i in range(5):
    hist_data = ax[0, i].hist(
        innerdata_test[:, i], bins=40, label="data background", density=True, histtype="stepfilled"
    )
    binning = hist_data[1]
    next(ax[0, i]._get_lines.prop_cycler)
    hist_samples = ax[0, i].hist(
        samples[:, i], bins=binning, label="sampled background", density=True, histtype="step"
    )
    hist_samples_comparison = ax[0, i].hist(
        samples_comparison[:, i],
        bins=binning,
        label="sampled background (Manuel)",
        density=True,
        histtype="step",
    )
    # data_hist = hist_data[0]
    # sample_hist = hist_samples[0]
    data_hist = np.histogram(innerdata_test[:, i], bins=binning, density=False)[0]
    sample_hist = np.histogram(samples[:, i], bins=binning, density=False)[0]
    sample_hist_comparison = np.histogram(samples_comparison[:, i], bins=binning, density=False)[0]

    data_scale_factor = np.sum(data_hist) * np.diff(binning)
    sample_scale_factor = np.sum(sample_hist) * np.diff(binning)
    sample_scale_factor_comparison = np.sum(sample_hist_comparison) * np.diff(binning)
    if i == 2:
        ax[0, i].legend(loc="best", frameon=False)
    # ax[i].set_ylim(0, plt.gca().get_ylim()[1] * 1.2)
    ax[0, i].set_xlabel(f"{label_map[str(i)]}")
    # ax[0,i].set_yscale("log")
    with np.errstate(divide="ignore", invalid="ignore"):
        ax[1, i].axhline(1.0, color="black", linestyle="-", alpha=0.8)
        next(ax[1, i]._get_lines.prop_cycler)
        next(ax[1, i]._get_lines.prop_cycler)
        ax[1, i].errorbar(
            0.5 * (binning[:-1] + binning[1:]),
            data_hist / sample_hist * sample_scale_factor / data_scale_factor,
            linestyle="none",
            marker=".",
            yerr=np.sqrt(data_hist) / sample_hist * sample_scale_factor / data_scale_factor,
        )
        ax[1, i].errorbar(
            0.5 * (binning[:-1] + binning[1:]),
            data_hist
            / sample_hist_comparison
            * sample_scale_factor_comparison
            / data_scale_factor,
            linestyle="none",
            marker=".",
            yerr=np.sqrt(data_hist)
            / sample_hist_comparison
            * sample_scale_factor_comparison
            / data_scale_factor,
        )

        ax[1, i].set_ylim(0.85, 1.15)
        # ax[1,i].set_ylim(0.3, 1.7)

    if i == 0:
        ax[0, i].set_ylabel("Events (norm.)")
        ax[1, i].set_ylabel("Data/Sample (norm.)")
plt.tight_layout()
plt.show()

In [None]:
save_path = "/beegfs/desy/user/ewencedr/data/generative_challenge/gen_data.npy"
save_path_true = "/beegfs/desy/user/ewencedr/data/generative_challenge/gen_data_true.npy"

In [None]:
# np.save(save_path, samples)
# np.save(save_path_true, innerdata_test)

In [None]:
samples = np.load(save_path)
innerdata_test = np.load(save_path_true)

In [None]:
for i in range(5):
    fig, (ax1, ax2) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True)
    hist_data = ax1.hist(
        innerdata_test[:, i], bins=100, label="data background", density=True, histtype="step"
    )
    binning = hist_data[1]
    hist_samples = ax1.hist(
        samples[:, i], bins=binning, label="sampled background", density=True, histtype="step"
    )
    data_hist = hist_data[0]
    sample_hist = hist_samples[0]
    # ax1.errorbar(0.5*(binning[:-1] + binning[1:]), data_hist/data_scale_factor, yerr=np.sqrt(data_hist)/data_scale_factor, fmt="none", color=data_color)
    sample_scale_factor = sum(sample_hist) * np.diff(binning)
    data_scale_factor = sum(data_hist) * np.diff(binning)
    # print(sample_scale_factor, data_scale_factor)
    with np.errstate(divide="ignore", invalid="ignore"):
        ax2.axhline(1.0, color="black", linestyle="-", alpha=0.8)
        ax2.errorbar(
            0.5 * (binning[:-1] + binning[1:]),
            data_hist / sample_hist * sample_scale_factor / data_scale_factor,
            linestyle="none",
            marker=".",
            yerr=np.sqrt(data_hist) / sample_hist * sample_scale_factor / data_scale_factor,
        )
        ax2.set_ylim(0.85, 1.15)

## Classifier

In [None]:
folder_classifier = (
    "/beegfs/desy/user/ewencedr/deep-learning/logs/hl_classifier/runs/2023-12-04_14-45-58/"
)

In [None]:
cfg_backup_file_classifier = join(folder_classifier, "config.yaml")
# load everything from experiment config
with hydra.initialize(version_base=None, config_path="../configs/"):
    if os.path.exists(cfg_backup_file_classifier):
        print("config file already exists --> loading from run directory")
    else:
        raise FileNotFoundError("config file not found")
cfg_classifier = OmegaConf.load(cfg_backup_file_classifier)

In [None]:
datamodule_classifier = hydra.utils.instantiate(cfg_classifier.data)
model_classifier = hydra.utils.instantiate(cfg_classifier.model)

datamodule_classifier.setup()

In [None]:
ckpt_classifier = join(folder_classifier, "checkpoints", "last.ckpt")
model_classifier = model_classifier.load_from_checkpoint(ckpt_classifier)

In [None]:
samples_preprocessed = normalize_tensor(
    torch.clone(torch.Tensor(samples[:, 1:])),
    datamodule_classifier.means,
    datamodule_classifier.stds,
)
samples_preprocessed_comparison = normalize_tensor(
    torch.clone(torch.Tensor(samples_comparison[:, 1:])),
    datamodule_classifier.means,
    datamodule_classifier.stds,
)

In [None]:
print(samples_preprocessed.shape)

In [None]:
classifier_preds = model_classifier.classify(torch.Tensor(samples_preprocessed)).detach().numpy()
classifier_preds_comparison = (
    model_classifier.classify(torch.Tensor(samples_preprocessed_comparison)).detach().numpy()
)

In [None]:
plt.hist(classifier_preds, bins=100, label="samples")
plt.hist(classifier_preds_comparison, bins=100, label="samples (Manuel)", histtype="step")
plt.legend()
plt.show()

In [None]:
print(classifier_preds.shape)

In [None]:
threshhold_value = 99
anomaly_threshold = np.percentile(classifier_preds, threshhold_value)
print(anomaly_threshold)
selected_samples = samples[classifier_preds > anomaly_threshold]
anomaly_threshold_comparison = np.percentile(classifier_preds_comparison, threshhold_value)
selected_samples_comparison = samples_comparison[
    classifier_preds_comparison > anomaly_threshold_comparison
]

In [None]:
# comparing samples to inner background (idealized sanity check)
label_map = {
    "0": r"$m_{jj}$",
    "1": r"$m_{J_1}$",
    "2": r"$\Delta m_J$",
    "3": r"$\tau_{41}^{J_1}$",
    "4": r"$\tau_{41}^{J_2}$",
}
fig, ax = plt.subplots(2, 5, figsize=(35, 8), gridspec_kw={"height_ratios": [3, 1]})
for i in range(5):
    hist_data = ax[0, i].hist(
        innerdata_test[:, i], bins=40, label="data background", density=True, histtype="stepfilled"
    )
    binning = hist_data[1]
    next(ax[0, i]._get_lines.prop_cycler)
    hist_samples = ax[0, i].hist(
        selected_samples[:, i],
        bins=binning,
        label="sampled background",
        density=True,
        histtype="step",
    )
    hist_samples_comparison = ax[0, i].hist(
        selected_samples_comparison[:, i],
        bins=binning,
        label="sampled background (Manuel)",
        density=True,
        histtype="step",
    )
    # data_hist = hist_data[0]
    # sample_hist = hist_samples[0]
    data_hist = np.histogram(innerdata_test[:, i], bins=binning, density=False)[0]
    sample_hist = np.histogram(samples[:, i], bins=binning, density=False)[0]
    sample_hist_comparison = np.histogram(samples_comparison[:, i], bins=binning, density=False)[0]

    data_scale_factor = np.sum(data_hist) * np.diff(binning)
    sample_scale_factor = np.sum(sample_hist) * np.diff(binning)
    sample_scale_factor_comparison = np.sum(sample_hist_comparison) * np.diff(binning)
    if i == 2:
        ax[0, i].legend(loc="best", frameon=False)
    # ax[i].set_ylim(0, plt.gca().get_ylim()[1] * 1.2)
    ax[0, i].set_xlabel(f"{label_map[str(i)]}")
    ax[0, i].set_yscale("log")
    with np.errstate(divide="ignore", invalid="ignore"):
        ax[1, i].axhline(1.0, color="black", linestyle="-", alpha=0.8)
        next(ax[1, i]._get_lines.prop_cycler)
        next(ax[1, i]._get_lines.prop_cycler)
        ax[1, i].errorbar(
            0.5 * (binning[:-1] + binning[1:]),
            data_hist / sample_hist * sample_scale_factor / data_scale_factor,
            linestyle="none",
            marker=".",
            yerr=np.sqrt(data_hist) / sample_hist * sample_scale_factor / data_scale_factor,
        )
        ax[1, i].errorbar(
            0.5 * (binning[:-1] + binning[1:]),
            data_hist
            / sample_hist_comparison
            * sample_scale_factor_comparison
            / data_scale_factor,
            linestyle="none",
            marker=".",
            yerr=np.sqrt(data_hist)
            / sample_hist_comparison
            * sample_scale_factor_comparison
            / data_scale_factor,
        )

        ax[1, i].set_ylim(0.85, 1.15)
        # ax[1,i].set_ylim(0.3, 1.7)

    if i == 0:
        ax[0, i].set_ylabel("Events (norm.)")
        ax[1, i].set_ylabel("Data/Sample (norm.)")
plt.tight_layout()
plt.show()