In [1]:
import numpy as np
import torch
import os
from tqdm import tqdm
import json

# -----------------------
# Config
# -----------------------
output_dir = "/work/gbadarac/MonoJet_NPLM/MonoJet_NPLM_analysis/Generate_Gaussian_Toy/saved_generated_gaussian_toys"
os.makedirs(output_dir, exist_ok=True)

x_eval_path = "/work/gbadarac/MonoJet_NPLM/MonoJet_NPLM_analysis/Train_Ensembles/Generate_Data/saved_generated_target_data/100k_target_training_set.npy"
x_eval = np.load(x_eval_path)  # shape: (100000, 2)
n_points = x_eval.shape[0]

# Target distribution params (known 2D Gaussian)
mean = np.array([-0.5, 0.6])
cov = np.diag([0.25**2, 0.4**2])  # diagonal covariance
inv_cov = np.linalg.inv(cov)
det_cov = np.linalg.det(cov)

# Reproducibility
np.random.seed(1234)

# -----------------------
# Evaluate target PDF at x_eval
# -----------------------
delta = x_eval - mean
exponent = -0.5 * np.sum(delta @ inv_cov * delta, axis=1)
norm_const = 1. / (2 * np.pi * np.sqrt(det_cov))
target_probs = norm_const * np.exp(exponent)  # shape: (100000,)



In [None]:
# -----------------------
# Generate toy models with structured bumps
# -----------------------

n_models = 10
model_probs = []
bump_metadata = []
bump_params = []  # store (center, width, amplitude, feature_idx) for plotting

for i in tqdm(range(n_models), desc="Generating models"):
    perturbed = target_probs.copy()
    metadata_i = {}

    for feat_idx in [0, 1]:
        mean_i = mean[feat_idx]
        std_i = np.sqrt(cov[feat_idx, feat_idx])
        center = np.random.uniform(mean_i - 2 * std_i, mean_i + 2 * std_i)
        width = np.random.uniform(0.03, 0.05)
        amplitude = np.random.uniform(0.01, 0.03)

        delta = x_eval[:, feat_idx] - center
        bump = np.exp(-0.5 * (delta / width)**2)
        # use normalization for gaussian function 1/np.sqrt()
        bump /= bump.sum()  # unit area
        bump *= amplitude * n_points  # now has area = amplitude * n_points

        bump_params.append({
            "model_idx": i,
            "feature_idx": feat_idx,
            "center": center,
            "width": width,
            "amplitude": amplitude
        })

        # Add to target
        perturbed += bump

        # Save metadata
        metadata_i[f"feature_{feat_idx}_center"] = float(center)
        metadata_i[f"feature_{feat_idx}_width"] = float(width)
        metadata_i[f"feature_{feat_idx}_amplitude"] = float(amplitude)
    
    perturbed /= perturbed.sum()
    perturbed *= n_points

    model_probs.append(torch.tensor(perturbed, dtype=torch.float32))
    bump_metadata.append(metadata_i)



Generating models: 100%|██████████| 10/10 [00:00<00:00, 11.64it/s]


In [3]:
# -----------------------
# Plot marginals
# -----------------------
import matplotlib.pyplot as plt
model_probs_np = torch.stack(model_probs).numpy()  # shape: (10, 100000)
feature_names = ["Feature 1", "Feature 2"]
bins = 100

# Normalize target_probs once (outside loop)
target_probs_normed = target_probs / target_probs.sum()

# Loop over models
for model_idx, probs in enumerate(model_probs_np):
    fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=False)

    # Normalize perturbed model
    perturbed_normed = probs / probs.sum()

    for i in range(2):
        feature = x_eval[:, i]
        bin_edges = np.linspace(feature.min(), feature.max(), bins + 1)
        bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
        bin_widths = np.diff(bin_edges)

        # Use normalized weights for histogram
        pert_hist = np.histogram(feature, bins=bin_edges, weights=perturbed_normed)[0]
        target_hist = np.histogram(feature, bins=bin_edges, weights=target_probs_normed)[0]

        # Convert to densities
        pert_density = pert_hist / bin_widths
        target_density = target_hist / bin_widths

        ax = axes[i]
        ax.plot(bin_centers, target_density, label="Target", color="red", linestyle="--")
        ax.plot(bin_centers, pert_density, label="Perturbed Model", color="blue")

        # Shade bump region
        bump_info = bump_metadata[model_idx]
        mu = bump_info[f"feature_{i}_center"]
        sigma = bump_info[f"feature_{i}_width"]
        amp = bump_info[f"feature_{i}_amplitude"]

        bump_region = (bin_centers >= mu - 1.5 * sigma) & (bin_centers <= mu + 1.5 * sigma)
        ax.fill_between(bin_centers, pert_density, target_density, where=bump_region,
                        color="blue", alpha=0.3, label="Bump region")

        # Annotate bump
        y_bump = pert_density[bump_region].max()
        y_top = ax.get_ylim()[1]
        y_text = min(y_bump * 1.05, y_top * 0.95)  # clamp to stay inside

        ax.annotate(
            f"μ={mu:.2f}, σ={sigma:.2f}, a={amp:.2f}",
            xy=(mu, y_bump),
            xytext=(mu, y_text),
            ha="center", fontsize=9,
            arrowprops=dict(arrowstyle="->", color="gray"),
            clip_on=True
        )

        ax.set_ylabel("Density")
        ax.set_title(f"Model {model_idx} — Marginal: {feature_names[i]}")
        ax.legend()

    axes[1].set_xlabel("Feature value")
    plt.tight_layout()
    fig.savefig(os.path.join(output_dir, f"model_{model_idx}_marginals.png"))
    plt.close(fig)



In [4]:
# -----------------------
# Save models and metadata
# -----------------------
model_probs_np = np.stack([t.numpy() for t in model_probs], axis=1)  # (N, M)
np.save(os.path.join(output_dir, "f_i.npy"), model_probs_np)

with open(os.path.join(output_dir, "bump_metadata.json"), "w") as f:
    json.dump(bump_metadata, f, indent=2)

with open(os.path.join(output_dir, "bump_params.json"), "w") as f:
    json.dump(bump_params, f, indent=2)
