In [None]:
from tqdm.notebook import tqdm
import numpy as np

In [1]:
EPSILON = 1e-10

### Synthetic Data generation routine

In [None]:
def generate_synthetic_dataset(rng, n_runs=5000, n_anomalous_runs=500, n_bins=100,
        mu_base=0, sigma_base=1.0, A_drift=0.5, min_stats=2000, max_stats=20000,
        mu_slow_change_period=500, mu_rapid_changes_shift_lims=(0.5, 1.5),
        sigma_rapid_changes_shift_lims=(0.1, 0.4), mu_anomaly_shift_lims=(0.25, 0.75),
        sigma_anomaly_shift_lims=(0.05, 0.2), rapid_change_p=0.005, binom_p=0.4,
        anomaly_p=0.9, fraction_dead_bins=0.1):
    
    current_persistent_mu_shift_from_rapid_change = 0.0
    current_persistent_sigma_shift_from_rapid_change = 0.0
    list_of_histograms = list()
    list_of_labels = list()

    anomalous_indices = set(rng.choice(np.arange(1, n_runs + 1), size=n_anomalous_runs, replace=False))

    for i in range(1, n_runs + 1):
        if i in anomalous_indices:
            yi = 1
        else:
            yi = 0

        drift_mu = A_drift * np.sin(i / mu_slow_change_period * np.pi)
        mu_run = mu_base + drift_mu
        sigma_run = sigma_base
        rand_for_abrupt_change = rng.random(1).item()
        if rand_for_abrupt_change < rapid_change_p:
            delta_mu_new_rapid = rng.uniform(low=mu_rapid_changes_shift_lims[0], high=mu_rapid_changes_shift_lims[1])
            delta_sigma_new_rapid = rng.uniform(low=sigma_rapid_changes_shift_lims[0], high=sigma_rapid_changes_shift_lims[1])
            current_persistent_mu_shift_from_rapid_change = delta_mu_new_rapid
            current_persistent_sigma_shift_from_rapid_change = delta_sigma_new_rapid

        mu_for_gaussian = mu_run + current_persistent_mu_shift_from_rapid_change
        sigma_for_gaussian = max(EPSILON, sigma_run + current_persistent_sigma_shift_from_rapid_change)

        dead_bin_indices = None
        
        if yi == 1:
            rand_num = rng.random(1).item()
            if rand_num < anomaly_p:
                delta_mu_anomaly = rng.uniform(low=mu_anomaly_shift_lims[0], high=mu_anomaly_shift_lims[1])
                delta_sigma_anomaly = rng.uniform(low=sigma_anomaly_shift_lims[0], high=sigma_anomaly_shift_lims[1])
                sign_mu = rng.choice([-1, 1], size=1)
                sign_sigma = rng.choice([-1, 1], size=1)
                mu_for_gaussian = mu_for_gaussian + (sign_mu * delta_mu_anomaly)
                sigma_for_gaussian = max(EPSILON, sigma_for_gaussian + (sign_sigma * delta_sigma_anomaly))
                dead_bin_indices = rng.choice(np.arange(0, n_bins), size=int(fraction_dead_bins*n_bins), replace=False)

        Ixi = rng.integers(low=min_stats, high=max_stats, size=1)

        events = rng.normal(loc=mu_for_gaussian, scale=sigma_for_gaussian, size=Ixi)

        xi, xi_edges = np.histogram(events, bins=n_bins)

        for j in range(n_bins // 2, n_bins):
            n = xi[j]
            p = binom_p
            binomial_sample = rng.binomial(n, p)
            chosen_sign = rng.choice([-1, 1])
            if isinstance(chosen_sign, np.ndarray):
                raise Exception("Incorrect type for chosen_sign")
            xi[j] = max(0, xi[j] + chosen_sign * binomial_sample)

        if dead_bin_indices is not None:
            for j in dead_bin_indices:
                xi[j] = 0

        list_of_histograms.append(xi)
        list_of_labels.append(yi)
    
    return list_of_histograms, list_of_labels

In [None]:
def generate_datasets(num_datasets):
    num_datasets = 1000
    datasets = []
    for seed in tqdm(range(num_datasets)):
        rng = np.random.default_rng(seed)
        x, y = generate_synthetic_dataset(rng=rng)
        datasets.append((seed, x, y)) # (seed#, histograms, labels)
    return datasets

In [None]:
datasets = generate_datasets(1000)
seeds = np.array([d[0] for d in datasets], dtype=np.int32)
histograms = np.array([d[1] for d in datasets], dtype=np.float32)
labels = np.array([d[2] for d in datasets], dtype=np.int64)

In [None]:
np.savez_compressed("datasets.npz", seeds=seeds, histograms=histograms, labels=labels)