In [1]:
# Add import statements here
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# Small epsilon value to prevent division by zero
EPSILON = 1e-10

In [3]:
def reference_init(nb_bins: int) -> tuple:
    """
    Initializes the reference histogram (mu) and its per-bin Poisson uncertainties (sigma_mu_p).

    Args:
        nb_bins: The number of bins in the histograms

    Returns:
        A tuple containing:
            - mu0 (np.ndarray): The initial normalized uniform reference histogram [7].
            - sigma_mu0_p (np.ndarray): The initial bin-wise Poisson uncertainty (standard deviation) for mu0 [7].
    """
    # Initialize mu as a uniform distribution, normalized to unity
    mu0 = np.full(nb_bins, 1.0 / nb_bins, dtype=float)
    I_mu0 = np.sum(mu0)

    # Compute bin-wise Poisson uncertainties for the initial reference.
    variance_mu0_p = mu0 / I_mu0 - (mu0**2) / I_mu0
    # Ensure variance is non-negative to avoid sqrt of negative numbers, clamping if necessary
    variance_mu0_p = np.maximum(variance_mu0_p, EPSILON)
    sigma_mu0_p = np.sqrt(variance_mu0_p)

    return mu0, sigma_mu0_p, I_mu0

In [4]:
def poisson_uncertainty(x_tilde_i: np.ndarray, i_x_i: float) -> np.ndarray:
    """
    Calculates the bin-wise Poisson uncertainty for a normalized histogram x_tilde_i.
    
    Args:
        x_tilde_i: The normalized histogram (probabilities per bin) for the current run.
        i_x_i: The sum of original (unnormalized) counts for the current histogram.
    
    Returns:
        np.ndarray: Bin-wise Poisson uncertainty (standard deviation) for x_tilde_i.
    """
    if i_x_i <= EPSILON: # Handle cases where there are no events in the histogram
        return np.full_like(x_tilde_i, np.inf) # Return large uncertainty
    
    # Calculate variance using the specific formula provided
    variance_per_bin = x_tilde_i / i_x_i - (x_tilde_i**2) / i_x_i
    
    # Ensure variance is non-negative, clamping if necessary (e.g., due to floating point precision or zero counts)
    variance_per_bin = np.maximum(variance_per_bin, EPSILON) 
    
    return np.sqrt(variance_per_bin)

In [5]:
def ewma_init(alpha: float, mu0: np.ndarray, sigma_mu0_p: np.ndarray) -> tuple:
    """
    Initializes the supporting variables (accumulators) for the Exponentially Weighted Moving Average (EWMA) process.
    These accumulators help in efficiently updating the reference mean and variance.
    
    Args:
        alpha: The smoothing factor for EWMA. It determines the weight assigned to history.
        mu0: The initial normalized reference histogram.
        sigma_mu0_p: The initial bin-wise Poisson uncertainty (standard deviation) for mu0.
    
    Returns:
        A tuple containing the initialized EWMA accumulators:
            - W0 (np.ndarray): Initial weighted sum of inverse variances.
            - S_mu0 (np.ndarray): Initial weighted sum of histograms.
            - S_sigma_mu0 (np.ndarray): Initial weighted sum of squared deviations.
    """
    # omega0 is a bin-wise weight inversely proportional to sigma_mu0_p squared (variance) [11]
    # Add EPSILON to prevent division by zero for very small uncertainties.
    omega0 = 1.0 / (sigma_mu0_p**2 + EPSILON)
    
    W0 = (1.0 - alpha) * omega0
    S_mu0 = (1.0 - alpha) * omega0 * mu0
    S_sigma_mu0 = (1.0 - alpha) * omega0 * (sigma_mu0_p**2)
    
    return W0, S_mu0, S_sigma_mu0

In [6]:
def compute_chi2_pull(x_tilde_i: np.ndarray, mu_i: np.ndarray, 
                       sigma_mu_i: np.ndarray, sigma_x_tilde_i_p: np.ndarray) -> tuple:
    """
    Computes the reduced chi-squared anomaly score and bin-wise pull values for a run.
    
    Args:
        x_tilde_i: Normalized histogram for the current run.
        mu_i: Current reference histogram.
        sigma_mu_i: Current bin-wise uncertainty (standard deviation) of the reference.
        sigma_x_tilde_i_p: Bin-wise Poisson uncertainty (standard deviation) for x_tilde_i.
    
    Returns:
        A tuple containing:
            - chi2_nu (float): The reduced chi-squared anomaly score.
            - pull (np.ndarray): Bin-wise pull values, representing deviations from the reference.
    """
    nb_bins = len(x_tilde_i)
    
    # Denominator for chi2 and pull, summing the variances of the run and the reference.
    denominator = (sigma_x_tilde_i_p**2) + (sigma_mu_i**2)
    # Add EPSILON to prevent division by zero for any bin.
    denominator = np.maximum(denominator, EPSILON)
    
    # Numerator for chi2_nu: squared difference between current run and reference
    chi2_numerator_per_bin = (x_tilde_i - mu_i)**2
    
    # Reduced chi-squared statistic, averaged over bins
    chi2_nu = (1.0 / nb_bins) * np.sum(chi2_numerator_per_bin / denominator)
    
    # Bin-wise pull values [11, 14]
    pull = (x_tilde_i - mu_i) / np.sqrt(denominator)
    
    return chi2_nu, pull

In [7]:
def update_reference(alpha: float, x_tilde_i: np.ndarray, I_x_i: float, 
                       mu_i: np.ndarray, sigma_mu_i: np.ndarray, 
                       W_i: np.ndarray, S_mu_i: np.ndarray, S_sigma_mu_i: np.ndarray) -> tuple:
    """
    Updates the reference histogram (mu) and its uncertainty (sigma_mu) using EWMA.
    This function is called exclusively when a run is confirmed to be "good".
    
    Args:
        alpha: The smoothing factor for EWMA.
        x_tilde_i: The normalized histogram of the current good run.
        I_x_i: The original total event count for the current good run.
        mu_i: The current reference histogram.
        sigma_mu_i: The current bin-wise uncertainty (standard deviation) of the reference.
        W_i, S_mu_i, S_sigma_mu_i: The current EWMA accumulators.
    
    Returns:
        A tuple containing the updated accumulators and reference:
            - mu_i_plus_1 (np.ndarray): The new reference histogram.
            - sigma_mu_i_plus_1 (np.ndarray): The new reference uncertainty.
            - W_i_plus_1 (np.ndarray)
            - S_mu_i_plus_1 (np.ndarray)
            - S_sigma_mu_i_plus_1 (np.ndarray)
    """
    # Calculate Poisson uncertainty for the current good run's normalized histogram
    sigma_x_tilde_i_p = poisson_uncertainty(x_tilde_i, I_x_i)
    
    # Calculate statistical weight omega_i, inversely proportional to the run's variance
    omega_i = 1.0 / (sigma_x_tilde_i_p**2 + EPSILON)
    
    # Update accumulators using the generalized EWMA formulas
    W_i_plus_1 = alpha * W_i + (1.0 - alpha) * omega_i
    S_mu_i_plus_1 = alpha * S_mu_i + (1.0 - alpha) * omega_i * x_tilde_i
    S_sigma_mu_i_plus_1 = alpha * S_sigma_mu_i + (1.0 - alpha) * omega_i * (x_tilde_i - mu_i)**2
    
    # Compute new reference mu and sigma_mu from updated accumulators
    mu_i_plus_1 = S_mu_i_plus_1 / W_i_plus_1
    sigma_mu_i_plus_1 = np.sqrt(S_sigma_mu_i_plus_1 / W_i_plus_1)
    
    return mu_i_plus_1, sigma_mu_i_plus_1, W_i_plus_1, S_mu_i_plus_1, S_sigma_mu_i_plus_1

In [8]:
def dinamo_s_algorithm(histograms: list[np.ndarray], labels: list[int], 
                       alpha: float, nb_bins: int) -> dict:
    """
    The main DINAMO-S algorithm for Dynamic and INterpretable Anomaly MOnitoring.
    It processes a sequence of histograms, calculates anomaly scores, and adaptively updates
    a reference histogram based on 'good' runs.
    
    Args:
        histograms (list[np.ndarray]): A list of 1D numpy arrays, where each array is an unnormalized
                                       histogram (counts) for a specific run.
        labels (list[int]): A list of ground-truth labels for each run (0 for good, 1 for bad).
                            In a real-world scenario, these labels would typically come from human
                            shifters or a preceding automated classification step.
        alpha (float): The smoothing factor for the EWMA. This parameter controls the trade-off
                       between giving more weight to historical data or the most recent good run
                       during reference updates. It should be between 0 (exclusive) and 1 (exclusive).
        nb_bins (int): The number of bins in each histogram.
    
    Returns:
        dict: A dictionary containing lists of results for each run:
            - 'chi2_scores': The reduced chi-squared anomaly score for each run.
            - 'pull_values': Bin-wise pull values for each run.
            - 'mu_references': The reference histogram (mu) after processing each run.
            - 'sigma_mu_references': The uncertainty (sigma_mu) of the reference after processing each run.
    """
    # 1. Initialize the reference histogram (mu), its uncertainty (sigma_mu), and EWMA accumulators
    mu_current, sigma_mu_current = reference_init(nb_bins)
    W_current, S_mu_current, S_sigma_mu_current = ewma_init(alpha, mu_current, sigma_mu_current)

    # Lists to store the results for each run
    chi2_scores = []
    pull_values = []
    mu_references = [mu_current.copy()] # Store the initial reference state
    sigma_mu_references = [sigma_mu_current.copy()] # Store the initial uncertainty state

    # Iterate through each run (histogram and its label)
    for i, (hist_i, label_i) in enumerate(zip(histograms, labels)):
        # 1. Normalize the histogram to unity and get its total integral (sum of original counts)
        I_x_i = np.sum(hist_i)
        if I_x_i <= EPSILON: # Handle cases where a histogram is empty (no events)
            x_tilde_i = np.full(nb_bins, 0.0)
            sigma_x_tilde_i_p = np.full(nb_bins, np.inf) # Assign infinite uncertainty
        else:
            x_tilde_i = hist_i / I_x_i
            # Compute Poisson uncertainty for the current normalized histogram
            sigma_x_tilde_i_p = poisson_uncertainty(x_tilde_i, i_x_i)
        
        # 2. Compute the anomaly score (reduced chi-squared) and bin-wise pull values
        chi2_nu, pull_i = compute_chi2_pull(x_tilde_i, mu_current, sigma_mu_current, sigma_x_tilde_i_p)
        
        chi2_scores.append(chi2_nu)
        pull_values.append(pull_i)
        
        # 3. Update the reference only if the run is confirmed "good" (label_i == 0)
        if label_i == 0:
            mu_current, sigma_mu_current, W_current, S_mu_current, S_sigma_mu_current = \
                update_reference(alpha, x_tilde_i, I_x_i, 
                                  mu_current, sigma_mu_current, 
                                  W_current, S_mu_current, S_sigma_mu_current)
        # If the run is "bad" (label_i == 1), the reference remains unchanged
        
        # Store the reference state after processing the current run and potential update
        mu_references.append(mu_current.copy())
        sigma_mu_references.append(sigma_mu_current.copy())

    return {
        'chi2_scores': chi2_scores,
        'pull_values': pull_values,
        'mu_references': mu_references, 
        'sigma_mu_references': sigma_mu_references
    }