Training a BDT to predict $\delta C_9$ on an event-by-event basis

Import Libraries

In [None]:

import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt

from library.utilities.plotting import setup_mpl_params
from library.data.datasets.aggregated_signal_binned import Aggregated_Signal_Binned_Dataset

Setup / Deactivate Fancy Plotting

In [None]:
setup_mpl_params()
# mpl.rcParams.update(mpl.rcParamsDefault)

Define Helper Functions

In [None]:
def predict_set_likelihood(x, clf):
    """
    x : ndarray of events
    clf : sklearn classifier 
    """
    pred = clf.predict_proba(x)
    sum_log_pred = np.sum(np.log(pred), axis=0)
    return sum_log_pred


def predict_likelihood_over_bins(x, y, clf):
    """
    x : ndarray of events
    y : ndarray of bins
    clf : sklearn classifier
    """
    bins = np.unique(y)
    preds = []
    for bin in bins:
        x_bin = x[y==bin]
        pred = predict_set_likelihood(x_bin, clf)
        preds.append(np.expand_dims(pred, axis=0))
    preds = np.concatenate(preds, axis=0)
    assert preds.shape == (len(bins), len(bins))
    return preds


def bootstrap_over_bins(x, y, n, rng=np.random.default_rng()):
    """
    x : ndarray of events
    y : ndarray of bins
    n : number of events to sample from each bin    
    """
    bootstrap_x = []
    bootstrap_y = []
    for bin in np.unique(y):
    
        pool_x = x[y==bin]
        pool_y = y[y==bin]
        assert pool_x.shape[0] == pool_y.shape[0]

        selection_indices = rng.choice(len(pool_x), n)

        bin_bootstrap_x = pool_x[selection_indices]
        bin_bootstrap_y = pool_y[selection_indices]

        bootstrap_x.append(bin_bootstrap_x)
        bootstrap_y.append(bin_bootstrap_y)

    bootstrap_x = np.concatenate(bootstrap_x)
    bootstrap_y = np.concatenate(bootstrap_y)

    return bootstrap_x, bootstrap_y


def plot_likelihood_over_bins(predictions_over_bins, bin_values, cmap=plt.cm.viridis):
    """
    predictions_over_bins : ndarray of summed log event probabilities
        (rows are input bins, columns are bin predictions)
    bin_values : ndarray of the value each bin represents 
    """

    fig, ax = plt.subplots(layout="constrained")

    bounds = np.append(bin_values, bin_values[-1] + (bin_values[-1] - bin_values[-2]))
    norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

    for b_v, pred in zip(bin_values, predictions_over_bins):
        pred_bin = np.argmax(pred)
        ax.plot(bin_values, pred, color=cmap(norm(b_v)))
        ax.scatter(bin_values[pred_bin], np.max(pred), color=cmap(norm(b_v)), edgecolors="black", zorder=100)

    fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, label=r"$\delta C_9$")
    ax.set_xlabel(r"$\delta C_9$")
    ax.set_ylabel(r"$\sum_i \log p(\delta C_9 \;|\; x_i)$")

    plt.show()    


def predict_likelihood_over_bootstrapped_trials(x, y, n_trials, n_events, clf):
    """
    x : ndarray of events
    y : ndarray of bins
    n_trials : number of bootstrapped sample sets
    n_events : number of events to bootstrap per bin
    """
    pred_bins_over_trials = []
    for _ in range(n_trials):
        boot_x, boot_y = bootstrap_over_bins(x, y, n_events)
        preds = predict_likelihood_over_bins(boot_x, boot_y, clf)
        pred_bins = np.argmax(preds, axis=1)
        pred_bins_over_trials.append(np.expand_dims(pred_bins, axis=0))
    pred_bins_over_trials = np.concatenate(pred_bins_over_trials)
    return pred_bins_over_trials


def plot_prediction_linearity(input_values, avg_pred, stdev_pred, ref_line_buffer, xlim=None, ylim=None, xlabel=None, ylabel=None):
    """
    input_values : value corresponding to each bin index
    avg_pred : ndarray of average prediction per input bin
    stdev_pred : ndarray of standard deviation of prediction per input bin 
    ref_line_buffer : extra amount to extend reference line
    xlim : x limits
    ylim : y limits
    """
    _, ax = plt.subplots()
        
    ax.scatter(input_values, avg_pred, label="Validation Results", color="firebrick", s=16, zorder=5)
    ax.errorbar(input_values, avg_pred, yerr=stdev_pred, fmt="none", elinewidth=0.5, capsize=0.5, color="black", label="Std. Dev.", zorder=10)

    ref_ticks = np.linspace(np.min(input_values)-ref_line_buffer, np.max(input_values)+ref_line_buffer, 2)
    ax.plot(
        ref_ticks, ref_ticks,
        label="Ref. Line (Slope = 1)",
        color="grey",
        linewidth=0.5,
        zorder=0
    )

    if xlim is not None:
        ax.set_xlim(xlim)
    if ylim is not None:
        ax.set_ylim(ylim)

    ax.legend()
    if xlabel is not None:
        ax.set_xlabel(xlabel) # )
    if ylabel is not None:
        ax.set_ylabel(ylabel) # r

    plt.show()

Load Data

In [None]:
level = "gen"

train_dataset = Aggregated_Signal_Binned_Dataset()
train_dataset.load(level, "train", "../../state/new_physics/data/processed")
eval_dataset = Aggregated_Signal_Binned_Dataset()
eval_dataset.load(level, "eval", "../../state/new_physics/data/processed")

np.testing.assert_equal(train_dataset.bins, eval_dataset.bins)
bin_values = train_dataset.bins

train_x = train_dataset.feat.numpy()
train_y = train_dataset.labels.numpy()

eval_x = eval_dataset.feat.numpy()
eval_y = eval_dataset.labels.numpy()

Observe Class Balance

In [None]:
# Training data
bins, counts = np.unique(train_y, return_counts=True)
plt.plot(counts)
plt.show()

# Evaluation data
bins, counts = np.unique(eval_y, return_counts=True)
plt.plot(counts)
plt.show()

Balance Classes

In [None]:

n_events_train = 200_000
n_events_eval = 75_000

bins_train = np.unique(train_y)
bins_eval = np.unique(eval_y)
np.testing.assert_equal(bins_train, bins_eval)

train_x_trimmed = np.concatenate([train_x[train_y==b][:n_events_train] for b in bins_train])
train_y_trimmed = np.concatenate([train_y[train_y==b][:n_events_train] for b in bins_train])

eval_x_trimmed = np.concatenate([eval_x[eval_y==b][:n_events_eval] for b in bins_train])
eval_y_trimmed = np.concatenate([eval_y[eval_y==b][:n_events_eval] for b in bins_train])

# Observe - training data
bins, counts = np.unique(train_y_trimmed, return_counts=True)
plt.plot(counts)
plt.show()

# Observe - evaluation data
bins, counts = np.unique(eval_y_trimmed, return_counts=True)
plt.plot(counts)
plt.show()

Fit the BDT

In [None]:
clf = HistGradientBoostingClassifier(max_iter=100, verbose=5).fit(train_x_trimmed, train_y_trimmed)

Evaluate Model Performance

In [None]:
# On all data
preds = predict_likelihood_over_bins(eval_x_trimmed, eval_y_trimmed, clf)
plot_likelihood_over_bins(preds, bin_values)

In [None]:
# On bootstrapped data
boot_x, boot_y = bootstrap_over_bins(eval_x_trimmed, eval_y_trimmed, 24_000)
preds = predict_likelihood_over_bins(boot_x, boot_y, clf)

In [None]:
plot_likelihood_over_bins(preds, bin_values)

In [None]:
# Over multiple bootstrapped trials

pred_bins_over_trials = predict_likelihood_over_bootstrapped_trials(eval_x_trimmed, eval_y_trimmed, 100, 24_000, clf)

pred_values_over_trials = bin_values[pred_bins_over_trials]
avg_pred_values_over_trials = np.mean(pred_values_over_trials, axis=0)
stdev_pred_values_over_trials = np.std(pred_values_over_trials, axis=0)

In [None]:
plot_prediction_linearity(
    bin_values, 
    avg_pred_values_over_trials, 
    stdev_pred_values_over_trials, 
    ref_line_buffer=0.05, 
    xlim=(-2.25, 1.35), 
    ylim=(-2.25, 1.35), 
    xlabel=r"Actual $\delta C_9$", 
    ylabel=r"Predicted $\delta C_9$"
)