In [1]:
import json
import optuna
import numpy as np
import pandas as pd

from tabrepo import load_repository
from probmetrics.calibrators import get_calibrator

import warnings
warnings.simplefilter("ignore")

We solve the following convex optimization problem:
$$
\frac{1}{n} \sum_{i=1}^n \ell(S((aI + \text{diag}(\mathbf{v}) + M \odot (11^\top-I)) \mathbf{z}_i + \mathbf{b}), y_i) + \lambda_\mathbf{b} \frac{k^\rho}{n^\tau}\|\mathbf{b}\|_\delta + \lambda_\mathbf{v} \frac{k^\rho}{n^\tau}\|\mathbf{v}\|_\delta + \lambda_M \frac{((k-1)k)^\rho}{n^\tau}\|\mathbf{M}\|_\delta \quad .
$$

We experiment with the following parameters:
- $\delta$ the order of the norm is $1$ (lasso), $2$ (group-lasso) or $2$-squared (ridge).
- $\rho$ the power of the number of parameters in $\{-1, -0.5, 0, 0.5, 1 \}$.
- $\tau$ the power of the number of samples is $\{0.5, 1, 1.5, 2 \}$.
- $\lambda_\mathbf{b}$, $\lambda_\mathbf{v}$, $\lambda_M$ the intercept, diagonal and off-diagonal regularization coefficients in the range $[0.01, 100]$.

In [2]:
repo = load_repository("D244_F3_C1530_200")
configs = pd.read_csv('results/multiclass/configs.csv', index_col=0)

start: Loading ZS Context
Loading BenchmarkContext:
	name: D244_F3_C1530_200
	description: Large-scale Benchmark on 244 datasets and 3 folds (120 GB, 200 smallest datasets)
	date: 2023_11_14
	folds: [0, 1, 2]
All required files are present...
Loading input files...
	configs :           /Users/eberta/.cache/tabrepo/data/results/2023_11_14/configs.parquet
	metadata:           /Users/eberta/.cache/tabrepo/data/results/2023_11_14/task_metadata.csv
Loading baselines: /Users/eberta/.cache/tabrepo/data/results/2023_11_14/baselines.parquet
Loading config hyperparameter definitions... Note: Hyperparameter definitions are only accurate for the latest version.
Loading ZS inputs:
	pred_proba:  /Users/eberta/.cache/tabrepo/data/results/2023_11_14/model_predictions/

Aligning GroundTruth with TabularPredictions... (Dataset count 200 -> 200)
Time for Loading ZS Context: 8.9770 secs


In [3]:
def brier_score_multiclass(probs, labels):
    n_samples, _ = probs.shape
    one_hot = np.zeros_like(probs)
    one_hot[np.arange(n_samples), labels] = 1
    squared_error = np.sum(np.square(probs - one_hot), axis=-1)
    return np.mean(squared_error)

In [4]:
def benchmark_multiclass_calibrator(cal, configs):

    improvements = []

    for _, row in configs.iterrows():
        dataset, fold, config = row['dataset'], row['fold'], row['tuned_config']
        
        p_cal = repo.predict_val(dataset=dataset, fold=fold, config=config)
        y_cal = repo.labels_val(dataset=dataset, fold=fold)
        p_test = repo.predict_test(dataset=dataset, fold=fold, config=config)
        y_test = repo.labels_test(dataset=dataset, fold=fold)

        n_cal, _ = p_cal.shape

        # subsampling large datasets:
        if n_cal > 10000:
            np.random.seed(123)
            idx = np.arange(0, n_cal)
            rand_idx = np.random.choice(idx, 10000, replace=False)
            p_cal = p_cal[rand_idx]
            y_cal = y_cal[rand_idx]

        base_score = brier_score_multiclass(p_test, y_test)

        cal.fit(p_cal, y_cal)
        cal_score = brier_score_multiclass(cal.predict_proba(p_test), y_test)

        improvements.append(cal_score - base_score)

    return np.array(improvements)

In [5]:
# Splitting datasets in two for gridsearch and benchmark using ts effect:
ts = get_calibrator('ts-mix')
ts_improvement = benchmark_multiclass_calibrator(ts, configs)
configs['ts_improvement'] = ts_improvement

df = configs.groupby(['dataset']).agg({'ts_improvement':'mean'}).reset_index()

datasets = df.dataset.tolist()
ts_effect = df.ts_improvement.tolist()

arg = np.argsort(ts_effect)

search_datasets = [datasets[i] for i in arg[::2]]
bench_datasets = list(set(datasets) - set(search_datasets))

search_configs = configs[configs.dataset.isin(search_datasets)]
bench_configs = configs[configs.dataset.isin(bench_datasets)]

In [6]:
with open("results/multiclass/search_datasets.json", 'w') as f:
    json.dump(search_datasets, f, indent=2)
with open("results/multiclass/bench_datasets.json", 'w') as f:
    json.dump(bench_datasets, f, indent=2)

Evaluating ts on search datasets for reference:

In [7]:
ts = get_calibrator('ts-mix')
ts_improvement = benchmark_multiclass_calibrator(ts, search_configs)
print(f'ts mean absolute improvement {ts_improvement.mean():.5f}')

ts mean absolute improvement -0.00274


Evaluating ts on bench datasets for reference:

In [None]:
ts = get_calibrator('ts-mix')
ts_improvement = benchmark_multiclass_calibrator(ts, bench_configs)
print(f'ts mean absolute improvement {ts_improvement.mean():.5f}')

ts mean absolute improvement -0.00201


# SVS parameter search

In [9]:
def objective(trial):
    rho = trial.suggest_categorical('rho', [-1.0, -0.5, 0.0, 0.5, 1.0])
    tau = trial.suggest_categorical('tau', [0.5, 1.0, 1.5, 2.0])
    lambda_intercept = trial.suggest_float('lambda_intercept', 1e-2, 1e2, log=True)
    lambda_diagonal = trial.suggest_float('lambda_diagonal', 1e-2, 1e2, log=True)

    svs = get_calibrator(
        'svs',
        svs_rho = rho,
        svs_tau = tau,
        svs_lambda_intercet = lambda_intercept,
        svs_lambda_diagonal = lambda_diagonal
    )
    improvement = benchmark_multiclass_calibrator(svs, search_configs)

    return improvement.mean()

study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2026-01-08 16:50:15,574] A new study created in memory with name: no-name-b711e93b-6c47-4df9-b1a3-f6138adc742b
[I 2026-01-08 16:50:33,889] Trial 0 finished with value: -0.0044060442596673965 and parameters: {'rho': 0.5, 'tau': 2.0, 'lambda_intercept': 0.21714111308363937, 'lambda_diagonal': 0.2921769907045169}. Best is trial 0 with value: -0.0044060442596673965.
[I 2026-01-08 16:50:52,079] Trial 1 finished with value: -0.004342087544500828 and parameters: {'rho': 0.0, 'tau': 2.0, 'lambda_intercept': 30.286764096294817, 'lambda_diagonal': 0.02008738406263097}. Best is trial 0 with value: -0.0044060442596673965.
[I 2026-01-08 16:51:10,667] Trial 2 finished with value: -0.004306043032556772 and parameters: {'rho': -0.5, 'tau': 2.0, 'lambda_intercept': 0.2656215854829775, 'lambda_diagonal': 2.7365211644888356}. Best is trial 0 with value: -0.0044060442596673965.
[I 2026-01-08 16:51:27,555] Trial 3 finished with value: -0.004192142281681299 and parameters: {'rho': -1.0, 'tau': 1.0, 'lamb

In [10]:
study.best_params

{'rho': 0.5,
 'tau': 1.5,
 'lambda_intercept': 18.257300443238925,
 'lambda_diagonal': 0.498580415150605}

In [11]:
svs_default = get_calibrator('svs')

svs_tuned = get_calibrator(
    'svs',
    svs_rho = study.best_params['rho'],
    svs_tau = study.best_params['tau'],
    svs_lambda_intercet = study.best_params['lambda_intercept'],
    svs_lambda_diagonal = study.best_params['lambda_diagonal']
)

In [12]:
print('On search datasets:')
improvement = benchmark_multiclass_calibrator(svs_default, search_configs)
print(f'svs default mean improvement: {improvement.mean():.5f}')
improvement = benchmark_multiclass_calibrator(svs_tuned, search_configs)
print(f'svs tuned mean improvement:   {improvement.mean():.5f}\n')

print('On bench datasets:')
improvement = benchmark_multiclass_calibrator(svs_default, bench_configs)
print(f'svs default mean improvement: {improvement.mean():.5f}')
improvement = benchmark_multiclass_calibrator(svs_tuned, bench_configs)
print(f'svs tuned mean improvement:   {improvement.mean():.5f}')

On search datasets:
svs default mean improvement: -0.00389
svs tuned mean improvement:   -0.00458

On bench datasets:
svs default mean improvement: -0.00281
svs tuned mean improvement:   -0.00274


# SMS parameter search

In [13]:
def objective(trial):
    rho = trial.suggest_categorical('rho', [-1.0, -0.5, 0.0, 0.5, 1.0])
    tau = trial.suggest_categorical('tau', [0.5, 1.0, 1.5, 2.0])
    lambda_intercept = trial.suggest_float('lambda_intercept', 1e-2, 1e2, log=True)
    lambda_diagonal = trial.suggest_float('lambda_diagonal', 1e-2, 1e2, log=True)
    lambda_off_diagonal = trial.suggest_float('lambda_off_diagonal', 1e-2, 1e2, log=True)

    sms = get_calibrator(
        'sms',
        sms_rho = rho,
        sms_tau = tau,
        sms_lambda_intercet = lambda_intercept,
        sms_lambda_diagonal = lambda_diagonal,
        sms_lambda_off_diagonal = lambda_off_diagonal
    )
    improvement = benchmark_multiclass_calibrator(sms, search_configs)

    return improvement.mean()

study = optuna.create_study()
study.optimize(objective, n_trials=50)

[I 2026-01-08 17:24:18,017] A new study created in memory with name: no-name-d36928fd-ae0e-4a64-9431-cc1dda8ae8cd
[I 2026-01-08 17:27:53,434] Trial 0 finished with value: 0.003714804208044077 and parameters: {'rho': 0.0, 'tau': 2.0, 'lambda_intercept': 94.59398613565904, 'lambda_diagonal': 0.02036092790367531, 'lambda_off_diagonal': 0.011336096665675045}. Best is trial 0 with value: 0.003714804208044077.
[I 2026-01-08 17:31:02,997] Trial 1 finished with value: -0.0007599393538375657 and parameters: {'rho': -0.5, 'tau': 1.5, 'lambda_intercept': 0.516602266345914, 'lambda_diagonal': 1.3181706956630839, 'lambda_off_diagonal': 37.13741863683784}. Best is trial 1 with value: -0.0007599393538375657.
[I 2026-01-08 17:34:25,744] Trial 2 finished with value: 0.0020947678097399144 and parameters: {'rho': -0.5, 'tau': 1.5, 'lambda_intercept': 0.19482809298484272, 'lambda_diagonal': 0.01113256441691698, 'lambda_off_diagonal': 1.1679975694303477}. Best is trial 1 with value: -0.0007599393538375657.

In [14]:
sms_default = get_calibrator('sms')

sms_tuned = get_calibrator(
    'sms',
    sms_rho = study.best_params['rho'],
    sms_tau = study.best_params['tau'],
    sms_lambda_intercet = study.best_params['lambda_intercept'],
    sms_lambda_diagonal = study.best_params['lambda_diagonal'],
    sms_lambda_off_diagonal = study.best_params['lambda_off_diagonal']
)

In [15]:
print('On search datasets:')
improvement = benchmark_multiclass_calibrator(sms_default, search_configs)
print(f'sms default mean improvement: {improvement.mean():.5f}')
improvement = benchmark_multiclass_calibrator(sms_tuned, search_configs)
print(f'sms tuned mean improvement:   {improvement.mean():.5f}\n')

print('On bench datasets:')
improvement = benchmark_multiclass_calibrator(sms_default, bench_configs)
print(f'sms default mean improvement: {improvement.mean():.5f}')
improvement = benchmark_multiclass_calibrator(sms_tuned, bench_configs)
print(f'sms tuned mean improvement:   {improvement.mean():.5f}')

On search datasets:
sms default mean improvement: -0.00566
sms tuned mean improvement:   -0.00621

On bench datasets:
sms default mean improvement: -0.00360
sms tuned mean improvement:   -0.00355
