In [1]:
from doubt import Boot
from doubt.datasets import (Airfoil, Blog, Concrete, CPU, 
                            FacebookComments, FishBioconcentration,
                            FishToxicity, ForestFire, NewTaipeiHousing,
                            PowerPlant, Protein, Servo,
                            SpaceShuttle)
from mapie.regression import MapieRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import (LinearRegression, PoissonRegressor, 
                                  GammaRegressor, HuberRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from collections import defaultdict

## Setting up

In [2]:
def evaluate_nasa(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    # Calculate training residuals
    model.fit(X_tr, y_tr)
    tr_preds = model.predict(X_tr)
    te_preds = model.predict(X_te)
    tr_residuals = y_tr - tr_preds
    
    n_train = X_tr.shape[0]
    n_test = X_te.shape[0]

    # Initialise random number generator
    rng = np.random.default_rng(4242)

    # Compute the model variances
    bootstrap_preds = np.empty((n_boots, n_test))
    for boot_idx in range(n_boots):
        train_idxs = rng.choice(range(n_train), size=n_train, replace=True)
        X_btr = X_tr[train_idxs, :]
        y_btr = y_tr[train_idxs]

        model.fit(X_btr, y_btr)
        
        bootstrap_pred = model.predict(X_te)
        bootstrap_preds[boot_idx] = bootstrap_pred

    # Centre the bootstrapped predictions across the bootstrap dimension
    bootstrap_preds = np.mean(bootstrap_preds, axis=0) - bootstrap_preds

    # Add up the bootstrap predictions and the hybrid train/val residuals
    C = np.array([m + o for m in bootstrap_preds for o in tr_residuals])

    # Calculate the intervals
    intervals = np.expand_dims(te_preds, -1) + np.transpose(np.quantile(C, q=[uncertainty/2, 1-uncertainty/2], axis=0))
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [3]:
def evaluate_doubt(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    bmodel = Boot(model, random_seed=4242)
    bmodel.fit(X_tr, y_tr, n_boots=n_boots)
    preds, intervals = bmodel.predict(X_te, uncertainty=uncertainty, n_boots=n_boots)
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [4]:
def evaluate_mapie(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    bmodel = MapieRegressor(model)
    bmodel.fit(X_tr, y_tr)
    preds, intervals = bmodel.predict(X_te, alpha=uncertainty)
    
    coverage = np.mean((y_te > intervals[:, 0, 0]) & (y_te < intervals[:, 1, 0]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [5]:
datasets = []
scaler = StandardScaler()

# Add Doubt datasets
dataset_classes = [
    Airfoil,
    Concrete,
    FishToxicity,
    ForestFire,
    NewTaipeiHousing,
    PowerPlant,
    Protein,
    Servo,
]

for dataset_class in dataset_classes:
    dataset = dataset_class()
    dataset._data = dataset._data.sample(n=min(len(dataset), 10000), random_state=4242)
    X_tr, X_te, y_tr, y_te = dataset.split(test_size=0.1, random_seed=4242)
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    datasets.append((dataset_class.__name__, X_tr, X_te, y_tr, y_te))
    
len(datasets)

8

In [6]:
for name, X_tr, X_te, y_tr, y_te in datasets:
    print(f'{name}: {len(X_tr) + len(X_te):,} samples, {X_tr.shape[-1]:,} features')

Airfoil: 1,503 samples, 5 features
Concrete: 1,030 samples, 8 features
FishToxicity: 908 samples, 6 features
ForestFire: 517 samples, 12 features
NewTaipeiHousing: 414 samples, 6 features
PowerPlant: 9,568 samples, 4 features
Protein: 10,000 samples, 9 features
Servo: 167 samples, 4 features


## Linear Regression

In [7]:
model = LinearRegression()

In [8]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in tqdm(datasets):
    for uncertainty in tqdm([0.01, 0.05, 0.1], leave=False):
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        mapie_coverage, mapie_mean_width = evaluate_mapie(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage_error'].append(100 * (1 - uncertainty - nasa_coverage))
        data_dict['doubt_coverage_error'].append(100 * (1 - uncertainty - doubt_coverage))
        data_dict['mapie_coverage_error'].append(100 * (1 - uncertainty - mapie_coverage))
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
        data_dict['mapie_mean_width'].append(mapie_mean_width)
    
linreg_df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
linreg_df

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage_error,doubt_coverage_error,mapie_coverage_error,nasa_mean_width,doubt_mean_width,mapie_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airfoil,0.01,-0.280576,-0.280576,0.438849,28.146645,30.217236,27.87211
Airfoil,0.05,0.755396,0.035971,0.035971,19.354946,19.468813,19.605997
Airfoil,0.1,0.071942,0.791367,0.791367,15.483973,15.439381,15.571301
Concrete,0.01,-1.0,-1.0,-1.0,53.82579,57.937128,58.171139
Concrete,0.05,-1.261682,-1.261682,-0.327103,42.502705,43.601188,42.180231
Concrete,0.1,2.149533,3.084112,0.280374,35.123406,35.760537,35.027402
FishToxicity,0.01,0.075269,0.075269,-1.0,5.890626,7.647095,6.483127
FishToxicity,0.05,-1.774194,-2.849462,-2.849462,4.040719,3.963933,3.977871
FishToxicity,0.1,-1.397849,-3.548387,-3.548387,3.057105,3.098415,3.140108
ForestFire,0.01,-1.0,0.923077,0.923077,308.484358,254.968123,334.826691


In [9]:
error_cols = [col for col in linreg_df.columns if 'error' in col]
(linreg_df[error_cols].abs()
                      .describe()
                      .loc[['mean', 'std']]
                      .T
                      .sort_values(by='mean'))

Unnamed: 0,mean,std
doubt_coverage_error,2.020065,2.837663
nasa_coverage_error,2.057549,4.095529
mapie_coverage_error,2.609275,4.396513


## Decision Tree

In [10]:
model = DecisionTreeRegressor()

In [11]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in tqdm(datasets):
    for uncertainty in tqdm([0.01, 0.05, 0.1], leave=False):
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        mapie_coverage, mapie_mean_width = evaluate_mapie(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage_error'].append(100 * (1 - uncertainty - nasa_coverage))
        data_dict['doubt_coverage_error'].append(100 * (1 - uncertainty - doubt_coverage))
        data_dict['mapie_coverage_error'].append(100 * (1 - uncertainty - mapie_coverage))
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
        data_dict['mapie_mean_width'].append(mapie_mean_width)
    
tree_df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
tree_df

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage_error,doubt_coverage_error,mapie_coverage_error,nasa_mean_width,doubt_mean_width,mapie_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airfoil,0.01,10.510791,-1.0,-1.0,8.82723,18.712431,19.223662
Airfoil,0.05,5.071942,-1.402878,-4.280576,8.830468,11.785669,12.208446
Airfoil,0.1,5.107914,-4.964029,-8.561151,6.88095,9.451752,9.783906
Concrete,0.01,11.149533,-1.0,-1.0,22.579963,61.434951,51.133994
Concrete,0.05,9.018692,-3.130841,-5.0,22.106012,29.809538,32.018414
Concrete,0.1,13.364486,-6.261682,-7.196262,15.572121,23.351312,25.176965
FishToxicity,0.01,18.354839,2.225806,-1.0,3.244373,8.216705,7.525468
FishToxicity,0.05,15.430108,3.602151,-1.774194,2.903684,4.601906,5.245728
FishToxicity,0.1,24.408602,6.129032,-4.623656,2.273784,3.607649,4.24507
ForestFire,0.01,31.692308,-1.0,-1.0,130.076072,1108.045715,591.727212


In [12]:
error_cols = [col for col in linreg_df.columns if 'error' in col]
(tree_df[error_cols].abs()
                    .describe()
                    .loc[['mean', 'std']]
                    .T
                    .sort_values(by='mean'))

Unnamed: 0,mean,std
doubt_coverage_error,3.216952,3.227193
mapie_coverage_error,3.796465,3.158507
nasa_coverage_error,18.829758,10.324322
