In [1]:
from doubt import Boot
from doubt.datasets import (Airfoil, Blog, Concrete, CPU, 
                            FacebookComments, FishBioconcentration,
                            FishToxicity, ForestFire, NewTaipeiHousing,
                            PowerPlant, Protein, Servo,
                            SpaceShuttle)
from mapie.regression import MapieRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import (LinearRegression, PoissonRegressor, 
                                  GammaRegressor, HuberRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from collections import defaultdict

## Setting up

In [2]:
def evaluate_nasa(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    # Calculate training residuals
    model.fit(X_tr, y_tr)
    tr_preds = model.predict(X_tr)
    te_preds = model.predict(X_te)
    tr_residuals = y_tr - tr_preds
    
    n_train = X_tr.shape[0]
    n_test = X_te.shape[0]

    # Initialise random number generator
    rng = np.random.default_rng(4242)

    # Compute the model variances
    bootstrap_preds = np.empty((n_boots, n_test))
    for boot_idx in range(n_boots):
        train_idxs = rng.choice(range(n_train), size=n_train, replace=True)
        X_btr = X_tr[train_idxs, :]
        y_btr = y_tr[train_idxs]

        model.fit(X_btr, y_btr)
        
        bootstrap_pred = model.predict(X_te)
        bootstrap_preds[boot_idx] = bootstrap_pred

    # Centre the bootstrapped predictions across the bootstrap dimension
    bootstrap_preds = np.mean(bootstrap_preds, axis=0) - bootstrap_preds

    # Add up the bootstrap predictions and the hybrid train/val residuals
    C = np.array([m + o for m in bootstrap_preds for o in tr_residuals])

    # Calculate the intervals
    intervals = np.expand_dims(te_preds, -1) + np.transpose(np.quantile(C, q=[uncertainty/2, 1-uncertainty/2], axis=0))
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [3]:
def evaluate_doubt(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    bmodel = Boot(model, random_seed=4242)
    bmodel.fit(X_tr, y_tr, n_boots=n_boots)
    preds, intervals = bmodel.predict(X_te, uncertainty=uncertainty, n_boots=n_boots)
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [4]:
def evaluate_mapie(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    bmodel = MapieRegressor(model)
    bmodel.fit(X_tr, y_tr)
    preds, intervals = bmodel.predict(X_te, alpha=uncertainty)
    
    coverage = np.mean((y_te > intervals[:, 0, 0]) & (y_te < intervals[:, 1, 0]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [26]:
datasets = []
scaler = StandardScaler()

# Add Doubt datasets
dataset_classes = [
    Airfoil,
    Concrete,
    FishToxicity,
    ForestFire,
    NewTaipeiHousing,
    PowerPlant,
    Protein,
    Servo,
]

for dataset_class in dataset_classes:
    dataset = dataset_class()
    dataset._data = dataset._data.sample(n=min(len(dataset), 1000), random_state=4242)
    X_tr, X_te, y_tr, y_te = dataset.split(test_size=0.1, random_seed=4242)
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    datasets.append((dataset_class.__name__, X_tr, X_te, y_tr, y_te))
    
len(datasets)

8

In [27]:
for name, X_tr, X_te, y_tr, y_te in datasets:
    print(f'{name}: {len(X_tr) + len(X_te):,} samples, {X_tr.shape[-1]:,} features')

Airfoil: 1,000 samples, 5 features
Concrete: 1,000 samples, 8 features
FishToxicity: 908 samples, 6 features
ForestFire: 517 samples, 12 features
NewTaipeiHousing: 414 samples, 6 features
PowerPlant: 1,000 samples, 4 features
Protein: 1,000 samples, 9 features
Servo: 167 samples, 4 features


## Linear Regression

In [28]:
model = LinearRegression()

In [31]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in tqdm(datasets):
    for uncertainty in tqdm([0.01, 0.05, 0.1], leave=False):
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        mapie_coverage, mapie_mean_width = evaluate_mapie(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage_error'].append(100 * (1 - uncertainty - nasa_coverage))
        data_dict['doubt_coverage_error'].append(100 * (1 - uncertainty - doubt_coverage))
        data_dict['mapie_coverage_error'].append(100 * (1 - uncertainty - mapie_coverage))
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
        data_dict['mapie_mean_width'].append(mapie_mean_width)
    
linreg_df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
linreg_df

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage_error,doubt_coverage_error,mapie_coverage_error,nasa_mean_width,doubt_mean_width,mapie_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airfoil,0.01,-0.038462,-1.0,-0.038462,27.458273,30.484579,27.840175
Airfoil,0.05,1.730769,-0.192308,-0.192308,18.739913,19.105214,19.464219
Airfoil,0.1,-1.346154,-1.346154,-1.346154,15.592097,15.674501,15.616511
Concrete,0.01,-1.0,-1.0,-1.0,53.752837,54.86424,57.515282
Concrete,0.05,-1.153846,-2.115385,-0.192308,42.637112,43.050993,42.508544
Concrete,0.1,2.5,2.5,0.576923,35.260715,35.621045,35.606486
FishToxicity,0.01,0.075269,0.075269,-1.0,5.890626,7.647095,6.483127
FishToxicity,0.05,-1.774194,-2.849462,-2.849462,4.040719,3.963933,3.977871
FishToxicity,0.1,-1.397849,-3.548387,-3.548387,3.057105,3.098415,3.140108
ForestFire,0.01,-1.0,0.923077,0.923077,308.484358,254.968123,334.826691


In [57]:
error_cols = [col for col in linreg_df.columns if 'error' in col]
(linreg_df[error_cols].abs()
                      .describe()
                      .loc[['mean', 'std']]
                      .T
                      .sort_values(by='mean'))

Unnamed: 0,mean,std
doubt_coverage_error,2.450686,2.756905
nasa_coverage_error,2.45316,4.011081
mapie_coverage_error,3.002037,4.31521


## Decision Tree

In [34]:
model = DecisionTreeRegressor()

In [35]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in tqdm(datasets):
    for uncertainty in tqdm([0.01, 0.05, 0.1], leave=False):
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        mapie_coverage, mapie_mean_width = evaluate_mapie(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage_error'].append(100 * (1 - uncertainty - nasa_coverage))
        data_dict['doubt_coverage_error'].append(100 * (1 - uncertainty - doubt_coverage))
        data_dict['mapie_coverage_error'].append(100 * (1 - uncertainty - mapie_coverage))
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
        data_dict['mapie_mean_width'].append(mapie_mean_width)
    
tree_df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
tree_df

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage_error,doubt_coverage_error,mapie_coverage_error,nasa_mean_width,doubt_mean_width,mapie_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Airfoil,0.01,10.538462,-0.038462,-1.0,9.723192,21.735653,21.281971
Airfoil,0.05,11.346154,-0.192308,-5.0,9.683952,13.300291,13.907837
Airfoil,0.1,15.0,-3.269231,-8.076923,7.089163,10.82563,11.281702
Concrete,0.01,10.538462,-1.0,-1.0,21.970042,73.19873,55.055606
Concrete,0.05,8.461538,-3.076923,-5.0,21.435412,30.203862,34.240105
Concrete,0.1,10.192308,-4.230769,-7.115385,15.584835,23.57805,25.352591
FishToxicity,0.01,16.204301,2.225806,-1.0,3.296202,8.301246,7.65709
FishToxicity,0.05,17.580645,3.602151,-2.849462,2.869638,4.557473,5.287328
FishToxicity,0.1,21.182796,-0.322581,-3.548387,2.232149,3.608312,4.159194
ForestFire,0.01,37.461538,-1.0,-1.0,95.71095,778.117523,585.553462


In [58]:
error_cols = [col for col in linreg_df.columns if 'error' in col]
(tree_df[error_cols].abs()
                    .describe()
                    .loc[['mean', 'std']]
                    .T
                    .sort_values(by='mean'))

Unnamed: 0,mean,std
doubt_coverage_error,3.226723,3.31191
mapie_coverage_error,3.580289,2.489488
nasa_coverage_error,22.567957,9.561601
