In [123]:
from doubt import Boot
from doubt.datasets import (Airfoil, Blog, Concrete, CPU, 
                            FacebookComments, FishBioconcentration,
                            FishToxicity, ForestFire, NewTaipeiHousing,
                            PowerPlant, Protein, Servo,
                            SpaceShuttle)
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import (LinearRegression, PoissonRegressor, 
                                  GammaRegressor, HuberRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from collections import defaultdict

## Setting up

In [229]:
def evaluate_nasa(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    # Calculate training residuals
    model.fit(X_tr, y_tr)
    tr_preds = model.predict(X_tr)
    te_preds = model.predict(X_te)
    tr_residuals = y_tr - tr_preds
    
    n_train = X_tr.shape[0]
    n_test = X_te.shape[0]

    # Initialise random number generator
    rng = np.random.default_rng(4242)

    # Compute the model variances
    bootstrap_preds = np.empty((n_boots, n_test))
    for boot_idx in range(n_boots):
        train_idxs = rng.choice(range(n_train), size=n_train, replace=True)
        X_btr = X_tr[train_idxs, :]
        y_btr = y_tr[train_idxs]

        model.fit(X_btr, y_btr)
        
        bootstrap_pred = model.predict(X_te)
        bootstrap_preds[boot_idx] = bootstrap_pred

    # Centre the bootstrapped predictions across the bootstrap dimension
    bootstrap_preds = np.mean(bootstrap_preds, axis=0) - bootstrap_preds

    # Add up the bootstrap predictions and the hybrid train/val residuals
    C = np.array([m + o for m in bootstrap_preds for o in tr_residuals])

    # Calculate the intervals
    intervals = np.expand_dims(te_preds, -1) + np.transpose(np.quantile(C, q=[uncertainty/2, 1-uncertainty/2], axis=0))
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [230]:
def evaluate_doubt(model, X_tr, X_te, y_tr, y_te, uncertainty=0.05):
    n_boots = int(np.sqrt(len(X_tr)))
    
    bmodel = Boot(model, random_seed=4242)
    bmodel.fit(X_tr, y_tr, n_boots=n_boots)
    preds, intervals = bmodel.predict(X_te, uncertainty=uncertainty, n_boots=n_boots)
    
    coverage = np.mean((y_te > intervals[:, 0]) & (y_te < intervals[:, 1]))
    mean_width = np.mean(intervals[:, 1] - intervals[:, 0])
    return coverage, mean_width

In [231]:
datasets = []
scaler = StandardScaler()

# Add Doubt datasets
dataset_classes = [Airfoil, Concrete, FishToxicity, NewTaipeiHousing, ForestFire]
for dataset_class in dataset_classes:
    X_tr, X_te, y_tr, y_te = dataset_class().split(test_size=0.1, random_seed=4242)
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    datasets.append((dataset_class.__name__, X_tr, X_te, y_tr, y_te))
    
len(datasets)

5

## Linear Regression

In [232]:
model = LinearRegression()

In [233]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in datasets:
    for uncertainty in [0.01, 0.05, 0.1]:
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage'].append(100 * nasa_coverage)
        data_dict['doubt_coverage'].append(100 * doubt_coverage)
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
    
df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage,doubt_coverage,nasa_mean_width,doubt_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Airfoil,0.01,99.280576,99.280576,28.293689,30.396459
Airfoil,0.05,92.086331,92.805755,19.489191,19.479813
Airfoil,0.1,87.769784,87.05036,15.365684,15.404273
Concrete,0.01,97.196262,100.0,52.676151,56.064162
Concrete,0.05,90.654206,91.588785,41.318408,42.282408
Concrete,0.1,86.915888,86.915888,34.658419,35.246152
FishToxicity,0.01,97.849462,98.924731,5.814432,7.529623
FishToxicity,0.05,91.397849,92.473118,3.928487,3.834488
FishToxicity,0.1,90.322581,91.397849,3.031887,3.03356
NewTaipeiHousing,0.01,97.560976,97.560976,50.89205,63.776944


## Decision Tree

In [234]:
model = DecisionTreeRegressor()

In [235]:
nasa_coverages = list()
nasa_mean_widths = list()
doubt_coverages = list()
doubt_mean_widths = list()

data_dict = defaultdict(list)
for dataset in datasets:
    for uncertainty in [0.01, 0.05, 0.1]:
        nasa_coverage, nasa_mean_width = evaluate_nasa(model, *dataset[1:], uncertainty=uncertainty)
        doubt_coverage, doubt_mean_width = evaluate_doubt(model, *dataset[1:], uncertainty=uncertainty)
        data_dict['dataset'].append(dataset[0])
        data_dict['uncertainty'].append(uncertainty)
        data_dict['nasa_coverage'].append(100 * nasa_coverage)
        data_dict['doubt_coverage'].append(100 * doubt_coverage)
        data_dict['nasa_mean_width'].append(nasa_mean_width)
        data_dict['doubt_mean_width'].append(doubt_mean_width)
    
df = pd.DataFrame(data_dict).set_index(['dataset', 'uncertainty'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,nasa_coverage,doubt_coverage,nasa_mean_width,doubt_mean_width
dataset,uncertainty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Airfoil,0.01,92.086331,99.280576,9.267137,22.037481
Airfoil,0.05,92.086331,98.561151,9.062791,12.057218
Airfoil,0.1,87.05036,97.841727,7.09877,9.69603
Concrete,0.01,88.785047,100.0,20.911108,64.683975
Concrete,0.05,81.308411,97.196262,20.788676,28.645234
Concrete,0.1,72.897196,91.588785,14.946832,22.51441
FishToxicity,0.01,91.397849,100.0,3.632771,8.414255
FishToxicity,0.05,79.569892,93.548387,3.117494,4.715847
FishToxicity,0.1,70.967742,90.322581,2.470283,3.800851
NewTaipeiHousing,0.01,80.487805,100.0,20.671988,71.722853
