# Performance Prediction Under Dataset Shift

Comparison of different types of Performance Predictors under dataset shift.

Results of paper "Performance Prediction Under Dataset Shift"

In [None]:
%pylab inline

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import os
import pickle
from drift_dac_experiments.viz_utils import name2type
import matplotlib.patches as mpatches
from scipy import stats

In [None]:
datasets = [
        'adult',
        'video_games',
        'heart',
        'bank',
        'dont_get_kicked',
        'Churn_Modelling',
        'bng_zoo',
        'jsbach_chorals_modified',
        'SDSS',
        'bng_ionosphere',
        'network_intrusion_detection',
        'artificial_characters',
        'default_of_credit_card_clients'
    ]

Load results

In [None]:
df = pd.DataFrame(columns=['dataset', 'ref. accuracy', 'CI'])
df_test_no_shift = pd.DataFrame()
df_test = pd.DataFrame()
df_test_unseen = pd.DataFrame()
df_test_unseen_subpop = pd.DataFrame()
df_test_natural = pd.DataFrame()

sets = ['train', 'test', 'test_no_shift', 'test_unseen', 'test_unseen_subpop', 'test_natural']

for i, ds in enumerate(datasets):
    
    data_fld = ds + '_data'
    pp_fld = ds + '_pp'
    
    print(data_fld.upper())
    
    try:
    
        all_ref_accuracies = []
        all_ci_drops = []
        for seed in range(10):
            with open(os.path.join(data_fld, './data_%d.pkl' % seed), 'rb') as f:
                train, test, test_unseen, test_natural, ref_task, _ = pickle.load(f)

            with open(os.path.join(data_fld, './shifts_%d.pkl' % seed), 'rb') as f:
                list_of_drift_types, filtered_list_of_train_shifts, filtered_list_of_test_shifts, filtered_list_of_unseen_shifts = pickle.load(f)

            train_shifts_names = [name2type(s) for s in filtered_list_of_train_shifts]
            test_shifts_names = [name2type(s) for s in filtered_list_of_test_shifts]

            ref_accuracy = ref_task.ref_accuracy

            alpha = 0.05
            n_samples = ref_task.y_src.shape[0]
            sigma = np.sqrt(ref_accuracy * (1 - ref_accuracy) / n_samples)
            ci_drop = stats.norm.ppf(1 - alpha / 2) * sigma

            all_ref_accuracies.append(ref_accuracy)
            all_ci_drops.append(ci_drop)


        new_row = {
         "dataset": data_fld,
         "ref. accuracy": '%.3f \tiny{$\pm$%.3f}' % (np.mean(all_ref_accuracies), np.std(all_ref_accuracies)),
         "CI": '%.3f \tiny{$\pm$%.3f}' % (np.mean(all_ci_drops), np.std(all_ci_drops))
          }
        df = df.append(new_row, ignore_index=True)
        
        with open(os.path.join(pp_fld, './model_names.npy'), 'rb') as f:
            model_names = np.load(f)

        for j, s in enumerate(sets):

            with open(os.path.join(pp_fld, './r2_score_%s.npy'%s), 'rb') as f:
                r2_score = np.load(f)

            with open(os.path.join(pp_fld, './within_ci_mae_%s.npy'%s), 'rb') as f:
                within_ci_mae = np.load(f)

            with open(os.path.join(pp_fld, './likelihood_%s.npy'%s), 'rb') as f:
                likelihood = np.load(f)

            for i, name in enumerate(model_names):
                new_row = {"dataset": ds, 'model': name}

                new_row['mae_within_ci'] = '%.3f \tiny{$\pm$ %.3f}' % (np.mean(within_ci_mae[:, i]), np.std(within_ci_mae[:, i]))

                if s=='test':
                    df_test = df_test.append(new_row, ignore_index=True)
                elif s=='test_no_shift':
                    df_test_no_shift = df_test_no_shift.append(new_row, ignore_index=True)
                elif s=='test_unseen':
                    df_test_unseen = df_test_unseen.append(new_row, ignore_index=True)
                elif s=='test_unseen_subpop':
                    df_test_unseen_subpop = df_test_unseen_subpop.append(new_row, ignore_index=True)
                elif s=='test_natural':
                    df_test_natural = df_test_natural.append(new_row, ignore_index=True)
                    
    except Exception as e:
        print(e)
        print("SKIP")
        continue
        

DataFrame with datasets accuracies and confidence intervals

In [None]:
df

In [None]:
print(df.to_latex(index=False).replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

DataFrame with MAE_CI_0.05 results for test_no_shift for all datasets

In [None]:
selected_models = ['ATC', 'ExpertRF (amazon)', 'ExpertRF (naver)', 'ErrorPredictorRF']

In [None]:
d = df_test_no_shift[df_test_no_shift['model'].isin(selected_models)].set_index(['dataset', 'model']).stack().unstack([1,2])
d = d[selected_models]
d = d.iloc[d.index.str.lower().argsort()]
d

In [None]:
print(d.to_latex().replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

DataFrame with MAE_CI_0.05 results for test_unseen_severity for all datasets

In [None]:
d = df_test[df_test['model'].isin(selected_models)].set_index(['dataset', 'model']).stack().unstack([1,2])
d = d[selected_models]
d = d.iloc[d.index.str.lower().argsort()]
d

In [None]:
print(d.to_latex().replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

DataFrame with MAE_CI_0.05 results for test_unseen_perturbation_shift for all datasets

In [None]:
d = df_test_unseen[df_test_unseen['model'].isin(selected_models)].set_index(['dataset', 'model']).stack().unstack([1,2])
d = d[selected_models]
d = d.iloc[d.index.str.lower().argsort()]
d

In [None]:
print(d.to_latex().replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

DataFrame with MAE_CI_0.05 results for test_unseen_subpop_shift for all datasets

In [None]:
d = df_test_unseen_subpop[df_test_unseen_subpop['model'].isin(selected_models)].set_index(['dataset', 'model']).stack().unstack([1,2])
d = d[selected_models]
d = d.iloc[d.index.str.lower().argsort()]
d


In [None]:
print(d.to_latex().replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

DataFrame with MAE_CI_0.05 results for test_natural for all datasets

In [None]:
d = df_test_natural[df_test_natural['model'].isin(selected_models)].set_index(['dataset', 'model']).stack().unstack([1,2])
d = d[selected_models]
d = d.iloc[d.index.str.lower().argsort()]
d

In [None]:
print(d.to_latex().replace('textbackslash ','').replace('\$','$').replace('\{','{').replace('\}', '}'))

Show all results plots

In [None]:
suffixes = ['paper2', 'errpred']

In [None]:
figsize = (12, 12)

In [None]:
plt.rcParams.update({'font.size': 18})

In [None]:
for i, ds in enumerate(datasets):
    
    data_fld = ds + '_data'
    viz_fld = ds + '_viz'
    
    print(ds.upper())
    
    try:
    
        plt.figure(figsize = figsize)
        x = plt.imread(os.path.join(data_fld, 'true_drops_by_type_0.png'))
        plt.imshow(x)
        plt.axis('off')
        plt.show()

        for suffix in suffixes:
            print('---- ' + ds + ' - ' + suffix)

            plt.figure(figsize = figsize)
            x = plt.imread(os.path.join(viz_fld, 'abs_error_' + suffix + '.png'))
            plt.imshow(x)
            plt.axis('off')
            plt.show()

            plt.figure(figsize = figsize)
            x = plt.imread(os.path.join(viz_fld, 'within_ci_mae_' + suffix + '.png'))
            plt.imshow(x)
            plt.axis('off')
            plt.show()

            plt.figure(figsize = figsize)
            x = plt.imread(os.path.join(viz_fld, 'likelihood_' + suffix + '.png'))
            plt.imshow(x)
            plt.axis('off')
            plt.show()
            
    except Exception as e:
        print(e)
        continue
    