In [None]:
import json
import os

import numpy as np
import pandas as pd
from tqdm.auto import trange
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

In [None]:
column_to_metric = {
    'Environment': 'env', 'Algorithm': 'alg', 'Policy': 'policy', 'Sampling': 'custom_sampling',
    'Detection': 'detection_on', 'Re-Evaluation': 'reevaluate_on',

    "Mean (MSE Objective)": "Mean ± Standard Deviation (MSE Objective)",
    "Mean (MSE BC 1)": "Mean ± Standard Deviation (MSE BC 1)",
    "Mean (MSE BC 2)": "Mean ± Standard Deviation (MSE BC 2)",
    "Mean (MSE QD)": "Mean ± Standard Deviation (MSE QD)",

    "AUC (MSE Objective)": "AUC (MSE Objective)",
    "AUC (MSE BC 1)": "AUC (MSE BC 1)",
    "AUC (MSE BC 2)": "AUC (MSE BC 2)",
    "AUC (MSE QD)": "AUC (MSE QD)",
    "Mean (Survival %)": "Survival (%)",
    "Evaluations": "Evaluations",

    "Rate of Change (MSE Objective)": "Offline Error (Objective)",
    "Rate of Change (MSE QD)": "Offline Error (QD)",
    
    "Mean Evaluations For Survival (50%)": None,
    "Mean Evaluations For Survival (75%)": None,
}

In [None]:
results_fname = './stats_results.csv'

In [None]:
get_mean = lambda x, settings: x[0]
get_std = lambda x, settings: x[1]
compute_sum = lambda x, settings: np.cumsum(x)[-1]
compute_mean = lambda x, settings: np.mean(x)
compute_std = lambda x, settings: np.std(x)
get_sample_method = lambda x, settings: settings['sampling_strategy'] if 'sampling_strategy' in settings.keys() else 'custom' if x else 'default'

def get_rate_of_change(values, settings):
    t = settings['time_shift_val']
    v = np.asarray(values)
    a = v[0:len(values):t]
    b = v[t-1:len(values):t]
    return np.mean(b - a)

def get_survival_score_over_threshold(results, settings, threshold = 50.0, use_actual=False):
    evals = np.asarray(results['Evaluations'])
    survs = np.asarray(results['Survival (%)'])
    # detected_shifts = np.asarray(results['Detected Shift']) if not use_actual else np.asarray(results['Actual Shift'])
    detected_shifts = np.asarray(results['Actual Shift'])
    n_detected = np.sum(detected_shifts)
    _scores = [0]
    _evals = [np.nan]
    
    detected_shifts = np.where(detected_shifts)[0]
    
    for i in range(len(detected_shifts) - 1):
        bf, af = detected_shifts[i], detected_shifts[i + 1]
        if np.any(survs[bf:af] > threshold):
            _scores.append(1)
            _evals.append(np.sum(evals[bf:af]))
        else:
            _scores.append(0)
            _evals.append(np.nan)
    
    return np.sum(_scores) / n_detected, np.nanmean(_evals)

In [None]:
column_to_op = {
    'Sampling': get_sample_method,

    "Mean (MSE Objective)": get_mean, "Standard Deviation (MSE Objective)": get_std,
    "Mean (MSE BC 1)": get_mean, "Standard Deviation (MSE BC 1)": get_std,
    "Mean (MSE BC 2)": get_mean, "Standard Deviation (MSE BC 2)": get_std,
    "Mean (MSE QD)": get_mean, "Standard Deviation (MSE QD)": get_std,
    "Mean (Survival %)": compute_mean, "Standard Deviation (Survival %)": compute_std,
    "Rate of Change (MSE Objective)": get_rate_of_change,
    "Rate of Change (MSE QD)": get_rate_of_change,

    "Evaluations": compute_sum
}

In [None]:
df = pd.DataFrame()

ref_dir = './raw_results'

items = os.listdir(ref_dir)
folders = [item for item in items if os.path.isdir(os.path.join(ref_dir, item))]

In [None]:
with trange(0, len(folders), desc='Processing experiments...') as ii:
    for i in ii:
        try:
            experiment_fname = folders[i]
            ii.set_postfix_str(f'{experiment_fname}', refresh=True)
            experiment_path = os.path.join(ref_dir, experiment_fname)
            with open(os.path.join(experiment_path, f'settings.metadata'), 'r') as f:
                settings = json.load(f)
            with open(os.path.join(experiment_path, 'results.json'), 'r') as f:
                results = json.load(f)
            env, alg, policies = settings['env'], settings['alg'], settings['policies']
            for policy in policies:
                new_row = {'run': experiment_fname.split('_')[1]}
                for c in column_to_metric.keys():
                    if c == 'Policy':
                        m = policy
                    elif c == "Mean Evaluations For Survival (50%)":
                        m = get_survival_score_over_threshold(results[policy], settings, 50.0, policy=='no_updates')
                    elif c == "Mean Evaluations For Survival (75%)":
                        m = get_survival_score_over_threshold(results[policy], settings, 75.0, policy=='no_updates')
                    elif column_to_metric[c] in settings.keys():
                        m = settings[column_to_metric[c]]
                    else:
                        m = results[policy][column_to_metric[c]]
                    if c in column_to_op.keys():
                        m = column_to_op[c](m, settings)
                    new_row.update({c: m})
                df2 = pd.DataFrame([new_row])
                df = pd.concat([df, df2], ignore_index=True)
        except Exception:
            print(f'Skipped {experiment_fname}...')

In [None]:
df.to_csv('grouped_data.csv', index=False)

In [None]:
df = pd.read_csv('grouped_data.csv')

In [None]:
envs = df['Environment'].unique()
algs = df['Algorithm'].unique()
policies = list(sorted(df['Policy'].unique()))
samplings = list(sorted(df['Sampling'].unique()))
detections = list(sorted(df['Detection'].unique()))
reevaluations = list(sorted(df['Re-Evaluation'].unique()))

shared_columns = ['run', 'Environment', 'Algorithm', 'Policy', 'Sampling', 'Detection', 'Re-Evaluation']

metrics = df.columns.drop(shared_columns)

In [None]:
processed_df = pd.DataFrame()

for env in envs:
    for alg in algs:
        for policy in policies:
            for sampling in samplings:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = df.loc[(df['Environment'] == env) & (df['Algorithm'] == alg) \
                                            & (df['Policy'] == policy) & (df['Sampling'] == sampling) \
                                            & (df['Detection'] == detection) & (df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        if (policy == 'no_updates' or policy == 'update_all') and sampling != 'default': continue
                        mean_values, std_values = {}, {}
                        for metric in metrics:
                            if metric.startswith("Mean Evaluations For Survival"):
                                mean_scores = np.nanmean(np.sum([x[1] for x in values[metric]]))
                                std_scores = np.nanstd(np.sum([x[1] for x in values[metric]]))
                                mean_iterations = np.nanmean([x[0] for x in values[metric]])
                                std_iterations = np.nanstd([x[0] for x in values[metric]])
                                mean_values[f'Mean {metric}'] = (mean_scores, mean_iterations)
                                std_values[f'Std {metric}'] = (std_scores, std_iterations)
                            else:
                                mean_values[f'Mean {metric}'] = values[metric].mean()
                                std_values[f'Std {metric}'] = values[metric].std()
                        # ci_value = {f'95CI {metric}': st.t.interval(0.95, len(values[metric]) - 1, loc=values[metric].mean(), scale=st.sem(values[metric])) for metric in metrics}
                        new_row = pd.DataFrame([{'Environment': env, 'Algorithm': alg, 'Policy': policy,
                                                 'Sampling': sampling, 'Detection': detection, 'Re-Evaluation': reevaluation,
                                                 **mean_values, **std_values}])
                        processed_df = pd.concat([processed_df, new_row], ignore_index=True)

In [None]:
processed_df.to_csv('statistical_data.csv', index=False)

In [None]:
processed_df

In [None]:
processed_df = pd.read_csv('statistical_data.csv')
results_fname = 'arxiv_results.csv'

In [None]:
def run_ttests(stats_testing_values, metrics_of_interest):
    all_configs = list(stats_testing_values.keys())
    
    config_combinations = {}
    for config in all_configs:
        config_combinations[config] = []
        curr_env, curr_alg = config.split('__')[0], config.split('__')[1] 
        for other_config in all_configs:
            if config != other_config and curr_env == other_config.split('__')[0] and curr_alg == other_config.split('__')[1]:
                if ('no_updates' in config or 'update_all' in config) and ('no_updates' in other_config or 'update_all' in other_config):
                    config_combinations[config].append(other_config)
                else:
                    config_combinations[config].append(other_config)
    
    with open(results_fname, 'a+') as f:
        for metric in metrics_of_interest:
            # f.write(f'### Analysis for {metric} ###\n')
            for config in config_combinations.keys():
                if ('no_updates' in config or 'update_all' in config): continue
                # f.write(f'\n{config}\n')
                raw_p_values = []
                for other_config in config_combinations[config]:
                    if config != other_config:
                        _, pv = ttest_ind(stats_testing_values[config][metric], stats_testing_values[other_config][metric])
                        raw_p_values.append(pv)
                if not ('no_updates' in other_config or 'update_all' in other_config):
                    _, corrected_pvs, _, bonfalpha = multipletests(raw_p_values, method='bonferroni')
                    p_values = corrected_pvs
                else:
                    p_values = raw_p_values
                
                for i, (other_config, p_value) in enumerate(zip(config_combinations[config], p_values)):
                    f.write(f'{metric},{config},{other_config},{p_value},{raw_p_values[i] if not ("no_updates" in other_config or "update_all" in other_config) else ""},{p_value < 0.05}')
                    f.write('\n')
                    
            f.write('')

RQ1: Two tables (one per environment) with Algorithm, Policy, Sampling/Detection/Reeval, Survival percentage

In [None]:
metrics_of_interest = ['Mean (Survival %)']
stats_testing_values = {}

rq1_columns = shared_columns.copy()
rq1_columns.extend(metrics_of_interest)
not_rq1_columns = list(set(list(df.columns)).difference(rq1_columns))
rq1_raw_data = df.columns.drop(not_rq1_columns)
df[rq1_raw_data].to_csv('rq1.csv')

for env in envs:
    df_view = pd.DataFrame()
    n_runs = 10 if env == 'sphere' else 5

    for alg in algs:
        for sampling in samplings:
            for policy in policies:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                            & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                            & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        stats_testing_values['__'.join([env, alg, sampling, policy, detection, reevaluation])] = {metric: df.loc[(df['Environment'] == env) & (df['Algorithm'] == alg) \
                                                                                                                       & (df['Policy'] == policy) & (df['Sampling'] == sampling) \
                                                                                                                       & (df['Detection'] == detection) & (df['Re-Evaluation'] == reevaluation)][metric].values for metric in metrics_of_interest}
                        values_of_interest = {metric: f'${values[f"Mean {metric}"].values[0]:.3f} \\pm {(1.96 * values[f"Std {metric}"].values[0]) / np.sqrt(n_runs):.3f}$' for metric in metrics_of_interest}
                        new_row = pd.DataFrame([{'Environment': env, 'Algorithm': alg, 'Policy': policy,
                                                 'Sampling': sampling, 'Detection': detection, 'Re-Evaluation': reevaluation,
                                                 **values_of_interest}])
                        df_view = pd.concat([df_view, new_row])

    print(df_view.to_latex(index=False))

    run_ttests(stats_testing_values, metrics_of_interest)
    

RQ2: Two tables (one per environment) with Algorithm, Policy, Sampling/Detection/Reeval, mean+-std MSEs

In [None]:
metrics_of_interest = ['Mean (MSE Objective)', 'Mean (MSE BC 1)', 'Mean (MSE BC 2)', 'Mean (MSE QD)']
stats_testing_values = {}

rq2_columns = shared_columns.copy()
rq2_columns.extend(metrics_of_interest)
not_rq2_columns = list(set(list(df.columns)).difference(rq2_columns))
rq2_raw_data = df.columns.drop(not_rq2_columns)
df[rq2_raw_data].to_csv('rq2.csv')

for env in envs:
    df_view = pd.DataFrame()
    n_runs = 10 if env == 'sphere' else 5

    for alg in algs:
        for sampling in samplings:
            for policy in policies:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                            & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                            & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        stats_testing_values['__'.join([env, alg, sampling, policy, detection, reevaluation])] = {metric: df.loc[(df['Environment'] == env) & (df['Algorithm'] == alg) \
                                                                                                                       & (df['Policy'] == policy) & (df['Sampling'] == sampling) \
                                                                                                                       & (df['Detection'] == detection) & (df['Re-Evaluation'] == reevaluation)][metric].values for metric in metrics_of_interest}
                        values_of_interest = {metric: f'${values[f"Mean {metric}"].values[0]:.3f} \\pm {1.96 * (values[f"Std {metric}"].values[0] / np.sqrt(n_runs)):.3f}$' for metric in metrics_of_interest}
                        new_row = pd.DataFrame([{'Environment': env, 'Algorithm': alg, 'Policy': policy,
                                                 'Sampling': sampling, 'Detection': detection, 'Re-Evaluation': reevaluation,
                                                 **values_of_interest}])
                        df_view = pd.concat([df_view, new_row])

    print(df_view.to_latex(index=False))

    run_ttests(stats_testing_values, metrics_of_interest)

RQ3: Two tables (one per environment) with Algorithm, Policy, Sampling/Detection/Reeval, ROCs

In [None]:
metrics_of_interest = ['Rate of Change (MSE Objective)', 'Rate of Change (MSE QD)']
stats_testing_values = {}

rq3_columns = shared_columns.copy()
rq3_columns.extend(metrics_of_interest)
not_rq3_columns = list(set(list(df.columns)).difference(rq3_columns))
rq3_raw_data = df.columns.drop(not_rq3_columns)
df[rq3_raw_data].to_csv('rq3.csv')

for env in ['lunar-lander']:#envs:
    df_view = pd.DataFrame()
    n_runs = 10 if env == 'sphere' else 5

    for alg in algs:
        for sampling in ['default']:# samplings:
            for policy in policies:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                            & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                            & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        stats_testing_values['__'.join([env, alg, sampling, policy, detection, reevaluation])] = {metric: df.loc[(df['Environment'] == env) & (df['Algorithm'] == alg) \
                                                                                                                       & (df['Policy'] == policy) & (df['Sampling'] == sampling) \
                                                                                                                       & (df['Detection'] == detection) & (df['Re-Evaluation'] == reevaluation)][metric].values for metric in metrics_of_interest}
                        values_of_interest = {metric: f'${values[f"Mean {metric}"].values[0]:.3f} \\pm {1.96 * (values[f"Std {metric}"].values[0] / np.sqrt(n_runs)):.3f}$' for metric in metrics_of_interest}
                        new_row = pd.DataFrame([{'Environment': env, 'Algorithm': alg, 'Policy': policy,
                                                 'Sampling': sampling, 'Detection': detection, 'Re-Evaluation': reevaluation,
                                                 **values_of_interest}])
                        df_view = pd.concat([df_view, new_row])

    print(df_view.to_latex(index=False))

    run_ttests(stats_testing_values, metrics_of_interest)

RQ4: Two tables (one per environment) with Algorithm, Policy, Sampling/Detection/Reeval, mean+-std cost metrics (to compute)

In [None]:
metrics_of_interest = ["Mean Evaluations For Survival (50%)", "Mean Evaluations For Survival (75%)"]
stats_testing_values = {}

rq4_columns = shared_columns.copy()
rq4_columns.extend(metrics_of_interest)
not_rq4_columns = list(set(list(df.columns)).difference(rq4_columns))
rq4_raw_data = df.columns.drop(not_rq4_columns)
df[rq4_raw_data].to_csv('rq4.csv')

for env in envs:
    df_view = pd.DataFrame()

    for alg in algs:
        for sampling in samplings:
            for policy in policies:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                            & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                            & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        stats_testing_values['__'.join([env, alg, sampling, policy, detection, reevaluation])] = {metric: [float(a.replace('(', '').replace(')','').split(', ')[1]) for a in processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                                                                                                       & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                                                                                                       & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)][metric].values[:]] for metric in metrics_of_interest}
                        values_of_interest = {metric: f'${values[f"Mean {metric}"].values[0][0]:.3f} ({values[f"Mean {metric}"].values[0][1]:.2%})$' for metric in metrics_of_interest}
                        new_row = pd.DataFrame([{'Environment': env, 'Algorithm': alg, 'Policy': policy,
                                                 'Sampling': sampling, 'Detection': detection, 'Re-Evaluation': reevaluation,
                                                 **values_of_interest}])
                        df_view = pd.concat([df_view, new_row])

    print(df_view.to_latex(index=False))

    run_ttests(stats_testing_values, metrics_of_interest)

In [None]:
import json
import os

import numpy as np
import pandas as pd
from tqdm.auto import trange
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

metrics_of_interest = ["Mean Evaluations For Survival (50%)", "Mean Evaluations For Survival (75%)"]
stats_testing_values = {}

processed_df = pd.read_csv('rq4.csv')

envs = processed_df['Environment'].unique()
algs = processed_df['Algorithm'].unique()
policies = list(sorted(processed_df['Policy'].unique()))
samplings = list(sorted(processed_df['Sampling'].unique()))
detections = list(sorted(processed_df['Detection'].unique()))
reevaluations = list(sorted(processed_df['Re-Evaluation'].unique()))

shared_columns = ['run', 'Environment', 'Algorithm', 'Policy', 'Sampling', 'Detection', 'Re-Evaluation']

metrics = processed_df.columns.drop(shared_columns)

results_fname = 'rq4_stats.csv'

def run_ttests(stats_testing_values, metrics_of_interest):
    all_configs = list(stats_testing_values.keys())
    
    config_combinations = {}
    for config in all_configs:
        config_combinations[config] = []
        curr_env, curr_alg = config.split('__')[0], config.split('__')[1] 
        for other_config in all_configs:
            if config != other_config and curr_env == other_config.split('__')[0] and curr_alg == other_config.split('__')[1]:
                if ('no_updates' in config or 'update_all' in config) and ('no_updates' in other_config or 'update_all' in other_config):
                    config_combinations[config].append(other_config)
                else:
                    config_combinations[config].append(other_config)
    
    with open(results_fname, 'a+') as f:
        for metric in metrics_of_interest:
            # f.write(f'### Analysis for {metric} ###\n')
            for config in config_combinations.keys():
                if ('no_updates' in config or 'update_all' in config): continue
                # f.write(f'\n{config}\n')
                raw_p_values = []
                for other_config in config_combinations[config]:
                    if config != other_config:
                        _, pv = ttest_ind(stats_testing_values[config][metric], stats_testing_values[other_config][metric])
                        raw_p_values.append(pv)
                if not ('no_updates' in other_config or 'update_all' in other_config):
                    _, corrected_pvs, _, bonfalpha = multipletests(raw_p_values, method='bonferroni')
                    p_values = corrected_pvs
                else:
                    p_values = raw_p_values
                
                for i, (other_config, p_value) in enumerate(zip(config_combinations[config], p_values)):
                    f.write(f'{metric},{config},{other_config},{p_value},{raw_p_values[i] if not ("no_updates" in other_config or "update_all" in other_config) else ""},{p_value < 0.05}')
                    f.write('\n')
                    
            f.write('')


for env in envs:
    for alg in algs:
        for sampling in samplings:
            for policy in policies:
                for detection in detections:
                    for reevaluation in reevaluations:
                        values = processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                            & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                            & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)]
                        if len(values) == 0: continue
                        stats_testing_values['__'.join([env, alg, sampling, policy, detection, reevaluation])] = {metric: [float(a.replace('(', '').replace(')','').split(', ')[0]) for a in processed_df.loc[(processed_df['Environment'] == env) & (processed_df['Algorithm'] == alg) \
                                                                                                                       & (processed_df['Policy'] == policy) & (processed_df['Sampling'] == sampling) \
                                                                                                                       & (processed_df['Detection'] == detection) & (processed_df['Re-Evaluation'] == reevaluation)][metric].values[:]] for metric in metrics_of_interest}

In [None]:
for k, res in stats_testing_values.items():
    for m, vs in res.items():
        print(k, m, 1.96 * np.std(vs) / np.sqrt(len(vs)))