# Synthetic data results
- from statsmodels.stats.anova import AnovaRM

In [1]:
import pandas as pd
from typing import List
import itertools
import os
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# get the final value of a metric for a method
def final_value(path:str, method:str, metric:str) -> pd.Series:
    df = pd.read_csv(f"{path}/{method}_{metric}.csv")
    return df.iloc[:, -1]

def summary_table(path:str, methods:List[str], metrics:List[str]) -> pd.DataFrame:  
    results = []
    for method in methods:
        for metric in metrics:
            mean, std = final_value(path, method, metric).mean(), final_value(path, method, metric).std()
            results.append([method, metric, mean, std])
    return pd.DataFrame(results, columns=['method', 'metric', 'mean', 'std'])


def table_res_column(path: str, metric: str) -> pd.DataFrame:
    """Create a results table for a specific metric and path"""
    # Define methods
    methods = ['mine', 'baseline', 'sliding', 'oob_single', 'oob', 'areba']
    
    # Get final values for each method
    results = []
    for method in methods:
        mean, std = final_value(path, method, metric).mean(), final_value(path, method, metric).std()
        results.append([mean, std])
    
    # Create DataFrame
    columns = [f'{m} mean' for m in methods] + [f'{m} std' for m in methods]
    df = pd.DataFrame(results, columns=['mean', 'std'])
    
    # Format into wide format
    final = pd.DataFrame({
        'proposing mean': df.loc[0, 'mean'],
        'proposing std': df.loc[0, 'std'],
        'baseline mean': df.loc[1, 'mean'],
        'baseline std': df.loc[1, 'std'],
        'sliding mean': df.loc[2, 'mean'],
        'sliding std': df.loc[2, 'std'],
        'oob_single mean': df.loc[3, 'mean'],
        'oob_single std': df.loc[3, 'std'],
        'oob mean': df.loc[4, 'mean'],
        'oob std': df.loc[4, 'std'],
        'areba mean': df.loc[5, 'mean'],
        'areba std': df.loc[5, 'std']
    }, index=[0])
    
    # Label the row with experiment details
    imtype = path.split('/')[-2]
    ir = path.split('/')[-1]
    final.index = [f"{imtype} {ir}"]
    
    return final

def table_res_boundary(boundary: str, metric: str) -> pd.DataFrame:
    """Create a summary table for all scenarios under a boundary"""
    result_path = "<your path>/res"
    base_path = f"{result_path}/{boundary}"
    
    # Get all combinations of imtype and ir
    scenarios = list(itertools.product(['safe', 'borderline', 'noise'], ['10', '1', '0']))
    
    # Initialize empty DataFrame
    summary_table = pd.DataFrame()
    
    # Process each scenario
    for imtype, ir in scenarios:
        path = f"{base_path}/{imtype}/{ir}"
        if os.path.exists(path):
            # Get results for this scenario
            scenario_results = table_res_column(path, metric)
            # Concatenate with main table
            summary_table = pd.concat([summary_table, scenario_results])
    
    return summary_table

## one table

In [2]:
result_path = "<your path>/res"
boundary = 'sea' # 'sine', 'sea', 'circle'
imtype = 'borderline' # 'safe', 'borderline', 'noise'
ir = '10' # '10', '1', '0'
# path
path = f"{result_path}/{boundary}/{imtype}/{ir}"
# methods and metrics
methods = ['baseline', 'sliding', 'oob_single', 'areba', 'mine'] #'oob', 
metrics = ['rec0', 'rec1', 'gmu']
summary_table(path, methods, metrics)

Unnamed: 0,method,metric,mean,std
0,baseline,rec0,0.991,0.009
1,baseline,rec1,0.096,0.037
2,baseline,gmu,0.302,0.059
3,sliding,rec0,0.969,0.012
4,sliding,rec1,0.344,0.053
5,sliding,gmu,0.575,0.044
6,oob_single,rec0,0.787,0.053
7,oob_single,rec1,0.799,0.031
8,oob_single,gmu,0.792,0.028
9,areba,rec0,0.86,0.102


## Full results table
- summary per boundary

In [3]:
table_res_boundary('sine', 'gmu')

Unnamed: 0,proposing mean,proposing std,baseline mean,baseline std,sliding mean,sliding std,oob_single mean,oob_single std,oob mean,oob std,areba mean,areba std
safe 10,0.886,0.025,0.568,0.146,0.821,0.068,0.809,0.027,0.823,0.021,0.802,0.101
safe 1,0.843,0.057,0.002,0.021,0.386,0.177,0.443,0.116,0.54,0.104,0.58,0.136
safe 0,0.343,0.351,0.0,0.0,0.005,0.045,0.008,0.057,0.027,0.113,0.188,0.285
borderline 10,0.752,0.034,0.207,0.136,0.592,0.086,0.671,0.037,0.694,0.03,0.618,0.171
borderline 1,0.712,0.079,0.0,0.0,0.114,0.144,0.297,0.123,0.395,0.135,0.329,0.188
borderline 0,0.261,0.316,0.0,0.0,0.0,0.0,0.004,0.045,0.023,0.107,0.074,0.196
noise 10,0.627,0.042,0.27,0.147,0.515,0.057,0.601,0.025,0.609,0.023,0.482,0.185


In [4]:
table_res_boundary('sea', 'gmu')

Unnamed: 0,proposing mean,proposing std,baseline mean,baseline std,sliding mean,sliding std,oob_single mean,oob_single std,oob mean,oob std,areba mean,areba std
safe 10,0.985,0.005,0.981,0.007,0.989,0.006,0.982,0.007,0.99,0.005,0.982,0.014
safe 1,0.974,0.023,0.558,0.115,0.857,0.053,0.884,0.042,0.937,0.033,0.924,0.041
safe 0,0.749,0.352,0.0,0.0,0.162,0.273,0.103,0.225,0.141,0.265,0.675,0.392
borderline 10,0.799,0.03,0.302,0.059,0.575,0.044,0.792,0.028,0.813,0.026,0.68,0.197
borderline 1,0.758,0.064,0.0,0.0,0.088,0.116,0.436,0.124,0.568,0.097,0.333,0.201
borderline 0,0.194,0.29,0.0,0.0,0.0,0.0,0.005,0.05,0.038,0.134,0.056,0.172
noise 10,0.682,0.032,0.65,0.024,0.645,0.025,0.671,0.021,0.673,0.02,0.627,0.068


In [5]:
table_res_boundary('circle', 'gmu')

Unnamed: 0,proposing mean,proposing std,baseline mean,baseline std,sliding mean,sliding std,oob_single mean,oob_single std,oob mean,oob std,areba mean,areba std
safe 10,0.907,0.019,0.536,0.13,0.908,0.012,0.853,0.025,0.876,0.018,0.867,0.084
safe 1,0.891,0.04,0.0,0.0,0.383,0.151,0.394,0.136,0.471,0.12,0.627,0.112
safe 0,0.487,0.409,0.0,0.0,0.0,0.0,0.015,0.076,0.028,0.116,0.234,0.319
borderline 10,0.749,0.033,0.028,0.058,0.703,0.027,0.673,0.049,0.701,0.033,0.666,0.155
borderline 1,0.734,0.058,0.0,0.0,0.103,0.146,0.313,0.115,0.405,0.109,0.386,0.155
borderline 0,0.285,0.335,0.0,0.0,0.0,0.0,0.009,0.067,0.029,0.12,0.06,0.176
noise 10,0.612,0.039,0.058,0.094,0.548,0.037,0.572,0.058,0.594,0.035,0.474,0.238


## Statistical test
- ANOVA
- Tukey's HSD

In [6]:
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

def statistical_analysis(boundary: str, metric: str):
    """Perform ANOVA and Tukey's HSD test for a specific boundary and metric"""
    result_path = "<your path>/res"
    methods = ['mine', 'baseline', 'sliding', 'oob_single','oob', 'areba']
    
    # Get all combinations of imtype and ir
    scenarios = list(itertools.product(['safe', 'borderline', 'noise'], ['10', '1', '0']))
    
    print(f"\nStatistical Analysis for {boundary} boundary - {metric} metric")
    print("-" * 60)
    
    for imtype, ir in scenarios:
        path = f"{result_path}/{boundary}/{imtype}/{ir}"
        if os.path.exists(path):
            # Collect data
            data_values = []
            method_labels = []
            
            for method in methods:
                values = final_value(path, method, metric)
                data_values.extend(values)
                method_labels.extend([method] * len(values))
            
            # Perform one-way ANOVA
            method_groups = [final_value(path, method, metric) for method in methods]
            f_stat, p_value = stats.f_oneway(*method_groups)
            
            # Perform Tukey's HSD test
            tukey = pairwise_tukeyhsd(data_values, method_labels)
            
            # Get p-values
            from statsmodels.stats.multicomp import MultiComparison
            mc = MultiComparison(data_values, method_labels)
            precise_pvalues = mc.tukeyhsd().pvalues
            
            # Print results
            print(f"\nScenario: {imtype} {ir}")
            print(f"ANOVA results:")
            print(f"F-statistic = {f_stat:.3f}")
            print(f"p-value = {p_value:.3f}")
            
            print("\nTukey's HSD test results:")
            print("=" * 80)
            print(f"{'group1':>10} {'group2':>10} {'meandiff':>10} {'p-value':>10} {'lower':>10} {'upper':>10} {'reject':>6}")
            print("-" * 80)
            
            for idx, row in enumerate(tukey._results_table.data[1:]):
                print(f"{row[0]:>10} {row[1]:>10} {float(row[2]):>10.3f} {precise_pvalues[idx]:>10.3f} "
                      f"{float(row[3]):>10.3f} {float(row[4]):>10.3f} {row[5]:>6}")


In [7]:
statistical_analysis('sine', 'gmu')


Statistical Analysis for sine boundary - gmu metric
------------------------------------------------------------

Scenario: safe 10
ANOVA results:
F-statistic = 193.669
p-value = 0.000

Tukey's HSD test results:
    group1     group2   meandiff    p-value      lower      upper reject
--------------------------------------------------------------------------------
     areba   baseline     -0.234      0.000      0.000     -0.266 -0.2019
     areba       mine      0.084      0.000      0.000      0.052 0.1162
     areba        oob      0.022      0.386      0.386     -0.011 0.0537
     areba oob_single      0.008      0.984      0.984     -0.024 0.0398
     areba    sliding      0.019      0.532      0.532     -0.013 0.0512
  baseline       mine      0.318      0.000      0.000      0.286 0.3502
  baseline        oob      0.256      0.000      0.000      0.224 0.2877
  baseline oob_single      0.242      0.000      0.000      0.209 0.2737
  baseline    sliding      0.253      0.000     

In [8]:
statistical_analysis('sea', 'gmu')


Statistical Analysis for sea boundary - gmu metric
------------------------------------------------------------

Scenario: safe 10
ANOVA results:
F-statistic = 22.916
p-value = 0.000

Tukey's HSD test results:
    group1     group2   meandiff    p-value      lower      upper reject
--------------------------------------------------------------------------------
     areba   baseline     -0.001      0.834      0.834     -0.004 0.0018
     areba       mine      0.002      0.246      0.246     -0.001 0.0056
     areba        oob      0.007      0.000      0.000      0.004 0.0103
     areba oob_single     -0.001      0.997      0.997     -0.004 0.0026
     areba    sliding      0.007      0.000      0.000      0.004   0.01
  baseline       mine      0.004      0.010      0.010      0.001  0.007
  baseline        oob      0.009      0.000      0.000      0.005 0.0117
  baseline oob_single      0.001      0.979      0.979     -0.002  0.004
  baseline    sliding      0.008      0.000      0.

In [9]:
statistical_analysis('circle', 'gmu')


Statistical Analysis for circle boundary - gmu metric
------------------------------------------------------------

Scenario: safe 10
ANOVA results:
F-statistic = 484.800
p-value = 0.000

Tukey's HSD test results:
    group1     group2   meandiff    p-value      lower      upper reject
--------------------------------------------------------------------------------
     areba   baseline     -0.331      0.000      0.000     -0.358 -0.3051
     areba       mine      0.040      0.000      0.000      0.013 0.0661
     areba        oob      0.009      0.932      0.932     -0.018  0.035
     areba oob_single     -0.014      0.622      0.622     -0.041 0.0119
     areba    sliding      0.040      0.000      0.000      0.014 0.0666
  baseline       mine      0.371      0.000      0.000      0.345 0.3975
  baseline        oob      0.340      0.000      0.000      0.314 0.3665
  baseline oob_single      0.317      0.000      0.000      0.291 0.3433
  baseline    sliding      0.372      0.000   