## Statistical Significance Test:

In [18]:
# Load packages:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
import numpy as np
import statsmodels.stats.anova as anova
import os
import scipy.stats as stats
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#source code: https://s-nako.work/2020/01/paired-one-way-anova-and-multiple-comparisons-in-python-2/

# Define home folder and file to open:
home_path = '/home/obsegment/code/ResearchDataset/CLEAR/Results'
stats_file = 'Model_Comparison_Stats_AllMetrics.xlsx'

# Read data from excel files:
stats_path = os.path.join(home_path, stats_file)
dice_df = pd.read_excel(stats_path, sheet_name = 'Dice')
HD_df = pd.read_excel(stats_path, sheet_name = 'HD')
jaccard_df = pd.read_excel(stats_path, sheet_name = 'Jaccard')

# Display overview of information contained in dice_df:
dice_df.describe()

Unnamed: 0,ResUNet,aUNet,SegResNet,UNETR
count,26.0,26.0,26.0,26.0
mean,0.803407,0.797915,0.804491,0.61689
std,0.08362,0.081937,0.070122,0.062837
min,0.551044,0.598903,0.649766,0.504893
25%,0.7655,0.760667,0.748503,0.56455
50%,0.816232,0.817924,0.805071,0.61306
75%,0.859328,0.865737,0.865638,0.664074
max,0.917361,0.896627,0.898376,0.730635


In [19]:
# Function to run one-way paired ANOVA, on each performance metric - across all 4 model types:
# This considers column names as subjects, with the same subjects (filenames) in each distribution
def one_way_paired_anova(df):
    #load subjects: in this case, the subjects are the filenames
    subjects = df.Filename.tolist()
    #load each column values: these are the variables or observations we will be comparing
    ResUNet = df.ResUNet.tolist()
    aUNet = df.aUNet.tolist()
    SegResNet = df.SegResNet.tolist()
    UNETr = df.UNETR.tolist()

    #combine variables into an array of points:
    points = np.array(ResUNet + aUNet + SegResNet + UNETr)
    #repeat variable to create conditions & subjects
    conditions = np.repeat(['ResUNet','aUNet','SegResNet', 'UNETR'],len(subjects))
    subjects = np.array(subjects+subjects+subjects+subjects)

    anova_df = pd.DataFrame({'Point':points,'Conditions':conditions,'Subjects':subjects})
    aov=anova.AnovaRM(anova_df, 'Point','Subjects',['Conditions'])
    result=aov.fit()
    print(result)
    
    return result

In [20]:
print("Dice ANOVA:")
dice_aov_result = one_way_paired_anova(dice_df)

print("HD ANOVA:")
HD_aov_result = one_way_paired_anova(HD_df)

print("Jaccard ANOVA:")
jaccard_aov_result = one_way_paired_anova(jaccard_df)


Dice ANOVA:
                 Anova
           F Value Num DF  Den DF Pr > F
----------------------------------------
Conditions 84.1820 3.0000 75.0000 0.0000

HD ANOVA:
                 Anova
           F Value Num DF  Den DF Pr > F
----------------------------------------
Conditions 79.1470 3.0000 75.0000 0.0000

Jaccard ANOVA:
                  Anova
           F Value  Num DF  Den DF Pr > F
-----------------------------------------
Conditions 100.3863 3.0000 75.0000 0.0000



# Paired T-test
Given a small p-value, it is worthwhile to do multiple comparisons with post-hoc bonferonni adjustments:


In [21]:
def t_test(metric_df):
    ResUNet =  metric_df['ResUNet']
    aUNet =  metric_df['aUNet']
    SegResNet =  metric_df['SegResNet']
    UNETR =  metric_df['UNETR']
    #calculate invidivual t values between groups
    group1_list_names = ['ResUNet', 'ResUNet', 'ResUNet', 'aUNet', 'aUNet', 'SegResNet']
    group1_list = [metric_df[name] for name in group1_list_names]
    group2_list_names = ['aUNet', 'SegResNet', 'UNETR', 'SegResNet', 'UNETR', 'UNETR']
    group2_list = [metric_df[name] for name in group2_list_names]
    
    pvals = []
    for i in range(len(group1_list)):
        pvals.append(stats.ttest_rel(group1_list[i], group2_list[i])[1])
    
    p_adjusted = multipletests(pvals, alpha=0.05, method='bonferroni')
    
    Ttest_df = pd.DataFrame({'group 1': group1_list_names, 'group 2': group2_list_names, 'p-orig': pvals, 'p-adj': p_adjusted[1] , 'reject': p_adjusted[0]})
    display(Ttest_df)
    
    return Ttest_df
    
print("DICE Result =")    
Dice_Ttest= t_test(dice_df)
print("HD Result =")    
HD_Ttest = t_test(HD_df)
print("jaccard Result =")    
jaccard_Ttest = t_test(jaccard_df)


DICE Result =


Unnamed: 0,group 1,group 2,p-orig,p-adj,reject
0,ResUNet,aUNet,0.5830362,1.0,False
1,ResUNet,SegResNet,0.9236903,1.0,False
2,ResUNet,UNETR,5.941859e-11,3.565115e-10,True
3,aUNet,SegResNet,0.5314781,1.0,False
4,aUNet,UNETR,3.238173e-10,1.942904e-09,True
5,SegResNet,UNETR,1.909224e-11,1.145534e-10,True


HD Result =


Unnamed: 0,group 1,group 2,p-orig,p-adj,reject
0,ResUNet,aUNet,0.02450429,0.1470258,False
1,ResUNet,SegResNet,0.03473909,0.2084345,False
2,ResUNet,UNETR,5.687948e-12,3.412769e-11,True
3,aUNet,SegResNet,0.4420089,1.0,False
4,aUNet,UNETR,5.072735e-10,3.043641e-09,True
5,SegResNet,UNETR,1.422383e-09,8.534298e-09,True


jaccard Result =


Unnamed: 0,group 1,group 2,p-orig,p-adj,reject
0,ResUNet,aUNet,0.5004394,1.0,False
1,ResUNet,SegResNet,0.9326394,1.0,False
2,ResUNet,UNETR,8.268026e-12,4.960815e-11,True
3,aUNet,SegResNet,0.5713051,1.0,False
4,aUNet,UNETR,4.413751e-11,2.648251e-10,True
5,SegResNet,UNETR,2.501026e-12,1.500616e-11,True


Alternatively, a pairwise TukeyHSD could be performed:

In [25]:
def tukey_hsd(group_names , *args ):
    endog = np.hstack(args)
    groups_list = []
    for i in range(len(args)):
        for j in range(len(args[i])):
            groups_list.append(group_names[i])
    groups = np.array(groups_list)
    res = pairwise_tukeyhsd(endog, groups)
    print(res)

print("Dice results are:")
tukey_hsd(['ResUNet', 'aUNet', 'SegResNet', 'UNETR'], dice_df['ResUNet'], dice_df['aUNet'], dice_df['SegResNet'], dice_df['UNETR'])
print("HD results are:")
tukey_hsd(['ResUNet', 'aUNet', 'SegResNet', 'UNETR'], HD_df['ResUNet'], HD_df['aUNet'], HD_df['SegResNet'], HD_df['UNETR'])
print("Jaccard results are:")
tukey_hsd(['ResUNet', 'aUNet', 'SegResNet', 'UNETR'], jaccard_df['ResUNet'], jaccard_df['aUNet'], jaccard_df['SegResNet'], jaccard_df['UNETR'])

Dice results are:
   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
  group1    group2  meandiff p-adj  lower   upper  reject
---------------------------------------------------------
  ResUNet SegResNet   0.0011   0.9 -0.0534  0.0555  False
  ResUNet     UNETR  -0.1865 0.001  -0.241 -0.1321   True
  ResUNet     aUNet  -0.0055   0.9 -0.0599  0.0489  False
SegResNet     UNETR  -0.1876 0.001  -0.242 -0.1332   True
SegResNet     aUNet  -0.0066   0.9  -0.061  0.0479  False
    UNETR     aUNet    0.181 0.001  0.1266  0.2355   True
---------------------------------------------------------
HD results are:
    Multiple Comparison of Means - Tukey HSD, FWER=0.05     
  group1    group2  meandiff p-adj   lower    upper   reject
------------------------------------------------------------
  ResUNet SegResNet   6.1939 0.5369  -5.9076  18.2954  False
  ResUNet     UNETR   51.744  0.001  39.6425  63.8455   True
  ResUNet     aUNet   7.8842  0.328  -4.2174  19.9857  False
SegResNet     UNETR 