# Statistical significance: ANOVA followed by Tukey HSD test

In [None]:
import pandas as pd

In [None]:
endpoints_cases_dict = {'half_life': ['Fan_Lombardo_DDPD_eDrug3D_Obach','Fan','Lombardo_DDPD_eDrug3D_Obach'], 
                        'clearance': ['Lombardo_Astrazeneca_Iwata_Obach_GombarHall_Varma2009_Varma2010','Astrazeneca','Lombardo_Iwata_Obach_GombarHall_Varma2009_Varma2010']}
endpoints_sources_dict = {'half_life': [['Obach','Lombardo','eDrug3D','DDPD'], 'Fan'],
                          'clearance': [['Lombardo','Iwata','Obach','GombarHall','Varma2009','Varma2010'], 'Astrazeneca']}
df_dict = {'half_life': [], 'clearance': []}

In [None]:
for endpoint, cases_list in endpoints_cases_dict.items():
    # Load data
    all_results = pd.read_csv(f'../results/{endpoint}/{cases_list[0]}/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')
    divergent_results = pd.read_csv(f'../results/{endpoint}/{cases_list[1]}/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')
    homogenous_results = pd.read_csv(f'../results/{endpoint}/{cases_list[2]}/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')

    # Build dataframes
    for source in endpoints_sources_dict[endpoint][0]:
        df_homogenous = homogenous_results[['seed','fold','r2']].loc[homogenous_results['ref'] == source]
        df_homogenous['cv_cycle'] = df_homogenous['seed'].astype(str)+'_'+df_homogenous['fold'].astype(str)
        df_homogenous['method'] = 'homogenous'
        df_homogenous_all = all_results[['seed','fold','r2']].loc[all_results['ref'] == source]
        df_homogenous_all['cv_cycle'] = df_homogenous_all['seed'].astype(str)+'_'+df_homogenous_all['fold'].astype(str)
        df_homogenous_all['method'] = 'all'
        df_homogenous = pd.concat([df_homogenous, df_homogenous_all], axis=0)
        df_homogenous['split'] = 'random'
        df_homogenous['source'] = source

        df_dict[endpoint].append(df_homogenous)

    df_divergent = divergent_results[['seed','fold','r2']].loc[divergent_results['ref'] == endpoints_sources_dict[endpoint][1]]
    df_divergent['cv_cycle'] = df_divergent['seed'].astype(str)+'_'+df_divergent['fold'].astype(str)
    df_divergent['method'] = 'divergent'
    df_divergent_all = all_results[['seed','fold','r2']].loc[all_results['ref'] == endpoints_sources_dict[endpoint][1]]
    df_divergent_all['cv_cycle'] = df_divergent_all['seed'].astype(str)+'_'+df_divergent_all['fold'].astype(str)
    df_divergent_all['method'] = 'all'
    df_divergent = pd.concat([df_divergent, df_divergent_all], axis=0)
    df_divergent['split'] = 'random'
    df_divergent['source'] = endpoints_sources_dict[endpoint][1]

    df_dict[endpoint].append(df_divergent)

##  Examine the parametric testing assumptions

### The independence assumption

The usage of appropriate sampling mechanisms (such as 5x5 repeated CV) is so important to ensure the samples are sufficiently independent.

### The homogeneity of variances assumption: Levene test

In [None]:
from scipy.stats import levene

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')
    
    for df in df_list:
        print(f'Source: {df["source"].values[0]}')
        groups = df.groupby('method')['r2'].apply(list)
        stat, pvalue = levene(*groups)
        print(f'Levene test for R2: p-value = {pvalue}')

### The normality assumption

In [None]:
from model_comparison import make_normality_diagnostic

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')
    
    for df in df_list:
        print(f'Source: {df["source"].values[0]}')
        make_normality_diagnostic(df.copy(), ['r2'])

## Perform repeated measures ANOVA

In [None]:
from model_comparison import make_boxplots_parametric, rm_tukey_hsd

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')
    
    for df in df_list:
        print(f'Source: {df["source"].values[0]}')
        make_boxplots_parametric(df.copy(), ['r2'])

### Tukey HSD Test

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')
    
    for df in df_list:
        print(f'Source: {df["source"].values[0]}')
        tukey_results = rm_tukey_hsd(df, "r2", "method")[0]
        print(tukey_results)

In [None]:
tukey_results