# Statistical significance: ANOVA followed by Tukey HSD test

In [None]:
import pandas as pd

In [None]:
endpoint_divergent_dict = {'half_life': 'Fan', 'clearance':'Astrazeneca'}
df_dict = {}

In [None]:
for endpoint, divergent_source in endpoint_divergent_dict.items():
    # Load data
    homogenous_results = pd.read_csv(f'../results_scaling/{endpoint}/Homogenous/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')
    divergent_results = pd.read_csv(f'../results_scaling/{endpoint}/{divergent_source}/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')
    homogenous_divergent_results = pd.read_csv(f'../results_scaling/{endpoint}/Homogenous_{divergent_source}/XGBoost_rdkit_ecfp4_metrics_folds.tsv', sep='\t')

    # Build dataframes
    df_homogenous = homogenous_results[['seed','fold','r2']].loc[homogenous_results['ref'] == 'Homogenous']
    df_homogenous['cv_cycle'] = df_homogenous['seed'].astype(str)+'_'+df_homogenous['fold'].astype(str)
    df_homogenous['method'] = 'homogenous'
    df_homogenous_scaling = homogenous_divergent_results[['seed','fold','r2']].loc[homogenous_divergent_results['ref'] == 'Homogenous']
    df_homogenous_scaling['cv_cycle'] = df_homogenous_scaling['seed'].astype(str)+'_'+df_homogenous_scaling['fold'].astype(str)
    df_homogenous_scaling['method'] = 'homogenous_divergent_scaling'
    df_homogenous = pd.concat([df_homogenous, df_homogenous_scaling], axis=0)
    df_homogenous['split'] = 'random'

    df_divergent = divergent_results[['seed','fold','r2']].loc[divergent_results['ref'] == divergent_source]
    df_divergent['cv_cycle'] = df_divergent['seed'].astype(str)+'_'+df_divergent['fold'].astype(str)
    df_divergent['method'] = 'divergent'
    df_divergent_scaling = homogenous_divergent_results[['seed','fold','r2']].loc[homogenous_divergent_results['ref'] == divergent_source]
    df_divergent_scaling['cv_cycle'] = df_divergent_scaling['seed'].astype(str)+'_'+df_divergent_scaling['fold'].astype(str)
    df_divergent_scaling['method'] = 'homogenous_divergent_scaling'
    df_divergent = pd.concat([df_divergent, df_divergent_scaling], axis=0)
    df_divergent['split'] = 'random'

    # Append dataframes
    df_dict[endpoint] = [df_homogenous, df_divergent]

##  Examine the parametric testing assumptions

### The independence assumption

The usage of appropriate sampling mechanisms (such as 5x5 repeated CV) is so important to ensure the samples are sufficiently independent.

### The homogeneity of variances assumption: Levene test

In [None]:
from scipy.stats import levene

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')
    groups_homogenous = df_list[0].groupby('method')['r2'].apply(list)
    groups_divergnet = df_list[1].groupby('method')['r2'].apply(list)

    print('Homogenous sources')
    stat, pvalue = levene(*groups_homogenous)
    print(f'Levene test for R2: p-value = {pvalue}')

    print('Divergent source')
    stat, pvalue = levene(*groups_divergnet)
    print(f'Levene test for R2: p-value = {pvalue}')

### The normality assumption

In [None]:
from model_comparison import make_normality_diagnostic

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')

    print('Homogenous sources')
    make_normality_diagnostic(df_list[0].copy(), ['r2'])

    print('Divergent source')
    make_normality_diagnostic(df_list[1].copy(), ['r2'])

## Perform repeated measures ANOVA

In [None]:
from model_comparison import make_boxplots_parametric, rm_tukey_hsd

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')

    print('Homogenous sources')
    make_boxplots_parametric(df_list[0].copy(), ['r2'])

    print('Divergent source')
    make_boxplots_parametric(df_list[1].copy(), ['r2'])

### Tukey HSD Test

In [None]:
for endpoint, df_list in df_dict.items():
    print(f'\n{endpoint}\n')

    print('Homogenous sources')
    tukey_results = rm_tukey_hsd(df_list[0], "r2", "method")[0]
    print(tukey_results)

    print('Divergent source')
    tukey_results = rm_tukey_hsd(df_list[1], "r2", "method")[0]
    print(tukey_results)

In [None]:
rm_tukey_hsd(df_dict['half_life'][0], "r2", "method")[0]

In [None]:
rm_tukey_hsd(df_dict['half_life'][1], "r2", "method")[0]

In [None]:
rm_tukey_hsd(df_dict['clearance'][0], "r2", "method")[0]

In [None]:
rm_tukey_hsd(df_dict['clearance'][1], "r2", "method")[0]