This notebook is used to conduct pairwise statistical tests on prediction performance metrics.

In [1]:
import pandas as pd

from scipy import stats

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Pairwise statistical tests on 30-fold prediction performance metrics

## Benchmark datasets

In [3]:
#specify the dataset details
folder = 'benchmark'
task_setting = 'benchmark'
mol_props = ['BACE', 'BBBP', 'HIV', 'ESOL', 'FreeSolv',  'Lipop'] 
split_types = ['scaffold', 'random']
num_folds = 30

#specify model names
model_names = ['RF', 'molbert', 'grover_base', 'grover_base_rdkit']

-  comparison between model pairs

In [4]:
#make an empty dataframe to attach the results
stats_df = pd.DataFrame(columns=['model_1','model_2', 'metric_name', 'mol_prop', 'split_type', 'task',\
                                 'p_value', 'stats_method'])

In [5]:
#read the saved grand_perf_df
grand_perf_df = pd.read_csv('../results/processed_performance/{folder}_grand_perf_df_{task}.csv'.format(folder=folder, task=task_setting))

for split_type in split_types:
    for mol_prop in mol_props:
        if mol_prop in ['BACE', 'BBBP', 'HIV']:
            #cls
            metric_names = ["AUROC", "AUPRC", "Precision_PPV", "Precision_NPV"]
        elif mol_prop in ['ESOL', 'FreeSolv', 'Lipop']:
            #reg
            metric_names = ['RMSE', 'R2', 'Pearson_R', 'MAE']
        
        # loop through all metrics
        for metric_name in metric_names:
            #select the perf_df
            perf_df = grand_perf_df.loc[(grand_perf_df['split_type']==split_type) & \
                                        (grand_perf_df['mol_prop']==mol_prop) & \
                                        (grand_perf_df['metric_name']==metric_name)]
            #loop through the combinations of the models
            for i in range(len(model_names)):
                model_1 = model_names[i]
                for j in range(len(model_names)):
                    model_2 = model_names[j]
                    x1 = perf_df.loc[perf_df['model_name']==model_1]['metric_score']
                    x2 = perf_df.loc[perf_df['model_name']==model_2]['metric_score']
                    
                    #calculate the p value using paired t test
                    _, p_ttest_rel = stats.ttest_rel(list(x1), list(x2), alternative='two-sided')
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ttest_rel, 'stats_method': 'ttest_rel'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using Wilcoxon signed-rank test
                    try:
                        _, p_wilcoxon = stats.wilcoxon(list(x1), list(x2), alternative='two-sided')
                    except ValueError:
                        p_wilcoxon = 1
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_wilcoxon, 'stats_method': 'wilcoxon'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using independent t test
                    _, p_ttest_ind = stats.ttest_ind(list(x1), list(x2), alternative='two-sided')
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ttest_ind, 'stats_method': 'ttest_ind'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using ranksums test
                    _, p_ranksums = stats.ranksums(list(x1), list(x2), alternative='two-sided')
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ranksums, 'stats_method': 'ranksums'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

In [6]:
stats_df.head()

Unnamed: 0,model_1,model_2,metric_name,mol_prop,split_type,task,p_value,stats_method
0,RF,RF,AUROC,BACE,scaffold,benchmark,,ttest_rel
1,RF,RF,AUROC,BACE,scaffold,benchmark,1.0,wilcoxon
2,RF,RF,AUROC,BACE,scaffold,benchmark,1.0,ttest_ind
3,RF,RF,AUROC,BACE,scaffold,benchmark,1.0,ranksums
4,RF,molbert,AUROC,BACE,scaffold,benchmark,0.0,ttest_rel


In [7]:
# save to results/stats
stats_df.to_csv('../results/stats/{folder}_stats_df_{task}.csv'.format(folder=folder, task=task_setting), index=False)

## Opioids datasets

In [20]:
#specify the dataset details
folder = 'opioids'
task_setting = 'reg' # cutoff6, reg
mol_props = ['MDR1', 'CYP3A4', 'CYP2D6', 'MOR', 'DOR', 'KOR'] 
split_types = ['scaffold', 'random']
num_folds = 30

#specify model names
model_names = ['RF', 'molbert', 'grover_base', 'grover_base_rdkit']

-  comparison between model pairs

In [21]:
#make an empty dataframe to attach the results
stats_df = pd.DataFrame(columns=['model_1','model_2', 'metric_name', 'mol_prop', 'split_type', 'task',\
                                 'p_value', 'stats_method'])

In [22]:
#read the saved grand_perf_df
grand_perf_df = pd.read_csv('../results/processed_performance/{folder}_grand_perf_df_{task}.csv'.format(folder=folder, task=task_setting))

for split_type in split_types:
    for mol_prop in mol_props:
        # get metric names
        if task_setting == 'cutoff6':
            metric_names =  ['AUROC', 'AUPRC', 'Precision_PPV', 'Precision_NPV']
        elif task_setting == 'reg':
            metric_names = ['RMSE', 'MAE', 'R2', 'Pearson_R']
        
        # loop through all metrics
        for metric_name in metric_names:
            #select the perf_df
            perf_df = grand_perf_df.loc[(grand_perf_df['split_type']==split_type) & \
                                        (grand_perf_df['mol_prop']==mol_prop) & \
                                        (grand_perf_df['metric_name']==metric_name)]
            #loop through the combinations of the models
            for i in range(len(model_names)):
                model_1 = model_names[i]
                for j in range(len(model_names)):
                    model_2 = model_names[j]
                    x1 = perf_df.loc[perf_df['model_name']==model_1]['metric_score']
                    x2 = perf_df.loc[perf_df['model_name']==model_2]['metric_score']
                    
                    #calculate the p value using paired t test
                    try:
                        _, p_ttest_rel = stats.ttest_rel(list(x1), list(x2), alternative='two-sided')
                    except ValueError:
                        p_wilcoxon = None
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ttest_rel, 'stats_method': 'ttest_rel'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using Wilcoxon signed-rank test
                    try:
                        _, p_wilcoxon = stats.wilcoxon(list(x1), list(x2), alternative='two-sided')
                    except ValueError:
                        p_wilcoxon = 1
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_wilcoxon, 'stats_method': 'wilcoxon'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using independent t test
                    _, p_ttest_ind = stats.ttest_ind(list(x1), list(x2), alternative='two-sided')
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ttest_ind, 'stats_method': 'ttest_ind'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                    #calculate the p value using ranksums test
                    _, p_ranksums = stats.ranksums(list(x1), list(x2), alternative='two-sided')
                    #assemble values to add
                    values_to_add = {'model_1': model_1, 'model_2': model_2, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                    'split_type': split_type, 'task': task_setting, 'p_value': p_ranksums, 'stats_method': 'ranksums'}
                    #convert to a row series and append new row
                    row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

In [23]:
# save to results/stats
stats_df.to_csv('../results/stats/{folder}_stats_df_{task}.csv'.format(folder=folder, task=task_setting), index=False)

-  comparison between scaffold and random split to examine inter-scaffold generalization

In [24]:
#make an empty dataframe to attach the results
stats_df = pd.DataFrame(columns=['model', 'metric_name', 'mol_prop', 'task',\
                                 'p_value', 'stats_method'])

In [25]:
#read the grand perf df
grand_perf_df = pd.read_csv('../results/processed_performance/{folder}_grand_perf_df_{task}.csv'.format(folder=folder, task=task_setting))

for model in model_names:
    for mol_prop in mol_props:
        # get metric names
        if task_setting == 'cutoff6':
            metric_names =  ['AUROC', 'AUPRC', 'Precision_PPV', 'Precision_NPV']
        elif task_setting == 'reg':
            metric_names = ['RMSE', 'MAE', 'R2', 'Pearson_R']
        for metric_name in metric_names:
            #select the perf_df
            perf_df = grand_perf_df.loc[(grand_perf_df['model_name']==model) & \
                                        (grand_perf_df['mol_prop']==mol_prop) & \
                                        (grand_perf_df['metric_name']==metric_name)]
            
            #get the perf under different split type respectively
            x1 = perf_df.loc[perf_df['split_type']=='scaffold']['metric_score']
            x2 = perf_df.loc[perf_df['split_type']=='random']['metric_score']
                    
            #calculate the p value using paired t test
            _, p_ttest_rel = stats.ttest_rel(list(x1), list(x2), alternative='two-sided')
            #assemble values to add
            values_to_add = {'model': model, 'metric_name': metric_name, 'mol_prop': mol_prop, 'task': task_setting,\
                             'p_value': p_ttest_rel, 'stats_method': 'ttest_rel'}
            #convert to a row series and append new row
            row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

            #calculate the p value using Wilcoxon signed-rank test
            try:
                _, p_wilcoxon = stats.wilcoxon(list(x1), list(x2), alternative='two-sided')
            except ValueError:
                p_wilcoxon = 1
            #assemble values to add
            values_to_add = {'model': model, 'metric_name': metric_name, 'mol_prop': mol_prop, 'task': task_setting,\
                             'p_value': p_wilcoxon, 'stats_method': 'wilcoxon'}
            #convert to a row series and append new row
            row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

            #calculate the p value using independent t test
            _, p_ttest_ind = stats.ttest_ind(list(x1), list(x2), alternative='two-sided')
            #assemble values to add
            values_to_add = {'model': model, 'metric_name': metric_name, 'mol_prop': mol_prop, 'task': task_setting,\
                             'p_value': p_ttest_ind, 'stats_method': 'ttest_ind'}
            #convert to a row series and append new row
            row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

            #calculate the p value using ranksums test
            _, p_ranksums = stats.ranksums(list(x1), list(x2), alternative='two-sided')
            #assemble values to add
            values_to_add = {'model': model, 'metric_name': metric_name, 'mol_prop': mol_prop, 'task': task_setting,\
                             'p_value': p_ranksums, 'stats_method': 'ranksums'}
            #convert to a row series and append new row
            row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

In [26]:
stats_df.head()

Unnamed: 0,model,metric_name,mol_prop,task,p_value,stats_method
0,RF,RMSE,MDR1,reg,3.257433e-07,ttest_rel
1,RF,RMSE,MDR1,reg,5.751653e-06,wilcoxon
2,RF,RMSE,MDR1,reg,1.705933e-06,ttest_ind
3,RF,RMSE,MDR1,reg,6.974497e-06,ranksums
4,RF,MAE,MDR1,reg,1.372884e-10,ttest_rel


In [27]:
# save to results/stats
stats_df.to_csv('../results/stats/{folder}_stats_df_{task}_interscaffold.csv'\
                .format(folder=folder, task=task_setting), index=False)

-  comparison between AC and non-AC molecules to examine intra-scaffold generalization

In [28]:
#make an empty dataframe to attach the results
stats_df = pd.DataFrame(columns=['split_type', 'metric_name', 'mol_prop', 'model_name', 'task',\
                                 'p_value', 'stats_method'])

In [29]:
#read the grand perf df
grand_perf_df = pd.read_csv('../results/processed_performance/AC_{folder}_grand_perf_df_{task}.csv'.format(folder=folder, task=task_setting))

for split_type in split_types:
    for mol_prop in mol_props:
        # get metric names
        if task_setting == 'cutoff6':
            metric_names =  ['AUROC', 'AUPRC', 'Precision_PPV', 'Precision_NPV']
        elif task_setting == 'reg':
            metric_names = ['RMSE', 'MAE', 'R2', 'Pearson_R']
        for metric_name in metric_names:
            for model_name in model_names:
                #select the perf_df
                perf_df = grand_perf_df.loc[(grand_perf_df['split_type']==split_type) & \
                                            (grand_perf_df['mol_prop']==mol_prop) & \
                                            (grand_perf_df['metric_name']==metric_name) &\
                                            (grand_perf_df['model_name']==model_name)]
                #get the perf under different split types
                x1 = perf_df.loc[perf_df['AC_label']=='AC']['metric_score']
                x2 = perf_df.loc[perf_df['AC_label']=='non-AC']['metric_score']

                #calculate the p value using paired t test
                try:
                    _, p_ttest_rel = stats.ttest_rel(list(x1), list(x2), alternative='two-sided')
                except ValueError:
                    p_wilcoxon = None
                #assemble values to add
                values_to_add = {'split_type': split_type, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                'model_name': model_name, 'task': task_setting, 'p_value': p_ttest_rel, 'stats_method': 'ttest_rel'}
                #convert to a row series and append new row
                row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                #calculate the p value using Wilcoxon signed-rank test
                try:
                    _, p_wilcoxon = stats.wilcoxon(list(x1), list(x2), alternative='two-sided')
                except ValueError:
                    p_wilcoxon = 1
                #assemble values to add
                values_to_add = {'split_type': split_type, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                'model_name': model_name, 'task': task_setting, 'p_value': p_wilcoxon, 'stats_method': 'wilcoxon'}
                #convert to a row series and append new row
                row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                #calculate the p value using independent t test
                _, p_ttest_ind = stats.ttest_ind(list(x1), list(x2), alternative='two-sided')
                #assemble values to add
                values_to_add = {'split_type': split_type, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                'model_name': model_name, 'task': task_setting, 'p_value': p_ttest_ind, 'stats_method': 'ttest_ind'}
                #convert to a row series and append new row
                row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

                #calculate the p value using ranksums test
                _, p_ranksums = stats.ranksums(list(x1), list(x2), alternative='two-sided')
                #assemble values to add
                values_to_add = {'split_type': split_type, 'metric_name': metric_name, 'mol_prop': mol_prop, \
                'model_name': model_name, 'task': task_setting, 'p_value': p_ranksums, 'stats_method': 'ranksums'}
                #convert to a row series and append new row
                row_to_add = pd.Series(values_to_add); stats_df = stats_df.append(row_to_add, ignore_index=True)

In [30]:
stats_df.loc[(stats_df['model_name']=='RF') & (stats_df['mol_prop']=='MOR')].head()

Unnamed: 0,split_type,metric_name,mol_prop,model_name,task,p_value,stats_method
192,scaffold,RMSE,MOR,RF,reg,0.007164,ttest_rel
193,scaffold,RMSE,MOR,RF,reg,0.011748,wilcoxon
194,scaffold,RMSE,MOR,RF,reg,0.013259,ttest_ind
195,scaffold,RMSE,MOR,RF,reg,0.049261,ranksums
208,scaffold,MAE,MOR,RF,reg,0.013186,ttest_rel


In [31]:
# save to results/stats
stats_df.to_csv('../results/stats/AC_{folder}_stats_df_{task}_intrascaffold.csv'\
                .format(folder=folder, task=task_setting), index=False)