In [None]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, auc
from scipy.stats import rankdata

In [None]:
PATH_ROOT = Path(os.getcwd()).absolute().parent.parent
print(PATH_ROOT)

In [None]:
CLASSIFIERS = ['KNeighborsClassifier', 'NNClassifier', 'RandomForestClassifier', 'SVC']
CLF_alias = ['KNN', 'NN', 'RF', 'SVM']
AD = ['DAIndexDelta', 
      'DAIndexGamma', 
      'DAIndexKappa', 
      'PCABoundingBox', 
      'ProbabilityClassifier', 
      'Magnet',
      'SklearnFeatureSqueezing', 
      'SklearnRegionBasedClassifier',
]
AD_alias = ['DM-δ',
            'DM-γ',
            'DM-κ',
            'BB',
            'Prob.',
            'Magnet',
            'FS',
            'RC'
]
AL = ['Magnet', 'FS', 'RC']
DATASETS = ['Ames', 'BBBP', 'Cancer', 'CYP1A2', 'hERG', 'HIV', 'Liver']

In [None]:
def get_data_path(model, ad, dataset, suffix='roc'):
    """Return a full path of the ROC results;"""
    path_file = os.path.join(PATH_ROOT, 'results', f'{model}_{ad}', f'{dataset}_{suffix}.csv')
    return path_file

# Test function
df_test = pd.read_csv(get_data_path(CLASSIFIERS[0], AD[0], DATASETS[0]))
df_test.head()

In [None]:
def get_mean_tpr(clf, ad, dataset):
    """Get TPR on 5-fold CV on 1 dataset with 1 classifier and 1 AD method;"""
    mean_fpr = np.linspace(0, 1, 100)
    df = pd.read_csv(get_data_path(clf, ad, dataset))
    tpr = []
    # Results are saved as 5-fold CV, starts from 1 to 5;
    for i in range(1, 6):
        fold_fpr = df[f'cv{i}_fpr'].dropna().to_numpy()
        fold_tpr = df[f'cv{i}_tpr'].dropna().to_numpy()
        interp_tpr = np.interp(mean_fpr, fold_fpr, fold_tpr)
        interp_tpr[0] = 0.
        tpr.append(interp_tpr)

    mean_tpr = np.mean(tpr, axis=0)
    mean_tpr[-1] = 1.0
    mean_tpr = pd.Series(mean_tpr)
    return mean_tpr

# Test function
res_test = get_mean_tpr(CLASSIFIERS[0], AD[0], DATASETS[0])
print(res_test)

In [None]:
def get_df_roc(clf, dataset):
    """Get a DataFrame that contains all AD method on 1 dataset and 1 classifier;""" 
    _df = pd.DataFrame({ 'fpr': np.linspace(0, 1, 100) })

    for ad, ad_lbl in zip(AD, AD_alias):
        # print(i, ad, ad_lbl)
        _tpr = get_mean_tpr(clf, ad, dataset)
        _df = pd.concat((_df, pd.DataFrame({ ad_lbl: _tpr })), axis=1)
    return _df

# Test function
df_test = get_df_roc(CLASSIFIERS[0], DATASETS[0])
df_test.head()

In [None]:
def get_auc(clf, clf_alias, dataset):
    df_auc = pd.DataFrame({
    'Dataset': [],
    'Classifier': [],
    'Method': [],
    'AUC': [],
    })
    df_roc = get_df_roc(clf, dataset)
    for i, ad in enumerate(df_roc.columns[1:]):
        auc_score = auc(df_roc['fpr'], df_roc[ad])
        row = [
            dataset,
            clf_alias,
            ad,
            auc_score
        ]
        df_auc.loc[len(df_auc)] = row
    # Largest one should be #1.
    # Use 'min' for the tie, e.g, [1, 2, 2, 4]
    df_auc['Rank'] = rankdata(-df_auc['AUC'], method='min')
    return df_auc


# Test function
df_test = get_auc(CLASSIFIERS[0], CLF_alias[0], DATASETS[0])
df_test


In [None]:
df_auc = pd.DataFrame()
for dataset in DATASETS:
    for clf, alias in zip(CLASSIFIERS, CLF_alias):
        _df = get_auc(clf, alias, dataset)
        df_auc = pd.concat((df_auc, _df), ignore_index=True)

path_output = os.path.join(PATH_ROOT, 'plot_results', 'roc', 'auc.csv')
print('Save to:', path_output)
df_auc.to_csv(path_output, index=False)

df_auc

In [None]:
def create_auc_heatmap(df, classifier, output=None, fontsize=12, figsize=(5, 5), show_title=False):
    """Plot heatmap per classifier."""
    df_selected = df[df['Classifier'] == classifier]
    heatmap_data = df_selected.pivot('Dataset', 'Method', 'AUC')
    
    plt.rc('font', size=fontsize)
    plt.figure(figsize=figsize)
    sns.heatmap(heatmap_data, 
                xticklabels=AD_alias,
                cmap='YlGn',
                annot=True, 
                fmt='.2f', 
                cbar=False)
    plt.xlabel('Method', fontsize=fontsize+1)
    plt.ylabel('Dataset', fontsize=fontsize+1)
    if show_title:
        plt.title(f'{classifier} AUC', fontsize=fontsize+4)
    plt.tight_layout()
    if output:
        plt.savefig(output, dpi=300)
    else:
        plt.show()

# Test function
create_auc_heatmap(df_auc, 'NN', show_title=True)

In [None]:
for clf in CLF_alias:
    path_output = os.path.join(PATH_ROOT, 'plot_results', 'roc', f'{clf}_auc_heatmap.pdf')
    print('Save to:', path_output)
    create_auc_heatmap(df_auc, clf, output=path_output)