_____________________________________________________________________________________________________________
AUC curves for preselected predictors
_____________________________________________________________________________________________________________

In [None]:
import pandas as pd

In [65]:
auc_cvd_training_selected_predictors = pd.DataFrame()
auc_cvd_validation_selected_predictors = pd.DataFrame()
auc_cvd_test_selected_predictors = pd.DataFrame()

auc_ncvd_training_selected_predictors = pd.DataFrame()
auc_ncvd_validation_selected_predictors = pd.DataFrame()
auc_ncvd_test_selected_predictors = pd.DataFrame()

N_runs = 3

for i in range(N_runs):
    auc_cvd_training_selected_predictors = pd.concat([auc_cvd_training_selected_predictors, 
                                                      pd.read_excel("auc_7_predictors/cvd_auc_training_{}.xlsx".format(i))],
                                                      ignore_index=True)
     
    auc_cvd_validation_selected_predictors = pd.concat([auc_cvd_validation_selected_predictors, 
                                                      pd.read_excel("auc_7_predictors/cvd_auc_validation_{}.xlsx".format(i))],
                                                      ignore_index=True)
        
    auc_cvd_test_selected_predictors = pd.concat([auc_cvd_test_selected_predictors, 
                                                  pd.read_excel("auc_7_predictors/cvd_auc_test_{}.xlsx".format(i))],
                                                  ignore_index=True)
    
    # -----------------------------------------------------------------------------------------------------------------------
    auc_ncvd_training_selected_predictors = pd.concat([auc_ncvd_training_selected_predictors, 
                                                      pd.read_excel("auc_7_predictors/ncvd_auc_training_{}.xlsx".format(i))],
                                                      ignore_index=True)
    
    auc_ncvd_validation_selected_predictors = pd.concat([auc_ncvd_validation_selected_predictors, 
                                                   pd.read_excel("auc_7_predictors/ncvd_auc_validation_{}.xlsx".format(i))],
                                                   ignore_index=True)
    
    auc_ncvd_test_selected_predictors = pd.concat([auc_ncvd_test_selected_predictors, 
                                                   pd.read_excel("auc_7_predictors/ncvd_auc_test_{}.xlsx".format(i))],
                                                   ignore_index=True)

In [66]:
# -----------------------------------------
# Estimate a confidence interval
# -----------------------------------------
import scipy
import scipy.stats
import numpy as np

def CI_estimation(sample, confidence_level = 0.95):
    degrees_freedom = sample.size - 1
    sample_mean = np.mean(sample)
    sample_standard_error = scipy.stats.sem(sample)
    
    confidence_interval = scipy.stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error)
    #print(sample_mean, '({0:.4f}, {0:.4f})'.format(confidence_interval[0], confidence_interval[1]))

    return confidence_interval

In [67]:
# -----------------------------------------
# Estimate AUC mean and confidence interval for different models and times
# -----------------------------------------

def AUC_statistics(data):
    auc_stats = pd.DataFrame(columns = ["model", "time", "auc_mean", "95%CI lower", "95%CI upper"])
    for m in data.model.unique():
        for t in data.times.unique():
            mask = (data.loc[:, 'model'] == m) & (data.loc[:, 'times'] == t)
            auc_values = data.loc[mask, 'AUC']

            ci_bounds = CI_estimation(auc_values)

            auc_stats = pd.concat([auc_stats, pd.DataFrame({"model": m, 
                                                            "time": t, 
                                                            "auc_mean": np.mean(auc_values), 
                                                            "95%CI lower": ci_bounds[0], 
                                                            "95%CI upper": ci_bounds[1]}, index = [0])], ignore_index=True)
    return auc_stats

In [68]:
auc_stats_cvd_training = AUC_statistics(auc_cvd_training_selected_predictors)
auc_stats_cvd_validation = AUC_statistics(auc_cvd_validation_selected_predictors)
auc_stats_cvd_test = AUC_statistics(auc_cvd_test_selected_predictors)

auc_stats_ncvd_training = AUC_statistics(auc_ncvd_training_selected_predictors)
auc_stats_ncvd_validation = AUC_statistics(auc_ncvd_validation_selected_predictors)
auc_stats_ncvd_test = AUC_statistics(auc_ncvd_test_selected_predictors)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

plt.subplots(3,2, figsize = (14,20))

# -------------
# Training data
# -------------
plt.subplot(3,2,1) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_training, hue='model', style = 'model',
             dashes = [(1,0), (1, 1)] * 9, legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nTraining data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
#plt.ylim((0.60,0.75))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')


plt.subplot(3,2,2) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_training, hue='model', style = 'model',
             dashes = [(1,0), (1, 1)] * 9, legend=False)
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nTraining data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
#plt.ylim((0.60,1))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')

# -------------
# Validation data
# -------------
plt.subplot(3,2,3) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_validation, hue='model', style = 'model',
             dashes = [(1,0), (1, 1)] * 9, legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nValidation data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.75))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')


plt.subplot(3,2,4) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_validation, hue='model', style = 'model', 
             dashes = [(1,0), (1, 1)] * 9, legend=False)
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nValidation data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.75))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')

# -------------
# Test data
# -------------
plt.subplot(3,2,5) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_test, hue='model', style = 'model', 
             dashes = [(1,0), (1, 1)] * 9, legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nTest data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.75))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')

# Reorder models in the legend
auc_stats_ncvd_test.iloc[:120] = auc_stats_ncvd_test.iloc[:120].sort_values("model").reset_index(drop=True)

plt.subplot(3,2,6) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_test, hue='model', style = 'model', dashes = [(1,0), (1, 1)] * 9)
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nTest data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.75))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')
plt.legend(bbox_to_anchor=(1, -0.15), loc=1, borderaxespad=0., ncol=3)

plt.savefig("AUC_preselected_predictors.png", bbox_inches='tight')

_____________________________________________________________________________________________________________
AUC curves for many predictors
_____________________________________________________________________________________________________________

In [26]:
auc_cvd_training_selected_predictors = pd.DataFrame()
auc_cvd_validation_selected_predictors = pd.DataFrame()
auc_cvd_test_selected_predictors = pd.DataFrame()

auc_ncvd_training_selected_predictors = pd.DataFrame()
auc_ncvd_validation_selected_predictors = pd.DataFrame()
auc_ncvd_test_selected_predictors = pd.DataFrame()

N_runs = 50

for i in range(N_runs):
    auc_cvd_training_selected_predictors = pd.concat([auc_cvd_training_selected_predictors, 
                                                      pd.read_excel("auc_many_predictors/cvd_auc_training_{}.xlsx".format(i))],
                                                      ignore_index=True)
     
    auc_cvd_validation_selected_predictors = pd.concat([auc_cvd_validation_selected_predictors, 
                                                      pd.read_excel("auc_many_predictors/cvd_auc_validation_{}.xlsx".format(i))],
                                                      ignore_index=True)
        
    auc_cvd_test_selected_predictors = pd.concat([auc_cvd_test_selected_predictors, 
                                                  pd.read_excel("auc_many_predictors/cvd_auc_test_{}.xlsx".format(i))],
                                                  ignore_index=True)
    
    # -----------------------------------------------------------------------------------------------------------------------
    auc_ncvd_training_selected_predictors = pd.concat([auc_ncvd_training_selected_predictors, 
                                                      pd.read_excel("auc_many_predictors/ncvd_auc_training_{}.xlsx".format(i))],
                                                      ignore_index=True)
    
    auc_ncvd_validation_selected_predictors = pd.concat([auc_ncvd_validation_selected_predictors, 
                                                   pd.read_excel("auc_many_predictors/ncvd_auc_validation_{}.xlsx".format(i))],
                                                   ignore_index=True)
    
    auc_ncvd_test_selected_predictors = pd.concat([auc_ncvd_test_selected_predictors, 
                                                   pd.read_excel("auc_many_predictors/ncvd_auc_test_{}.xlsx".format(i))],
                                                   ignore_index=True)

In [70]:
auc_stats_cvd_training = AUC_statistics(auc_cvd_training_selected_predictors)
auc_stats_cvd_validation = AUC_statistics(auc_cvd_validation_selected_predictors)
auc_stats_cvd_test = AUC_statistics(auc_cvd_test_selected_predictors)

auc_stats_ncvd_training = AUC_statistics(auc_ncvd_training_selected_predictors)
auc_stats_ncvd_validation = AUC_statistics(auc_ncvd_validation_selected_predictors)
auc_stats_ncvd_test = AUC_statistics(auc_ncvd_test_selected_predictors)

In [None]:
plt.subplots(3,2, figsize = (14,20))

# -------------
# Training data
# -------------
plt.subplot(3,2,1) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_training, hue='model', style = 'model',
            legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nTraining data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,1.05))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')


plt.subplot(3,2,2) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_training, hue='model', style = 'model',
             legend=False)
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nTraining data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,1.05))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')

# -------------
# Validation data
# -------------
plt.subplot(3,2,3) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_validation, hue='model', style = 'model',
             legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nValidation data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.85))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')


plt.subplot(3,2,4) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_validation, hue='model', style = 'model', 
             legend=False)
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nValidation data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.85))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')

# -------------
# Test data
# -------------
plt.subplot(3,2,5) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_cvd_test, hue='model', style = 'model', 
             legend=False)
plt.title("Time-dependent AUC(t) when predicting the CVD risk\nTest data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.85))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')


plt.subplot(3,2,6) 
sns.lineplot(x='time', y='auc_mean', data=auc_stats_ncvd_test, hue='model', style = 'model')
plt.title("Time-dependent AUC(t) when predicting the non-CVD risk\nTest data", fontsize = 12, y = 0.9)
plt.xlabel("Time, years")
plt.ylabel("AUC")
plt.xlim((5,30))
plt.ylim((0.60,0.85))
plt.minorticks_on()
plt.grid(which='minor', linestyle=':', linewidth='1.5', color='white')
plt.legend(bbox_to_anchor=(0.65, -0.15), loc=1, borderaxespad=0., ncol=3)

plt.savefig("AUC_many_predictors.png", bbox_inches='tight')