In [829]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from fancyimpute import KNN

In [830]:
# Read all samples from the file
data_kihd = pd.read_excel('kihd_may_2019.xlsx')

In [831]:
# -----------------------------------------
# Keep the original data in 'data_kihd' and use its copy 'data_kihd_preprocessed' instead
# -----------------------------------------
data_kihd_preprocessed = data_kihd.copy()

# Drop two variables (dates of visits)
data_kihd_preprocessed = data_kihd_preprocessed.drop(['tpvm2', 'tpnr2'], axis = 1)

# Correct the zero-level for the following variables:
wrong_zero = ['v0563', 'v0565', 'v0567', 'v0569', 'v0571', 'v0573', 'v0575', 'v0577', 'v0579', 'v0607', 'v0609', 'v0613',
              'v0621', 'v0623', 'v0625', 'v0627', 'v0629', 'v0631', 'v0633', 'v0635', 'v0637', 'v0639', 'v0643', 'v0645', 
              'v0647', 'v0649', 'v0651', 'v0653']
# Zeros are at the wrong end of the scale
# Change zeros to (max+1)
for col in wrong_zero:
    data_kihd_preprocessed.loc[:, col] = [np.max(data_kihd_preprocessed.loc[:, col]) + 1 if value == 0.0 else value for value in data_kihd_preprocessed.loc[:, col]]

print(data_kihd_preprocessed.shape)
# -----------------------------------------
# Turn the categorical variables indo dummies
# -----------------------------------------
categorical_variables=['au0136','au0153','ek0115','ek0119','ka0118','mi0205','mi0207','mi0208','mi0209',
                       'mi0210','mi0211','mi0212','mi0213','mi0214','v0145','v0146','v0157','v0158','v0161',
                       'v0172','v0247','v0248','v0665','v0721','v0724','u1307']

for col in categorical_variables:
    if col in data_kihd_preprocessed.columns:
        new_dummies=pd.get_dummies(data_kihd_preprocessed[col], dummy_na=False)
        my_list = new_dummies.columns.values
        string = col+"_"
        my_new_list = [string + str(x) for x in my_list]
        new_dummies.columns = my_new_list
        data_kihd_preprocessed = data_kihd_preprocessed.drop(col, axis=1)       
        data_kihd_preprocessed = data_kihd_preprocessed.join(new_dummies)
                 
# Outcomes
kihd_outcomes = data_kihd_preprocessed.loc[:, ['tutknro', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d']]

# Predictors
kihd_predictors = data_kihd_preprocessed.drop(['tutknro', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d'], axis = 1)

# Separate genes and phenotypes
genes_start = list(kihd_predictors.columns.values).index('FEDER2HH'.lower())
genes_end = list(kihd_predictors.columns.values).index('CATAETT'.lower())

# Genes only
kihd_genes = kihd_predictors.iloc[:, genes_start:(genes_end+1)].copy()
print("KIHD with genes only: {}".format(kihd_genes.shape))

# Phenotypes
kihd_phenotypes = kihd_predictors.drop(kihd_predictors.columns.values[genes_start:genes_end+1], axis = 1)
print("KIHD with phenotypes only: {}".format(kihd_phenotypes.shape))

(2682, 992)
KIHD with genes only: (2682, 96)
KIHD with phenotypes only: (2682, 977)


In [832]:
# -----------------------------------------
# Remove predictors and subjects based on the number of missing values in 'kihd_phenotypes'
# -----------------------------------------
# Remove variables (columns) containing more than 5% of missing values
threshold_columns = kihd_phenotypes.shape[0]-round(0.05*kihd_phenotypes.shape[0])
kihd_phenotypes = kihd_phenotypes.dropna(axis=1, thresh=threshold_columns) 

print("Filter out predictors with more than 5% of missing values ...")
print("Dataset size: {rows}x{cols}".format(rows = kihd_phenotypes.shape[0], cols = kihd_phenotypes.shape[1]))
print("-------------------------------------")

# Remove subjects (rows) with more than 5% of missing values
threshold_rows = kihd_phenotypes.shape[1]-round(0.05*kihd_phenotypes.shape[1])
kihd_phenotypes = kihd_phenotypes.dropna(axis=0, thresh=threshold_rows)

print("Filter out rows with more than 5% of missing values ...")
print("Dataset size: {rows}x{cols}".format(rows = kihd_phenotypes.shape[0], cols = kihd_phenotypes.shape[1]))
print("-------------------------------------")

# Remove subjects in genes and outcomes correspondingly
kihd_genes = kihd_genes.loc[kihd_phenotypes.index, :]
kihd_outcomes = kihd_outcomes.loc[kihd_phenotypes.index, :]

# Reset indices in all data frames after removing subjects 
kihd_genes = kihd_genes.reset_index(drop=True)
kihd_outcomes = kihd_outcomes.reset_index(drop=True)
kihd_phenotypes = kihd_phenotypes.reset_index(drop=True)

Filter out predictors with more than 5% of missing values ...
Dataset size: 2682x746
-------------------------------------
Filter out rows with more than 5% of missing values ...
Dataset size: 2623x746
-------------------------------------


In [834]:
# -----------------------------------------
# Fill gaps in 'kihd_phenotypes' with kNN
# -----------------------------------------

# Scale predictors before applying the NN-based method
scaler=MinMaxScaler().fit(kihd_phenotypes)
kihd_phenotypes_scaled=scaler.transform(kihd_phenotypes)
kihd_phenotypes_scaled_filled_knn = KNN(k=1).fit_transform(kihd_phenotypes_scaled)

# Inverse scaling to original ranges
kihd_phenotypes=pd.DataFrame(scaler.inverse_transform(kihd_phenotypes_scaled_filled_knn), columns=kihd_phenotypes.columns.values)

Imputing row 1/2623 with 11 missing, elapsed time: 47.254
Imputing row 101/2623 with 29 missing, elapsed time: 47.274
Imputing row 201/2623 with 0 missing, elapsed time: 47.283
Imputing row 301/2623 with 1 missing, elapsed time: 47.298
Imputing row 401/2623 with 6 missing, elapsed time: 47.306
Imputing row 501/2623 with 0 missing, elapsed time: 47.316
Imputing row 601/2623 with 1 missing, elapsed time: 47.324
Imputing row 701/2623 with 0 missing, elapsed time: 47.333
Imputing row 801/2623 with 0 missing, elapsed time: 47.341
Imputing row 901/2623 with 0 missing, elapsed time: 47.349
Imputing row 1001/2623 with 1 missing, elapsed time: 47.365
Imputing row 1101/2623 with 0 missing, elapsed time: 47.373
Imputing row 1201/2623 with 1 missing, elapsed time: 47.391
Imputing row 1301/2623 with 6 missing, elapsed time: 47.406
Imputing row 1401/2623 with 1 missing, elapsed time: 47.422
Imputing row 1501/2623 with 4 missing, elapsed time: 47.448
Imputing row 1601/2623 with 0 missing, elapsed tim

In [835]:
# -----------------------------------------
# Handle competing risks
# -----------------------------------------
# Remove subjects died because of any non-cardiovascular reason within the prediction horizon

prediction_horizon = 35 * 365
kihd_outcomes = kihd_outcomes.drop(kihd_outcomes[ (kihd_outcomes.loc[:, 'ncvd16'] == 1) & 
                                                 (kihd_outcomes.loc[:, 'ncvd16d'] <= prediction_horizon)].index, axis=0)
# Remove subjects in kihd_genes and kihd_phenotypes correspondingly
kihd_genes = kihd_genes.loc[kihd_outcomes.index, :]
kihd_phenotypes = kihd_phenotypes.loc[kihd_outcomes.index, :]

# Reset indices in all data frames after removing subjects 
kihd_genes = kihd_genes.reset_index(drop=True)
kihd_outcomes = kihd_outcomes.reset_index(drop=True)
kihd_phenotypes = kihd_phenotypes.reset_index(drop=True)

In [840]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import scipy
import plotly
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import plotly.express as px

In [841]:
# -----------------------------------------
# Perform cross-validation for the given data and model 
# -----------------------------------------
def cv_estimator(cv_splits, data_x, data_y, model):
    auc_test = []
    auc_train = []
    
    splitting = StratifiedKFold(n_splits=cv_splits, random_state=None, shuffle=True)
    
    for train_index, test_index in splitting.split(np.zeros(data_y.shape[0]), data_y):
        data_x_train, data_x_test = data_x.iloc[train_index].copy(), data_x.iloc[test_index].copy()
        y_train, y_test = data_y[train_index], data_y[test_index]
        
        #normalize
        scaler = MinMaxScaler().fit(data_x_train)
        data_x_train = scaler.transform(data_x_train)
        data_x_test = scaler.transform(data_x_test)
        #train the model
        model.fit(data_x_train, y_train)
        #apply to test
        y_prob_test = [prediction[1] for prediction in model.predict_proba(data_x_test)]
        auc_test.append(roc_auc_score(y_test, y_prob_test))
        #apply to train
        y_prob_train = [prediction[1] for prediction in model.predict_proba(data_x_train)]
        auc_train.append(roc_auc_score(y_train, y_prob_train))
        
    return np.array(auc_train), np.array(auc_test)

# -----------------------------------------
# Estimate a confidence interval
# -----------------------------------------
def CI_estimation(sample, confidence_level = 0.95):
    degrees_freedom = sample.size - 1
    sample_mean = np.mean(sample)
    sample_standard_error = scipy.stats.sem(sample)
    
    confidence_interval = scipy.stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error)
    #print(sample_mean, '({0:.4f}, {0:.4f})'.format(confidence_interval[0], confidence_interval[1]))

    return confidence_interval

# -----------------------------------------
# Estimate AUC for the Logistic Regression model without regularization (FinRisk predictors), i.e. a baseline level
# -----------------------------------------
def auc_estimation_baseline(data_x, data_y, runs, cv_splits):
    auc_test_all = []
    auc_train_all = []

    for r in range(runs):
        model = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=100000)
        auc_train, auc_test = cv_estimator(cv_splits, data_x, data_y, model)

        auc_train_all.extend(auc_train)   
        auc_test_all.extend(auc_test)
            
    return np.array(auc_train_all), np.array(auc_test_all)

In [842]:
# Select an outcome variable
outcome = 'cvd16'
data_y = ((kihd_outcomes.loc[:, 'cvd16d'] <= prediction_horizon) & (kihd_outcomes.loc[:, 'cvd16'] == 1)).astype("int").values.ravel()

# Define parameters of the experiment 
runs=50
cv_splits=5

# Define the 'baseline' inputs
data_x = kihd_phenotypes.loc[:, ['v0137', 'tup', 'mvp0224', 'bi0160', 'bi0171', 'diab', 'cvdfam']].copy()
# Train the baseline model 
auc_train_baseline, auc_test_baseline = auc_estimation_baseline(data_x, data_y, runs, cv_splits)
# Estimate CIs on the training and test data
ci_test_baseline = CI_estimation(auc_test_baseline)
ci_train_baseline = CI_estimation(auc_train_baseline)

# Get mean, std, max, min for predictors
data_x.loc[data_y == 0].describe().to_excel('predictors_healthy.xlsx')
data_x.loc[data_y == 1].describe().to_excel('predictors_sick.xlsx')

In [31]:
# Use all available inputs to train the model
data_x = kihd_phenotypes.copy()

knn_neighbors = [3, 5, 10, 15, 20, 35, 50, 75, 100, 150]
lambda_lasso = [0.025, 0.05, 0.075, 0.1, 0.15, 0.25, 0.5, 0.75, 1]
tree_depth = [2, 3, 4, 5, 7, 9, 11, 13, 15]
n_neurons = [3, 5, 10, 15, 30, 50, 100, 300, 400]

In [163]:
# -----------------------------------------
# Estimate AUC on training, validation, and test data:
# a model and its parameters are passed
# a dataframe with results is returned
# -----------------------------------------
def auc_estimation(data_x, data_y, runs, cv_splits, model, model_name, parameter):
    auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])

    for r in range(runs):
        splitting = StratifiedKFold(n_splits=cv_splits, random_state=None, shuffle=True)
        cv = 0
        for train_index, test_index in splitting.split(np.zeros(data_y.shape[0]), data_y):
            data_x_train, data_x_test = data_x.loc[train_index].copy(), data_x.loc[test_index].copy()
            y_train, y_test = data_y[train_index], data_y[test_index]
            
            # -----------------------------------------
            # Validation on the training data to choose the model parameters 
            # -----------------------------------------
            auc_train_inside, auc_valid_inside = cv_estimator(cv_splits, data_x_train, y_train, model)
            # -----------------------------------------
            # -----------------------------------------
            
            #normalize
            scaler = MinMaxScaler().fit(data_x_train)
            data_x_train = scaler.transform(data_x_train)
            data_x_test = scaler.transform(data_x_test)
            #train the model
            model.fit(data_x_train, y_train)
            #apply to test
            y_prob_test = [prediction[1] for prediction in model.predict_proba(data_x_test)]
            auc_test = roc_auc_score(y_test, y_prob_test)
            #apply to train
            y_prob_train = [prediction[1] for prediction in model.predict_proba(data_x_train)]
            auc_train = roc_auc_score(y_train, y_prob_train)
            
            auc_summary = pd.concat( [auc_summary, pd.DataFrame({'run': r, 'cv': cv,
            'auc_training': auc_train, 'auc_validation': str(list(auc_valid_inside)), 'auc_test': auc_test, 
            'model': model_name, 'parameter': parameter}, index=[0])], ignore_index=True)
            
            cv += 1
              
    return auc_summary

In [None]:
# -----------------------------------------
# Run the evaluations for KNeighborsClassifier
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for knn in knn_neighbors:
    model_knn = KNeighborsClassifier(n_neighbors=knn, weights='uniform')
    auc_summary = pd.concat([auc_summary, auc_estimation(data_x, data_y, runs, cv_splits, model_knn, 'k-Nearest Neighbors', knn)], ignore_index=True)
    
auc_summary.to_excel('auc_KNeighborsClassifier.xlsx', index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for LogisticRegression
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for lambd in lambda_lasso:
    model_lasso = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=lambd)
    auc_summary = pd.concat([auc_summary, auc_estimation(data_x, data_y, runs, cv_splits, model_lasso, 'Logistic Lasso Regression', lambd)], ignore_index=True)
    
auc_summary.to_excel('auc_LogisticRegression.xlsx', index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for DecisionTreeClassifier
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for depth in tree_depth:
    model_dt= DecisionTreeClassifier(max_depth=depth,class_weight=None) #'balanced'
    auc_summary = pd.concat([auc_summary, auc_estimation(data_x, data_y, runs, cv_splits, model_dt, 'Decision Tree', depth)], ignore_index=True)
    
auc_summary.to_excel('auc_DecisionTreeClassifier.xlsx', index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for RandomForestClassifier
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for depth in tree_depth:
    model_rf = RandomForestClassifier(n_estimators=100, max_depth=depth, bootstrap=True, oob_score=True)
    auc_summary = pd.concat([auc_summary, auc_estimation(data_x, data_y, runs, cv_splits, model_rf, 'Random Forest', depth)], ignore_index=True)
    
auc_summary.to_excel('auc_RandomForestClassifier.xlsx', index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for MLPClassifier
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for neurons in n_neurons:
    model_mlp = MLPClassifier(hidden_layer_sizes=(neurons, ), activation='relu', solver='adam', alpha=0.001, batch_size='auto',   ## hidden_layer_sizes=(round((inputsTrainingScaled.shape[1]+2)/2.), )
                                                         learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=500, shuffle=True,
                                                         random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
                                                         early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)

    auc_summary = pd.concat([auc_summary, auc_estimation(data_x, data_y, runs, cv_splits, model_mlp, 'Multilayer Perceptron', neurons)], ignore_index=True)

auc_summary.to_excel('auc_MLPClassifier.xlsx', index = False)

In [810]:
# -----------------------------------------
# Import the results for all the models
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])

auc_knn = pd.read_excel('auc_KNeighborsClassifier.xlsx')
auc_summary = pd.concat([auc_summary, auc_knn], ignore_index=True)

auc_lasso = pd.read_excel('auc_LogisticRegression.xlsx')
auc_summary = pd.concat([auc_summary, auc_lasso], ignore_index=True)

auc_dt = pd.read_excel('auc_DecisionTreeClassifier.xlsx')
auc_summary = pd.concat([auc_summary, auc_dt], ignore_index=True)

auc_rf = pd.read_excel('auc_RandomForestClassifier.xlsx')
auc_summary = pd.concat([auc_summary, auc_rf], ignore_index=True)

auc_mlp = pd.read_excel('auc_MLPClassifier.xlsx')
auc_summary = pd.concat([auc_summary, auc_mlp], ignore_index=True)

In [811]:
# -----------------------------------------
# Modify the results' format for drawing figures and tables
# -----------------------------------------
def prepare_for_boxplots(auc):
    
    # Boxplots
    auc.columns = ['run', 'cv', 'training', 'validation', 'test', 'model', 'parameter']

    df1 = pd.melt(auc, id_vars=['run', 'cv', 'model', 'parameter'], value_vars=['training', 'test'])
    df1.columns = ['run', 'cv', 'model', 'parameter', 'sample', 'AUC']

    for i in range(auc.shape[0]):
        auc_valid = [float(item) for item in auc.loc[i, 'validation'][1:-1].split(', ')]
    
        for item in auc_valid:
            df1 = df1.append({'run': auc.loc[i, 'run'], 'cv': auc.loc[i, 'cv'], 'model': auc.loc[i, 'model'],
                        'parameter': auc.loc[i, 'parameter'], 'sample': 'validation', 'AUC': item},
                         ignore_index = True)
    
    # Confidence intervals
    df2 = pd.DataFrame(columns = ['mean', 'ci', 'sample', 'parameter', 'model'])
    for prmtr in df1['parameter'].unique():
        for smpl in ['training', 'validation', 'test']:
            selected = (df1.loc[:, 'parameter'] == prmtr) & (df1.loc[:, 'sample'] == smpl)
            df2 = df2.append({'mean': df1.loc[selected, 'AUC'].mean(), 
                              'ci': CI_estimation(df1.loc[selected, 'AUC'])[1] - df1.loc[selected, 'AUC'].mean(), 
                              'sample': smpl, 'parameter': prmtr, 'model': auc.loc[0, 'model']},
                               ignore_index = True)
    return df1, df2

In [812]:
# -----------------------------------------
# Apply modifications to the results
# -----------------------------------------
data_for_boxplots = pd.DataFrame()
auc_stats = pd.DataFrame()

auc_knn_melted, auc_knn_stats = prepare_for_boxplots(auc_knn)
data_for_boxplots = pd.concat([data_for_boxplots, auc_knn_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_knn_stats], ignore_index = True)

auc_lasso_melted, auc_lasso_stats = prepare_for_boxplots(auc_lasso)
data_for_boxplots = pd.concat([data_for_boxplots, auc_lasso_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_lasso_stats], ignore_index = True)

auc_dt_melted, auc_dt_stats = prepare_for_boxplots(auc_dt)
data_for_boxplots = pd.concat([data_for_boxplots, auc_dt_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_dt_stats], ignore_index = True)

auc_rf_melted, auc_rf_stats = prepare_for_boxplots(auc_rf)
data_for_boxplots = pd.concat([data_for_boxplots, auc_rf_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_rf_stats], ignore_index = True)

auc_mlp_melted, auc_mlp_stats = prepare_for_boxplots(auc_mlp)
data_for_boxplots = pd.concat([data_for_boxplots, auc_mlp_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_mlp_stats], ignore_index = True)

In [847]:
# -----------------------------------------
# Produce boxplots for AUCs
# -----------------------------------------
def draw_boxplots(data_for_boxplots):

    data_for_boxplots.columns = ['run', 'cv', 'model', 'parameter', 'Sample: ', 'AUC']
    fig = px.box(data_for_boxplots, x='parameter', y='AUC', facet_row='model', color='Sample: ', 
                 color_discrete_sequence=px.colors.qualitative.Set2,
    category_orders={'Sample: ': ['training', 'validation', 'test']})
    
    fig.update_yaxes(matches=None) 
    fig.update_xaxes(matches=None, type='category', showticklabels=True, title = '')

    fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
    fig.update_traces(legendgroup = 'main')
    
    fig.update_layout(
    font_size = 25,
    autosize=True,
    width=1700,
    height=2500)
    
    
    fig.add_scatter(x=[3, 150], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=5, col=1) 
    fig.add_scatter(x=[0.025, 1], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=4, col=1) 
    fig.add_scatter(x=[2, 15], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=3, col=1) 
    fig.add_scatter(x=[2, 15], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=2, col=1) 
    fig.add_scatter(x=[3, 400], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, 
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=1, col=1) 
        
    fig.update_layout(legend=dict( tracegroupgap = 50,
    orientation='h', traceorder = 'grouped',
    #yanchor='bottom', 
    bordercolor = 'gray',
    y=0.055,
    #xanchor='center',
    x=0.635))
    
    fig.add_annotation(row=5,col=1, x=35, y=0.55, showarrow=False, text='The number of nearest neighbors')
    fig.add_annotation(row=4,col=1, x=0.15, y=0.6, showarrow=False, text='Regularization')
    fig.add_annotation(row=3,col=1, x=7, y=0.4, showarrow=False, text='Decision tree depth')
    fig.add_annotation(row=2,col=1, x=7, y=0.65, showarrow=False, text='Decision tree depth')
    fig.add_annotation(row=1,col=1, x=30, y=0.35, showarrow=False, text='The number of neurons')
    
    fig.show()
    
    plotly.offline.plot(fig, filename = 'AUCs.html', auto_open=False)
    
    
draw_boxplots(data_for_boxplots)

In [851]:
import plotly.graph_objects as go

# -----------------------------------------
# Produce tables with 95%CIs for AUCs
# -----------------------------------------
def get_info_for_CI_table(auc_stats, model):
    
    selected = (auc_stats.loc[:, 'model'] == model)
    col1 = auc_stats.loc[selected, 'parameter'].unique()
    
    col2 = []
    col3 = []
    col4 = []
    
    for prmtr in col1:  
        selected_prmtr = selected & (auc_stats.loc[:, 'parameter'] == prmtr)
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'training'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'training'), 'ci'].values[0]
        col2.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
        
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'validation'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'validation'), 'ci'].values[0]
        col3.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
        
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'test'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'test'), 'ci'].values[0]
        col4.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
    
    return col1, col2, col3, col4


def create_CI_tables(auc_stats):

    fig = make_subplots(
        rows=5, cols=1, vertical_spacing=0.05,
        specs=[[{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}]],
        subplot_titles=[
            'k-Nearest Neighbors', 
            'Logistic Lasso Regression',
            'Decision Tree',
            'Random Forest',
            'Multilayer Perceptron'])

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'k-Nearest Neighbors')
    fig.add_trace(go.Table(header=dict(values=['The number of nearest neighbors', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 1, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Logistic Lasso Regression')
    fig.add_trace(go.Table(header=dict(values=['Regularization', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 2, col = 1)
    
    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Decision Tree')
    fig.add_trace(go.Table(header=dict(values=['Decision tree depth', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 3, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Random Forest')
    fig.add_trace(go.Table(header=dict(values=['Decision tree depth', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 4, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Multilayer Perceptron')
    fig.add_trace(go.Table(header=dict(values=['The number of neurons', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 5, col = 1)
    
    fig.update_layout(
    autosize=True,
    #width=2000,
    height=1600)
    fig.show()

    plotly.offline.plot(fig, filename = 'CI AUCs.html', auto_open=False)

    
create_CI_tables(auc_stats)