In [44]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import scipy
import plotly
import os
import miceforest as mf
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px

In [3]:
# Read all samples from the file
data_kihd = pd.read_excel('kihd_may_2019.xlsx')

In [14]:
# Define the scenario

scenario = 0 # scenario = 1

# if scenario == 0, then competing risks are removed
# if scenario == 1, then competing risks are treated as non-cases

if scenario == 0:
    folder_prefix = 'no_ncvd'
else:
    folder_prefix = 'ncvd'

Preprocessing
-----------------

In [5]:
# -----------------------------------------
# Keep the original data in 'data_kihd' and use its copy 'data_kihd_preprocessed' instead
# -----------------------------------------
data_kihd_preprocessed = data_kihd.copy()

# Drop two variables (dates of visits)
#data_kihd_preprocessed = data_kihd_preprocessed.drop(['tpvm2', 'tpnr2'], axis = 1)

# Correct the zero-level for the following variables:
wrong_zero = ['v0563', 'v0565', 'v0567', 'v0569', 'v0571', 'v0573', 'v0575', 'v0577', 'v0579', 'v0607', 'v0609', 'v0613',
              'v0621', 'v0623', 'v0625', 'v0627', 'v0629', 'v0631', 'v0633', 'v0635', 'v0637', 'v0639', 'v0643', 'v0645', 
              'v0647', 'v0649', 'v0651', 'v0653']
# Zeros are at the wrong end of the scale
# Change zeros to (max+1)
for col in wrong_zero:
    data_kihd_preprocessed.loc[:, col] = [np.max(data_kihd_preprocessed.loc[:, col]) + 1 if value == 0.0 else value for value in data_kihd_preprocessed.loc[:, col]]

print(data_kihd_preprocessed.shape)
# -----------------------------------------
# Turn the categorical variables indo dummies
# -----------------------------------------
categorical_variables=['au0136','au0153','ek0115','ek0119','ka0118','mi0205','mi0207','mi0208','mi0209',
                       'mi0210','mi0211','mi0212','mi0213','mi0214','v0145','v0146','v0157','v0158','v0161',
                       'v0172','v0247','v0248','v0665','v0721','v0724','u1307']

for col in categorical_variables:
    if col in data_kihd_preprocessed.columns:
        new_dummies=pd.get_dummies(data_kihd_preprocessed[col], dummy_na=False)
        my_list = new_dummies.columns.values
        string = col+"_"
        my_new_list = [string + str(x) for x in my_list]
        new_dummies.columns = my_new_list
        data_kihd_preprocessed = data_kihd_preprocessed.drop(col, axis=1)       
        data_kihd_preprocessed = data_kihd_preprocessed.join(new_dummies)
                 
# Outcomes
kihd_outcomes = data_kihd_preprocessed.loc[:, ['tutknro', 'tpvm2', 'tpnr2', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d']]

# Predictors
kihd_predictors = data_kihd_preprocessed.drop(['tutknro', 'tpvm2', 'tpnr2', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d'], axis = 1)

# Separate genes and phenotypes
genes_start = list(kihd_predictors.columns.values).index('FEDER2HH'.lower())
genes_end = list(kihd_predictors.columns.values).index('CATAETT'.lower())

# Genes only
kihd_genes = kihd_predictors.iloc[:, genes_start:(genes_end+1)].copy()
print("KIHD with genes only: {}".format(kihd_genes.shape))

# Phenotypes
kihd_phenotypes = kihd_predictors.drop(kihd_predictors.columns.values[genes_start:genes_end+1], axis = 1)
print("KIHD with phenotypes only: {}".format(kihd_phenotypes.shape))

(2682, 994)
KIHD with genes only: (2682, 96)
KIHD with phenotypes only: (2682, 977)


In [6]:
# -----------------------------------------
# Remove predictors and subjects based on the number of missing values in 'kihd_phenotypes'
# -----------------------------------------
# Remove variables (columns) containing more than 5% of missing values
threshold_columns = kihd_phenotypes.shape[0]-round(0.05*kihd_phenotypes.shape[0])
kihd_phenotypes = kihd_phenotypes.dropna(axis=1, thresh=threshold_columns) 

print("Filter out predictors with more than 5% of missing values ...")
print("Dataset size: {rows}x{cols}".format(rows = kihd_phenotypes.shape[0], cols = kihd_phenotypes.shape[1]))
print("-------------------------------------")

# Remove subjects in genes and outcomes correspondingly
kihd_genes = kihd_genes.loc[kihd_phenotypes.index, :]
kihd_outcomes = kihd_outcomes.loc[kihd_phenotypes.index, :]

Filter out predictors with more than 5% of missing values ...
Dataset size: 2682x746
-------------------------------------


In [7]:
# -----------------------------------------
# Handle competing risks
# -----------------------------------------
# Remove subjects died because of any non-cardiovascular reason within the prediction horizon
prediction_horizon = 35*365.25

if scenario == 0:
    kihd_outcomes = kihd_outcomes.drop(kihd_outcomes[ (kihd_outcomes.loc[:, 'ncvd16'] == 1) & 
                                                     (kihd_outcomes.loc[:, 'ncvd16d'] <= prediction_horizon)].index, axis=0)
    # Remove subjects in kihd_genes and kihd_phenotypes correspondingly
    kihd_genes = kihd_genes.loc[kihd_outcomes.index, :]
    kihd_phenotypes = kihd_phenotypes.loc[kihd_outcomes.index, :]

    # Reset indices in all data frames after removing subjects 
    kihd_genes = kihd_genes.reset_index(drop=True)
    kihd_outcomes = kihd_outcomes.reset_index(drop=True)
    kihd_phenotypes = kihd_phenotypes.reset_index(drop=True)
    
    print("Dataset size: {rows}x{cols}".format(rows = kihd_phenotypes.shape[0], cols = kihd_phenotypes.shape[1]))


Dataset size: 1861x746


Baseline dataset
---------------

In [22]:
# Select an outcome variable
outcome = 'cvd16'
data_y = ((kihd_outcomes.loc[:, 'cvd16d'] <= prediction_horizon) & (kihd_outcomes.loc[:, 'cvd16'] == 1)).astype("int").values.ravel()


# Define the 'baseline' inputs
data_x = kihd_phenotypes.loc[:, ['v0137', 'tup', 'mvp0224', 'bi0160', 'bi0171', 'diab', 'cvdfam']].copy()

Splitting into training and test parts
-------------------------------------

In [18]:
import os

# A directory to be created for the baseline (selected predictors) sample
folder1 = "splitting_7_{}".format(folder_prefix)

try:
    os.mkdir(folder1)
except OSError:
    print ("Creation of the directory %s failed (already exists?)" % folder1)
else:
    print ("Successfully created the directory %s" % folder1)

    
# A directory to be created for the high-dim sample
folder2 = "splitting_746_{}".format(folder_prefix)

try:
    os.mkdir(folder2)
except OSError:
    print ("Creation of the directory %s failed (already exists?)" % folder2)
else:
    print ("Successfully created the directory %s" % folder2)


Creation of the directory splitting_7_no_ncvd failed (already exists?)
Creation of the directory splitting_746_no_ncvd failed (already exists?)


In [23]:
# Define parameters of the experiment 
runs=50
cv_splits=5


def cv_splitting(cv_splits, data_x, data_y):
    
    splitting = StratifiedKFold(n_splits=cv_splits, random_state=None, shuffle=True)

    for train_index, test_index in splitting.split(data_x, data_y):

        train_index_original = data_x.index[train_index]
        test_index_original = data_x.index[test_index]

        yield train_index_original, test_index_original

In [112]:
#
# Splitting data multiple times for cross-validation (upper loop)
#
for r in range(runs):
    
    generate_split = cv_splitting(cv_splits, data_x, data_y)
    
    for cv in range(cv_splits):
        train_index, test_index = next(generate_split)
        
        # Save the current split in the folder:
        # Baseline
        train_data = data_x.loc[train_index].copy()
        train_data['class'] = data_y[train_index]
        
        test_data = data_x.loc[test_index].copy()
        test_data['class'] = data_y[test_index]
        
        train_data.to_excel("splitting_7_{}/train_data_{}_{}.xlsx".format(folder_prefix, r, cv))
        test_data.to_excel("splitting_7_{}/test_data_{}_{}.xlsx".format(folder_prefix, r, cv))
        
        # Save the current split in the folder:
        # High-dimesional
        train_data = kihd_phenotypes.loc[train_index].copy()
        train_data['class'] = data_y[train_index]
        
        test_data = kihd_phenotypes.loc[test_index].copy()
        test_data['class'] = data_y[test_index]
        
        train_data.to_excel("splitting_746_{}/train_data_{}_{}.xlsx".format(folder_prefix, r, cv))
        test_data.to_excel("splitting_746_{}/test_data_{}_{}.xlsx".format(folder_prefix, r, cv))
    

Missing data imputation
-----------------------

1. baseline dataset (small)
____________________________

In [24]:
# -------------------------------------------------------------------------------
# Function:
# Train an imputer with the MICE algorithm to fill gaps in training and test data
# -------------------------------------------------------------------------------

def filling_gaps_small_data(train_data, test_data):
    '''
    Fill gaps in training and test data using MICE
    
    Parameters
    ----------
    train_data: pd.DataFrame
         predictors with gaps to train an imputer 
    test_data: pd.DataFrame
         predictors with gaps to apply an imputer 
         
    Returns
    -------
    train_data_filled: pd.DataFrame
        training data with filled gaps
    test_data_filled: pd.DataFrame
        test data with filled gaps
    '''

    # Create kernel 
    np.random.seed()

    # Define parameters of the MICE algorithm from the miceforest package
    mice_kernel = mf.KernelDataSet(
    train_data,
    mean_match_candidates=5,
    save_all_iterations=True,
    save_models = 1,
    random_state=None
    )

    # Run 10 iterations of the MICE algorithm
    mice_kernel.mice(10, n_jobs=10, max_depth = 3, n_estimators=50, oob_score=True, bootstrap=True)

    # Return a training dataset filled
    train_filled = mice_kernel.complete_data()
    # Apply the trained imputer to a test dataset
    test_filled = mice_kernel.impute_new_data(new_data=test_data)
    # Return a test dataset filled
    test_filled = test_filled.complete_data()
    
    return train_filled, test_filled

In [26]:
# A directory to be created for the baseline (selected predictors) sample
folder1 = "imputing_7_{}".format(folder_prefix)

try:
    os.mkdir(folder1)
except OSError:
    print ("Creation of the directory %s failed (already exists?)" % folder1)
else:
    print ("Successfully created the directory %s" % folder1)

    
# A directory to be created for the high-dim sample
folder2 = "imputing_746_{}".format(folder_prefix)

try:
    os.mkdir(folder2)
except OSError:
    print ("Creation of the directory %s failed (already exists?)" % folder2)
else:
    print ("Successfully created the directory %s" % folder2)

Creation of the directory imputing_7_no_ncvd failed (already exists?)
Creation of the directory imputing_746_no_ncvd failed (already exists?)


In [114]:
for r in range(runs):
    for cv in range(cv_splits):
        train_data = pd.read_excel("splitting_7_{}/train_data_{}_{}.xlsx".format(folder_prefix, r, cv), index_col = 0)
        test_data = pd.read_excel("splitting_7_{}/test_data_{}_{}.xlsx".format(folder_prefix, r, cv), index_col = 0)
    
        train_filled, test_filled = filling_gaps_small_data(train_data.loc[:, train_data.columns[:-1]], 
                                                            test_data.loc[:, test_data.columns[:-1]])
        
        train_filled['class'] = train_data['class'].copy()
        test_filled['class'] = test_data['class'].copy()
        
        train_filled.to_excel("imputing_7_{}/train_filled_{}_{}.xlsx".format(folder_prefix, r, cv))
        test_filled.to_excel("imputing_7_{}/test_filled_{}_{}.xlsx".format(folder_prefix, r, cv))

2. high-dimensional dataset
______________________________

In [28]:
# In the test part of high-dimensional data, there are often gaps in variables that do not have missing values in the training part. 
# Therefore, an imputer is trained on the joint dataset (train+test) again to fill gaps in the test part. 
# Gaps in the training part are filled without using the test part.

In [29]:
# -------------------------------------------------------------------------------
# Function:
# Train an imputer with the MICE algorithm to fill gaps in training and test data
# -------------------------------------------------------------------------------

def filling_gaps_big_data(train_data, test_data):
    '''
    Fill gaps in training and test data using MICE
    
    Parameters
    ----------
    train_data: pd.DataFrame
         predictors with gaps to train an imputer 
    test_data: pd.DataFrame
         predictors with gaps to apply an imputer 
         
    Returns
    -------
    train_filled: pd.DataFrame
        training data with filled gaps
    test_filled: pd.DataFrame
        test data with filled gaps
    '''

    # In this function, an imputer first is trained to fill gaps in the training data.
    # Then, training samples without gaps are combined with test sampes with gaps
    # and an inputer is trained again to fill gaps in the test data.
    # This is done in this way because test examples have missing values in variables 
    # that have no gaps in the training data, therefore, the MICE imputer cannot fill
    # them after training on the training data. 
    # This procedure immitates the real world scenario, when the model is first trained
    # on some data, then other unseen data come for tests. 
    
    # Create kernel 
    np.random.seed()

    # Define parameters of the MICE algorithm from the miceforest package
    mice_kernel = mf.KernelDataSet(
    train_data,
    mean_match_candidates=5,
    save_all_iterations=True,
    save_models = 1,
    random_state=None
    )

    # Run 10 iterations of the MICE algorithm
    mice_kernel.mice(10, n_jobs=10, max_depth = 5, n_estimators=100, oob_score=True, bootstrap=True, verbose=False)

    # Return a training dataset filled
    train_filled = mice_kernel.complete_data()

    # Train and test joint sample
    test_starts = train_filled.shape[0]
    all_samples = pd.concat([train_filled, test_data])
    
    print("Trained has been filled")
    
    # Create kernel for all samples
    np.random.seed()

    mice_kernel_test = mf.KernelDataSet(
    all_samples,
    mean_match_candidates=5,
    save_all_iterations=False,
    save_models = 1,
    random_state=None
    )
    
    # Run the MICE algorithm for 10 iterations (to fill gaps in test samples)
    mice_kernel_test.mice(10, n_jobs=10, max_depth = 5, n_estimators=100, oob_score=True, bootstrap=True, verbose=False)
    
    # Fill gaps in samples (gaps are in test samples)
    all_samples_filled = mice_kernel_test.complete_data()

    # Return filled test samples
    test_filled = all_samples_filled.iloc[test_starts:,]
    
    print("Test has been filled")
    
    return train_filled, test_filled

In [None]:
for r in range(runs):
    print("Run: ", r)
    for cv in range(cv_splits):
        print("CV: ", cv)
        train_data = pd.read_excel("splitting_746_{}/train_data_{}_{}.xlsx".format(folder_prefix, r, cv), index_col = 0)
        test_data = pd.read_excel("splitting_746_{}/test_data_{}_{}.xlsx".format(folder_prefix, r, cv), index_col = 0)
    
        train_filled, test_filled = filling_gaps_big_data(train_data.loc[:, train_data.columns[:-1]], 
                                                            test_data.loc[:, test_data.columns[:-1]])
        
        train_filled['class'] = train_data['class'].copy()
        test_filled['class'] = test_data['class'].copy()
        
        train_filled.to_excel("imputing_746_{}/train_filled_{}_{}.xlsx".format(folder_prefix, r, cv))
        test_filled.to_excel("imputing_746_{}/test_filled_{}_{}.xlsx".format(folder_prefix, r, cv))

Estimate AUC in multiple runs
------------------------------

Baseline level
__________________

In [380]:
# -----------------------------------------
# Estimate AUC for the Logistic Regression model without regularization (FinRisk predictors), i.e. a baseline level
# -----------------------------------------
def auc_estimation_baseline(runs, cv_splits, folder):
    auc_test_all = []
    auc_train_all = []

    for r in range(runs):
        
        for cv in range(cv_splits):
            model = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=100000)
            
            data_train = pd.read_excel(folder + "/train_filled_{}_{}.xlsx".format(r, cv), index_col = 0)
            data_test = pd.read_excel(folder + "/test_filled_{}_{}.xlsx".format(r, cv), index_col = 0)
            
            #data_train = data_train.loc[list(set(data_train.index) - set(index_ncvd))].copy()
            #data_test = data_test.loc[list(set(data_test.index) - set(index_ncvd))].copy()
            
            data_x_train = data_train.loc[:, data_train.columns[:-1]].copy()
            y_train = data_train.loc[:, 'class'].values
            
            data_x_test = data_test.loc[:, data_test.columns[:-1]].copy()
            y_test = data_test.loc[:, 'class'].values
            
            #normalize
            scaler = MinMaxScaler().fit(data_x_train)
            data_x_train = scaler.transform(data_x_train)
            data_x_test = scaler.transform(data_x_test)
            #train the model
            model.fit(data_x_train, y_train)
            #apply to test
            y_prob_test = [prediction[1] for prediction in model.predict_proba(data_x_test)]
            auc_test_all.append(roc_auc_score(y_test, y_prob_test))
            #apply to train
            y_prob_train = [prediction[1] for prediction in model.predict_proba(data_x_train)]
            auc_train_all.append(roc_auc_score(y_train, y_prob_train))

    return np.array(auc_train_all), np.array(auc_test_all)

In [381]:
folder = "imputing_7_{}".format(folder_prefix)

auc_train_baseline, auc_test_baseline = auc_estimation_baseline(runs, cv_splits, folder)

In [32]:
# -----------------------------------------
# Estimate a confidence interval
# -----------------------------------------
def CI_estimation(sample, confidence_level = 0.95):
    degrees_freedom = sample.size - 1
    sample_mean = np.mean(sample)
    sample_standard_error = scipy.stats.sem(sample)
    
    confidence_interval = scipy.stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error)
    print(sample_mean, '({0:.4f}, {1:.4f})'.format(confidence_interval[0], confidence_interval[1]))

    return confidence_interval

In [None]:
CI_estimation(auc_test_baseline, confidence_level = 0.95)
CI_estimation(auc_train_baseline, confidence_level = 0.95)

In [8]:
#--------------------------------
# Return predicted probabilities
#--------------------------------
def predict_baseline(runs, cv_splits, folder):
    
    y_prob_test = []    
    y_true_test = []

    for r in range(runs):    
        for cv in range(cv_splits):
            model = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=100000)

            data_train = pd.read_excel(folder + "/train_filled_{}_{}.xlsx".format(r, cv), index_col = 0)
            data_test = pd.read_excel(folder + "/test_filled_{}_{}.xlsx".format(r, cv), index_col = 0)

            data_x_train = data_train.loc[:, data_train.columns[:-1]].copy()
            y_train = data_train.loc[:, 'class'].values

            data_x_test = data_test.loc[:, data_test.columns[:-1]].copy()
            y_test = data_test.loc[:, 'class'].values

            #normalize
            scaler = MinMaxScaler().fit(data_x_train)
            data_x_train = scaler.transform(data_x_train)
            data_x_test = scaler.transform(data_x_test)

            #train the model    
            model.fit(data_x_train, y_train)

            #apply to test
            y_prob_test.extend([prediction[1] for prediction in model.predict_proba(data_x_test)])

            y_true_test.extend(y_test)

    return np.array(y_prob_test), np.array(y_true_test)

In [9]:
folder = "imputing_7_{}".format(folder_prefix)

y_prob_test_baseline, y_true_test_baseline = predict_baseline(runs, cv_splits, folder)

2. AUC for high-dimensional data
___________________________________

In [359]:
# -----------------------------------------
# Perform cross-validation for the given data and model 
# -----------------------------------------
def cv_estimator(cv_splits, data_x, data_y, model):
    auc_test = []
    auc_train = []
    
    splitting = StratifiedKFold(n_splits=cv_splits, random_state=None, shuffle=True)
    
    for train_index, test_index in splitting.split(np.zeros(data_y.shape[0]), data_y):
        data_x_train, data_x_test = data_x.iloc[train_index].copy(), data_x.iloc[test_index].copy()
        y_train, y_test = data_y[train_index], data_y[test_index]
        
        #normalize
        scaler = MinMaxScaler().fit(data_x_train)
        data_x_train = scaler.transform(data_x_train)
        data_x_test = scaler.transform(data_x_test)
        #train the model
        model.fit(data_x_train, y_train)
        #apply to test
        y_prob_test = [prediction[1] for prediction in model.predict_proba(data_x_test)]
        auc_test.append(roc_auc_score(y_test, y_prob_test))
        #apply to train
        y_prob_train = [prediction[1] for prediction in model.predict_proba(data_x_train)]
        auc_train.append(roc_auc_score(y_train, y_prob_train))
        
    return np.array(auc_train), np.array(auc_test)

In [35]:
# -----------------------------------------
# Estimate AUC on training, validation, and test data:
# a model and its parameters are passed
# a dataframe with results is returned
# -----------------------------------------
def auc_estimation(folder, runs, cv_splits, model, model_name, parameter):
    
    y_prob_test_all = []
    y_test_all = []

    auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
    predictions_all = pd.DataFrame(columns = ['true', 'predicted'])

    for r in range(runs):
        
        for cv in range(cv_splits):

            data_train = pd.read_excel(folder + "/train_filled_{}_{}.xlsx".format(r, cv), index_col = 0)
            data_test = pd.read_excel(folder + "/test_filled_{}_{}.xlsx".format(r, cv), index_col = 0)
            
            data_x_train = data_train.loc[:, data_train.columns[:-1]].copy()
            y_train = data_train.loc[:, 'class'].values
            
            data_x_test = data_test.loc[:, data_test.columns[:-1]].copy()
            y_test = data_test.loc[:, 'class'].values
            
            # -----------------------------------------
            # Validation on the training data to choose the model parameters 
            # -----------------------------------------
            auc_train_inside, auc_valid_inside = cv_estimator(cv_splits, data_x_train, y_train, model)
            # -----------------------------------------
            # -----------------------------------------
            
            #normalize
            scaler = MinMaxScaler().fit(data_x_train)
            data_x_train = scaler.transform(data_x_train)
            data_x_test = scaler.transform(data_x_test)
            #train the model
            model.fit(data_x_train, y_train)
            #apply to test
            y_prob_test = [prediction[1] for prediction in model.predict_proba(data_x_test)]
            auc_test = roc_auc_score(y_test, y_prob_test)
            #apply to train
            y_prob_train = [prediction[1] for prediction in model.predict_proba(data_x_train)]
            auc_train = roc_auc_score(y_train, y_prob_train)
            
            auc_summary = pd.concat( [auc_summary, pd.DataFrame({'run': r, 'cv': cv,
            'auc_training': auc_train, 'auc_validation': str(list(auc_valid_inside)), 'auc_test': auc_test, 
            'model': model_name, 'parameter': parameter}, index=[0])], ignore_index=True)
            
            y_prob_test_all.extend(y_prob_test)
            y_test_all.extend(y_test)

    predictions_all['true'] = y_test_all
    predictions_all['predicted'] = y_prob_test_all
    
    return auc_summary, predictions_all

In [36]:
knn_neighbors = [2, 4, 6, 8, 10, 12, 15, 20, 35]
lambda_lasso = [0.025, 0.05, 0.075, 0.1, 0.15, 0.25, 0.5, 0.75, 1]
tree_depth = [2, 3, 4, 5, 7, 9, 11, 13, 15]
n_neurons = [3, 5, 10, 15, 30, 50, 100, 300, 400]

In [None]:
# -----------------------------------------
# Run the evaluations for KNeighborsClassifier
# -----------------------------------------
folder = "imputing_746_{}".format(folder_prefix)

auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for knn in knn_neighbors:
    model_knn = KNeighborsClassifier(n_neighbors=knn, weights='uniform')
    auc, predictions_test = auc_estimation(folder, runs, cv_splits, model_knn, 'k-Nearest Neighbors', knn)
    auc_summary = pd.concat([auc_summary, auc], ignore_index=True)
    predictions_test.to_excel('version_2_{}/predictions_test_KNeighborsClassifier_{}.xlsx'.format(folder_prefix, knn), index = False)

auc_summary.to_excel('version_2_{}/auc_KNeighborsClassifier.xlsx'.format(folder_prefix), index = False)


In [370]:
# -----------------------------------------
# Run the evaluations for LogisticRegression
# -----------------------------------------
folder = "imputing_746_{}".format(folder_prefix)

auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for lambd in lambda_lasso:
    model_lasso = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=lambd)
    auc, predictions_test = auc_estimation(folder, runs, cv_splits, model_lasso, 'Logistic Lasso Regression', lambd)
    auc_summary = pd.concat([auc_summary, auc], ignore_index=True)
    predictions_test.to_excel('version_2_{}/predictions_test_LogisticRegression_{}.xlsx'.format(folder_prefix, lambd), index = False)
    
auc_summary.to_excel('version_2_{}/auc_LogisticRegression.xlsx'.format(folder_prefix), index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for DecisionTreeClassifier
# -----------------------------------------
folder = "imputing_746_{}".format(folder_prefix)

auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for depth in tree_depth:
    model_dt= DecisionTreeClassifier(max_depth=depth,class_weight=None) 
    auc, predictions_test = auc_estimation(folder, runs, cv_splits, model_dt, 'Decision Tree', depth)
    auc_summary = pd.concat([auc_summary, auc], ignore_index=True)
    predictions_test.to_excel('version_2_{}/predictions_test_DecisionTreeClassifier_{}.xlsx'.format(folder_prefix, depth), index = False)

auc_summary.to_excel('version_2_{}/auc_DecisionTreeClassifier.xlsx'.format(folder_prefix), index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for RandomForestClassifier
# -----------------------------------------
folder = "imputing_746_{}".format(folder_prefix)

auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for depth in tree_depth:
    model_rf = RandomForestClassifier(n_estimators=100, max_depth=depth, bootstrap=True, oob_score=True)
    auc, predictions_test = auc_estimation(folder, runs, cv_splits, model_rf, 'Random Forest', depth)
    auc_summary = pd.concat([auc_summary, auc], ignore_index=True)
    predictions_test.to_excel('version_2_{}/predictions_test_RandomForestClassifier_{}.xlsx'.format(folder_prefix, depth), index = False)

auc_summary.to_excel('version_2_{}/auc_RandomForestClassifier.xlsx'.format(folder_prefix), index = False)

In [None]:
# -----------------------------------------
# Run the evaluations for MLPClassifier
# -----------------------------------------
auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])
for neurons in n_neurons:
    model_mlp = MLPClassifier(hidden_layer_sizes=(neurons, ), activation='relu', solver='adam', alpha=0.001, batch_size='auto',   ## hidden_layer_sizes=(round((inputsTrainingScaled.shape[1]+2)/2.), )
                                                     learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=500, shuffle=True,
                                                     random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
                                                     early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)

    auc, predictions_test = auc_estimation(folder, runs, cv_splits, model_mlp, 'Multilayer Perceptron', neurons)
    auc_summary = pd.concat([auc_summary, auc], ignore_index=True)
    predictions_test.to_excel('version_2_{}/predictions_test_MLPClassifier_{}.xlsx'.format(folder_prefix, neurons), index = False)

auc_summary.to_excel('version_2_{}/auc_MLPClassifier.xlsx'.format(folder_prefix), index = False)

Post-processing the results
-----------------------------

In [5]:
# -----------------------------------------
# Import the results for all the models
# -----------------------------------------

knn_neighbors = [2, 4, 6, 8, 10, 12, 15, 20, 35]
lambda_lasso = [0.025, 0.05, 0.075, 0.1, 0.15, 0.25, 0.5, 0.75, 1]
tree_depth = [2, 3, 4, 5, 7, 9, 11, 13, 15]
n_neurons = [3, 5, 10, 15, 30, 50, 100, 300, 400]


auc_summary = pd.DataFrame(columns = ['run', 'cv', 'auc_training', 'auc_validation', 'auc_test', 'model', 'parameter'])

auc_knn = pd.read_excel('version_2_{}/auc_KNeighborsClassifier.xlsx'.format(folder_prefix))
auc_summary = pd.concat([auc_summary, auc_knn], ignore_index=True)

auc_lasso = pd.read_excel('version_2_{}/auc_LogisticRegression.xlsx'.format(folder_prefix))
auc_summary = pd.concat([auc_summary, auc_lasso], ignore_index=True)

auc_dt = pd.read_excel('version_2_{}/auc_DecisionTreeClassifier.xlsx'.format(folder_prefix))
auc_summary = pd.concat([auc_summary, auc_dt], ignore_index=True)

auc_rf = pd.read_excel('version_2_{}/auc_RandomForestClassifier2.xlsx'.format(folder_prefix))
auc_summary = pd.concat([auc_summary, auc_rf], ignore_index=True)

auc_mlp = pd.read_excel('version_2_{}/auc_MLPClassifier.xlsx'.format(folder_prefix))
auc_summary = pd.concat([auc_summary, auc_mlp], ignore_index=True)

In [38]:
# -----------------------------------------
# Modify the results' format for drawing figures and tables
# -----------------------------------------
def prepare_for_boxplots(auc):
    
    # Boxplots
    auc.columns = ['run', 'cv', 'training', 'validation', 'test', 'model', 'parameter']

    df1 = pd.melt(auc, id_vars=['run', 'cv', 'model', 'parameter'], value_vars=['training', 'test'])
    df1.columns = ['run', 'cv', 'model', 'parameter', 'sample', 'AUC']

    for i in range(auc.shape[0]):
        auc_valid = [float(item) for item in auc.loc[i, 'validation'][1:-1].split(', ')]
    
        for item in auc_valid:
            df1 = df1.append({'run': auc.loc[i, 'run'], 'cv': auc.loc[i, 'cv'], 'model': auc.loc[i, 'model'],
                        'parameter': auc.loc[i, 'parameter'], 'sample': 'validation', 'AUC': item},
                         ignore_index = True)
    
    # Confidence intervals
    df2 = pd.DataFrame(columns = ['mean', 'ci', 'sample', 'parameter', 'model'])
    for prmtr in df1['parameter'].unique():
        for smpl in ['training', 'validation', 'test']:
            selected = (df1.loc[:, 'parameter'] == prmtr) & (df1.loc[:, 'sample'] == smpl)
            df2 = df2.append({'mean': df1.loc[selected, 'AUC'].mean(), 
                              'ci': CI_estimation(df1.loc[selected, 'AUC'])[1] - df1.loc[selected, 'AUC'].mean(), 
                              'sample': smpl, 'parameter': prmtr, 'model': auc.loc[0, 'model']},
                               ignore_index = True)
    return df1, df2

In [None]:
# -----------------------------------------
# Apply modifications to the results
# -----------------------------------------
data_for_boxplots = pd.DataFrame()
auc_stats = pd.DataFrame()

auc_knn_melted, auc_knn_stats = prepare_for_boxplots(auc_knn)
data_for_boxplots = pd.concat([data_for_boxplots, auc_knn_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_knn_stats], ignore_index = True)

auc_lasso_melted, auc_lasso_stats = prepare_for_boxplots(auc_lasso)
data_for_boxplots = pd.concat([data_for_boxplots, auc_lasso_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_lasso_stats], ignore_index = True)

auc_dt_melted, auc_dt_stats = prepare_for_boxplots(auc_dt)
data_for_boxplots = pd.concat([data_for_boxplots, auc_dt_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_dt_stats], ignore_index = True)

auc_rf_melted, auc_rf_stats = prepare_for_boxplots(auc_rf)
data_for_boxplots = pd.concat([data_for_boxplots, auc_rf_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_rf_stats], ignore_index = True)

auc_mlp_melted, auc_mlp_stats = prepare_for_boxplots(auc_mlp)
data_for_boxplots = pd.concat([data_for_boxplots, auc_mlp_melted], ignore_index = True)
auc_stats = pd.concat([auc_stats, auc_mlp_stats], ignore_index = True)

In [None]:
# -----------------------------------------
# Produce boxplots for AUCs
# -----------------------------------------
def draw_boxplots(data_for_boxplots):

    data_for_boxplots.columns = ['run', 'cv', 'model', 'parameter', 'Sample: ', 'AUC']
    fig = px.box(data_for_boxplots, x='parameter', y='AUC', facet_row='model', color='Sample: ', 
                 color_discrete_sequence=px.colors.qualitative.Set2,
    category_orders={'Sample: ': ['training', 'validation', 'test']})
    
    fig.update_yaxes(matches=None) 
    fig.update_xaxes(matches=None, type='category', showticklabels=True, title = '')

    fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
    fig.update_traces(legendgroup = 'main')
    
    fig.update_layout(
    font_size = 25,
    autosize=True,
    width=1700,
    height=2500)

    
    
    fig.add_scatter(x=[2, 35], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=5, col=1) 
    fig.add_scatter(x=[0.025, 1], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=4, col=1) 
    fig.add_scatter(x=[2, 15], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=3, col=1) 
    fig.add_scatter(x=[2, 15], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, showlegend = False,
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=2, col=1) 
    fig.add_scatter(x=[3, 400], y=[auc_test_baseline.mean()]*2, marker=dict({'opacity': 0}), opacity = 0.8, 
                    line = dict({'dash': 'dash', 'color': 'gray'}), name = 'baseline test AUC', legendgroup = 'baseline',
                    row=1, col=1) 
        
    fig.update_layout(legend=dict( tracegroupgap = 50,
    orientation='h', traceorder = 'grouped',
    #yanchor='bottom', 
    bordercolor = 'gray',
    y=0.055,
    #xanchor='center',
    x=0.615))
    
    fig.add_annotation(row=5,col=1, x=4, y=0.45, showarrow=False, text='The number of nearest neighbors')
    fig.add_annotation(row=4,col=1, x=4, y=0.55, showarrow=False, text='Regularization')
    fig.add_annotation(row=3,col=1, x=4, y=0.35, showarrow=False, text='Decision tree depth')
    fig.add_annotation(row=2,col=1, x=4, y=0.55, showarrow=False, text='Decision tree depth')
    fig.add_annotation(row=1,col=1, x=4, y=0.325, showarrow=False, text='The number of neurons')
    
    fig.show()
    
    plotly.offline.plot(fig, filename = 'version_2_{}/AUCs.html'.format(folder_prefix), auto_open=True)
    
    
draw_boxplots(data_for_boxplots)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# -----------------------------------------
# Produce tables with 95%CIs for AUCs
# -----------------------------------------
def get_info_for_CI_table(auc_stats, model):
    
    selected = (auc_stats.loc[:, 'model'] == model)
    col1 = auc_stats.loc[selected, 'parameter'].unique()
    
    col2 = []
    col3 = []
    col4 = []
    
    for prmtr in col1:  
        selected_prmtr = selected & (auc_stats.loc[:, 'parameter'] == prmtr)
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'training'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'training'), 'ci'].values[0]
        col2.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
        
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'validation'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'validation'), 'ci'].values[0]
        col3.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
        
        auc_mean = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'test'), 'mean'].values[0]
        auc_ci = auc_stats.loc[selected_prmtr & (auc_stats.loc[:, 'sample'] == 'test'), 'ci'].values[0]
        col4.append('{} ({}, {})'.format(round(auc_mean, 4), round(auc_mean-auc_ci, 4), round(auc_mean+auc_ci, 4)))
    
    return col1, col2, col3, col4


def create_CI_tables(auc_stats):

    fig = make_subplots(
        rows=5, cols=1, vertical_spacing=0.05,
        specs=[[{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}],
               [{"type": "table"}]],
        subplot_titles=[
            'k-Nearest Neighbors', 
            'Logistic Lasso Regression',
            'Decision Tree',
            'Random Forest',
            'Multilayer Perceptron'])

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'k-Nearest Neighbors')
    fig.add_trace(go.Table(header=dict(values=['The number of nearest neighbors', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 1, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Logistic Lasso Regression')
    fig.add_trace(go.Table(header=dict(values=['Regularization', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 2, col = 1)
    
    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Decision Tree')
    fig.add_trace(go.Table(header=dict(values=['Decision tree depth', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 3, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Random Forest')
    fig.add_trace(go.Table(header=dict(values=['Decision tree depth', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 4, col = 1)

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, 'Multilayer Perceptron')
    fig.add_trace(go.Table(header=dict(values=['The number of neurons', 'Training data: AUC mean (95%CI)', 
                                               'Validation data: AUC mean (95%CI)', 'Test data: AUC mean (95%CI)']),
                     cells=dict(values=[col1, col2, col3, col4])), row = 5, col = 1)
    
    fig.update_layout(
    autosize=True,
    #width=2000,
    height=1600)
    fig.show()

    plotly.offline.plot(fig, filename = 'CI AUCs eq hor.html', auto_open=False)

    
create_CI_tables(auc_stats)

In [14]:
# --------------------------
# Save CI-tables in excel
# --------------------------
def create_CI_tables_excel(auc_stats, model_name, parameter):

    col1, col2, col3, col4 = get_info_for_CI_table(auc_stats, model_name)
    
    df = pd.DataFrame({parameter: col1, 
                       'Training data: AUC mean (95%CI)': col2, 
                       'Validation data: AUC mean (95%CI)': col3, 
                       'Test data: AUC mean (95%CI)': col4})
    
    
    df.to_excel("version_2_ncvd/{}_95CI.xlsx".format(model_name))


model_names=['k-Nearest Neighbors', 
            'Logistic Lasso Regression',
            'Decision Tree',
            'Random Forest',
            'Multilayer Perceptron']

parameters = ['The number of nearest neighbors', 
              'Regularization', 
              'Decision tree depth', 
              'Decision tree depth', 
              'The number of neurons']

for i, model_name in enumerate(model_names):
    create_CI_tables_excel(auc_stats, model_name, parameters[i])

In [None]:
# -------------------------
# Draw calibration curves
# -------------------------

plt.subplots(figsize = (10,5))

# Baseline curve
prob_true_baseline, prob_pred_baseline = calibration_curve(y_true_test_baseline, y_prob_test_baseline, n_bins=10, normalize = False)

plt.plot([0, 1], [0, 1], color = 'black')
plt.plot(prob_pred_baseline, prob_true_baseline, marker = 'o', color = 'orange', label = 'Baseline')

# Calibration curves for the selected models
models = ['k-Nearest Neighbors', 'Logistic Lasso Regression', 'Decision Tree',
            'Random Forest', 'Multilayer Perceptron']

model_file = {'k-Nearest Neighbors': 'KNeighborsClassifier', 'Logistic Lasso Regression': 'LogisticRegression',
             'Decision Tree': 'DecisionTreeClassifier', 'Random Forest': 'RandomForestClassifier2',
             'Multilayer Perceptron': 'MLPClassifier'}

model_color = {'k-Nearest Neighbors': 'red', 'Logistic Lasso Regression': 'blue', 'Decision Tree': 'green',
            'Random Forest': 'purple', 'Multilayer Perceptron': 'brown'}


if scenario == 0:
    parameters = {'k-Nearest Neighbors': [35], 'Logistic Lasso Regression': [0.25], 'Decision Tree': [3],
                'Random Forest': [5], 'Multilayer Perceptron': [100]}
else:
    parameters = {'k-Nearest Neighbors': [35], 'Logistic Lasso Regression': [0.15, 0.25], 'Decision Tree': [3],
                'Random Forest': [3,4,5,7], 'Multilayer Perceptron': [400]}
    

line_types = ['solid', 'dotted', 'dashed', 'dashdot']

for model in models:
    params = parameters[model]
    for i, param in enumerate(params):
        predictions = pd.read_excel('version_2_{}/predictions_test_{}_{}.xlsx'.format(folder_prefix, model_file[model], param))
        prob_true, prob_pred = calibration_curve(predictions.loc[:, 'true'].values, predictions.loc[:, 'predicted'].values, 
                                             n_bins=10, normalize = False)

        if i < 3:
            plt.plot(prob_pred, prob_true, marker = 'o', linestyle=line_types[i], color = model_color[model], 
                     label = '{}({})'.format(model, param))
        else:
            plt.plot(prob_pred, prob_true, linestyle=line_types[i], color = model_color[model], 
                     label = '{}({})'.format(model, param))

plt.legend()

plt.xlabel('Predicted risk')
plt.ylabel('Observed risk')

plt.savefig('calibration plot {}.png'.format(folder_prefix), dpi = 800)

ANOVA
---------

In [None]:
from scipy.stats import f_oneway

knn_neighbors = [2, 4, 6, 8, 10, 12, 15, 20, 35]
lambda_lasso = [0.025, 0.05, 0.075, 0.1, 0.15, 0.25, 0.5, 0.75, 1]
tree_depth = [2, 3, 4, 5, 7, 9, 11, 13, 15]
n_neurons = [3, 5, 10, 15, 30, 50, 100, 300, 400]

# Select samples to compare:
dataset = 'validation' # or dataset = 'test'

if scenario == 0:
    dataset = 'validation'
    param1 = lambda_lasso[4]
    param2 = lambda_lasso[5]
    
    sample1 = auc_lasso_melted.loc[ (auc_lasso_melted.loc[:, 'sample'] == dataset) & (auc_lasso_melted.loc[:, 'parameter'] == param1), 'AUC'].values
    sample2 = auc_lasso_melted.loc[ (auc_lasso_melted.loc[:, 'sample'] == dataset) & (auc_lasso_melted.loc[:, 'parameter'] == param2), 'AUC'].values

    print(param1, param2)
    f_oneway(sample1, sample2)
    
else:
    param1 = lambda_lasso[4]
    param2 = lambda_lasso[5]
    
    param3 = tree_depth[1]
    param4 = tree_depth[2]
    param5 = tree_depth[3]
    param6 = tree_depth[4]
    
    sample1 = auc_lasso_melted.loc[ (auc_lasso_melted.loc[:, 'sample'] == dataset) & (auc_lasso_melted.loc[:, 'parameter'] == param1), 'AUC'].values
    sample2 = auc_lasso_melted.loc[ (auc_lasso_melted.loc[:, 'sample'] == dataset) & (auc_lasso_melted.loc[:, 'parameter'] == param2), 'AUC'].values

    sample3 = auc_rf_melted.loc[ (auc_rf_melted.loc[:, 'sample'] == dataset) & (auc_rf_melted.loc[:, 'parameter'] == param3), 'AUC'].values
    sample4 = auc_rf_melted.loc[ (auc_rf_melted.loc[:, 'sample'] == dataset) & (auc_rf_melted.loc[:, 'parameter'] == param4), 'AUC'].values
    sample5 = auc_rf_melted.loc[ (auc_rf_melted.loc[:, 'sample'] == dataset) & (auc_rf_melted.loc[:, 'parameter'] == param5), 'AUC'].values
    sample6 = auc_rf_melted.loc[ (auc_rf_melted.loc[:, 'sample'] == dataset) & (auc_rf_melted.loc[:, 'parameter'] == param6), 'AUC'].values

    print(param1, param2, param3, param4, param5, param6)

    f_oneway(sample1, sample2, sample3, sample4, sample5, sample6)

Additional analysis
___________________

Missing values examination
-----------------

In [55]:
# -----------------------------------------
# Keep the original data in 'data_kihd' and use its copy 'data_kihd_preprocessed' instead
# -----------------------------------------
data_kihd_preprocessed = data_kihd.copy()

# Drop two variables (dates of visits)
#data_kihd_preprocessed = data_kihd_preprocessed.drop(['tpvm2', 'tpnr2'], axis = 1)

# Correct the zero-level for the following variables:
wrong_zero = ['v0563', 'v0565', 'v0567', 'v0569', 'v0571', 'v0573', 'v0575', 'v0577', 'v0579', 'v0607', 'v0609', 'v0613',
              'v0621', 'v0623', 'v0625', 'v0627', 'v0629', 'v0631', 'v0633', 'v0635', 'v0637', 'v0639', 'v0643', 'v0645', 
              'v0647', 'v0649', 'v0651', 'v0653']
# Zeros are at the wrong end of the scale
# Change zeros to (max+1)
for col in wrong_zero:
    data_kihd_preprocessed.loc[:, col] = [np.max(data_kihd_preprocessed.loc[:, col]) + 1 if value == 0.0 else value for value in data_kihd_preprocessed.loc[:, col]]

print(data_kihd_preprocessed.shape)
# -----------------------------------------
# Turn the categorical variables indo dummies
# -----------------------------------------
categorical_variables=['au0136','au0153','ek0115','ek0119','ka0118','mi0205','mi0207','mi0208','mi0209',
                       'mi0210','mi0211','mi0212','mi0213','mi0214','v0145','v0146','v0157','v0158','v0161',
                       'v0172','v0247','v0248','v0665','v0721','v0724','u1307']

for col in categorical_variables:
    if col in data_kihd_preprocessed.columns:
        new_dummies=pd.get_dummies(data_kihd_preprocessed[col], dummy_na=False)
        my_list = new_dummies.columns.values
        string = col+"_"
        my_new_list = [string + str(x) for x in my_list]
        new_dummies.columns = my_new_list
        data_kihd_preprocessed = data_kihd_preprocessed.drop(col, axis=1)       
        data_kihd_preprocessed = data_kihd_preprocessed.join(new_dummies)
                 
# Outcomes
kihd_outcomes = data_kihd_preprocessed.loc[:, ['tutknro', 'tpvm2', 'tpnr2', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d']]

# Predictors
kihd_predictors = data_kihd_preprocessed.drop(['tutknro', 'tpvm2', 'tpnr2', 'chdb16', 'chdb16d', 'amif16', 'amif16d', 'amig16', 'amig16d',
       'amim16', 'amim16d', 'all16', 'all16d', 'cvd16', 'cvd16d', 'syd14', 'alzm14', 'vp14', 'kol14', 'diab14',
       'ncvd16', 'ncvd16d', 'cv15', 'cv15d', 'all15', 'stro15', 'cvd15', 'chd15', 'amib15', 'isth15', 'hsth15', 
                                'db15', 'can15', 'canc15', 'ast15', 'copd15', 'dema15', 'finchd18', 'finchd18d'], axis = 1)

# Separate genes and phenotypes
genes_start = list(kihd_predictors.columns.values).index('FEDER2HH'.lower())
genes_end = list(kihd_predictors.columns.values).index('CATAETT'.lower())

# Genes only
kihd_genes = kihd_predictors.iloc[:, genes_start:(genes_end+1)].copy()
print("KIHD with genes only: {}".format(kihd_genes.shape))

# Phenotypes
kihd_phenotypes = kihd_predictors.drop(kihd_predictors.columns.values[genes_start:genes_end+1], axis = 1)
print("KIHD with phenotypes only: {}".format(kihd_phenotypes.shape))

(2682, 994)
KIHD with genes only: (2682, 96)
KIHD with phenotypes only: (2682, 977)


In [56]:
# Dataset for the examination
kihd_7_inputs_na = kihd_phenotypes.loc[:, ['v0137', 'tup', 'mvp0224', 'bi0160', 'bi0171', 'diab', 'cvdfam']].copy()
kihd_7_inputs_na.columns = ['age', 'smoking', 'sbp', 'tc', 'hdl', 'diabetes', 'fam_hist']

kihd_7_inputs_na.to_excel("kihd_7_inputs_na.xlsx", index = False)
kihd_7_inputs_na = kihd_7_inputs_na.isna()

kihd_7_inputs_na['cvd'] = kihd_outcomes.loc[:, 'cvd16'].copy()
kihd_7_inputs_na['ncvd'] = kihd_outcomes.loc[:, 'ncvd16'].copy()

In [57]:
# Preprocess pattern data
missing_pattern = np.unique(np.array(kihd_7_inputs_na.values.astype('int')), axis = 0, return_counts = True)
missing_pattern_df = pd.DataFrame(missing_pattern[0], columns = ['age', 'smoking', 'sbp', 'tc', 'hdl', 'diabetes', 'fam_hist', 'cvd', 'ncvd'])
missing_pattern_df['counts'] = missing_pattern[1]/kihd_7_inputs_na.shape[0] * 100
missing_pattern_df['counts'] = missing_pattern_df['counts'].apply(lambda x: round(x, 2))
missing_pattern_df = missing_pattern_df.sort_values(['cvd', 'ncvd', 'age', 'smoking', 'tc', 'hdl', 'diabetes', 'fam_hist', 'hdl', 'tc', 'sbp'])

missing_pattern_df = missing_pattern_df.reset_index(drop=True)
missing_pattern_df.index = [0,1,3,2,4,5,7,6,8,9,10,11]
missing_pattern_df = missing_pattern_df.sort_index()

In [None]:
# Visualize patterns
fig, ax = plt.subplots(nrows = 1, ncols = 3, figsize = (30,10))

# cvd
#plt.subplot(1,1)
f1 = sn.heatmap(missing_pattern_df.loc[missing_pattern_df.loc[:, 'cvd'] == 1, missing_pattern_df.columns[:-3]], linewidths=.25, cmap="Accent", 
           xticklabels=['age', 'smoking', 'sbp', 'tc', 'hdl', 'diabetes', 'fam_hist'],
           yticklabels= missing_pattern_df.loc[missing_pattern_df.loc[:, 'cvd'] == 1, 'counts'],
           square=True, cbar=False, 
           ax=ax[0])
f1.set_xticklabels(f1.get_xmajorticklabels(), fontsize = 20, rotation=45)
f1.set_yticklabels(f1.get_ymajorticklabels(), fontsize = 20, rotation=0)
f1.set_title("Cardiovascular death by the end of 2016", fontsize = 20)
f1.set_ylabel("Percentage in the cohort, %", fontsize = 20)

# healthy
#plt.subplot(1,2)
f2 = sn.heatmap(missing_pattern_df.loc[(missing_pattern_df.loc[:, 'cvd'] == 0) & (missing_pattern_df.loc[:, 'ncvd'] == 0), missing_pattern_df.columns[:-3]], 
           linewidths=.25, cmap="Accent", 
           xticklabels = ['age', 'smoking', 'sbp', 'tc', 'hdl', 'diabetes', 'fam_hist'],
           yticklabels = missing_pattern_df.loc[(missing_pattern_df.loc[:, 'cvd'] == 0) & (missing_pattern_df.loc[:, 'ncvd'] == 0), 'counts'],
           square=True, cbar=False,
           ax=ax[1])
f2.set_xticklabels(f2.get_xmajorticklabels(), fontsize = 20, rotation=45)
f2.set_yticklabels(f2.get_ymajorticklabels(), fontsize = 20, rotation=0)
#f2.set_title("Alive by the end of 2016", fontsize = 20)
f2.set_title("Missing values", fontsize = 20)

# ncvd
#plt.subplot(1,3)
f3 = sn.heatmap(missing_pattern_df.loc[missing_pattern_df.loc[:, 'ncvd'] == 1, missing_pattern_df.columns[:-3]], linewidths=.25, cmap="Accent", 
           xticklabels=['age', 'smoking', 'sbp', 'tc', 'hdl', 'diabetes', 'fam_hist'],
           yticklabels = missing_pattern_df.loc[missing_pattern_df.loc[:, 'ncvd'] == 1, 'counts'],
           square=True, cbar=False,
           ax = ax[2])
f3.set_xticklabels(f3.get_xmajorticklabels(), fontsize = 20, rotation=45)
f3.set_yticklabels(f3.get_ymajorticklabels(), fontsize = 20, rotation=0)
#f3.set_title("Non-cardiovarcular death by the end of 2016", fontsize = 20)
f3.set_title("Non-missing values", fontsize = 20)

#plt.savefig("miss_patterns2.png", dpi = 400, transparent = False)

In [59]:
# Load the rpy2 IPython extension into the notebook to use R with a magic command: %%R
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [51]:
%%R
# :::::::::
# R kernel
# :::::::::

install.packages("finalfit")
install.packages("readxl")

In [None]:
%%R -w 600 -h 650
# :::::::::
# R kernel
# :::::::::

# Missing data patterns

library("finalfit")
library("readxl")

# Import key variables
kihd_7_inputs_na <- read_excel("kihd_7_inputs_na.xlsx")

# Change types of categorical variables
kihd_7_inputs_na$smoking <- as.factor(kihd_7_inputs_na$smoking)
kihd_7_inputs_na$diabetes <- as.factor(kihd_7_inputs_na$diabetes)
kihd_7_inputs_na$fam_hist <- as.factor(kihd_7_inputs_na$fam_hist)

# Create a plot of variable distributions conditional on presence and missingness of other variables 
missing_pairs(kihd_7_inputs_na, position = "fill", showYAxisPlotLabels = TRUE)


Important variables
------------------

In [None]:
folder_name = "imputing_746_{}".format()

In [None]:
train_data = pd.read_excel(folder_name + "/train_filled_0_0.xlsx", index_col = 0)
test_data = pd.read_excel(folder_name + "/test_filled_0_0.xlsx", index_col = 0)

data = pd.concat([train_data, test_data], ignore_index = True)
data_x = data.loc[:, data.columns[:-1]]
data_y = data.loc[:, 'class'].values

In [None]:
# Decision Tree
import collections
from sklearn import tree
import pydotplus

data_x = data_x.rename(columns={"v0137": "AGE",
                      "li02met": "ACTIVITY",
                      "packyear": "PACKYEAR",
                      "cigyears": "CIGYEAR",
                      "v1151": "NITROMED",
                      "v0750": "SMOKING",
                      "crpk": "CRP",
                      "bi0222": "VITAMINC",
                      "homa1ir": "HOMAIR1",
                      "v0868": "AMIDG",
                      "mvp0134": "SYSTOLICB",
                      "v0860": "HEALTHSTATE",
                      "apob": "APOB",
                      "nicmgd": "NICOTIN",
                      "exihd": "ISCHAEMIA"})


dt = tree.DecisionTreeClassifier(max_depth=3,random_state=None) 
dt.fit(data_x, data_y)
    
dot_data = tree.export_graphviz(dt,
                    feature_names=data_x.columns.values,
                    out_file=None,
                    class_names = ["no cv death", "cv death"], 
                    filled=True,
                    rounded=True)
    
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('orange', 'lightblue')
colors = {'no cv death': 'lightblue', 'cv death': 'orange'}
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for node in graph.get_nodes():
    print(node.get_label())
    if node.get_label() != None:
        if node.get_label().endswith('no cv death"'):
            node.set_fillcolor(colors['no cv death'])
        else:
            node.set_fillcolor(colors['cv death'])

graph.write_png('{}_tree3.png'.format(folder_name)) 

In [None]:
#normalize
scaler = MinMaxScaler().fit(data_x)
data_x_scaled = scaler.transform(data_x)

In [None]:
# Logistic Lasso
runs = 50

weights = np.zeros(data_x_scaled.shape[1])

for r in range(runs):
    
    lambd = 0.25
    model_lasso = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", C=lambd)

    model_lasso.fit(data_x_scaled, data_y)
    weights=weights + model_lasso.coef_[0]/runs

    
result_importance=pd.DataFrame(columns=['Feature', 'Importance'])

for j in range(len(data_x.columns.values)):
      result_importance=result_importance.append({'Feature': data_x.columns.values[j], 'Importance': weights[j]}, ignore_index=True)


In [None]:
result_importance["abs imp"] = result_importance.loc[:, 'Importance'].apply(abs)
result_importance = result_importance.sort_values(['abs imp'], ascending=False)
result_importance.to_excel("{}_lasso_vars_importance.xlsx".format(folder_name))

In [None]:
# Random Forest
runs = 50

weights = np.zeros(data_x_scaled.shape[1])

for r in range(runs):
    
    tree_depth = 5
    model_rf = RandomForestClassifier(n_estimators=100, max_depth=tree_depth, bootstrap=True, oob_score=True)
    model_rf.fit(data_x_scaled, data_y)
    
    weights=weights + model_rf.feature_importances_/runs
    
result_importance=pd.DataFrame(columns=['Feature', 'Importance'])

for j in range(len(data_x.columns.values)):
      result_importance=result_importance.append({'Feature': data_x.columns.values[j], 'Importance': weights[j]}, ignore_index=True)


In [None]:
result_importance = result_importance.sort_values(['Importance'], ascending=False)
result_importance.to_excel("{}_forest_vars_importance.xlsx".format(folder_name))