In [5]:
import numpy as np
import pandas as pd
import random as rd

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score, recall_score

from imblearn.under_sampling import RandomUnderSampler

from sklearn.impute import KNNImputer

In [6]:
#algorithms
algorithms = {
    "SVM": (SVC(probability= True), {"C": [1, 10], "kernel": ("linear", "rbf"), "gamma": ('scale', 'auto')}),
    "RF" : (RandomForestClassifier(random_state=0), {"n_estimators": [100,200], "max_depth": [4,6],"max_features":("auto", "sqrt")}),
    "GB" : (GradientBoostingClassifier(random_state=0), {"n_estimators": [100,200],  'learning_rate': [0.05, 0.1], "max_depth": [4,6,10]})
}

In [7]:
#datasets
hosp1 = pd.read_csv("hosp1.csv")
hosp2 = pd.read_csv("hosp2.csv")
hosp3 = pd.read_csv("hosp3.csv")
hosp4 = pd.read_csv("hosp4.csv")
hosp5 = pd.read_csv("hosp5.csv")

data = {
    "hosp1": ((hosp1.drop(columns=['Severity'])), hosp1.Severity), 
    "hosp2": ((hosp2.drop(columns=['Severity'])), hosp2.Severity), 
    "hosp3": ((hosp3.drop(columns=['Severity'])), hosp3.Severity), 
    "hosp4": ((hosp4.drop(columns=['Severity'])), hosp4.Severity), 
    "hosp5": ((hosp5.drop(columns=['Severity'])), hosp5.Severity), 
}

In [8]:
#3 folds to choose the best hyperparameters
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#choose of the best hyperparameters through balanced accuracy
perf = balanced_accuracy_score
#10-fold cross validation 
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#for each dataset
for name, (X,y) in data.items(): 

    #store the recall of each algorithm 
    score = {}
    for algorithm in algorithms.keys():
        score[algorithm] = []

    #store the auc for each algorithm
    auc_score = {}
    for algorithm in algorithms.keys():
        auc_score[algorithm] = []
    
    #for each algorithm and its respective search space
    for algorithm, (clf, parameters) in algorithms.items():

        for train, test in kf.split(X, y):
            
            #split train and test 
            X_train, X_test = X.iloc[train], X.iloc[test]
            y_train, y_test = y.iloc[train], y.iloc[test]
            
            #check if the majoritary class is 1.5 times larger than the other
            y_ = pd.DataFrame.from_dict(y)
            if (((y_[y_.Severity == 1].shape[0])*1.5) < (y_[y_.Severity == 0].shape[0])):
                
                #when the conditions is true, make classes balanced through undersampling of majority class
                #repeat 10 times to avoid sample bias
                for j in range(0,10):
                    
                    #vectors to store y_pred e y_true
                    y_pred = [] 
                    y_true = [] 

                    #undersampling of the majority class
                    under = RandomUnderSampler(sampling_strategy='majority', random_state = 0)
                    X_train, y_train = under.fit_resample(X_train, y_train)

                    #impute missing values with the mean of the 3 nearest neighbors
                    imputer = KNNImputer(n_neighbors=3, weights = 'distance')
                    imputer.fit(X_train)
                    X_train = imputer.transform(X_train)
                    X_test = imputer.transform(X_test)        

                    #standardize the features
                    prep = StandardScaler()
                    prep.fit(X_train)

                    #search for the best hyperparameters
                    best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
                    best.fit(prep.transform(X_train), y_train)

                    #store the results
                    y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
                    y_true =  [*y_true, *y_test] 

                    #calculate the recall
                    score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                    #calculate the area under roc curve
                    aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
                    auc_score[algorithm].append(aucscore)
           
            #if classes are not umbalanced
            else:
                #vectors to store y_pred e y_true
                y_pred = [] 
                y_true = [] 

                #impute missing values with the mean of the 3 nearest neighbors
                imputer = KNNImputer(n_neighbors=3)
                imputer.fit(X_train)
                X_train = imputer.transform(X_train)
                X_test = imputer.transform(X_test)        

                #standardize the features
                prep = StandardScaler()
                prep.fit(X_train)

                #search for the best hyperparameters
                best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
                best.fit(prep.transform(X_train), y_train)

                #store the results
                y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
                y_true =  [*y_true, *y_test] 

                #calculate the recall
                score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                #calculate the area under roc curve
                aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
                auc_score[algorithm].append(aucscore)
    
    #write a csv with auc values
    auc_score = pd.DataFrame.from_dict(auc_score)  
    auc_score.to_csv(name + '_auc.csv')
    
    #write a csv with the recall of class '0' - specificity 
    #and another csv with the recall of class '1' - sensitivity
    recall_svm = pd.DataFrame(np.vstack(score['SVM']))
    recall_gb = pd.DataFrame(np.vstack(score['GB']))
    recall_rf = pd.DataFrame(np.vstack(score['RF']))

    esp = pd.concat([recall_svm[[0]], recall_rf[[0]], recall_gb[[0]]], axis=1)
    sen = pd.concat([recall_svm[[1]], recall_rf[[1]], recall_gb[[1]]], axis=1)

    esp.columns = ['SVM', 'RF', 'GB']
    sen.columns = ['SVM', 'RF', 'GB']

    esp.to_csv(name + '_spe.csv')
    sen.to_csv(name + '_sen.csv')  

In [9]:
#hosp1
auc = pd.read_csv('hosp1_auc.csv')

# recall
sen = pd.read_csv('hosp1_sen.csv')
spe = pd.read_csv('hosp1_spe.csv')

column_names = ["svm", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['grave'] = [sen['SVM'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['n_grave'] = [spe['SVM'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm,gb,rf
auc,0.87,0.879,0.877
grave,0.817,0.821,0.817
n_grave,0.777,0.787,0.805


In [10]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['grave'] = [sen['SVM'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['n_grave'] = [spe['SVM'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm,gb,rf
auc,0.02,0.023,0.023
grave,0.04,0.053,0.039
n_grave,0.022,0.015,0.016


In [11]:
#hosp2
auc = pd.read_csv('hosp2_auc.csv')

# recall
sen = pd.read_csv('hosp2_sen.csv')
spe = pd.read_csv('hosp2_spe.csv')

column_names = ["svm", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['grave'] = [sen['SVM'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['n_grave'] = [spe['SVM'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm,gb,rf
auc,0.841,0.835,0.854
grave,0.73,0.743,0.755
n_grave,0.804,0.772,0.799


In [12]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['grave'] = [sen['SVM'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['n_grave'] = [spe['SVM'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm,gb,rf
auc,0.039,0.031,0.03
grave,0.064,0.057,0.063
n_grave,0.035,0.03,0.032


In [13]:
#hosp3
auc = pd.read_csv('hosp3_auc.csv')

# recall
sen = pd.read_csv('hosp3_sen.csv')
spe = pd.read_csv('hosp3_spe.csv')

column_names = ["svm", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['grave'] = [sen['SVM'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['n_grave'] = [spe['SVM'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm,gb,rf
auc,0.709,0.701,0.721
grave,0.564,0.564,0.477
n_grave,0.754,0.73,0.818


In [14]:
#hosp3

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['grave'] = [sen['SVM'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['n_grave'] = [spe['SVM'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm,gb,rf
auc,0.045,0.061,0.041
grave,0.1,0.083,0.121
n_grave,0.072,0.117,0.083


In [15]:
#hosp4
auc = pd.read_csv('hosp4_auc.csv')

# recall
sen = pd.read_csv('hosp4_sen.csv')
spe = pd.read_csv('hosp4_spe.csv')

column_names = ["svm", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['grave'] = [sen['SVM'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['n_grave'] = [spe['SVM'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df



Unnamed: 0,svm,gb,rf
auc,0.777,0.77,0.788
grave,0.675,0.715,0.724
n_grave,0.743,0.718,0.722


In [16]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['grave'] = [sen['SVM'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['n_grave'] = [spe['SVM'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm,gb,rf
auc,0.038,0.079,0.054
grave,0.148,0.183,0.144
n_grave,0.075,0.089,0.065


In [17]:
#hosp5
auc = pd.read_csv('hosp5_auc.csv')

# recall
sen = pd.read_csv('hosp5_sen.csv')
spe = pd.read_csv('hosp5_spe.csv')

column_names = ["svm", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['grave'] = [sen['SVM'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['n_grave'] = [spe['SVM'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm,gb,rf
auc,0.767,0.727,0.718
grave,0.567,0.5,0.5
n_grave,0.815,0.77,0.87


In [18]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['grave'] = [sen['SVM'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['n_grave'] = [spe['SVM'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df

Unnamed: 0,svm,gb,rf
auc,0.266,0.252,0.294
grave,0.316,0.393,0.236
n_grave,0.28,0.263,0.283
