In [1]:
#load libraries

import numpy as np
import pandas as pd
import random as rd
import shap

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score, recall_score
from sklearn.impute import KNNImputer

from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#algorithms
algorithms = {"GB" : (GradientBoostingClassifier(random_state=0), 
                      {"n_estimators": [100,200,500],  'learning_rate': [0.05, 0.1], "max_depth": [4,6,10]})}
   

In [6]:
#hosp1
data = pd.read_csv("hosp1.csv")

#isolate the target variable 
X = data.drop(columns = ["Severity"])
y = data.Severity

#sets 3 folders to select the best hyper-parameters by cross validation within the grid search 
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 

#sets balanced accuracy to choose the best hyper-parameters within the grid search
perf = balanced_accuracy_score

#sets 10 folders for cross validation 
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)  

#will store the recall values of each fold
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store the auc values of each fold
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

    #for each of the 10 folds       
    for train, test in kf.split(X, y):
        
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true 
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersample the majority class repeating 10 times to avoid bias 
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)

            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metric 
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)


#write a csv with auc values            
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp1_auc_gb.csv')

#write a csv with recall values  
recall_svm = pd.DataFrame(np.vstack(score['GB']))
recall_svm.to_csv('hosp1_recall_gb.csv')

sirio
testando
testando
testando
testando
testando
testando
testando
testando
testando
testando


In [7]:
#hosp2
data = pd.read_csv("hosp2.csv")

X = data.drop(columns = ["Severity"])
y = data.Severity

#isolate the target variable 
X = data.drop(columns = ["Severity"])
y = data.Severity

#sets 3 folders to select the best hyper-parameters by cross validation within the grid search 
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 

#sets balanced accuracy to choose the best hyper-parameters within the grid search
perf = balanced_accuracy_score

#sets 10 folders for cross validation 
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

    #for each of the 10 folds      
    for train, test in kf.split(X, y):
        
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias 
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)

            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metric 
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values               
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp2_auc_gb.csv')

#write a csv with recall values   
recall_svm = pd.DataFrame(np.vstack(score['GB']))
recall_svm.to_csv('hosp2_recall_gb.csv')

bp
testando
testando
testando
testando
testando
testando
testando
testando
testando
testando


In [None]:
#hosp3

data = pd.read_csv("hosp3.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 
        
        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metric 
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)


#write a csv with auc values         
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp3_auc_gb.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['GB']))
recall_svm.to_csv('hosp3_recall_gb.csv')

In [None]:
#hosp4

data = pd.read_csv("hosp4.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 
        
        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias         
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)

            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metric 
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values               
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp4_auc_gb.csv')

#write a csv with recall values   
recall_svm = pd.DataFrame(np.vstack(score['GB']))
recall_svm.to_csv('hosp4_recall_gb.csv')

In [None]:
#hosp5

data = pd.read_csv("hosp5.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #balance the classes by randomly undersampling the majority class
        under = RandomUnderSampler(sampling_strategy='majority',  )
        X_train, y_train = under.fit_resample(X_train, y_train)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metrics
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)


#write a csv with auc values   
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp5_auc_gb.csv')

#write a csv with recall values   
recall_svm = pd.DataFrame(np.vstack(score['GB']))
recall_svm.to_csv('hosp5_recall_gb.csv')

In [4]:
#algoritmos
algorithms = {"RF" : (RandomForestClassifier(random_state=0), 
                      {"n_estimators": [100,200,500], "max_depth": [4,6,10],
                       "max_features":("auto", "sqrt", "log2")})}
   

In [12]:
#hosp1

data = pd.read_csv("hosp1.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp1')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 
        
        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias           
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)
        
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values             
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp1_auc_rf.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['RF']))
recall_svm.to_csv('hosp1_recall_rf.csv')

sirio
testando
testando
testando
testando
testando
testando
testando


KeyboardInterrupt: 

In [None]:
#hosp2

data = pd.read_csv("hosp2.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp2')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias   
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)
    
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values            
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp2_auc_rf.csv')

#write a csv with recall values
recall_svm = pd.DataFrame(np.vstack(score['RF']))
recall_svm.to_csv('hosp2_recall_rf.csv')

In [None]:
#hosp3

data = pd.read_csv("hosp3.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp3')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metrics
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)
        
#write a csv with auc values            
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp3_auc_rf.csv')

#write a csv with recall values
recall_svm = pd.DataFrame(np.vstack(score['RF']))
recall_svm.to_csv('hosp3_recall_rf.csv')

In [None]:
#hosp4

data = pd.read_csv("hosp4.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp4')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias   
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',   )
            X_train, y_train = under.fit_resample(X_train, y_train)
    
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values             
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp4_auc_rf.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['RF']))
recall_svm.to_csv('hosp4_recall_rf.csv')

In [None]:
#hosp5
data = pd.read_csv("hosp5.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp5')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metrics
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)

#write a csv with auc values         
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp5_auc_rf.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['RF']))
recall_svm.to_csv('hosp5_recall_rf.csv')

In [None]:
#algoritmos
algorithms = {
    "SVM": (SVC(probability= True), {"C": [1, 10, 100], 
                                     "kernel": ("linear", "rbf"), "gamma": ('scale', 'auto')})}
   

In [None]:
#hosp1

data = pd.read_csv("hosp1.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp1')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias           
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)
        
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values             
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp1_auc_svm.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['SVM']))
recall_svm.to_csv('hosp1_recall_svm.csv')


In [None]:
#hosp2

data = pd.read_csv("hosp2.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp2')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias   
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',  )
            X_train, y_train = under.fit_resample(X_train, y_train)
    
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values             
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp2_auc_svm.csv')

#write a csv with recall values 
recall_svm = pd.DataFrame(np.vstack(score['SVM']))
recall_svm.to_csv('hosp2_recall_svm.csv')


In [None]:
#hosp3

data = pd.read_csv("hosp3.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp3')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metrics
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)

#write a csv with auc values            
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp3_auc_svm.csv')

#write a csv with recall values
recall_svm = pd.DataFrame(np.vstack(score['SVM']))
recall_svm.to_csv('hosp3_recall_svm.csv')

In [None]:
#hosp4

data = pd.read_csv("hosp4.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp4')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #as the classes are umbalanced randomly undersampling the majority class repeating 10 times to avoid bias   
        for j in range(0,10):

            #impute missing values with the mean of the 3 nearest neighbors
            imputer = KNNImputer(n_neighbors=3)
            imputer.fit(X_train)
            X_train = imputer.transform(X_train)
            X_test = imputer.transform(X_test)

            #balance the classes by randomly undersampling the majority class
            under = RandomUnderSampler(sampling_strategy='majority',   )
            X_train, y_train = under.fit_resample(X_train, y_train)
    
            #standard the variables
            prep = StandardScaler()
            prep.fit(X_train)

            #perform a grid search for the best hyperparameters 
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
            best.fit(prep.transform(X_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate recall metrics
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

#write a csv with auc values
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp4_auc_svm.csv')

#write a csv with recall values
recall_svm = pd.DataFrame(np.vstack(score['SVM']))
recall_svm.to_csv('hosp4_recall_svm.csv')

In [None]:
#hosp5

data = pd.read_csv("hosp5.csv")
X = data.drop(columns = ["Severity"])
y = data.Severity
print('hosp5')

#3 pastas para escolha dos melhores parâmetros
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 
#escolha dos melhores parâmetros por acurácia balanceada
perf = balanced_accuracy_score
#validação cruzada com 10 pastas
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#will store each recall value
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#will store each auc value    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []
    
#for each algorithm and its dictionary of hyperparameters to be tested
for algorithm, (clf, parameters) in algorithms.items():

        #for each of the 10 folds      
        for train, test in kf.split(X, y):
            
        #splits data into training and testing
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        print("testando")

        #will store the y_predicted and y_true
        y_pred = [] 
        y_true =  [] 

        #impute missing values with the mean of the 3 nearest neighbors
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(X_train)
        X_train = imputer.transform(X_train)
        X_test = imputer.transform(X_test)

        #standard the variables
        prep = StandardScaler()
        prep.fit(X_train)

        #perform a grid search for the best hyperparameters 
        best = GridSearchCV(clf, parameters, cv=gskf, scoring=(make_scorer(perf)))
        best.fit(prep.transform(X_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(X_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate recall metrics
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(X_test)))[:, 1])
        auc_score[algorithm].append(aucscore)

#write a csv with auc values            
auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('hosp5_auc_svm.csv')

#write a csv with recall values
recall_svm = pd.DataFrame(np.vstack(score['SVM']))
recall_svm.to_csv('hosp5_recall_svm.csv')

In [None]:
#results

In [None]:
#hosp1 - mean

#auc
auc_s = pd.read_csv('hosp1_auc_svm.csv')
auc_g = pd.read_csv('hosp1_auc_gb.csv')
auc_r = pd.read_csv('hosp1_auc_rf.csv')

#recall
rf = pd.read_csv('hosp1_recall_rf.csv')
gb = pd.read_csv('hosp1_recall_gb.csv')
svm = pd.read_csv('hosp1_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].mean(), auc_g['GB'].mean(), auc_r['RF'].mean()]
df.loc['grave'] = [svm['1'].mean(), gb['1'].mean(), rf['1'].mean()]
df.loc['n_grave'] = [svm['0'].mean(), gb['0'].mean(), rf['0'].mean()]

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp1 - std

#auc
auc_s = pd.read_csv('hosp1_auc_svm.csv')
auc_g = pd.read_csv('hosp1_auc_gb.csv')
auc_r = pd.read_csv('hosp1_auc_rf.csv')

#recall
rf = pd.read_csv('hosp1_recall_rf.csv')
gb = pd.read_csv('hosp1_recall_gb.csv')
svm = pd.read_csv('hosp1_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].std(), auc_g['GB'].std(), auc_r['RF'].std()]
df.loc['grave'] = [svm['1'].std(), gb['1'].std(), rf['1'].std()]
df.loc['n_grave'] = [svm['0'].std(), gb['0'].std(), rf['0'].std()]


#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp2 - mean

#auc
auc_s = pd.read_csv('hosp2_auc_svm.csv')
auc_g = pd.read_csv('hosp2_auc_gb.csv')
auc_r = pd.read_csv('hosp2_auc_rf.csv')

#recall
rf = pd.read_csv('hosp2_recall_rf.csv')
gb = pd.read_csv('hosp2_recall_gb.csv')
svm = pd.read_csv('hosp2_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].mean(), auc_g['GB'].mean(), auc_r['RF'].mean()]
df.loc['grave'] = [svm['1'].mean(), gb['1'].mean(), rf['1'].mean()]
df.loc['n_grave'] = [svm['0'].mean(), gb['0'].mean(), rf['0'].mean()]


#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp2 - std

#auc
auc_s = pd.read_csv('hosp2_auc_svm.csv')
auc_g = pd.read_csv('hosp2_auc_gb.csv')
auc_r = pd.read_csv('hosp2_auc_rf.csv')

#recall
rf = pd.read_csv('hosp2_recall_rf.csv')
gb = pd.read_csv('hosp2_recall_gb.csv')
svm = pd.read_csv('hosp2_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].std(), auc_g['GB'].std(), auc_r['RF'].std()]
df.loc['grave'] = [svm['1'].std(), gb['1'].std(), rf['1'].std()]
df.loc['n_grave'] = [svm['0'].std(), gb['0'].std(), rf['0'].std()]

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp3 - mean

#auc
auc_s = pd.read_csv('hosp3_auc_svm.csv')
auc_g = pd.read_csv('hosp3_auc_gb.csv')
auc_r = pd.read_csv('hosp3_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp3_recall_rf.csv')
gb = pd.read_csv('hosp3_recall_gb.csv')
svm = pd.read_csv('hosp3_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].mean(), auc_g['GB'].mean(), auc_r['RF'].mean()] 
df.loc['grave'] = [svm['1'].mean(), gb['1'].mean(), rf['1'].mean()] 
df.loc['n_grave'] = [svm['0'].mean(), gb['0'].mean(), rf['0'].mean()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp3 - std

#auc
auc_s = pd.read_csv('hosp3_auc_svm.csv')
auc_g = pd.read_csv('hosp3_auc_gb.csv')
auc_r = pd.read_csv('hosp3_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp3_recall_rf.csv')
gb = pd.read_csv('hosp3_recall_gb.csv')
svm = pd.read_csv('hosp3_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].std(), auc_g['GB'].std(), auc_r['RF'].std()] 
df.loc['grave'] = [svm['1'].std(), gb['1'].std(), rf['1'].std()] 
df.loc['n_grave'] = [svm['0'].std(), gb['0'].std(), rf['0'].std()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp4 - mean

#auc
auc_s = pd.read_csv('hosp4_auc_svm.csv')
auc_g = pd.read_csv('hosp4_auc_gb.csv')
auc_r = pd.read_csv('hosp4_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp4_recall_rf.csv')
gb = pd.read_csv('hosp4_recall_gb.csv')
svm = pd.read_csv('hosp4_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].mean(), auc_g['GB'].mean(), auc_r['RF'].mean()] 
df.loc['grave'] = [svm['1'].mean(), gb['1'].mean(), rf['1'].mean()] 
df.loc['n_grave'] = [svm['0'].mean(), gb['0'].mean(), rf['0'].mean()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp4 - std

#auc
auc_s = pd.read_csv('hosp4_auc_svm.csv')
auc_g = pd.read_csv('hosp4_auc_gb.csv')
auc_r = pd.read_csv('hosp4_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp4_recall_rf.csv')
gb = pd.read_csv('hosp4_recall_gb.csv')
svm = pd.read_csv('hosp4_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].std(), auc_g['GB'].std(), auc_r['RF'].std()] 
df.loc['grave'] = [svm['1'].std(), gb['1'].std(), rf['1'].std()] 
df.loc['n_grave'] = [svm['0'].std(), gb['0'].std(), rf['0'].std()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp5 - mean

#auc
auc_s = pd.read_csv('hosp5_auc_svm.csv')
auc_g = pd.read_csv('hosp5_auc_gb.csv')
auc_r = pd.read_csv('hosp5_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp5_recall_rf.csv')
gb = pd.read_csv('hosp5_recall_gb.csv')
svm = pd.read_csv('hosp5_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].mean(), auc_g['GB'].mean(), auc_r['RF'].mean()] 
df.loc['grave'] = [svm['1'].mean(), gb['1'].mean(), rf['1'].mean()] 
df.loc['n_grave'] = [svm['0'].mean(), gb['0'].mean(), rf['0'].mean()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df

In [None]:
#hosp5 - std

#auc
auc_s = pd.read_csv('hosp5_auc_svm.csv')
auc_g = pd.read_csv('hosp5_auc_gb.csv')
auc_r = pd.read_csv('hosp5_auc_rf.csv')

# recall 
rf = pd.read_csv('hosp5_recall_rf.csv')
gb = pd.read_csv('hosp5_recall_gb.csv')
svm = pd.read_csv('hosp5_recall_svm.csv')

column_names = ["svm", "gb", "rf"]

#create a df with results
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc_s['SVM'].std(), auc_g['GB'].std(), auc_r['RF'].std()] 
df.loc['grave'] = [svm['1'].std(), gb['1'].std(), rf['1'].std()] 
df.loc['n_grave'] = [svm['0'].std(), gb['0'].std(), rf['0'].std()] 

#round to 3 decimal places and show results
df = df.round(decimals=3)
df