In [1]:
#Importa bibliotecas

import numpy as np
import pandas as pd
import random as rd
import os


from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score, recall_score
from imblearn.under_sampling import RandomUnderSampler

from sklearn.impute import KNNImputer

import hyperopt
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

rstate = np.random.default_rng(17)

In [2]:
#algoritmos
algorithms = {
    
    'SVM_linear': (SVC, {'probability': True,
                         'C': hp.loguniform('C', np.log(1e-3), np.log(1e3))}),
    
    'SVM_rbf': (SVC, {'kernel': 'rbf',
             'probability': True,
             'C': hp.loguniform('C', np.log(1e-3), np.log(1e3)),
             'gamma': hp.choice('gamma',['scale', 'auto'])}),
    
    'RF' : (RandomForestClassifier,{
             'n_estimators': hp.uniformint('n_estimators', 2, 200),
             'max_depth': hp.uniformint('max_depth', 1, 100),
             'criterion': hp.choice('criterion', ["gini", "entropy"])}),
    
    
    'GB' : (GradientBoostingClassifier, {'learning_rate': hp.lognormal('learning_rate', np.log(0.01), np.log(10.0)),
                                     'n_estimators': scope.int(hp.qloguniform('n_estimators', np.log(10.5), np.log(1000.5), 1)),
                                     'loss': hp.choice('loss', ['deviance']),
                                     }),
    
}

In [3]:
#choose of the best hyperparameters through balanced accuracy
perf = make_scorer(balanced_accuracy_score)

#10-fold cross validation 
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#define Standard Scaler to standardize the features
prep = StandardScaler()

#imputa os valores ausentes dos 3 vizinhos mais próximos
imputer = KNNImputer(n_neighbors=3, weights = 'distance')

# redefine the function usng a wider range of hyperparameters
def objective(search_space):
    model = clf(**search_space, random_state = 0)
    score = cross_val_score(model, x_train, y_train, cv=3, scoring=perf, n_jobs=None).mean()
    return {'loss': -score, 'status': STATUS_OK} 

In [4]:
#datasets
hosp1 = pd.read_csv("hosp1_severity_.csv")
hosp2 = pd.read_csv("hosp2_severity_.csv")

data = {
    "hosp1": ((hosp1.drop(columns=['severity'])), hosp1.severity), 
    "hosp2": ((hosp2.drop(columns=['severity'])), hosp2.severity)
}

In [5]:
for name, (X,y) in data.items(): 

    #store recall values for each algorithm
    score = {}
    for algorithm in algorithms.keys():
        score[algorithm] = []

    #store auc values for each algorithm
    auc_score = {}
    for algorithm in algorithms.keys():
        auc_score[algorithm] = []
    
    #for each algorithm and its respective search space
    for algorithm, (clf, search_space) in algorithms.items():

        for train, test in kf.split(X, y):
            
            #train and test alocation
            X_train, X_test = X.iloc[train], X.iloc[test]
            Y_train, y_test = y.iloc[train], y.iloc[test]
            
            y_ = pd.DataFrame.from_dict(y)
            #if the classes are unbalanced
            if (((y_[y_.severity == 1].shape[0])*1.5) < (y_[y_.severity == 0].shape[0])):
                
                for j in range(0,10):
                    
                    #vectors to store y_pred e y_true
                    y_pred = [] 
                    y_true = [] 

                    #undersampling of the majority class
                    #undersampling the majority class when classes are umbalanced
                    under = RandomUnderSampler(sampling_strategy='majority', random_state = j)
                    x_train, y_train = under.fit_resample(X_train, Y_train)
                    x_test = X_test
                                    
                    imputer.fit(x_train)
                    x_train = imputer.transform(x_train)
                    x_test = imputer.transform(x_test)   

                    #standardize the features                
                    prep.fit(x_train)

                    best = fmin(
                      fn=objective,
                      space=search_space,
                      algo=tpe.suggest,
                      max_evals=100,
                      timeout= 90,
                      rstate= rstate)        

                    best = space_eval(search_space, best)
                    best = clf(**best)

                    #search for the best hyperparameters
                    best.fit(prep.transform(x_train), y_train)

                    #store the results
                    y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
                    y_true =  [*y_true, *y_test] 

                    #calculate the recall
                    score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                    #calculate the area under roc curve
                    aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
                    auc_score[algorithm].append(aucscore)
           
            else:
                 #vectors to store y_pred e y_true
                y_pred = [] 
                y_true = [] 

                x_train = X_train
                y_train = Y_train
                x_test = X_test
                
                imputer.fit(x_train)
                x_train = imputer.transform(x_train)
                x_test = imputer.transform(x_test)   

                #standardize the features                
                prep.fit(x_train)

                best = fmin(
                  fn=objective,
                  space=search_space,
                  algo=tpe.suggest,
                  max_evals=100,
                  timeout= 90,
                  rstate=rstate)       

                best = space_eval(search_space, best)
                best = clf(**best)

                #search for the best hyperparameters
                best.fit(prep.transform(x_train), y_train)

                #store the results
                y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
                y_true =  [*y_true, *y_test] 

                #calculate the recall
                score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                #calculate the area under roc curve
                aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
                auc_score[algorithm].append(aucscore)
            
    auc_score = pd.DataFrame.from_dict(auc_score)  
    auc_score.to_csv(name + '_auc.csv')

    #write a csv with the recall of class '0' - specificity 
    #and another csv with the recall of class '1' - sensitivity
    recall_svm_linear = pd.DataFrame(np.vstack(score['SVM_linear']))
    recall_svm_rbf = pd.DataFrame(np.vstack(score['SVM_rbf']))
    recall_gb = pd.DataFrame(np.vstack(score['GB']))
    recall_rf = pd.DataFrame(np.vstack(score['RF']))

    esp = pd.concat([recall_svm_linear[[0]], recall_svm_rbf[[0]], recall_rf[[0]], recall_gb[[0]]], axis=1)
    sen = pd.concat([recall_svm_linear[[1]], recall_svm_rbf[[1]], recall_rf[[1]], recall_gb[[1]]], axis=1)

    esp.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']
    sen.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']

    esp.to_csv(name + '_spe.csv')
    sen.to_csv(name + '_sen.csv')

100%|██████| 100/100 [00:57<00:00,  1.73trial/s, best loss: -0.5995939813709099]
100%|██████| 100/100 [00:56<00:00,  1.77trial/s, best loss: -0.5892842926518589]
100%|██████| 100/100 [00:52<00:00,  1.89trial/s, best loss: -0.5862922272642836]
100%|██████| 100/100 [00:55<00:00,  1.80trial/s, best loss: -0.5819401321550832]
100%|███████| 100/100 [00:58<00:00,  1.71trial/s, best loss: -0.599706764323435]
100%|██████| 100/100 [00:59<00:00,  1.69trial/s, best loss: -0.5860732956505585]
100%|██████| 100/100 [01:04<00:00,  1.56trial/s, best loss: -0.6092004352095108]
100%|██████| 100/100 [01:12<00:00,  1.39trial/s, best loss: -0.5745628002016824]
100%|████████| 100/100 [01:11<00:00,  1.41trial/s, best loss: -0.59869171775071]
100%|██████| 100/100 [01:04<00:00,  1.56trial/s, best loss: -0.5756640925616326]
100%|██████| 100/100 [01:12<00:00,  1.37trial/s, best loss: -0.6017434917602101]
100%|██████| 100/100 [00:59<00:00,  1.69trial/s, best loss: -0.5871082450972588]
100%|███████| 100/100 [01:07

In [6]:
#hosp1
auc = pd.read_csv('hosp1_auc.csv')

# recall
sen = pd.read_csv('hosp1_sen.csv')
spe = pd.read_csv('hosp1_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.656,0.656,0.749,0.759
sen,0.608,0.608,0.691,0.693
spe,0.615,0.616,0.664,0.679


In [7]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.045,0.046,0.028,0.026
sen,0.07,0.071,0.057,0.051
spe,0.044,0.043,0.042,0.05


In [8]:
#hosp2
auc = pd.read_csv('hosp2_auc.csv')

# recall
sen = pd.read_csv('hosp2_sen.csv')
spe = pd.read_csv('hosp2_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.757,0.74,0.696,0.738
sen,0.666,0.665,0.613,0.617
spe,0.71,0.714,0.661,0.701


In [9]:

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.121,0.154,0.119,0.124
sen,0.192,0.188,0.202,0.21
spe,0.11,0.103,0.146,0.131


In [10]:
#armazena o recall de cada algoritmo
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#armazena auc de cada algoritmo    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []

#for each algorithm and its respective search space
for algorithm, (clf, search_space) in algorithms.items():

    for j in range(0,10):
        
        X_train = hosp1.drop(columns=['severity'])
        Y_train = hosp1.severity
        X_test = hosp2.drop(columns=['severity'])
        y_test = hosp2.severity

        #vectors to store y_pred e y_true
        y_pred = [] 
        y_true = [] 

        #undersampling of the majority class
        #undersampling the majority class when classes are umbalanced
        under = RandomUnderSampler(sampling_strategy='majority', random_state = j)
        x_train, y_train = under.fit_resample(X_train, Y_train)
        x_test = X_test

        imputer.fit(x_train)
        x_train = imputer.transform(x_train)
        x_test = imputer.transform(x_test)   


        #standardize the features                
        prep.fit(x_train)

        best = fmin(
          fn=objective,
          space=search_space,
          algo=tpe.suggest,
          max_evals=100,
          timeout= 90,
          rstate=rstate)       

        best = space_eval(search_space, best)
        best = clf(**best)

        #search for the best hyperparameters
        best.fit(prep.transform(x_train), y_train)

        #store the results
        y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
        y_true =  [*y_true, *y_test] 

        #calculate the recall
        score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

        #calculate the area under roc curve
        aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
        auc_score[algorithm].append(aucscore)

auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('crossmodels_auc.csv')

#write a csv with the recall of class '0' - specificity 
#and another csv with the recall of class '1' - sensitivity
recall_svm_linear = pd.DataFrame(np.vstack(score['SVM_linear']))
recall_svm_rbf = pd.DataFrame(np.vstack(score['SVM_rbf']))
recall_gb = pd.DataFrame(np.vstack(score['GB']))
recall_rf = pd.DataFrame(np.vstack(score['RF']))

esp = pd.concat([recall_svm_linear[[0]], recall_svm_rbf[[0]], recall_rf[[0]], recall_gb[[0]]], axis=1)
sen = pd.concat([recall_svm_linear[[1]], recall_svm_rbf[[1]], recall_rf[[1]], recall_gb[[1]]], axis=1)

esp.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']
sen.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']

esp.to_csv('crossmodels_spe.csv')
sen.to_csv('crossmodels_sen.csv')  

100%|██████| 100/100 [01:05<00:00,  1.53trial/s, best loss: -0.5877685755863722]
100%|██████| 100/100 [01:06<00:00,  1.50trial/s, best loss: -0.5840716486902927]
100%|██████| 100/100 [01:08<00:00,  1.47trial/s, best loss: -0.5773679592535524]
100%|██████| 100/100 [01:08<00:00,  1.47trial/s, best loss: -0.5850721195000856]
100%|██████| 100/100 [01:08<00:00,  1.47trial/s, best loss: -0.5935627461051189]
100%|██████| 100/100 [01:07<00:00,  1.49trial/s, best loss: -0.5888760486218113]
100%|██████| 100/100 [01:08<00:00,  1.46trial/s, best loss: -0.5934717942133196]
100%|██████| 100/100 [01:05<00:00,  1.52trial/s, best loss: -0.5576581492895052]
100%|██████| 100/100 [01:07<00:00,  1.49trial/s, best loss: -0.5888492980653998]
100%|██████| 100/100 [01:07<00:00,  1.49trial/s, best loss: -0.5699955059065228]
100%|██████| 100/100 [01:03<00:00,  1.58trial/s, best loss: -0.5877685755863722]
100%|██████| 100/100 [01:06<00:00,  1.51trial/s, best loss: -0.5840716486902927]
100%|██████| 100/100 [01:05<

In [11]:
#cruzados
auc = pd.read_csv('crossmodels_auc.csv')

# recall
sen = pd.read_csv('crossmodels_sen.csv')
spe = pd.read_csv('crossmodels_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.662,0.66,0.769,0.779
sen,0.581,0.581,0.66,0.663
spe,0.644,0.647,0.742,0.727


In [12]:
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.024,0.023,0.013,0.011
sen,0.039,0.039,0.026,0.018
spe,0.037,0.036,0.024,0.03
