In [1]:
#Importa bibliotecas

import numpy as np
import pandas as pd
import random as rd
import os


from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_scorefrom sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score, recall_score
from imblearn.under_sampling import RandomUnderSampler

from sklearn.impute import KNNImputer

import hyperopt
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

rstate = np.random.default_rng(17)

In [2]:
#algoritmos
algorithms = {
    
    'SVM_linear': (SVC, {'probability': True,
                         'C': hp.loguniform('C', np.log(1e-3), np.log(1e3))}),
    
    'SVM_rbf': (SVC, {'kernel': 'rbf',
             'probability': True,
             'C': hp.loguniform('C', np.log(1e-3), np.log(1e3)),
             'gamma': hp.choice('gamma',['scale', 'auto'])}),
    
    'RF' : (RandomForestClassifier,{
             'n_estimators': hp.uniformint('n_estimators', 2, 200),
             'max_depth': hp.uniformint('max_depth', 1, 100),
             'criterion': hp.choice('criterion', ["gini", "entropy"])}),
    
    
    'GB' : (GradientBoostingClassifier, {'learning_rate': hp.lognormal('learning_rate', np.log(0.01), np.log(10.0)),
                                     'n_estimators': scope.int(hp.qloguniform('n_estimators', np.log(10.5), np.log(1000.5), 1)),
                                     'loss': hp.choice('loss', ['deviance']),
                                     }),
    
}

In [3]:

#3 folds to choose the best hyperparameters
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) 

#choose of the best hyperparameters through balanced accuracy
perf = balanced_accuracy_score

#10-fold cross validation 
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 

#define Standard Scaler to standardize the features
prep = StandardScaler()

#imputa os valores ausentes dos 3 vizinhos mais próximos
imputer = KNNImputer(n_neighbors=3, weights = 'distance')

# redefine the function usng a wider range of hyperparameters
def objective(search_space):
    model = clf(**search_space, random_state = 0)
    score = cross_val_score(model, x_train, y_train, cv=3, scoring=perf, n_jobs=None).mean()
    return {'loss': -score, 'status': STATUS_OK} 

In [4]:
#datasets
hosp1 = pd.read_csv("hosp1_severity_.csv")
hosp2 = pd.read_csv("hosp2_severity_.csv")

data = {
    "hosp1": ((hosp1.drop(columns=['severity'])), hosp1.severity), 
    "hosp2": ((hosp2.drop(columns=['severity'])), hosp2.severity)
}

In [5]:
for name, (X,y) in data.items(): 

    #armazena o recall de cada algoritmo
    score = {}
    for algorithm in algorithms.keys():
        score[algorithm] = []

    #armazena auc de cada algoritmo    
    auc_score = {}
    for algorithm in algorithms.keys():
        auc_score[algorithm] = []
    
    #for each algorithm and its respective search space
    for algorithm, (clf, search_space) in algorithms.items():

        for train, test in kf.split(X, y):
            
            #separa treino e teste deixando sempre 1 para teste
            X_train, x_test = X.iloc[train], X.iloc[test]
            Y_train, y_test = y.iloc[train], y.iloc[test]
            
            y_ = pd.DataFrame.from_dict(y)
            #se a diferença entre as classes for maior que 1.5
            if (((y_[y_.severity == 1].shape[0])*1.5) < (y_[y_.severity == 0].shape[0])):
                
                for j in range(0,10):
                    
                     #vectors to store y_pred e y_true
                    y_pred = [] 
                    y_true = [] 

                    #undersampling of the majority class
                    #undersampling the majority class when classes are umbalanced
                    under = RandomUnderSampler(sampling_strategy='majority', random_state = j)
                    x_train, y_train = under.fit_resample(X_train, Y_train)
                    
                    imputer.fit(x_train)
                    x_train = imputer.transform(x_train)
                    x_test = imputer.transform(x_test)   

                    #standardize the features                
                    prep.fit(x_train)

                    best = fmin(
                      fn=objective,
                      space=search_space,
                      algo=tpe.suggest,
                      max_evals=100,
                      timeout= 90,
                      rstate= rstate)        

                    best = space_eval(search_space, best)
                    best = clf(**best)

                    #search for the best hyperparameters
                    best.fit(prep.transform(x_train), y_train)

                    #store the results
                    y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
                    y_true =  [*y_true, *y_test] 

                    #calculate the recall
                    score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                    #calculate the area under roc curve
                    aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
                    auc_score[algorithm].append(aucscore)
           
            else:
                 #vectors to store y_pred e y_true
                y_pred = [] 
                y_true = [] 

                #undersampling of the majority class
                #undersampling the majority class when classes are umbalanced
                x_train = X_train
                y_train = Y_train
                
                imputer.fit(x_train)
                x_train = imputer.transform(x_train)
                x_test = imputer.transform(x_test)   

                #standardize the features                
                prep.fit(x_train)

                best = fmin(
                  fn=objective,
                  space=search_space,
                  algo=tpe.suggest,
                  max_evals=100,
                  timeout= 90,
                  rstate=rstate)       

                best = space_eval(search_space, best)
                best = clf(**best)

                #search for the best hyperparameters
                best.fit(prep.transform(x_train), y_train)

                #store the results
                y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
                y_true =  [*y_true, *y_test] 

                #calculate the recall
                score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                #calculate the area under roc curve
                aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
                auc_score[algorithm].append(aucscore)
            
    auc_score = pd.DataFrame.from_dict(auc_score)  
    auc_score.to_csv(name + '_auc.csv')

    #write a csv with the recall of class '0' - specificity 
    #and another csv with the recall of class '1' - sensitivity
    recall_svm_linear = pd.DataFrame(np.vstack(score['SVM_linear']))
    recall_svm_rbf = pd.DataFrame(np.vstack(score['SVM_rbf']))
    recall_gb = pd.DataFrame(np.vstack(score['GB']))
    recall_rf = pd.DataFrame(np.vstack(score['RF']))

    esp = pd.concat([recall_svm_linear[[0]], recall_svm_rbf[[0]], recall_rf[[0]], recall_gb[[0]]], axis=1)
    sen = pd.concat([recall_svm_linear[[1]], recall_svm_rbf[[1]], recall_rf[[1]], recall_gb[[1]]], axis=1)

    esp.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']
    sen.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']

    esp.to_csv(name + '_spe.csv')
    sen.to_csv(name + '_sen.csv')  

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.12trial/s, best loss: -0.5836616213974705]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.11trial/s, best loss: -0.5962056811113414]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.12trial/s, best loss: -0.5836616213974705]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.09trial/s, best loss: -0.6056396433754925]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.14trial/s, best loss: -0.6079203815052872]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.01trial/s, best loss: -0.5852166701223305]
100%|█████████████████████████████

In [6]:
#hosp1
auc = pd.read_csv('hosp1_auc.csv')

# recall
sen = pd.read_csv('hosp1_sen.csv')
spe = pd.read_csv('hosp1_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.681,0.684,0.756,0.756
sen,0.636,0.631,0.749,0.693
spe,0.634,0.633,0.695,0.683


In [7]:
df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.05,0.048,0.028,0.031
sen,0.074,0.076,0.053,0.056
spe,0.043,0.044,0.051,0.053


In [8]:
#hosp2
auc = pd.read_csv('hosp2_auc.csv')

# recall
sen = pd.read_csv('hosp2_sen.csv')
spe = pd.read_csv('hosp2_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.723,0.731,0.755,0.724
sen,0.692,0.704,0.789,0.619
spe,0.676,0.683,0.733,0.703


In [9]:

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df


Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.176,0.153,0.126,0.131
sen,0.194,0.192,0.165,0.214
spe,0.111,0.107,0.158,0.148


In [10]:
X_train = hosp1.drop(columns=['severity'])
Y_train = hosp1.severity
x_test = hosp2.drop(columns=['severity'])
y_test = hosp2.severity

#armazena o recall de cada algoritmo
score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

#armazena auc de cada algoritmo    
auc_score = {}
for algorithm in algorithms.keys():
    auc_score[algorithm] = []

#for each algorithm and its respective search space
for algorithm, (clf, search_space) in algorithms.items():

        y_ = pd.DataFrame.from_dict(Y_train)
        #se a diferença entre as classes for maior que 1.5
        if (((y_[y_.severity == 1].shape[0])*1.5) < (y_[y_.severity == 0].shape[0])):

            for j in range(0,10):

                 #vectors to store y_pred e y_true
                y_pred = [] 
                y_true = [] 

                #undersampling of the majority class
                #undersampling the majority class when classes are umbalanced
                under = RandomUnderSampler(sampling_strategy='majority', random_state = j)
                x_train, y_train = under.fit_resample(X_train, Y_train)

                imputer.fit(x_train)
                x_train = imputer.transform(x_train)
                x_test = imputer.transform(x_test)   

                #standardize the features                
                prep.fit(x_train)

                best = fmin(
                  fn=objective,
                  space=search_space,
                  algo=tpe.suggest,
                  max_evals=100,
                  timeout= 90,
                  rstate= rstate)        

                best = space_eval(search_space, best)
                best = clf(**best)

                #search for the best hyperparameters
                best.fit(prep.transform(x_train), y_train)

                #store the results
                y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
                y_true =  [*y_true, *y_test] 

                #calculate the recall
                score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

                #calculate the area under roc curve
                aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
                auc_score[algorithm].append(aucscore)

        else:
             #vectors to store y_pred e y_true
            y_pred = [] 
            y_true = [] 

            #undersampling of the majority class
            #undersampling the majority class when classes are umbalanced
            x_train = X_train
            y_train = Y_train

            imputer.fit(x_train)
            x_train = imputer.transform(x_train)
            x_test = imputer.transform(x_test)   

            #standardize the features                
            prep.fit(x_train)

            rstate = np.random.default_rng(17)
            best = fmin(
              fn=objective,
              space=search_space,
              algo=tpe.suggest,
              max_evals=100,
              timeout= 90,
              rstate= rstate)        

            best = space_eval(search_space, best)
            best = clf(**best)

            #search for the best hyperparameters
            best.fit(prep.transform(x_train), y_train)

            #store the results
            y_pred = [*y_pred, *(best.predict(prep.transform(x_test)))] 
            y_true =  [*y_true, *y_test] 

            #calculate the recall
            score[algorithm].append(recall_score(y_true, y_pred, labels = [0,1], average = None))

            #calculate the area under roc curve
            aucscore = roc_auc_score(y_test, (best.predict_proba(prep.transform(x_test)))[:, 1])
            auc_score[algorithm].append(aucscore)

auc_score = pd.DataFrame.from_dict(auc_score)  
auc_score.to_csv('crossmodels_auc.csv')

#write a csv with the recall of class '0' - specificity 
#and another csv with the recall of class '1' - sensitivity
recall_svm_linear = pd.DataFrame(np.vstack(score['SVM_linear']))
recall_svm_rbf = pd.DataFrame(np.vstack(score['SVM_rbf']))
recall_gb = pd.DataFrame(np.vstack(score['GB']))
recall_rf = pd.DataFrame(np.vstack(score['RF']))

esp = pd.concat([recall_svm_linear[[0]], recall_svm_rbf[[0]], recall_rf[[0]], recall_gb[[0]]], axis=1)
sen = pd.concat([recall_svm_linear[[1]], recall_svm_rbf[[1]], recall_rf[[1]], recall_gb[[1]]], axis=1)

esp.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']
sen.columns = ['SVM_rbf', 'SVM_linear', 'RF', 'GB']

esp.to_csv('crossmodels_spe.csv')
sen.to_csv('crossmodels_sen.csv')  

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.13trial/s, best loss: -0.6064381482418416]
100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.26trial/s, best loss: -0.593726283835062]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.21trial/s, best loss: -0.6139008348090058]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:29<00:00,  3.39trial/s, best loss: -0.6088413862888945]
100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.16trial/s, best loss: -0.6096635466734126]
100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.22trial/s, best loss: -0.596951682266633]
100%|█████████████████████████████

In [11]:
#cruzados
auc = pd.read_csv('crossmodels_auc.csv')

# recall
sen = pd.read_csv('crossmodels_sen.csv')
spe = pd.read_csv('crossmodels_spe.csv')

column_names = ["svm_linear", "svm_rbf", "gb", "rf"]

df = pd.DataFrame(columns = column_names)
df.loc['auc'] = [auc['SVM_linear'].mean(), auc['SVM_rbf'].mean(), auc['GB'].mean(), auc['RF'].mean()]
df.loc['sen'] = [sen['SVM_linear'].mean(), sen['SVM_rbf'].mean(), sen['GB'].mean(), sen['RF'].mean()]
df.loc['spe'] = [spe['SVM_linear'].mean(), spe['SVM_rbf'].mean(), spe['GB'].mean(), spe['RF'].mean()]


df = df.round(decimals=3)
df

Unnamed: 0,svm_linear,svm_rbf,gb,rf
auc,0.51,0.513,0.747,0.748
sen,0.973,0.973,0.663,0.64
spe,0.019,0.019,0.754,0.736


In [12]:
df.loc['auc'] = [auc['SVM_linear'].std(), auc['SVM_rbf'].std(), auc['GB'].std(), auc['RF'].std()]
df.loc['sen'] = [sen['SVM_linear'].std(), sen['SVM_rbf'].std(), sen['GB'].std(), sen['RF'].std()]
df.loc['spe'] = [spe['SVM_linear'].std(), spe['SVM_rbf'].std(), spe['GB'].std(), spe['RF'].std()]


df = df.round(decimals=3)
df

SyntaxError: invalid syntax (228772512.py, line 2)