# Modelos de Classificação - Hepatite

In [1]:
# Bibliotecas de manipualção e visualização de dados
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from mlxtend.plotting import plot_decision_regions
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Classes dos modelo
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

#Ensemble 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from yellowbrick.classifier import ROCAUC

# Seleção de Features
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

from sklearn.decomposition import PCA



In [2]:
# dataset sem outliers
df_hepatite = pd.read_csv('HCV-Egy-Data-no-outlier.csv')

A remoção de outliers mostrou-se eficiente na performace do modelo aumentando em serca de 1% a acurácia.

In [3]:
df_hepatite.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,...,ALT 24,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baselinehistological staging
0,1,46,1,29,1,2,2,1,2,2,...,113,57,123,44,40620,538635,637056,336804,31085,2
1,3,49,2,33,1,2,1,2,1,2,...,88,48,77,33,1041941,449939,585688,744463,582301,3
2,5,58,2,22,2,2,2,1,2,2,...,65,73,114,29,1157452,1086852,5,5,5,4
3,6,42,2,26,1,1,2,2,2,2,...,107,84,80,28,325694,1034008,275095,214566,635157,4
4,7,48,2,30,1,1,2,2,1,1,...,45,96,53,39,641129,72050,787295,370605,506296,3


Ao exportar o dataset sem outliers surgiu a coluna `Unnamed: 0` que será removida.

In [4]:
df_hepatite.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df_hepatite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1381 entries, 0 to 1380
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Age                           1381 non-null   int64  
 1   Gender                        1381 non-null   int64  
 2   BMI                           1381 non-null   int64  
 3   Fever                         1381 non-null   int64  
 4   Nausea/Vomting                1381 non-null   int64  
 5   Headache                      1381 non-null   int64  
 6   Diarrhea                      1381 non-null   int64  
 7   FGba                          1381 non-null   int64  
 8   Jaundice                      1381 non-null   int64  
 9   Ep                            1381 non-null   int64  
 10  WBC                           1381 non-null   int64  
 11  RBC                           1381 non-null   float64
 12  HGB                           1381 non-null   int64  
 13  Pla

### Seperação da váriável target do dataset

In [6]:
X = df_hepatite.drop('Baselinehistological staging', axis=1)
y = df_hepatite['Baselinehistological staging']

## Pipeline dos Dados

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

**Pipeline com dados númericos**

Com a normalização dos dados o modelo de predição aumentou em um valor de 6% sua acurácia.

In [8]:
pipeline_numerico = Pipeline([
    ('std_scaler', StandardScaler())
])

In [9]:
X_num = X.drop(['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep'], axis=1)
X_num_tr = pipeline_numerico.fit_transform(X_num)

**Pipeline com dados categóricos**

In [10]:
X_num_std = pipeline_numerico.fit_transform(X_num)

In [11]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

In [12]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [13]:
# lista com os atributos categóricos
atributos_cat = ['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep']
# lista com os atributos númericos
atributos_num = list(X.drop(['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep'], axis=1))

In [14]:
pipeline_num_cat = ColumnTransformer([
    ('num', pipeline_numerico, atributos_num),
    ('cat', OneHotEncoder(), atributos_cat)
])

In [15]:
X_final = pipeline_num_cat.fit_transform(X)

In [16]:
X_final = pd.DataFrame(X_final)
X_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.034767,0.099591,1.710260,0.020626,-1.512119,-0.746860,0.317246,1.507094,0.435737,-0.326068,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.307367,1.081989,-0.393171,1.074781,-1.512119,-0.306427,-1.530374,-0.768433,0.963259,-0.134190,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.333767,-1.619604,1.591799,-1.558182,1.406858,-0.698899,-0.645056,0.774297,1.415421,0.479821,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,-0.490944,-0.637207,1.529945,0.938257,-0.344528,0.487435,-0.183151,-1.038410,1.113980,1.324087,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.193322,0.345191,-0.076400,-0.047159,-0.928324,1.490328,1.395024,1.082844,-0.129466,1.669468,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376,-0.262856,0.099591,-0.185489,1.543957,1.406858,1.138394,1.510500,-0.961274,-0.204826,0.863578,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1377,0.991633,1.327588,-0.499261,0.619643,-1.512119,-1.097118,1.741453,0.697161,-0.694668,0.594948,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1378,-0.490944,-0.637207,-0.984351,-0.866636,0.823063,-0.772966,-0.837516,0.350047,1.490781,-0.863328,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1379,0.649500,0.099591,-0.105641,-0.520399,-1.512119,1.225708,-0.491088,0.504320,0.774858,-0.364444,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


## Funções para executar os modelos

In [17]:
#função que retorna um dicionário com os valores dos resultados
def model_results(model, X_train, y_train, X_test, y_test,results_dict_aux):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #medindo e armazenando acurácia, f1-score e auc-score no dicionário
    accuracy = model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    AUC = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovo')
    CM = confusion_matrix(y_test, y_pred)

    results_dict_aux['accuracy'].append(accuracy)
    results_dict_aux['f1'].append(f1)
    results_dict_aux['auc'].append(AUC)
    results_dict_aux['cm'].append(CM)
    
    #print(f"f1: %.6f\n" %(f1))
    #print(f"Accuracy: %.6f\n" %(accuracy))
    #print(f"AUC: %.6f" %(AUC))
    #print(f"CM: \n{CM} \n")
    '''
    print("-----------------------CURVA ROC---------------------")
    visualizer = ROCAUC(model, encoder={1:"Class 1", 2:"Class 2", 3:"Class 3", 4:"Class 4"})

    visualizer.fit(X_train, y_train)        
    visualizer.score(X_test, y_test)        
    visualizer.show()                       
    #print("-----------------------------------------------------\n")'''
    
    return results_dict_aux

## RandomizedSearch

Para cada modelo é implementada uma função do RandomizedSearch. Para ser aplicada em cada um dos 10 conjuntos de treino do 10-fold.

In [18]:
def grid_search(model, param_grid, X, y):    
    # defining parameter range
    grid = RandomizedSearchCV(model, param_grid, cv=10, scoring='accuracy', n_jobs=-1, random_state=199)
    # fitting the model for grid search
    return grid.fit(X, y)

**KNN**

In [19]:
def kNN_grid_search(X_train, y_train):
    #lista com números impares para o número de vizinhos do knn
    k_range = [impar for impar in range(1,32) if (impar%2)!=0]
    #listas com formas de considerar a ditância do vizinho
    weights = ['uniform', 'distance']
    #lista com formas de calcular as distâncias
    dist = ['euclidian','manhattan','chebyshev']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'n_neighbors': k_range,
        'weights': weights,
        'metric': dist
    }
    # defining parameter range
        
    grid = grid_search(knn(), param_grid, X_train, y_train)   

    #utilizando melhores parâmetros calculados pelo gridsearch
    k = grid.best_params_['n_neighbors']
    w = grid.best_params_['weights']
    m = grid.best_params_['metric']

    '''
    print("KNN")
    print(f"Melhores parâmetros - k:{k}, w:{w}, m:{m}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (k,w,m)

**Decision Tree**

In [20]:
def dt_grid_search(X_train, y_train):
    #
    max_depth_range = [x for x in range(1,32)]
    #listas com formas de medir a qualidade do 'split'
    criterion_list = ['gini', 'entropy']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'max_depth': max_depth_range,
        'criterion': criterion_list,
    }
    # defining parameter range
    grid = grid_search(DecisionTreeClassifier(), param_grid, X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    md = grid.best_params_['max_depth']
    c = grid.best_params_['criterion']

    '''
    print("DT")
    print(f"Melhores parâmetros - md:{md}, c:{c}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (md,c)

**MLP**

In [21]:
def mlp_grid_search(X_train, y_train):
    # lista com o número de camadas
    hidden_layer_sizes_list = [(100,), (50, 15, 5), (100, 25, 10)]
    # listas das taxas de aprendizado inicial
    learning_rate_init_list =  [0.1, 0.01, 0.001]
    # lista de estratégias
    solver_list = ['sgd', 'adam', 'lbfgs']
    # lista das funções de ativações
    activation_list = ['tanh', 'relu', 'logistic']
    # lista da forma da taxa de aprendizado
    learning_rate_list = ['constant', 'adaptive']
    # dicionário com parêmetros para o gridsearch
    param_grid = {
        'hidden_layer_sizes': hidden_layer_sizes_list,
        'activation': activation_list,
        'solver': solver_list,
        'learning_rate': learning_rate_list,
        'learning_rate_init': learning_rate_init_list,
    }
    # defining parameter range
    grid = grid_search(MLPClassifier(), param_grid, X_train, y_train)

    # utilizando melhores parâmetros calculados pelo gridsearch
    hls = grid.best_params_['hidden_layer_sizes']
    a = grid.best_params_['activation']
    s = grid.best_params_['solver']
    lr = grid.best_params_['learning_rate']
    lri = grid.best_params_['learning_rate_init']
    '''
    print("MLP")
    print(f"Melhores parâmetros - hls:{hls}, a:{a}, s:{s}, lr:{lr}, lri:{lri}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''
    return (hls, a, s, lr, lri)

**SVM**

In [22]:
def svm_grid_search(X_train, y_train):
    #
    C_list = [0.1, 1, 10, 100]
    kernel_list = ['rbf','sigmoid', 'linear'] #['linear', 'poly', 'rbf', 'sigmoid']
    gamma_list = [1, 0.1, 0.01, 0.001]
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'C': C_list,
        'kernel': kernel_list,
        'gamma': gamma_list
    }
    # defining parameter range
    grid = grid_search(SVC(), param_grid, X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    c = grid.best_params_['C']
    k = grid.best_params_['kernel']
    g = grid.best_params_['gamma']

    '''
    print("SVM")
    print(f"Melhores parâmetros - C:{c}, k:{k}\n, g:{g}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (c,k,g)

**RF**

In [23]:
def rf_grid_search(X_train, y_train):

    # número de árvores
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]

    # profundidade máxima
    max_depth = [int(x) for x in np.linspace(100, 300, num = 11)]
    max_depth.append(None)
    
    # numero de features a serem consideradas em cada fold
    max_features = ['auto', 'sqrt', 'log2']
    
    # grid
    param_grid = {
     'n_estimators': n_estimators,
     'max_depth': max_depth,
     'max_features': max_features
     }

    # defining parameter range
    grid = grid_search(RandomForestClassifier(), param_grid, X_train, y_train)
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    e = grid.best_params_['n_estimators']
    m = grid.best_params_['max_depth']
    mf = grid.best_params_['max_features']

    '''
    print("RF")
    print(f"Melhores parâmetros - e:{e}, m:{m}, mf:{mf}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''
    return (e,m, mf)

**GBoost**

In [24]:
def gboost_grid_search(X_train, y_train):

   
    # grid
    param_grid = {
    "loss":["deviance", "exponential"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
     }

    # defining parameter range
    grid = grid_search(GradientBoostingClassifier(), param_grid, X_train, y_train)
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    l = grid.best_params_['loss']
    lr = grid.best_params_['learning_rate']
    mss = grid.best_params_['min_samples_split']
    msl = grid.best_params_['min_samples_leaf']
    mdep = grid.best_params_['max_depth']
    mfeat = grid.best_params_['max_features']
    crit = grid.best_params_['criterion']
    sub = grid.best_params_['subsample']
    n_est = grid.best_params_['n_estimators']

    '''
    print("RF")
    print(f"Melhores parâmetros - e:{e}, m:{m}, mf:{mf}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''
    return (l,lr, mss, msl, mdep, mfeat, crit, sub, n_est)



**Ada Boosting**

In [25]:
def aboost_grid_search(X_train, y_train):

   
    # grid
    param_grid = {
        'n_estimators':[int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
        'learning_rate':[.001,0.01, 0.1, 1, 2]
    }

    # defining parameter range
    grid = grid_search(AdaBoostClassifier(), param_grid, X_train, y_train)
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    n_est_ada = grid.best_params_['n_estimators']
    lr_ada = grid.best_params_['learning_rate']

    '''
    print("RF")
    print(f"Melhores parâmetros - e:{e}, m:{m}, mf:{mf}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''
    return (lr_ada, n_est_ada)

**SMOTE**

In [26]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [27]:
strategy = {1:1500, 2:1500, 3:1500, 4:1500}
oversample = SMOTE(sampling_strategy=strategy)
X, y = oversample.fit_resample(X, y)



In [28]:
X.shape

(6000, 27)

## Hold-out

Antes de realizar o 10-Fold os dados serão separados em treino e teste através do método **hold-out**. O data set de treino será utilizado para validar os modelos e escolher os melhores parâmetros com o **10-fold** para então comparar o modelos com o data set de teste.

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=199, stratify=y)

## K-best

In [30]:
X_train.head()

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,Ep,...,ALT 12,ALT 24,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF
1035,54,1,31,1,2,2,2,1,1,2,...,121,108,47,49,34,163195,20984,5,5,5
610,43,2,27,2,2,2,1,2,2,1,...,45,76,67,83,34,576218,322716,557033,804770,91511
245,38,2,31,2,2,1,1,1,1,2,...,41,122,47,123,40,1006627,1147164,757387,536783,79052
1849,56,1,32,2,2,1,1,1,1,1,...,64,125,60,97,27,513021,1004394,603754,363812,785133
4568,50,1,27,2,2,1,1,1,2,1,...,84,90,74,63,40,290980,782533,5,5,5


In [31]:
#kbest = SelectKBest(score_func=f_classif, k=7)
#kbest = SelectKBest(score_func=f_classif, k=13)
#kbest.fit(X_train, y_train)
#X_train = pd.DataFrame(kbest.transform(X_train))
#X_test = pd.DataFrame(kbest.transform(X_test))    


#RFECV

#classificador = RandomForestClassifier(n_estimators=100, random_state=199)
#classificador.fit(X_train, y_train)

#selecionador_rfecv = RFECV(estimator = classificador, step=3, min_features_to_select=7, scoring="accuracy")
#selecionador_rfecv = RFE(estimator = classificador, step=3)
#selecionador_rfecv.fit(X_train, y_train)
#X_train = pd.DataFrame(selecionador_rfecv.transform(X_train))
#X_test = pd.DataFrame(selecionador_rfecv.transform(X_test))


In [32]:
X_train.head()

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,Ep,...,ALT 12,ALT 24,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF
1035,54,1,31,1,2,2,2,1,1,2,...,121,108,47,49,34,163195,20984,5,5,5
610,43,2,27,2,2,2,1,2,2,1,...,45,76,67,83,34,576218,322716,557033,804770,91511
245,38,2,31,2,2,1,1,1,1,2,...,41,122,47,123,40,1006627,1147164,757387,536783,79052
1849,56,1,32,2,2,1,1,1,1,1,...,64,125,60,97,27,513021,1004394,603754,363812,785133
4568,50,1,27,2,2,1,1,1,2,1,...,84,90,74,63,40,290980,782533,5,5,5


In [33]:
X_test.head()

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,Ep,...,ALT 12,ALT 24,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF
211,46,2,24,2,1,1,1,2,1,1,...,112,62,101,98,34,270089,130153,5,5,5
3262,48,1,29,2,1,2,1,1,1,2,...,50,81,54,68,30,577296,367162,117755,429399,521089
4520,45,1,30,1,1,1,1,1,2,2,...,113,126,58,59,23,431838,1039552,5,5,5
6,45,1,30,2,1,2,2,1,1,2,...,127,81,125,43,30,1151206,230488,267320,275295,555516
2317,42,1,29,1,1,2,1,1,2,1,...,50,77,110,99,42,732356,369745,160788,340282,426970


In [34]:
#scaler = StandardScaler()
#scaler.fit(X_train, y_train)
#X_train = pd.DataFrame(scaler.transform(X_train))
#scaler.fit(X_test, y_test)
#X_test = pd.DataFrame(scaler.transform(X_test))

In [35]:
y_train

1035    1
610     1
245     4
1849    1
4568    3
       ..
4771    3
2895    2
2706    2
5744    4
5601    4
Name: Baselinehistological staging, Length: 4200, dtype: int64

## 10-Fold

Com esse método são criados 10 datasets de treino e 10 datasets de test com uma divisão de 90% para treino e 10% para teste em cada divisão.

O conjunto de treino sera divido mais uma vez em treino e validação (isso é feito dendo da função `RandomizedSearchCV` para que então seja aplicado a validação cruzada e assim obtenha-se os melhores parâmetros. Por fim, tendo os melhores parâmetros, utiliza-se o conjunto de teste para que se possa avaliar os resultados.

Esses resultados são obtidos de cada fold e então se tira a média deles para obter-se a avaliação final de cada modelo.

In [36]:
def print_best_results(best_k, best_weight, best_metric, best_max_depth, best_criterion, best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i, best_c, best_kernel, best_gamma,  best_n, best_md, best_mf, best_l,best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est, best_lr_ada, best_n_est_ada):
    print(f"Best KNN parameters: k:{best_k} m:{best_weight} w:{best_metric}")
    print(f"Best DT parameters: md:{best_max_depth} c:{best_criterion}")
    print(f"Best MLP parameters: hls:{best_hidden_layers} a:{best_activation} s:{best_solver} lr:{best_learn_rate} lri:{best_learn_rate_i}")
    print(f"Best SVM parameters: c:{best_c} k:{best_kernel} g:{best_gamma}")
    print(f"Best RF parameters: e:{best_n} md:{best_md} mf:{best_mf}")
    print(f"Best GBoosting parameters: l:{best_l} lr:{best_lr} mss:{best_mss} msl:{best_msl} mdep:{best_mdep} mfeat: {best_mfeat} crit:{best_crit} sub:{best_sub} n_est:{best_n_est}")
    print(f"Best GBoosting parameters: l:{best_lr_ada} lr:{best_n_est_ada}")
    print("--------------------------------------------------------------------------")

In [37]:
def print_mean_result(model_key, accuracies, f1, auc, conf_matrix):
    print(f"\t{model_key}")
    print("Acurácia média (desvio): %.6f +- (%.6f)" %(accuracies.mean(), accuracies.std()))
    print("F1-score média (desvio): %.6f +- (%.6f)" %(f1.mean(), f1.std()))
    print("AUC média (desvio): %.6f +- (%.6f)\n" %(auc.mean(), auc.std()))
    print(f"Matriz de Confusão:  \n{sum(conf_matrix)*0.1}")

In [38]:
#função que roda os modelos em cada uma das divisões do 10-fold
#e imprime a média e o desvio padrão dos resultados

def evaluate_model_with_kfold(kf):
    results_dict_models = {}
    # listas e dicionarios para salvar as métricas dos resultados de todas as interacoes
    
    results_dict_KNN = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_DT = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_MLP = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_SVM = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_GNB = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    
    results_dict_RF = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    
    results_dict_GBoost = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_ABoost = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    
    # váriável para salvar os melhores parâmetros
    best = 0.0
    # knn
    best_k, best_weight, best_metric = 5, 'uniform', 'minkowski' #padrão
    # dt
    best_max_depth, best_criterion = None, 'gini' #padrão
    #mlp
    best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = (100,), 'relu', 'adam', 'constant', 0.001 #padrão
    #svm
    best_c, best_kernel, best_gamma = 1.0, 'rbf', 'scale' #padrão
    #rf
    best_n, best_md, best_mf = 100, None, 'auto' #padrão
    #gboost
    best_l, best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est = 'deviance', 0.1, 2, 1, 3, None, 'friedman_mse', 1.0, 100
    #adaboost
    best_lr_ada, best_n_est_ada = 1.0, 50 #padrão
    
    # laço que roda todos os modelos em cada 1-fold
    for train, test in kf.split(X_train, y_train):
        X_train_kf, y_train_kf, X_test_kf, y_test_kf = X_train.iloc[train], y_train.iloc[train], X_train.iloc[test], y_train.iloc[test]        
        
        #FS
        
        #kbest = SelectKBest(score_func=f_classif, k=7)
        kbest = SelectKBest(score_func=chi2, k=5)
        kbest.fit(X_train_kf, y_train_kf)
        #features = kbest.get_features_name_out()
       # print("Features selecionadas: ",features)
        X_train_kf = pd.DataFrame(kbest.transform(X_train_kf))
        X_test_kf = pd.DataFrame(kbest.transform(X_test_kf)) 
        
        
        #classificador = RandomForestClassifier(n_estimators=100, random_state=199)
        #classificador.fit(X_train_kf, y_train_kf)

        #selecionador_rfecv = RFECV(estimator = classificador, step=3, min_features_to_select=7, scoring="accuracy")
        #selecionador_rfecv = RFE(estimator = classificador, step=3)
        #selecionador_rfecv.fit(X_train_kf, y_train_kf)
        #X_train_kf = pd.DataFrame(selecionador_rfecv.transform(X_train_kf))
        #X_test_kf = pd.DataFrame(selecionador_rfecv.transform(X_test_kf)) 
        
        scaler = StandardScaler()
        scaler.fit(X_train_kf, y_train_kf)
        X_train_kf = pd.DataFrame(scaler.transform(X_train_kf))
        scaler.fit(X_test_kf, y_test_kf)
        X_test_kf = pd.DataFrame(scaler.transform(X_test_kf))
        
        # PCA - Principal Component Analysis
        pca = PCA(n_components = 0.95)
        pca.fit(X_train_kf)
        X_train_kf = pca.transform(X_train_kf)
        X_test_kf = pca.transform(X_test_kf) 
            
        #kNN
        k,w,m = kNN_grid_search(X_train_kf, y_train_kf)      
        model = knn(n_neighbors=k, weights=w, metric=m)        
        results_dict_KNN = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_KNN)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_KNN['accuracy'][-1] > best): 
            best = results_dict_KNN['accuracy'][-1]
            best_k, best_weight, best_metric = k, w, m 
        
        #DT
        md,c = dt_grid_search(X_train_kf, y_train_kf)
        model = DecisionTreeClassifier(max_depth=md, criterion=c, random_state=199)
        results_dict_DT = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_DT)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_DT['accuracy'][-1] > best): 
            best = results_dict_DT['accuracy'][-1]
            best_max_depth, best_criterion = md, c
        
        
        #MLP 
        hls, a, s, lr, lri = mlp_grid_search(X_train_kf,y_train_kf)
        model = MLPClassifier(
            hidden_layer_sizes=hls, 
            activation=a, 
            solver=s, 
            learning_rate=lr, 
            learning_rate_init=lri, 
            max_iter=2000, 
            tol=0.000001,
            random_state=199
        )
        results_dict_MLP = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_MLP)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_MLP['accuracy'][-1] > best): 
            best = results_dict_MLP['accuracy'][-1]
            best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = hls, a, s, lr, lri
        
        #GNB 
        model = GaussianNB()
        results_dict_models['GNB'] = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_GNB)
        
        #SVM
        c, k, g = svm_grid_search(X_train_kf, y_train_kf)
        model = SVC(C=c, kernel=k, gamma=g, probability=True, random_state=199)
        results_dict_SVM = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_SVM)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_SVM['accuracy'][-1] > best): 
            best = results_dict_SVM['accuracy'][-1]
            best_c, best_kernel, best_gamma = c, k, g
            
        #RF
        e, m, mf = rf_grid_search(X_train_kf, y_train_kf)
        model = RandomForestClassifier(n_estimators=e, max_depth=m, max_features=mf, random_state=199)
        results_dict_RF = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_RF)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_RF['accuracy'][-1] > best): 
            best = results_dict_RF['accuracy'][-1]
            best_n, best_md, best_mf = e, m, mf
            
        #GBoost
        l,lr, mss, msl, mdep, mfeat, crit, sub, n_est = gboost_grid_search(X_train_kf, y_train_kf)
        model = GradientBoostingClassifier(loss=l,
                                           learning_rate=lr,
                                           min_samples_split = mss,
                                           min_samples_leaf = msl, 
                                           n_estimators=n_est, 
                                           max_depth=mdep, 
                                           max_features=mfeat,
                                           subsample=sub,
                                           criterion=crit,
                                           random_state=199)
        results_dict_GBoost = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_GBoost)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_GBoost['accuracy'][-1] > best): 
            best = results_dict_GBoost['accuracy'][-1]
            best_l,best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est = l,lr, mss, msl, mdep, mfeat, crit, sub, n_est
        
        #ABoost
        #base_estimator=DecisionTreeClassifier(max_depth=md, criterion=c, random_state=199),
                                           
        lr_ada, n_est_ada = aboost_grid_search(X_train_kf, y_train_kf)
        model = AdaBoostClassifier(n_estimators=n_est_ada, 
                                           learning_rate=lr_ada, 
                                           random_state=199)

        results_dict_ABoost = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_ABoost)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_ABoost['accuracy'][-1] > best): 
            best = results_dict_ABoost['accuracy'][-1]
            best_lr_ada, best_n_est_ada = lr_ada, n_est_ada    
        
    results_dict_models['KNN'] = results_dict_KNN
    results_dict_models['DT'] = results_dict_DT
    results_dict_models['MLP'] = results_dict_MLP
    results_dict_models['GNB'] = results_dict_GNB
    results_dict_models['SVM'] = results_dict_SVM
    results_dict_models['RF'] = results_dict_RF
    results_dict_models['GBoost'] = results_dict_GBoost
    results_dict_models['ABoost'] = results_dict_ABoost
    
    # a cada interação calcula a média e o desvio padrão da 
    # acurácia, f1-score, auc-scor e matriz de confusão de cada modelo
    for model_key in results_dict_models.keys():
        accuracies = np.array(results_dict_models[model_key]['accuracy'])
        f1 = np.array(results_dict_models[model_key]['f1'])
        auc = np.array(results_dict_models[model_key]['auc'])
        conf_matrix = np.array(results_dict_models[model_key]['cm'])

        print_mean_result(model_key, accuracies, f1, auc, conf_matrix)

    # imprime os melhores parâmetros
    print_best_results(best_k, best_weight, best_metric, best_max_depth, best_criterion, best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i, best_c, best_kernel, best_gamma, best_n, best_md, best_mf, best_l,best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est, best_lr_ada, best_n_est_ada)
    
    # salva os melhores parâmetros em um dicionário que é o retorno da função
    parameters_dict = {
        'knn': (best_k, best_weight, best_metric),
        'dt': (best_max_depth, best_criterion),
        'mlp': (best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i),
        'svm': (best_c, best_kernel, best_gamma),
        'rf': (best_n, best_md, best_mf),
        'gboost': (best_l,best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est),
        'aboost': (best_lr_ada, best_n_est_ada)
    }
    
    return parameters_dict

In [39]:
import warnings

In [None]:
%%time
#ignorando warnings
warnings.filterwarnings('ignore')

params_dict = {}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=199)
params_dict = evaluate_model_with_kfold(skf)

## Teste

Aqui os modelos são execultados no dataset de teste com seus melhores parâmetros.

In [None]:
def model_test_results(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #medindo e armazenando acurácia, f1-score e auc-score no dicionário
    accuracy = model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    AUC = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovo')
    CM = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: %.6f" %(accuracy))
    print(f"f1: %.6f" %(f1))
    print(f"AUC: %.6f" %(AUC))
    print(f"CM: \n{CM} \n")

**KNN**

In [None]:
best_k, best_weight, best_metric = params_dict['knn']

In [None]:
model = knn(n_neighbors=best_k, weights=best_weight, metric=best_metric)
model_test_results(model, X_train, y_train, X_test, y_test)

**DT**

In [None]:
best_max_depth, best_criterion = params_dict['dt']

In [None]:
model = DecisionTreeClassifier(max_depth=best_max_depth, criterion=best_criterion, random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

**MLP**

In [None]:
best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = params_dict['mlp']

In [None]:
model = MLPClassifier(
            hidden_layer_sizes=best_hidden_layers, 
            activation=best_activation, 
            solver=best_solver, 
            learning_rate=best_learn_rate, 
            learning_rate_init=best_learn_rate_i, 
            max_iter=2000, 
            tol=0.000001,
            random_state=199
        )
model_test_results(model, X_train, y_train, X_test, y_test)

**GNB**

In [None]:
model = GaussianNB()
model_test_results(model, X_train, y_train, X_test, y_test)

**SVM**

In [None]:
best_c, best_kernel, best_gamma = params_dict['svm']

In [None]:
model = SVC(C=best_c, kernel=best_kernel, gamma=best_gamma, probability=True, random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

**RF**

In [None]:
best_n, best_md, best_mf = params_dict['rf']

In [None]:
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_md, max_features=best_mf, random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

**GBoosting**

In [None]:
best_l,best_lr, best_mss, best_msl, best_mdep, best_mfeat, best_crit, best_sub, best_n_est = params_dict['gboost']

In [None]:
model = GradientBoostingClassifier(loss=best_l,
                                           learning_rate=best_lr,
                                           min_samples_split = best_mss,
                                           min_samples_leaf = best_msl, 
                                           n_estimators=best_n_est, 
                                           max_depth=best_mdep, 
                                           max_features=best_mfeat,
                                           subsample=best_sub,
                                           criterion=best_crit,
                                           random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

**AdaBoosting**

In [None]:
best_lr_ada, best_n_est_ada = params_dict['aboost']

In [None]:
model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=best_max_depth, criterion=best_criterion, random_state=199),
                           n_estimators=best_n_est_ada, 
                                   learning_rate=best_lr_ada, 
                                   random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)