# Modelos de Classificação - Hepatite

In [39]:
# Bibliotecas de manipualção e visualização de dados
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Classes dos modelo
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from mord import LogisticAT
from xgboost import XGBClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ROCAUC


In [7]:
# dataset sem outliers
df_hepatite = pd.read_csv('HCV-Egy-Data-no-outlier.csv')

A remoção de outliers mostrou-se eficiente na performace do modelo aumentando em serca de 1% a acurácia.

In [3]:
df_hepatite.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,...,ALT 24,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baselinehistological staging
0,1,46,1,29,1,2,2,1,2,2,...,113,57,123,44,40620,538635,637056,336804,31085,2
1,3,49,2,33,1,2,1,2,1,2,...,88,48,77,33,1041941,449939,585688,744463,582301,3
2,5,58,2,22,2,2,2,1,2,2,...,65,73,114,29,1157452,1086852,5,5,5,4
3,6,42,2,26,1,1,2,2,2,2,...,107,84,80,28,325694,1034008,275095,214566,635157,4
4,7,48,2,30,1,1,2,2,1,1,...,45,96,53,39,641129,72050,787295,370605,506296,3


Ao exportar o dataset sem outliers surgiu a coluna `Unnamed: 0` que será removida.

In [8]:
df_hepatite.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df_hepatite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1381 entries, 0 to 1380
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Age                           1381 non-null   int64  
 1   Gender                        1381 non-null   int64  
 2   BMI                           1381 non-null   int64  
 3   Fever                         1381 non-null   int64  
 4   Nausea/Vomting                1381 non-null   int64  
 5   Headache                      1381 non-null   int64  
 6   Diarrhea                      1381 non-null   int64  
 7   FGba                          1381 non-null   int64  
 8   Jaundice                      1381 non-null   int64  
 9   Ep                            1381 non-null   int64  
 10  WBC                           1381 non-null   int64  
 11  RBC                           1381 non-null   float64
 12  HGB                           1381 non-null   int64  
 13  Pla

### Seperação da váriável target do dataset

In [9]:
X = df_hepatite.drop('Baselinehistological staging', axis=1)
y = df_hepatite['Baselinehistological staging']

## Pipeline dos Dados

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

**Pipeline com dados númericos**

Com a normalização dos dados o modelo de predição almentou em um valor de 6% sua acurácia.

In [11]:
pipeline_numerico = Pipeline([
    ('std_scaler', StandardScaler())
])

In [12]:
X_num = X.drop(['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep'], axis=1)
X_num_tr = pipeline_numerico.fit_transform(X_num)

**Pipeline com dados categóricos**

In [13]:
X_num_std = pipeline_numerico.fit_transform(X_num)

In [14]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

In [15]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [16]:
# lista com os atributos categóricos
atributos_cat = ['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep']
# lista com os atributos númericos
atributos_num = list(X.drop(['Gender', 'Fever', 'Nausea/Vomting', 'Headache', 'Diarrhea', 'FGba', 'Jaundice', 'Ep'], axis=1))

In [17]:
pipeline_num_cat = ColumnTransformer([
    ('num', pipeline_numerico, atributos_num),
    ('cat', OneHotEncoder(), atributos_cat)
])

In [18]:
X_final = pipeline_num_cat.fit_transform(X)

## Funções para executar os modelos

In [19]:
#função que retorna um dicionário com os valores dos resultados
def model_results(model, X_train, y_train, X_test, y_test,results_dict_aux):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #medindo e armazenando acurácia, f1-score e auc-score no dicionário
    accuracy = model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    AUC = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovo')
    CM = confusion_matrix(y_test, y_pred)

    results_dict_aux['accuracy'].append(accuracy)
    results_dict_aux['f1'].append(f1)
    results_dict_aux['auc'].append(AUC)
    results_dict_aux['cm'].append(CM)
    
    #print(f"f1: %.6f\n" %(f1))
    #print(f"Accuracy: %.6f\n" %(accuracy))
    #print(f"AUC: %.6f" %(AUC))
    #print(f"CM: \n{CM} \n")
    '''
    print("-----------------------CURVA ROC---------------------")
    visualizer = ROCAUC(model, encoder={1:"Class 1", 2:"Class 2", 3:"Class 3", 4:"Class 4"})

    visualizer.fit(X_train, y_train)        
    visualizer.score(X_test, y_test)        
    visualizer.show()                       
    #print("-----------------------------------------------------\n")'''
    
    return results_dict_aux

## Grid Search

Para cada modelo é implementada uma função do grid search. Para ser aplicada em cada um dos 10 conjuntos de treino do 10-fold.

In [20]:
def grid_search(model, param_grid, X, y):    
    # defining parameter range
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    # fitting the model for grid search
    return grid.fit(X, y)

**KNN**

In [21]:
def kNN_grid_search(X_train, y_train):
    #lista com números impares para o número de vizinhos do knn
    k_range = [impar for impar in range(1,32) if (impar%2)!=0]
    #listas com formas de considerar a ditância do vizinho
    weights = ['uniform', 'distance']
    #lista com formas de calcular as distâncias
    dist = ['euclidian','manhattan','chebyshev']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'n_neighbors': k_range,
        'weights': weights,
        'metric': dist
    }
    # defining parameter range
    grid = grid_search(knn(), param_grid, X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    k = grid.best_params_['n_neighbors']
    w = grid.best_params_['weights']
    m = grid.best_params_['metric']

    '''
    print("KNN")
    print(f"Melhores parâmetros - k:{k}, w:{w}, m:{m}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (k,w,m)

**Decision Tree**

In [22]:
def dt_grid_search(X_train, y_train):
    #
    max_depth_range = [x for x in range(1,32)]
    #listas com formas de medir a qualidade do 'split'
    criterion_list = ['gini', 'entropy']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'max_depth': max_depth_range,
        'criterion': criterion_list,
    }
    # defining parameter range
    grid = grid_search(DecisionTreeClassifier(), param_grid, X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    md = grid.best_params_['max_depth']
    c = grid.best_params_['criterion']

    '''
    print("DT")
    print(f"Melhores parâmetros - md:{md}, c:{c}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (md,c)

**MLP**

In [23]:
def mlp_grid_search(X_train, y_train):
    # lista com o número de camadas
    hidden_layer_sizes_list = [(100,), (50, 15, 5), (100, 25, 10)]
    # listas das taxas de aprendizado inicial
    learning_rate_init_list = [0.05, 0.0001]
    # lista de estratégias
    solver_list = ['sgd', 'adam']
    # lista das funções de ativações
    activation_list = ['tanh', 'relu']
    # lista da forma da taxa de aprendizado
    learning_rate_lsit = ['constant', 'adaptive']
    # dicionário com parêmetros para o gridsearch
    param_grid = {
        'hidden_layer_sizes': hidden_layer_sizes_list,
        'activation': activation_list,
        'solver': solver_list,
        'learning_rate': learning_rate_lsit,
        'learning_rate_init': learning_rate_init_list,
    }
    # defining parameter range
    grid = grid_search(MLPClassifier(), param_grid, X_train, y_train)

    # utilizando melhores parâmetros calculados pelo gridsearch
    hls = grid.best_params_['hidden_layer_sizes']
    a = grid.best_params_['activation']
    s = grid.best_params_['solver']
    lr = grid.best_params_['learning_rate']
    lri = grid.best_params_['learning_rate_init']
    '''
    print("MLP")
    print(f"Melhores parâmetros - hls:{hls}, a:{a}, s:{s}, lr:{lr}, lri:{lri}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''
    return (hls, a, s, lr, lri)

**SVM**

In [24]:
def svm_grid_search(X_train, y_train):
    #
    C_list = [0.1, 1, 10, 100]
    kernel_list = ['rbf','sigmoid'] #['linear', 'poly', 'rbf', 'sigmoid']
    gamma_list = [1, 0.1, 0.01, 0.001]
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'C': C_list,
        'kernel': kernel_list,
        'gamma': gamma_list
    }
    # defining parameter range
    grid = grid_search(SVC(), param_grid, X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    c = grid.best_params_['C']
    k = grid.best_params_['kernel']
    g = grid.best_params_['gamma']

    '''
    print("SVM")
    print(f"Melhores parâmetros - C:{c}, k:{k}\n, g:{g}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (c,k,g)

**Regressão Logística Ordinal**

In [1]:
def olr_grid_search(X_train, y_train):

    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'alpha': [0, 0.2, 0.5, 2.0, 5.0]
    }
    # defining parameter range
    grid = GridSearchCV(LogisticAT(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    a = grid.best_params_['alpha']

    '''
    print("RLO")
    print(f"Melhores parâmetros - a:{a}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return a

**Random Forest**

In [2]:
def rf_grid_search(X_train, y_train):

    # número de árvores
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]

    # profundidade máxima
    max_depth = [int(x) for x in np.linspace(100, 300, num = 11)]
    max_depth.append(None)
    # grid
    param_grid = {
     'n_estimators': n_estimators,
     'max_depth': max_depth
     }

    # defining parameter range
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    e = grid.best_params_['n_estimators']
    m = grid.best_params_['max_depth']

    '''
    print("RF")
    print(f"Melhores parâmetros - e:{e}, m:{m}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return (e,m)

**Extreme Gradient Boosting**

In [3]:
def xgb_grid_search(X_train, y_train):

    # dicionário com parêmetros para o gridsearch 
    param_grid = {
        "objective": ['multi:softmax'],
        "max_depth": range (2, 10, 1),
        "n_estimators": range(60, 220, 40),
        "learning_rate": [0.1, 0.01, 0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.5],
    }

    # defining parameter range
    grid = GridSearchCV(XGBClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    dicxgb = grid.best_params_

    '''
    print("XGB")
    print(f"Melhores parâmetros - {dicxgb}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")
    '''

    return dicxgb

## Hold-out

Antes de realizar o 10-Fold os dados serão separados em treino e teste através do método **hold-out**. O data set de treino será utilizado para validar os modelos e escolher os melhores parâmetros com o **10-fold** para então comparar o modelos com o data set de teste.

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=199, stratify=y)

## 10-Fold

Com esse método são criados 10 datasets de treino e 10 datasets de test com uma divisão de 90% para treino e 10% para teste em cada divisão.

O conjunto de treino sera divido mais uma vez em treino e validação (isso é feito dendo da função `GridSearchCV` para que então seja aplicado o GridSearch e assim obtenha-se os melhores parâmetros. Por fim, tendo os melhores parâmetros, utiliza-se o conjunto de teste para que se possa avaliar os resultados.

Esses resultados são obtidos de cada fold e então se tira a média deles para obter-se a avaliação final de cada modelo.

In [27]:
def calculate_mean_restults(results_dict_models):    
    # a cada interação calcula a média e o desvio padrão da 
    # acurácia, f1-score, auc-scor e matriz de confusão de cada modelo
    for model_key in results_dict_models.keys():
        accuracies = np.array(results_dict_models[model_key]['accuracy'])
        f1 = np.array(results_dict_models[model_key]['f1'])
        auc = np.array(results_dict_models[model_key]['auc'])
        conf_matrix = np.array(results_dict_models[model_key]['cm'])

        print_mean_result(model_key, accuracies, f1, auc, conf_matrix)

In [28]:
def print_best_results(best_k, best_weight, best_metric, best_max_depth, best_criterion, best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i, best_c, best_kernel, best_gamma):
    print(f"Best KNN parameters: k:{best_k} m:{best_weight} w:{best_metric}")
    print(f"Best DT parameters: md:{best_max_depth} c:{best_criterion}")
    print(f"Best MLP parameters: hls:{best_hidden_layers} a:{best_activation} s:{best_solver} lr:{best_learn_rate} lri:{best_learn_rate_i}")
    print(f"Best SVM parameters: c:{best_c} k:{best_kernel} g:{best_gamma}")
    print(f"Best OLR parameters: a:{best_alpha}")
    print(f"Best RF parameters: e:{best_estimators}  md:{best_mdepth}") 
    print(f"Best XGB parameters: dic:{best_dicxgb}")
    print("--------------------------------------------------------------------------")

In [29]:
def print_mean_result(model_key, accuracies, f1, auc, conf_matrix):
    print(f"\t{model_key}")
    print("Acurácia média (desvio): %.6f +- (%.6f)" %(accuracies.mean(), accuracies.std()))
    print("F1-score média (desvio): %.6f +- (%.6f)" %(f1.mean(), f1.std()))
    print("AUC média (desvio): %.6f +- (%.6f)\n" %(auc.mean(), auc.std()))
    print(f"Matriz de Confusão:  \n{sum(conf_matrix)*0.1}")

In [33]:
#função que roda os modelos em cada uma das divisões do 10-fold
#e imprime a média e o desvio padrão dos resultados

def evaluate_model_with_kfold(kf):
    results_dict_models = {}
    # listas e dicionarios para salvar as métricas dos resultados de todas as interacoes
    # a key 'best'salva a melhor acurácia
    results_dict_KNN = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_DT = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_MLP = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_SVM = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_GNB = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_OLR = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_RF = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    results_dict_XGB = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': [],
        'best': 0.0
    }
    
    # váriável para salvar os melhores parâmetros
    # knn
    best_k, best_weight, best_metric = 5, 'uniform', 'minkowski' #padrão
    # dt
    best_max_depth, best_criterion = None, 'gini' #padrão
    #mlp
    best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = (100,), 'relu', 'adam', 'constant', 0.001 #padrão
    #svm
    best_c, best_kernel, best_gamma = 1.0, 'rbf', 'scale' #padrão
    #olr
    best_alpha = 1.0 #padrão
    #rf
    best_estimators, best_mdepth = 100, None #padrão
    #xgb
    best_dicxgb = 0
    
    # laço que roda todos os modelos em cada 1-fold
    for train, test in kf.split(X_train.iloc[:100], y_train.iloc[:100]):
        X_train_kf, y_train_kf, X_test_kf, y_test_kf = X_train.iloc[train], y_train.iloc[train], X_train.iloc[test], y_train.iloc[test]        
        
        #kNN
        k,w,m = kNN_grid_search(X_train_kf, y_train_kf)
        model = knn(n_neighbors=k, weights=w, metric=m)
        results_dict_KNN = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_KNN)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_KNN['accuracy'][-1] > results_dict_KNN['best']): 
            results_dict_KNN['best'] = results_dict_KNN['accuracy'][-1]
            best_k, best_weight, best_metric = k, w, m 
        
        #DT
        md,c = dt_grid_search(X_train_kf, y_train_kf)
        model = DecisionTreeClassifier(max_depth=md, criterion=c, random_state=199)
        results_dict_DT = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_DT)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_DT['accuracy'][-1] > results_dict_DT['best']): 
            results_dict_DT['best'] = results_dict_DT['accuracy'][-1]
            best_max_depth, best_criterion = md, c
        
        
        #MLP 
        hls, a, s, lr, lri = mlp_grid_search(X_train_kf,y_train_kf)
        model = MLPClassifier(
            hidden_layer_sizes=hls, 
            activation=a, 
            solver=s, 
            learning_rate=lr, 
            learning_rate_init=lri, 
            max_iter=2000, 
            tol=0.000001,
            random_state=199
        )
        results_dict_MLP = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_MLP)
        #verifica se os parêmetros do fold atual são os melhores comparando a acurácia
        if (results_dict_MLP['accuracy'][-1] > results_dict_MLP['best']): 
            results_dict_MLP['best'] = results_dict_MLP['accuracy'][-1]
            best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = hls, a, s, lr, lri
        
        #GNB 
        model = GaussianNB()
        results_dict_models['GNB'] = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_GNB)
        
        #SVM
        c, k, g = svm_grid_search(X_train_kf, y_train_kf)
        model = SVC(C=c, kernel=k, gamma=g, probability=True, random_state=199)
        results_dict_SVM = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_SVM)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_SVM['accuracy'][-1] > results_dict_SVM['best']): 
            results_dict_SVM['best'] = results_dict_SVM['accuracy'][-1]
            best_c, best_kernel, best_gamma = c, k, g           
        
        # Regressão Logística Ordinal
        a = olr_grid_search(X_train_kf, y_train_kf)
        model = LogisticAT(alpha=a)
        results_dict_OLR = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_OLR)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_OLR['accuracy'][-1] > results_dict_OLR['best']): 
            results_dict_OLR['best'] = results_dict_OLR['accuracy'][-1]
            best_alpha = a 
        
        # RF
        e, m =  rf_grid_search(X_train_kf, y_train_kf)
        model = RandomForestClassifier(n_estimators=e, max_depth=m, random_state=199)
        results_dict_RF = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_RF)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_RF['accuracy'][-1] > results_dict_RF['best']): 
            results_dict_RF['best'] = results_dict_RF['accuracy'][-1]
            best_estimators, best_mdepth = e, m 
      
        # XGB
        dicxgb = xgb_grid_search(X_train_kf, y_train_kf)
        model = XGBClassifier(**dicxgb, random_state=199, silent=True)
        results_dict_XGB = model_results(model, X_train_kf, y_train_kf, X_test_kf, y_test_kf, results_dict_XGB)
        #verifica se os parêmetros do fold atual são os melhores
        if (results_dict_XGB['accuracy'][-1] > results_dict_XGB['best']): 
            results_dict_XGB['best'] = results_dict_XGB['accuracy'][-1]
            best_dicxgb = dicxgb
        
        
    results_dict_models['KNN'] = results_dict_KNN
    results_dict_models['DT'] = results_dict_DT
    results_dict_models['MLP'] = results_dict_MLP
    results_dict_models['GNB'] = results_dict_GNB
    results_dict_models['SVM'] = results_dict_SVM
    results_dict_models['OLR'] = results_dict_OLR
    results_dict_models['RF'] = results_dict_RF
    results_dict_models['XGB'] = results_dict_XGB
    
    # calcula a média dos resultados e imprime cada métrica
    calculate_mean_restults(results_dict_models)

    # imprime os melhores parâmetros
    print_best_results(best_k, best_weight, best_metric, best_max_depth, best_criterion, best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i, best_c, best_kernel, best_gamma, best_alpha, best_estimators, best_mdepth, best_dicxgb)
    
    # salva os melhores parâmetros em um dicionário que é o retorno da função
    parameters_dict = {
        'knn': (best_k, best_weight, best_metric),
        'dt': (best_max_depth, best_criterion),
        'mlp': (best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i),
        'svm': (best_c, best_kernel, best_gamma),
        'olr': (best_alpha,),
        'rf': (best_estimators, best_mdepth),
        'xgb': (best_dicxgb,)
    }
    
    return parameters_dict

In [31]:
import warnings

In [None]:
%%time
#ignorando warnings
warnings.filterwarnings('ignore')

params_dict = {}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=199)
params_dict = evaluate_model_with_kfold(skf)





















































































































































## Teste

Aqui os modelos são execultados no dataset de teste com seus melhores parâmetros.

In [148]:
def model_test_results(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #medindo e armazenando acurácia, f1-score e auc-score no dicionário
    accuracy = model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    AUC = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovo')
    CM = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: %.6f" %(accuracy))
    print(f"f1: %.6f" %(f1))
    print(f"AUC: %.6f" %(AUC))
    print(f"CM: \n{CM} \n")

**KNN**

In [149]:
best_k, best_weight, best_metric = params_dict['knn']

In [150]:
model = knn(n_neighbors=best_k, weights=best_weight, metric=best_metric)
model_test_results(model, X_train, y_train, X_test, y_test)

Accuracy: 0.250602
f1: 0.249453
AUC: 0.490444
CM: 
[[27 24 21 29]
 [25 16 24 34]
 [24 30 29 24]
 [40 15 21 32]] 



**DT**

In [151]:
best_max_depth, best_criterion = params_dict['dt']

In [152]:
model = DecisionTreeClassifier(max_depth=best_max_depth, criterion=best_criterion, random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

Accuracy: 0.243373
f1: 0.242814
AUC: 0.494661
CM: 
[[18 31 22 30]
 [25 19 23 32]
 [22 24 32 29]
 [33 20 23 32]] 



**MLP**

In [153]:
best_hidden_layers, best_activation, best_solver, best_learn_rate, best_learn_rate_i = params_dict['mlp']

In [154]:
model = MLPClassifier(
            hidden_layer_sizes=best_hidden_layers, 
            activation=best_activation, 
            solver=best_solver, 
            learning_rate=best_learn_rate, 
            learning_rate_init=best_learn_rate_i, 
            max_iter=2000, 
            tol=0.000001,
            random_state=199
        )
model_test_results(model, X_train, y_train, X_test, y_test)

Accuracy: 0.265060
f1: 0.263031
AUC: 0.494977
CM: 
[[27 23 22 29]
 [24 20 27 28]
 [30 18 26 33]
 [33 17 21 37]] 



**GNB**

In [155]:
model = GaussianNB()
model_test_results(model, X_train, y_train, X_test, y_test)

Accuracy: 0.269880
f1: 0.268598
AUC: 0.470611
CM: 
[[20 20 21 40]
 [17 27 20 35]
 [20 20 29 38]
 [23 18 31 36]] 



**SVM**

In [156]:
best_c, best_kernel, best_gamma = params_dict['svm']

In [157]:
model = SVC(C=best_c, kernel=best_kernel, gamma=best_gamma, probability=True, random_state=199)
model_test_results(model, X_train, y_train, X_test, y_test)

Accuracy: 0.269880
f1: 0.199224
AUC: 0.513747
CM: 
[[ 0  0 39 62]
 [ 0  5 33 61]
 [ 0  3 43 61]
 [ 0  0 44 64]] 

