# Rodando os modelos com Cross Validation

In [1]:
# Bibliotecas de manipualção e visualização de dados
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Classes dos modelo
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from mord import LogisticAT
from sklearn.ensemble import RandomForestClassifier
from mord import LogisticAT
from xgboost import XGBClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ROCAUC

# Seleção de Features e redução de dimencionalidade
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA


In [2]:
df_hepatite = pd.read_csv('HCV-Egy-Data.csv')

In [3]:
# dataset sem outliers
df_hepatite = pd.read_csv('HCV-Egy-Data-no-outlier.csv')

A remoção de outliers mostrou-se eficiente na performace do modelo aumentando em serca de 1% a acurácia.

In [4]:
df_hepatite.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,FGba,Jaundice,...,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,BhG,Baselinehistological staging
0,1,46,1,29,1,2,2,1,2,2,...,57,123,44,40620,538635,637056,336804,31085,4,2
1,3,49,2,33,1,2,1,2,1,2,...,48,77,33,1041941,449939,585688,744463,582301,10,3
2,5,58,2,22,2,2,2,1,2,2,...,73,114,29,1157452,1086852,5,5,5,4,4
3,6,42,2,26,1,1,2,2,2,2,...,84,80,28,325694,1034008,275095,214566,635157,12,4
4,7,48,2,30,1,1,2,2,1,1,...,96,53,39,641129,72050,787295,370605,506296,12,3


### Seperação da váriável target do dataset

In [5]:
X = df_hepatite.drop(['BhG', 'Baselinehistological staging'], axis=1)
y = df_hepatite['Baselinehistological staging']

### Normalização dos dados

Com a normalização dos dados o modelo melhorou de predição almentou em um valor de 6% na sua acurácia.

In [6]:
scaler = StandardScaler()
scaler.fit(X)

X_train = scaler.transform(X)

In [7]:
#função que retorna um dicionário com os valores dos resultados
def model_results(model, X_train, y_train, X_test, y_test,results_dict_aux):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #medindo e armazenando acurácia, f1-score e auc-score no dicionário
    #accuracy = model.score(X_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    AUC = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovo')
    CM = confusion_matrix(y_test, y_pred)
    #xx = classification_report(y_test, y_pred)
    #print(xx)

    
    results_dict_aux['accuracy'].append(accuracy)
    results_dict_aux['f1'].append(f1)
    results_dict_aux['auc'].append(AUC)
    results_dict_aux['cm'].append(CM)
    print(f"f1: {f1}\n")
    print(f"Accuracy: {accuracy}\n")
    print(f"CM: \n{CM} \n")
    print(f"AUC: %.6f" %(AUC))
    
    '''
    print("-----------------------CURVA ROC---------------------")
    visualizer = ROCAUC(model, encoder={1:"Class 1", 2:"Class 2", 3:"Class 3", 4:"Class 4"})

    visualizer.fit(X_train, y_train)        
    visualizer.score(X_test, y_test)        
    visualizer.show()                       
    #print("-----------------------------------------------------\n")
    '''
    return results_dict_aux

## Grid Search

Para cada modelo é implementada uma função do grid search. Para ser aplicada em cada um dos 10 conjuntos de treino do 10-fold.

**KNN**

In [8]:
def kNN_grid_search(X_train, y_train):
    #lista com números impares para o número de vizinhos do knn
    k_range = [impar for impar in range(1,32) if (impar%2)!=0]
    #listas com formas de considerar a ditância do vizinho
    weights = ['uniform', 'distance']
    #lista com formas de calcular as distâncias
    dist = ['euclidian','manhattan','chebyshev']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'n_neighbors': k_range,
        'weights': weights,
        'metric': dist
    }
    # defining parameter range
    grid = GridSearchCV(knn(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    k = grid.best_params_['n_neighbors']
    w = grid.best_params_['weights']
    m = grid.best_params_['metric']

    print("KNN")
    print(f"Melhores parâmetros - k:{k}, w:{w}, m:{m}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return (k,w,m)

**Decision Tree**

In [9]:
def dt_grid_search(X_train, y_train):
    #
    max_depth_range = [x for x in range(1,32)]
    #listas com formas de medir a qualidade do 'split'
    criterion_list = ['gini', 'entropy']
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'max_depth': max_depth_range,
        'criterion': criterion_list,
    }
    # defining parameter range
    grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    md = grid.best_params_['max_depth']
    c = grid.best_params_['criterion']

    print("DT")
    print(f"Melhores parâmetros - md:{md}, c:{c}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")


    return (md,c)

**MLP**

In [10]:
def mlp_grid_search(X_train, y_train):
    # lista com o número de camadas
    hidden_layer_sizes_list = [(100,), (50, 15, 5), (100, 25, 10)]
    # listas das taxas de aprendizado inicial
    learning_rate_init_list = [0.05, 0.0001]
    # lista de estratégias
    solver_list = ['sgd', 'adam']
    # lista das funções de ativações
    activation_list = ['tanh', 'relu']
    # lista da forma da taxa de aprendizado
    learning_rate_lsit = ['constant', 'adaptive']
    # dicionário com parêmetros para o gridsearch
    param_grid = {
        'hidden_layer_sizes': hidden_layer_sizes_list,
        'activation': activation_list,
        'solver': solver_list,
        'learning_rate': learning_rate_lsit,
        'learning_rate_init': learning_rate_init_list,
    }
    # defining parameter range
    grid = GridSearchCV(MLPClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    # utilizando melhores parâmetros calculados pelo gridsearch
    hls = grid.best_params_['hidden_layer_sizes']
    a = grid.best_params_['activation']
    s = grid.best_params_['solver']
    lr = grid.best_params_['learning_rate']
    lri = grid.best_params_['learning_rate_init']

    print("MLP")
    print(f"Melhores parâmetros - hls:{hls}, a:{a}, s:{s}, lr:{lr}, lri:{lri}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return (hls, a, s, lr, lri)

**SVM**

In [11]:
def svm_grid_search(X_train, y_train):
    #
    C_list = [0.1, 1, 10, 100]
    kernel_list = ['rbf','sigmoid'] #['linear', 'poly', 'rbf', 'sigmoid']
    gamma_list = [1, 0.1, 0.01, 0.001]
    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'C': C_list,
        'kernel': kernel_list,
        'gamma': gamma_list
    }
    # defining parameter range
    grid = GridSearchCV(SVC(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    c = grid.best_params_['C']
    k = grid.best_params_['kernel']
    g = grid.best_params_['gamma']

    print("SVM")
    print(f"Melhores parâmetros - C:{c}, k:{k}\n, g:{g}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return (c,k,g)

**Regressão Logística Ordinal**

In [12]:
def olr_grid_search(X_train, y_train):

    #dicionário com parêmetros para o gridsearch
    param_grid = {
        'alpha': [0, 0.2, 0.5, 2.0, 5.0]
    }
    # defining parameter range
    grid = GridSearchCV(LogisticAT(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    a = grid.best_params_['alpha']

    print("RLO")
    print(f"Melhores parâmetros - a:{a}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return a

**Random Forest**

In [13]:
def rf_grid_search(X_train, y_train):

    # número de árvores
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]

    # profundidade máxima
    max_depth = [int(x) for x in np.linspace(100, 300, num = 11)]
    max_depth.append(None)
    # grid
    param_grid = {
     'n_estimators': n_estimators,
     'max_depth': max_depth
     }

    # defining parameter range
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    e = grid.best_params_['n_estimators']
    m = grid.best_params_['max_depth']

    print("RF")
    print(f"Melhores parâmetros - e:{e}, m:{m}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return (e,m)

**Extreme Gradient Boosting**

In [14]:
def xgb_grid_search(X_train, y_train):

    # dicionário com parêmetros para o gridsearch 
    '''
    param_grid = {
        "objective": ['multi:softmax'],
        "max_depth": [3, 4, 5, 7],
        "learning_rate": [0.1, 0.01, 0.05],
        "gamma": [0, 0.25, 1],
        "reg_lambda": [0, 1, 10],
        "scale_pos_weight": [1, 3, 5],
        "subsample": [0.8],
        "colsample_bytree": [0.5],
    }

    '''
    param_grid = {
        "objective": ['multi:softmax'],
        "max_depth": range (2, 10, 1),
        "n_estimators": range(60, 220, 40),
        "learning_rate": [0.1, 0.01, 0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.5],
    }

    # defining parameter range
    grid = GridSearchCV(XGBClassifier(), param_grid, cv=3, scoring='accuracy')
    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #utilizando melhores parâmetros calculados pelo gridsearch
    dicxgb = grid.best_params_

    print("XGB")
    print(f"Melhores parâmetros - {dicxgb}")
    print(f"Accuracy: %.6f" %(grid.best_score_))
    print("------------------------------------------------")

    return dicxgb

## 10-Fold

Com esse método são criados 10 datasets de treino e 10 datasets de test com uma divisão de 90% para treino e 10% para teste em cada divisão.

O conjunto de treino sera divido mais uma vez em treino e validação (isso é feito dendo da função `GridSearchCV` para que então seja aplicado o GridSearch e assim obtenha-se os melhores parâmetros. Por fim, tendo os melhores parâmetros, utiliza-se o conjunto de teste para que se possa avaliar os resultados.

Esses resultados são obtidos de cada fold e então se tira a média deles para obter-se a avaliação final de cada modelo.

In [15]:
#função que roda os modelos em cada uma das divisões do 10-fold
#e imprime a média e o desvio padrão dos resultados

def evaluate_model_with_kfold(kf):
    results_dict_models = {}
    # listas e dicionarios para salvar as métricas dos resultados de todas as interacoes
    
    results_dict_KNN = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_DT = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_MLP = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_SVM = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_GNB = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_OLR = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_RF = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    results_dict_XGB = {
        'accuracy': [],
        'f1': [],
        'auc': [],
        'cm': []
    }
    
    for train, test in kf.split(X, y):
        X_train, y_train, X_test, y_test = X.iloc[train], y.iloc[train], X.iloc[test], y.iloc[test]
        
        # Método de seleção de features não supervisionado
        # Limiar de variância - retira-se as features com variância menor que 85%
        filter_variance = VarianceThreshold(0.85)
        filter_variance.fit(X_train)
        X_train = filter_variance.transform(X_train)
        X_test = filter_variance.transform(X_test)
        print("Features selecionadas: %d" %(X_train.shape[1]))  

        #kNN
        k,w,m = kNN_grid_search(X_train, y_train)
        model = knn(n_neighbors=k, weights=w, metric=m)
        results_dict_KNN = model_results(model, X_train, y_train, X_test, y_test, results_dict_KNN)

        #DT
        md,c = dt_grid_search(X_train, y_train)
        model = DecisionTreeClassifier(max_depth=md, criterion=c, random_state=199)
        results_dict_DT = model_results(model, X_train, y_train, X_test, y_test, results_dict_DT)

        #MLP 
        hls, a, s, lr, lri = mlp_grid_search(X_train,y_train)
        model = MLPClassifier(
            hidden_layer_sizes=hls, 
            activation=a, 
            solver=s, 
            learning_rate=lr, 
            learning_rate_init=lri, 
            max_iter=2000, 
            tol=0.000001,
            random_state=199
        )
        results_dict_MLP = model_results(model, X_train, y_train, X_test, y_test, results_dict_MLP)
         
        #GNB 
        model = GaussianNB()
        results_dict_GNB = model_results(model, X_train, y_train, X_test, y_test, results_dict_GNB)

        #SVM
        c, k, g = svm_grid_search(X_train, y_train)
        model = SVC(C=c, kernel=k, gamma=g, probability=True, random_state=199)
        results_dict_SVM = model_results(model, X_train, y_train, X_test, y_test, results_dict_SVM)

        # Regressão Logística Ordinal
        a = olr_grid_search(X_train, y_train)
        model = LogisticAT(alpha=a)
        results_dict_OLR = model_results(model, X_train, y_train, X_test, y_test, results_dict_OLR)
        
        # RF
        e,m =  rf_grid_search(X_train, y_train)
        model = RandomForestClassifier(n_estimators=e,max_depth=m)
        results_dict_RF = model_results(model, X_train, y_train, X_test, y_test, results_dict_RF)
      
        # XGB
        dicxgb = xgb_grid_search(X_train, y_train)
        model = XGBClassifier(**dicxgb)
        results_dict_XGB = model_results(model, X_train, y_train, X_test, y_test, results_dict_XGB)

    results_dict_models['KNN'] = results_dict_KNN
    results_dict_models['DT'] = results_dict_DT
    results_dict_models['MLP'] = results_dict_MLP
    results_dict_models['GNB'] = results_dict_GNB
    results_dict_models['SVM'] = results_dict_SVM
    results_dict_models['OLR'] = results_dict_OLR
    results_dict_models['RF'] = results_dict_RF
    results_dict_models['XGB'] = results_dict_XGB

    
    # a cada interação calcula a média e o desvio padrão da 
    # acurácia, f1-score, auc-scor e matriz de confusão de cada modelo
    for model_key in results_dict_models.keys():
        accuracies = np.array(results_dict_models[model_key]['accuracy'])
        f1 = np.array(results_dict_models[model_key]['f1'])
        auc = np.array(results_dict_models[model_key]['auc'])
        conf_matrix = np.array(results_dict_models[model_key]['cm'])

        print(f"\t{model_key}")
        print("Acurácia média (desvio): %.6f +- (%.6f)" %(accuracies.mean(), accuracies.std()))
        print("F1-score média (desvio): %.6f +- (%.6f)" %(f1.mean(), f1.std()))
        print("AUC média (desvio): %.6f +- (%.6f)\n" %(auc.mean(), auc.std()))
        print(f"Matriz de Confusão:  \n{sum(conf_matrix)*0.1}")
        # print(f"Matriz de Confusão:  \n{sum(conf_matrix)*0.1}\n")
    print("------------------------------------------------")

In [16]:
import warnings

In [17]:
%%time
#ignorando warnings
warnings.filterwarnings("ignore")

evaluate_model_with_kfold(StratifiedKFold(n_splits=10, shuffle=True, random_state=199))

Features selecionadas: 20
KNN
Melhores parâmetros - k:19, w:uniform, m:chebyshev
Accuracy: 0.277778
------------------------------------------------
f1: 0.24776950229709707

Accuracy: 0.2446043165467626

CM: 
[[ 7  7 10 10]
 [11  9  8  6]
 [12  5 10  8]
 [12  4 12  8]] 

AUC: 0.503190
DT
Melhores parâmetros - md:1, c:gini
Accuracy: 0.261675
------------------------------------------------
f1: 0.11588342199229212

Accuracy: 0.2589928057553957

CM: 
[[ 0  0 34  0]
 [ 0  0 34  0]
 [ 0  0 35  0]
 [ 0  0 35  1]] 

AUC: 0.504696
MLP
Melhores parâmetros - hls:(50, 15, 5), a:tanh, s:sgd, lr:adaptive, lri:0.05
Accuracy: 0.274557
------------------------------------------------
f1: 0.24964595779664417

Accuracy: 0.2805755395683453

CM: 
[[ 6  8  4 16]
 [ 7  2  7 18]
 [ 6  4  9 16]
 [ 9  1  4 22]] 

AUC: 0.492359
f1: 0.1685577898277304

Accuracy: 0.17985611510791366

CM: 
[[ 2  8  8 16]
 [ 5  7 10 12]
 [ 6  9  4 16]
 [ 8  7  9 12]] 

AUC: 0.461609
SVM
Melhores parâmetros - C:0.1, k:rbf
, g:1
Accu



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 180, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.266506
------------------------------------------------


f1: 0.16803257710464145

Accuracy: 0.17266187050359713

CM: 
[[ 7  9  5 13]
 [ 7  3 10 14]
 [11  7  5 12]
 [11  6 10  9]] 

AUC: 0.477693
Features selecionadas: 20
KNN
Melhores parâmetros - k:5, w:uniform, m:chebyshev
Accuracy: 0.269499
------------------------------------------------
f1: 0.3091570921202301

Accuracy: 0.3115942028985507

CM: 
[[13  8  8  4]
 [10 12  4  7]
 [ 8 11  9  8]
 [ 5 13  9  9]] 

AUC: 0.535695
DT
Melhores parâmetros - md:14, c:gini
Accuracy: 0.266292
------------------------------------------------
f1: 0.24715928093645487

Accuracy: 0.2463768115942029

CM: 
[[11  9  5  8]
 [ 6  7  9 11]
 [ 6  8  9 13]
 [ 9  7 13  7]] 

AUC: 0.499026
MLP
Melhores parâmetros - hls:(100,), a:relu, s:adam, lr:constant, lri:0.0001
Accuracy: 0.273537
------------------------------------------------
f1: 0.23338955543202258

Accuracy: 0.2463768115942029

CM: 
[[ 5 10 10  8]
 [ 5 10  9  9]
 [ 2  8 16 10]
 [ 9  9 15  3]] 

AUC: 0.498536
f1: 0.22796238934490035

Accuracy: 0.23188405797101



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 180, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.266294
------------------------------------------------


f1: 0.23318134158678366

Accuracy: 0.2318840579710145

CM: 
[[ 8 13  6  6]
 [ 8  7 13  5]
 [ 9  9  8 10]
 [13  7  7  9]] 

AUC: 0.485818
Features selecionadas: 20
KNN
Melhores parâmetros - k:15, w:uniform, m:manhattan
Accuracy: 0.255845
------------------------------------------------
f1: 0.3474137579097775

Accuracy: 0.34782608695652173

CM: 
[[14  5  7  7]
 [ 4 10  9 10]
 [13  7 11  5]
 [ 6  5 12 13]] 

AUC: 0.557937
DT
Melhores parâmetros - md:13, c:gini
Accuracy: 0.267908
------------------------------------------------
f1: 0.23237816216687843

Accuracy: 0.2391304347826087

CM: 
[[ 5  9 13  6]
 [ 7  7  9 10]
 [ 4 10 14  8]
 [ 7  9 13  7]] 

AUC: 0.495764
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:adam, lr:adaptive, lri:0.0001
Accuracy: 0.273550
------------------------------------------------
f1: 0.2910351072386705

Accuracy: 0.2971014492753623

CM: 
[[12  7  8  6]
 [ 8  6  7 12]
 [ 9  7  8 12]
 [ 6  6  9 15]] 

AUC: 0.544000
f1: 0.20207761927173779

Accuracy: 0.21739130434782



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.267084
------------------------------------------------


f1: 0.26142235383856055

Accuracy: 0.2608695652173913

CM: 
[[ 8  8 13  4]
 [ 4  8  6 15]
 [ 6  6 12 12]
 [ 6 13  9  8]] 

AUC: 0.482582
Features selecionadas: 20
KNN
Melhores parâmetros - k:19, w:uniform, m:chebyshev
Accuracy: 0.279986
------------------------------------------------
f1: 0.2554147628244383

Accuracy: 0.2536231884057971

CM: 
[[ 6 10 10  7]
 [14  6  5  8]
 [ 6  9 12  9]
 [11  7  7 11]] 

AUC: 0.502164
DT
Melhores parâmetros - md:17, c:gini
Accuracy: 0.255020
------------------------------------------------
f1: 0.18579707533542156

Accuracy: 0.18840579710144928

CM: 
[[ 6  9 11  7]
 [ 4 10 10  9]
 [12  7  6 11]
 [ 8 12 12  4]] 

AUC: 0.464875
MLP
Melhores parâmetros - hls:(100, 25, 10), a:tanh, s:adam, lr:constant, lri:0.0001
Accuracy: 0.271098
------------------------------------------------
f1: 0.2220675799979921

Accuracy: 0.2391304347826087

CM: 
[[ 5  9  7 12]
 [ 6  9  6 12]
 [ 9 10  3 14]
 [ 5 10  5 16]] 

AUC: 0.467519
f1: 0.23318908419650727

Accuracy: 0.2463768



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 60, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.256625
------------------------------------------------


f1: 0.22999993574043323

Accuracy: 0.2391304347826087

CM: 
[[ 5  7 11 10]
 [ 9  6 11  7]
 [12  6  5 13]
 [ 3  8  8 17]] 

AUC: 0.479298
Features selecionadas: 20
KNN
Melhores parâmetros - k:19, w:uniform, m:chebyshev
Accuracy: 0.280787
------------------------------------------------
f1: 0.20470378845664886

Accuracy: 0.21014492753623187

CM: 
[[ 8  6  6 13]
 [10  2 13  8]
 [10  9  9  8]
 [11  7  8 10]] 

AUC: 0.455895
DT
Melhores parâmetros - md:29, c:entropy
Accuracy: 0.259855
------------------------------------------------
f1: 0.21445206370586925

Accuracy: 0.21739130434782608

CM: 
[[ 9  6 11  7]
 [ 7  9  8  9]
 [12 11  4  9]
 [15  7  6  8]] 

AUC: 0.479373
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:sgd, lr:adaptive, lri:0.0001
Accuracy: 0.278329
------------------------------------------------
f1: 0.2508153478373254

Accuracy: 0.2608695652173913

CM: 
[[ 4  8 16  5]
 [ 4  6 11 12]
 [ 4  7 13 12]
 [ 9  5  9 13]] 

AUC: 0.501022
f1: 0.17200961662684489

Accuracy: 0.1884057971



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 60, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.258247
------------------------------------------------


f1: 0.2505871074262787

Accuracy: 0.2536231884057971

CM: 
[[ 6  4 11 12]
 [10  6  6 11]
 [ 3 10 13 10]
 [ 8  9  9 10]] 

AUC: 0.505383
Features selecionadas: 20
KNN
Melhores parâmetros - k:3, w:distance, m:chebyshev
Accuracy: 0.279182
------------------------------------------------
f1: 0.23286976090873868

Accuracy: 0.2391304347826087

CM: 
[[ 7  6 15  5]
 [ 7  5  8 13]
 [ 6  6 14 10]
 [13  6 10  7]] 

AUC: 0.471095
DT
Melhores parâmetros - md:3, c:gini
Accuracy: 0.269511
------------------------------------------------
f1: 0.21022669937540234

Accuracy: 0.2826086956521739

CM: 
[[ 5  1  0 27]
 [ 8  4  2 19]
 [ 3  7  1 25]
 [ 2  4  1 29]] 

AUC: 0.500744
MLP
Melhores parâmetros - hls:(50, 15, 5), a:tanh, s:adam, lr:adaptive, lri:0.0001
Accuracy: 0.261461
------------------------------------------------
f1: 0.20232362582420277

Accuracy: 0.21739130434782608

CM: 
[[ 4  7 10 12]
 [ 4  4 12 13]
 [ 5  5  6 20]
 [ 1  5 14 16]] 

AUC: 0.485336
f1: 0.27500573008031187

Accuracy: 0.289855072



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 140, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.250183
------------------------------------------------


f1: 0.27569168262538074

Accuracy: 0.2753623188405797

CM: 
[[ 8  9  9  7]
 [ 5  8 14  6]
 [ 9  8 11  8]
 [ 9  7  9 11]] 

AUC: 0.479600
Features selecionadas: 20
KNN
Melhores parâmetros - k:11, w:distance, m:chebyshev
Accuracy: 0.280787
------------------------------------------------
f1: 0.18845807482311147

Accuracy: 0.18840579710144928

CM: 
[[ 5  5 12 12]
 [ 7  5 14  7]
 [15  6  6  8]
 [ 7  9 10 10]] 

AUC: 0.472653
DT
Melhores parâmetros - md:3, c:entropy
Accuracy: 0.255024
------------------------------------------------
f1: 0.1548047557200875

Accuracy: 0.2246376811594203

CM: 
[[ 0  0  9 25]
 [ 0  1  8 24]
 [ 0  1  8 26]
 [ 0  0 14 22]] 

AUC: 0.507552
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:sgd, lr:constant, lri:0.0001
Accuracy: 0.272745
------------------------------------------------
f1: 0.2520403144535406

Accuracy: 0.26811594202898553

CM: 
[[ 4  1 19 10]
 [ 3  6 15  9]
 [ 5 10 17  3]
 [ 3  5 18 10]] 

AUC: 0.512276
f1: 0.194672641648067

Accuracy: 0.2101449275362



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 60, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.251805
------------------------------------------------


f1: 0.21515254538917628

Accuracy: 0.2318840579710145

CM: 
[[ 3  8 10 13]
 [ 6  5 10 12]
 [ 9  4  7 15]
 [ 6  4  9 17]] 

AUC: 0.463303
Features selecionadas: 20
KNN
Melhores parâmetros - k:17, w:uniform, m:chebyshev
Accuracy: 0.276767
------------------------------------------------
f1: 0.2596125721262874

Accuracy: 0.2608695652173913

CM: 
[[ 6  8  8 12]
 [ 8  7  9  9]
 [14  3 11  7]
 [ 7  9  8 12]] 

AUC: 0.538628
DT
Melhores parâmetros - md:2, c:entropy
Accuracy: 0.259054
------------------------------------------------
f1: 0.1702454895001479

Accuracy: 0.2753623188405797

CM: 
[[ 0  7  0 27]
 [ 0  8  0 25]
 [ 0  6  0 29]
 [ 0  6  0 30]] 

AUC: 0.526822
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:sgd, lr:adaptive, lri:0.0001
Accuracy: 0.267074
------------------------------------------------
f1: 0.272291687638029

Accuracy: 0.2971014492753623

CM: 
[[ 2 12 13  7]
 [ 4 18  5  6]
 [ 2  9 10 14]
 [ 1  8 16 11]] 

AUC: 0.569431
f1: 0.26652611285038574

Accuracy: 0.2681159420289855



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 140, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.257437
------------------------------------------------


f1: 0.1893069889780796

Accuracy: 0.1956521739130435

CM: 
[[ 6  8 11  9]
 [12  3  9  9]
 [ 9  5  6 15]
 [12  6  6 12]] 

AUC: 0.438571
Features selecionadas: 20
KNN
Melhores parâmetros - k:17, w:uniform, m:chebyshev
Accuracy: 0.286417
------------------------------------------------
f1: 0.26320827741553177

Accuracy: 0.26811594202898553

CM: 
[[ 7  9 13  5]
 [ 7  6 10 10]
 [ 6  7 14  8]
 [ 4  8 14 10]] 

AUC: 0.508556
DT
Melhores parâmetros - md:16, c:gini
Accuracy: 0.268710
------------------------------------------------
f1: 0.26852353436860477

Accuracy: 0.26811594202898553

CM: 
[[ 6 13  3 12]
 [ 7 11  9  6]
 [ 9  7 11  8]
 [12  8  7  9]] 

AUC: 0.508931
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:adam, lr:adaptive, lri:0.0001
Accuracy: 0.270293
------------------------------------------------
f1: 0.23920154381081404

Accuracy: 0.2391304347826087

CM: 
[[ 8  9  8  9]
 [ 4  9 11  9]
 [ 6 13  5 11]
 [ 6  9 10 11]] 

AUC: 0.536427
f1: 0.23193608966983756

Accuracy: 0.260869565217



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 60, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.252610
------------------------------------------------


f1: 0.22993905834722697

Accuracy: 0.2463768115942029

CM: 
[[ 2  8 12 12]
 [ 4  7  6 16]
 [ 5  5 11 14]
 [ 4  9  9 14]] 

AUC: 0.491201
Features selecionadas: 20
KNN
Melhores parâmetros - k:19, w:uniform, m:chebyshev
Accuracy: 0.279190
------------------------------------------------
f1: 0.24260146982091255

Accuracy: 0.2463768115942029

CM: 
[[10 11  4  9]
 [ 9  5  9 10]
 [ 7  9  6 13]
 [ 7  8  8 13]] 

AUC: 0.498039
DT
Melhores parâmetros - md:3, c:entropy
Accuracy: 0.276763
------------------------------------------------
f1: 0.19024479356960178

Accuracy: 0.2971014492753623

CM: 
[[ 0 16  0 18]
 [ 0 11  0 22]
 [ 0  5  0 30]
 [ 0  5  1 30]] 

AUC: 0.473579
MLP
Melhores parâmetros - hls:(100,), a:tanh, s:sgd, lr:adaptive, lri:0.0001
Accuracy: 0.274309
------------------------------------------------
f1: 0.2590813485224857

Accuracy: 0.2753623188405797

CM: 
[[ 4  8 14  8]
 [ 7  4 13  9]
 [ 8  5 17  5]
 [ 8  4 11 13]] 

AUC: 0.520421
f1: 0.2847382457711229

Accuracy: 0.29710144927536



















XGB
Melhores parâmetros - {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'objective': 'multi:softmax', 'subsample': 0.8}
Accuracy: 0.257439
------------------------------------------------
f1: 0.24264398832488043

Accuracy: 0.2536231884057971

CM: 
[[ 6  5 13 10]
 [10  4  7 12]
 [ 9  8  8 10]
 [ 5  9  5 17]] 

AUC: 0.523829
	KNN
Acurácia média (desvio): 0.257069 +- (0.043435)
F1-score média (desvio): 0.255121 +- (0.043835)
AUC média (desvio): 0.504385 +- (0.030905)

Matriz de Confusão:  
[[ 8.3  7.5  9.3  8.4]
 [ 8.7  6.7  8.9  8.8]
 [ 9.7  7.2 10.2  8.4]
 [ 8.3  7.6  9.8 10.3]]
	DT
Acurácia média (desvio): 0.249812 +- (0.031568)
F1-score média (desvio): 0.198972 +- (0.043094)
AUC média (desvio): 0.496136 +- (0.017623)

Matriz de Confusão:  
[[ 4.2  7.   8.6 13.7]
 [ 3.9  6.8  8.9 13.5]
 [ 4.6  6.2  8.8 15.9]
 [ 5.3  5.8 10.2 14.7]]
	MLP
Acurácia média (desvio): 0.262116 +- (0.025113)
F1-score média (desvio): 0.247189 +- (0.023738)
AUC média (desvi

In [27]:
####################################################################################