In [26]:
import numpy as np
import pandas as pd

<h2>Carregando dados</h2>

In [2]:
data = np.genfromtxt('diabetes.csv', delimiter=',')

In [3]:
labels = data[:,-1]
data = data[:,:-1]

<h2>Normalização</h2>

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

In [6]:
np.min(data[:,0]), np.max(data[:,0])

(0.0, 1.0)

<h2>Treinamento</h2>

In [13]:
# Modelos
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# K-fold CrossValidation
from sklearn.model_selection import KFold

# Métricas
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.utils import resample
from collections import Counter
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle

In [8]:
def oversampling(x, y, random_state=0):
    cnt = Counter()
    for cat in y:
        cnt[cat] += 1

    classe_majo = cnt.most_common()[0][0]
    num_samples = cnt.most_common()[0][1]

    dados_bal = []
    labels_bal = []
    for classes in np.unique(y):
        if not classes == classe_majo:
            dados = x[y == classes]
            label = y[y == classes]

            sampled_dados, sampled_label = resample(dados, 
                                           label, 
                                           n_samples=num_samples,
                                           random_state=random_state)

            dados_bal.append(sampled_dados)
            labels_bal.append(sampled_label)
        else:
            dados_bal.append(x[y == classe_majo])
            labels_bal.append(y[y == classe_majo])

    return np.vstack(dados_bal), np.hstack(labels_bal)

In [29]:
n_modelos = 5
kf = KFold(n_splits=10)
num_samples = 900
random_state=42

performance = {
    'acuracia': [],
    'recall': [],
    'precisao': []
}

balanced_data, balanced_labels = oversampling(data,labels,random_state)
balanced_data, balanced_labels = shuffle(balanced_data, balanced_labels, random_state=random_state)
curr_fold = 1

for train_index, test_index in kf.split(balanced_data):
    
    X_train, X_test = balanced_data[train_index], balanced_data[test_index]
    y_train, y_test = balanced_labels[train_index], balanced_labels[test_index]
    
    print('Validação Cruzada - Fold {}'.format(curr_fold))
    print('Conjunto de treinamento - Dados {} - {}'.format(X_train.shape, y_train.shape))
    print('Conjunto de teste - Dados {} - {}'.format(X_test.shape, y_test.shape))
    print('____________________________________________________________________________')
    
    # Bagging
    predictions = []
    ensemble_predictions = []
    for n in range(n_modelos):
        sample_X_train, sample_y_train = resample(X_train, 
                                                  y_train, 
                                                  n_samples=num_samples,
                                                  random_state=random_state)
        
        print('Modelo MLP {}'.format(n+1))
        
        mlp = MLPClassifier()
        
        print('Após reamostragem....')
        print('Conjunto de treinamento - Dados {} - {}'.format(sample_X_train.shape, sample_y_train.shape))
        
        # Treino de cada modelo
        mlp.fit(sample_X_train, sample_y_train)
        
        # Avaliação
        predictions.append(mlp.predict(X_test))
        
    predictions = np.vstack(predictions)
    print('Predição de todos os modelos {}'.format(predictions.shape))
    
    # Voto Majoritário
    for col in range(predictions.shape[1]):
        cnt = Counter()
        
        votes = predictions[:,col]
        for vote in votes:
            cnt[vote] += 1
        ensemble_predictions.append(cnt.most_common()[0][0]) # Categoria mais votada
        
    
    print('____________________________________________________________________________')
    
    # Cálculo de métricas
    ens_acc = accuracy_score(y_test, ensemble_predictions)
    ens_recall = recall_score(y_test, ensemble_predictions)
    ens_precisao = precision_score(y_test, ensemble_predictions)
    
    print('Métricas - Fold {}'.format(curr_fold))
    print('Acurácia: {}'.format(ens_acc))
    print('Recall: {}'.format(ens_recall))
    print('Precision: {}'.format(ens_precisao))
    print('############################################################################')
    
    performance['acuracia'].append(ens_acc)
    performance['recall'].append(ens_recall)
    performance['precisao'].append(ens_precisao)
    
    curr_fold += 1
    
    
resultados = pd.DataFrame(performance)

Validação Cruzada - Fold 1
Conjunto de treinamento - Dados (900, 8) - (900,)
Conjunto de teste - Dados (100, 8) - (100,)
____________________________________________________________________________
Modelo MLP 1
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 2
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 3
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 4
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 5
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Predição de todos os modelos (5, 100)
____________________________________________________________________________
Métricas - Fold 1
Acurácia: 0.82
Recall: 0.865384615385
Precision: 0.803571428571
############################################################################
Validação Cruzada - Fold 2
Conjunto de treinamento - Dados (900, 8) - (900,)
Conjunto de teste - Dados 

Modelo MLP 2
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 3
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 4
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Modelo MLP 5
Após reamostragem....
Conjunto de treinamento - Dados (900, 8) - (900,)
Predição de todos os modelos (5, 100)
____________________________________________________________________________
Métricas - Fold 10
Acurácia: 0.73
Recall: 0.704545454545
Precision: 0.688888888889
############################################################################


In [30]:
resultados

Unnamed: 0,acuracia,precisao,recall
0,0.82,0.803571,0.865385
1,0.8,0.82,0.788462
2,0.76,0.761905,0.695652
3,0.77,0.702128,0.785714
4,0.8,0.75,0.875
5,0.7,0.681818,0.652174
6,0.71,0.795455,0.636364
7,0.71,0.792453,0.7
8,0.75,0.777778,0.763636
9,0.73,0.688889,0.704545


In [31]:
resultados.describe()

Unnamed: 0,acuracia,precisao,recall
count,10.0,10.0,10.0
mean,0.755,0.7574,0.746693
std,0.042492,0.050199,0.082847
min,0.7,0.681818,0.636364
25%,0.715,0.714096,0.696739
50%,0.755,0.769841,0.734091
75%,0.7925,0.794704,0.787775
max,0.82,0.82,0.875
