In [1]:
import numpy as np

<h2>Carregando dados</h2>

In [2]:
data = np.genfromtxt('diabetes.csv', delimiter=',')

In [3]:
labels = data[:,-1]
data = data[:,:-1]

<h2>Normalização</h2>

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

In [6]:
np.min(data[:,0]), np.max(data[:,0])

(0.0, 1.0)

<h2>Treinamento</h2>

In [7]:
# Modelos
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# K-fold CrossValidation
from sklearn.model_selection import KFold

# Métricas
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [48]:
performance = {}

modelos = ['bagging']

for modelo in modelos:
    performance[modelo] = {
        'acuracia': [],
        'recall': [],
        'precisao': []
    }

In [41]:
from sklearn.utils import resample
from collections import Counter
from sklearn.model_selection import train_test_split

In [43]:
def balancear_dados(x, y, estrategia='oversampling'):
    
    if estrategia == 'oversampling':
        cnt = Counter()
        for cat in y:
            cnt[cat] += 1
        
        classe_majo = cnt.most_common()[0][0]
        num_samples = cnt.most_common()[0][1]
        
        dados_bal = []
        labels_bal = []
        for classes in np.unique(y):
            if not classes == classe_majo:
                dados = x[y == classes]
                label = y[y == classes]
                
                sampled_dados, sampled_label = resample(dados, 
                                               label, 
                                               n_samples=num_samples)
                
                dados_bal.append(sampled_dados)
                labels_bal.append(sampled_label)
            else:
                dados_bal.append(x[y == classe_majo])
                labels_bal.append(y[y == classe_majo])
        
        return np.vstack(dados_bal), np.hstack(labels_bal)

In [44]:
X_train.shape

(692L, 8L)

In [45]:
balancear_dados(X_train, y_train)[0].shape

(908L, 8L)

In [49]:
n_modelos = 5
kf = KFold(n_splits=10)
num_samples = 400

for train_index, test_index in kf.split(data):
    
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    predictions = []
    ensemble_predictions = []
    for n in range(n_modelos):
        sample_X_train, sample_y_train = resample(X_train, 
                                                  y_train, 
                                                  n_samples=num_samples)
        # Balanceamento
        sample_X_train, sample_y_train = balancear_dados(sample_X_train, sample_y_train)
        
        mlp = MLPClassifier()
        # Treino de cada modelo
        mlp.fit(sample_X_train, sample_y_train)
        
        # Avaliação
        predictions.append(mlp.predict(X_test))
        
    predictions = np.vstack(predictions)
    # Voto Majoritário
    cnt = Counter()
    for col in range(predictions.shape[1]):
        votes = predictions[:,col]
        for vote in votes:
            cnt[vote] += 1
        ensemble_predictions.append(cnt.most_common()[0][0]) # Categoria mais votada
    
    # Cálculo de métricas
    ens_acc = accuracy_score(y_test, ensemble_predictions)
    ens_recall = recall_score(y_test, ensemble_predictions)
    ens_precisao = precision_score(y_test, ensemble_predictions)
    
    performance['bagging']['acuracia'].append(ens_acc)
    performance['bagging']['recall'].append(ens_recall)
    performance['bagging']['precisao'].append(ens_precisao)

In [50]:
performance

{'bagging': {'acuracia': [0.5974025974025974,
   0.7142857142857143,
   0.5454545454545454,
   0.42857142857142855,
   0.6493506493506493,
   0.6103896103896104,
   0.8181818181818182,
   0.7012987012987013,
   0.6842105263157895,
   0.6052631578947368],
  'precisao': [0.5128205128205128,
   0.0,
   0.4,
   0.39705882352941174,
   0.0,
   0.0,
   0.0,
   0.625,
   0.0,
   0.5],
  'recall': [0.625,
   0.0,
   0.058823529411764705,
   0.9,
   0.0,
   0.0,
   0.0,
   0.2,
   0.0,
   0.13333333333333333]}}

In [28]:
cnt.most_common()[0][0]

1