In [10]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('diabetes.csv', delimiter=',', header=None)

In [34]:
# df

In [35]:
data = df.get_values()

In [36]:
labels = data[:,-1] # Selecionando a última coluna
data = data[:,:-1] # Selecionando da primeira coluna até a penúltima

In [15]:
# data = data.astype(np.float)

In [37]:
# # Mapeamento categoria -> índice
# class_dict = {
#     'R': 0,
#     'M': 1
# }

# labels = np.array([class_dict[c] for c in labels])

<h2>Comparação de resultados com vs sem feature selection</h2>

In [38]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import *

In [39]:
kf = KFold(n_splits=10, random_state=0)

In [40]:
resultados = {
    'feature_selection': [],
    'classificador': [],
    'avg_acuracia': [],
    'avg_recall': [],
    'avg_precision': [],
}

acc_svm = []
acc_mlp = []
acc_tree = []

recall_svm = []
recall_mlp = []
recall_tree = []

precision_svm = []
precision_mlp = []
precision_tree = []

for train_index, test_index in kf.split(data):
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    svm = SVC()
    mlp = MLPClassifier()
    tree = DecisionTreeClassifier(max_depth=5)
    
    # Treinamento
    svm.fit(X_train, y_train)
    mlp.fit(X_train, y_train)
    tree.fit(X_train, y_train)
    
    # Teste
    svm_pred = svm.predict(X_test)
    mlp_pred = mlp.predict(X_test)
    tree_pred = tree.predict(X_test)
    
    # Avaliação
    ## Acuracia
    acc_svm.append(accuracy_score(y_test, svm_pred))
    acc_mlp.append(accuracy_score(y_test, mlp_pred))
    acc_tree.append(accuracy_score(y_test, tree_pred))
    
    ## Recall
    recall_svm.append(recall_score(y_test, svm_pred, average='weighted'))
    recall_mlp.append(recall_score(y_test, mlp_pred, average='weighted'))
    recall_tree.append(recall_score(y_test, tree_pred, average='weighted'))
    
    ## Precisao
    precision_svm.append(precision_score(y_test, svm_pred, average='weighted'))
    precision_mlp.append(precision_score(y_test, mlp_pred, average='weighted'))
    precision_tree.append(precision_score(y_test, tree_pred, average='weighted'))

In [41]:
classif = ['RBF SVM', 'MLP', 'Decision Tree']

for c in classif:
    resultados['classificador'].append(c)
    resultados['feature_selection'].append(False)
    if c == 'RBF SVM':
        resultados['avg_acuracia'].append(np.mean(acc_svm))
        resultados['avg_recall'].append(np.mean(recall_svm))
        resultados['avg_precision'].append(np.mean(precision_svm))
    elif c == 'MLP':
        resultados['avg_acuracia'].append(np.mean(acc_mlp))
        resultados['avg_recall'].append(np.mean(recall_mlp))
        resultados['avg_precision'].append(np.mean(precision_mlp))
    else:
        resultados['avg_acuracia'].append(np.mean(acc_tree))
        resultados['avg_recall'].append(np.mean(recall_tree))
        resultados['avg_precision'].append(np.mean(precision_tree))

In [42]:
pd.DataFrame(resultados)

Unnamed: 0,avg_acuracia,avg_precision,avg_recall,classificador,feature_selection
0,0.651025,0.429038,0.651025,RBF SVM,False
1,0.674436,0.680897,0.674436,MLP,False
2,0.748684,0.74767,0.748684,Decision Tree,False


In [43]:
from sklearn.feature_selection import mutual_info_classif

In [44]:
mi = mutual_info_classif(data, labels)

In [46]:
mi

array([0.05275893, 0.144012  , 0.        , 0.04417186, 0.02379815,
       0.08036976, 0.01349616, 0.07515459])

In [47]:
data.shape

(768L, 8L)

In [48]:
new_data = data[:, mi > 0.03]

In [49]:
new_data.shape

(768L, 5L)

In [50]:
acc_svm = []
acc_mlp = []
acc_tree = []

recall_svm = []
recall_mlp = []
recall_tree = []

precision_svm = []
precision_mlp = []
precision_tree = []

for train_index, test_index in kf.split(new_data):
    X_train, X_test = new_data[train_index], new_data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    svm = SVC()
    mlp = MLPClassifier()
    tree = DecisionTreeClassifier(max_depth=5)
    
    # Treinamento
    svm.fit(X_train, y_train)
    mlp.fit(X_train, y_train)
    tree.fit(X_train, y_train)
    
    # Teste
    svm_pred = svm.predict(X_test)
    mlp_pred = mlp.predict(X_test)
    tree_pred = tree.predict(X_test)
    
    # Avaliação
    ## Acuracia
    acc_svm.append(accuracy_score(y_test, svm_pred))
    acc_mlp.append(accuracy_score(y_test, mlp_pred))
    acc_tree.append(accuracy_score(y_test, tree_pred))
    
    ## Recall
    recall_svm.append(recall_score(y_test, svm_pred, average='weighted'))
    recall_mlp.append(recall_score(y_test, mlp_pred, average='weighted'))
    recall_tree.append(recall_score(y_test, tree_pred, average='weighted'))
    
    ## Precisao
    precision_svm.append(precision_score(y_test, svm_pred, average='weighted'))
    precision_mlp.append(precision_score(y_test, mlp_pred, average='weighted'))
    precision_tree.append(precision_score(y_test, tree_pred, average='weighted'))

In [51]:
classif = ['RBF SVM', 'MLP', 'Decision Tree']

for c in classif:
    resultados['classificador'].append(c)
    resultados['feature_selection'].append(True)
    if c == 'RBF SVM':
        resultados['avg_acuracia'].append(np.mean(acc_svm))
        resultados['avg_recall'].append(np.mean(recall_svm))
        resultados['avg_precision'].append(np.mean(precision_svm))
    elif c == 'MLP':
        resultados['avg_acuracia'].append(np.mean(acc_mlp))
        resultados['avg_recall'].append(np.mean(recall_mlp))
        resultados['avg_precision'].append(np.mean(precision_mlp))
    else:
        resultados['avg_acuracia'].append(np.mean(acc_tree))
        resultados['avg_recall'].append(np.mean(recall_tree))
        resultados['avg_precision'].append(np.mean(precision_tree))

In [52]:
pd.DataFrame(resultados)

Unnamed: 0,avg_acuracia,avg_precision,avg_recall,classificador,feature_selection
0,0.651025,0.429038,0.651025,RBF SVM,False
1,0.674436,0.680897,0.674436,MLP,False
2,0.748684,0.74767,0.748684,Decision Tree,False
3,0.651025,0.429038,0.651025,RBF SVM,True
4,0.677085,0.671353,0.677085,MLP,True
5,0.738209,0.747517,0.738209,Decision Tree,True


In [53]:
np.count_nonzero(labels == 0), np.count_nonzero(labels == 1)

(500, 268)