In [1]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('sonar.all-data.csv', delimiter=',', header=None)

In [9]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [10]:
data = df.get_values()

In [12]:
labels = data[:,-1] # Selecionando a última coluna
data = data[:,:-1] # Selecionando da primeira coluna até a penúltima

In [16]:
data = data.astype(np.float)

In [19]:
# Mapeamento categoria -> índice
class_dict = {
    'R': 0,
    'M': 1
}

labels = np.array([class_dict[c] for c in labels])

<h2>Comparação de resultados com vs sem feature selection</h2>

In [25]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import *

In [24]:
kf = KFold(n_splits=10, random_state=0)

In [26]:
resultados = {
    'feature_selection': [],
    'classificador': [],
    'avg_acuracia': [],
    'avg_recall': [],
    'avg_precision': [],
}

acc_svm = []
acc_mlp = []
acc_tree = []

recall_svm = []
recall_mlp = []
recall_tree = []

precision_svm = []
precision_mlp = []
precision_tree = []

for train_index, test_index in kf.split(data):
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    svm = SVC()
    mlp = MLPClassifier()
    tree = DecisionTreeClassifier()
    
    # Treinamento
    svm.fit(X_train, y_train)
    mlp.fit(X_train, y_train)
    tree.fit(X_train, y_train)
    
    # Teste
    svm_pred = svm.predict(X_test)
    mlp_pred = mlp.predict(X_test)
    tree_pred = tree.predict(X_test)
    
    # Avaliação
    ## Acuracia
    acc_svm.append(accuracy_score(y_test, svm_pred))
    acc_mlp.append(accuracy_score(y_test, mlp_pred))
    acc_tree.append(accuracy_score(y_test, tree_pred))
    
    ## Recall
    recall_svm.append(recall_score(y_test, svm_pred, average='weighted'))
    recall_mlp.append(recall_score(y_test, mlp_pred, average='weighted'))
    recall_tree.append(recall_score(y_test, tree_pred, average='weighted'))
    
    ## Precisao
    precision_svm.append(precision_score(y_test, svm_pred, average='weighted'))
    precision_mlp.append(precision_score(y_test, mlp_pred, average='weighted'))
    precision_tree.append(precision_score(y_test, tree_pred, average='weighted'))



In [27]:
classif = ['RBF SVM', 'MLP', 'Decision Tree']

for c in classif:
    resultados['classificador'].append(c)
    resultados['feature_selection'].append(False)
    if c == 'RBF SVM':
        resultados['avg_acuracia'].append(np.mean(acc_svm))
        resultados['avg_recall'].append(np.mean(recall_svm))
        resultados['avg_precision'].append(np.mean(precision_svm))
    elif c == 'MLP':
        resultados['avg_acuracia'].append(np.mean(acc_mlp))
        resultados['avg_recall'].append(np.mean(recall_mlp))
        resultados['avg_precision'].append(np.mean(precision_mlp))
    else:
        resultados['avg_acuracia'].append(np.mean(acc_tree))
        resultados['avg_recall'].append(np.mean(recall_tree))
        resultados['avg_precision'].append(np.mean(precision_tree))

In [28]:
pd.DataFrame(resultados)

Unnamed: 0,avg_acuracia,avg_precision,avg_recall,classificador,feature_selection
0,0.21881,0.21881,0.21881,RBF SVM,False
1,0.51619,0.51619,0.51619,MLP,False
2,0.530476,0.530476,0.530476,Decision Tree,False


In [29]:
from sklearn.feature_selection import mutual_info_classif

In [30]:
mi = mutual_info_classif(data, labels)

In [33]:
data.shape

(208L, 60L)

In [39]:
new_data = data[:, mi > 0]

In [40]:
acc_svm = []
acc_mlp = []
acc_tree = []

recall_svm = []
recall_mlp = []
recall_tree = []

precision_svm = []
precision_mlp = []
precision_tree = []

for train_index, test_index in kf.split(new_data):
    X_train, X_test = new_data[train_index], new_data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    svm = SVC()
    mlp = MLPClassifier()
    tree = DecisionTreeClassifier()
    
    # Treinamento
    svm.fit(X_train, y_train)
    mlp.fit(X_train, y_train)
    tree.fit(X_train, y_train)
    
    # Teste
    svm_pred = svm.predict(X_test)
    mlp_pred = mlp.predict(X_test)
    tree_pred = tree.predict(X_test)
    
    # Avaliação
    ## Acuracia
    acc_svm.append(accuracy_score(y_test, svm_pred))
    acc_mlp.append(accuracy_score(y_test, mlp_pred))
    acc_tree.append(accuracy_score(y_test, tree_pred))
    
    ## Recall
    recall_svm.append(recall_score(y_test, svm_pred, average='weighted'))
    recall_mlp.append(recall_score(y_test, mlp_pred, average='weighted'))
    recall_tree.append(recall_score(y_test, tree_pred, average='weighted'))
    
    ## Precisao
    precision_svm.append(precision_score(y_test, svm_pred, average='weighted'))
    precision_mlp.append(precision_score(y_test, mlp_pred, average='weighted'))
    precision_tree.append(precision_score(y_test, tree_pred, average='weighted'))

In [41]:
classif = ['RBF SVM', 'MLP', 'Decision Tree']

for c in classif:
    resultados['classificador'].append(c)
    resultados['feature_selection'].append(True)
    if c == 'RBF SVM':
        resultados['avg_acuracia'].append(np.mean(acc_svm))
        resultados['avg_recall'].append(np.mean(recall_svm))
        resultados['avg_precision'].append(np.mean(precision_svm))
    elif c == 'MLP':
        resultados['avg_acuracia'].append(np.mean(acc_mlp))
        resultados['avg_recall'].append(np.mean(recall_mlp))
        resultados['avg_precision'].append(np.mean(precision_mlp))
    else:
        resultados['avg_acuracia'].append(np.mean(acc_tree))
        resultados['avg_recall'].append(np.mean(recall_tree))
        resultados['avg_precision'].append(np.mean(precision_tree))

In [42]:
pd.DataFrame(resultados)

Unnamed: 0,avg_acuracia,avg_precision,avg_recall,classificador,feature_selection
0,0.21881,0.21881,0.21881,RBF SVM,False
1,0.51619,0.51619,0.51619,MLP,False
2,0.530476,0.530476,0.530476,Decision Tree,False
3,0.329286,0.329286,0.329286,RBF SVM,True
4,0.617619,0.617619,0.617619,MLP,True
5,0.539762,0.539762,0.539762,Decision Tree,True
6,0.262143,0.262143,0.262143,RBF SVM,True
7,0.597619,0.597619,0.597619,MLP,True
8,0.521667,0.521667,0.521667,Decision Tree,True
