<h5> Importações das bibliotecas utilizadas</h5>

In [1]:
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
from sklearn import preprocessing
from scipy.io.arff import loadarff
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import ParameterGrid
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings
import glob
import sys
warnings.filterwarnings("ignore")

<h5> Caminhos dos arquivos arff que foram gerados </h5>

In [2]:
path_alien = "arff_files/Alien-vs-Predator/"
path_game = "arff_files/Rock-paper-scissor/"
path_fruits = "arff_files/Fruits/"
path_iris = "arff_files/Iris/"
path_malaria = "arff_files/Malaria-Cell/"

<h5> Classe que realiza a classificação, geração de plots, normalização, entre outros. </h5>

In [3]:
class TrainAllClassifiers(BaseEstimator, TransformerMixin):

    
    def __init__(self, path_dataset = None, selectedNormalization = 0, arff_selected = 0, just_these_classifiers = None):
        
        self.path_dataset = path_dataset
        self.selectedNormalization = selectedNormalization
        self.arff_selected = arff_selected
        self.just_these_classifiers = just_these_classifiers
        
    
    def _get_classifiers(self, classifiers):
        clfs = []
        
        for clf in classifiers:
            if clf == "gnb":
                self.gnb = GaussianNB()
                clfs.append(self.gnb)
                
            elif clf == "logreg":
                self.logreg = LogisticRegression(random_state=42)
                clfs.append(self.logreg)
                
            elif clf == "dectree":
                self.dectree = DecisionTreeClassifier(random_state=42)
                clfs.append(self.dectree)
                
            elif clf == "knn":
                self.knn = KNeighborsClassifier(n_neighbors = 3)
                clfs.append(self.knn)
                
            elif clf == "lda":
                self.lda = LinearDiscriminantAnalysis()
                clfs.append(self.lda)
                
            elif clf == "svm":
                self.svm = SVC(random_state=42)
                clfs.append(self.svm)
                
            elif clf == "rf":
                self.rf = RandomForestClassifier(random_state=42)
                clfs.append(self.rf)
                
            elif clf == "nnet":
                self.nnet = MLPClassifier(alpha=1, random_state=42)
                clfs.append(self.nnet)
        
        return clfs
    
    
    def _preprocess(self):
        raw_data = loadarff(self.path)
        # Transforma o .arff em um Pandas Dataframe
        df = pd.DataFrame(raw_data[0])
        
        # Com o iloc voce retira as linhas e colunas que quiser do Dataframe, no caso aqui sem as classes
        X = df.iloc[:, 0:-1].values

        # Aqui salvamos apenas as classes agora
        y = df['class']
        # Substituimos os valores binários por inteiro
        bow = []
        int_value = 0
        y_aux = []
        for i in y:
            if i in bow:
                y_aux.append(int_value)
            else:
                bow.append(i)
                int_value += 1
                y_aux.append(int_value)
        # Novo y
        y = y_aux

        # Dividindo o conjunto em 80% Treino e 20% Teste.
        # O parâmetro random_state = 327 define que sempre será dividido da mesma forma o conjunto.
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=327)

    def _choose_scenario(self):
        self.acc_time = []
        self.scaler = None
        if self.selectedNormalization == 1:
            self.scaler = preprocessing.MinMaxScaler()
        if self.selectedNormalization == 2:
            self.scaler = preprocessing.StandardScaler()
        if self.selectedNormalization == 3:
            self.scaler = preprocessing.MaxAbsScaler()
        if self.selectedNormalization == 4:
            self.scaler = preprocessing.RobustScaler()
        
        self.path = None
        if self.arff_selected == 0:
            self.path = self.path_dataset + "CEDD.arff"
        elif self.arff_selected == 1:
            self.path = self.path_dataset + "FCTH.arff"
        elif self.arff_selected == 2:
            self.path = self.path_dataset + "Gabor.arff"
        elif self.arff_selected == 3:
            self.path = self.path_dataset + "GCH.arff"
        elif self.arff_selected == 4:
            self.path = self.path_dataset + "JCD.arff"
        elif self.arff_selected == 5:
            self.path = self.path_dataset + "LBP.arff"
        elif self.arff_selected == 6:
            self.path = self.path_dataset + "MPOC.arff"
        elif self.arff_selected == 7:
            self.path = self.path_dataset + "ReferenceColorSimilarity.arff"
    
    def fit(self, X=None):
        
        classifiers = None
        if self.just_these_classifiers is None:
            classifiers = ["gnb", "logreg", "dectree", "knn", "lda", "svm", "rf", "nnet"]
        else:
            classifiers = self.just_these_classifiers
        
        self.classifiers = self._get_classifiers(classifiers)
        
        self._choose_scenario()
        self._preprocess()
        
        if self.scaler is not None:
            self.X_train = self.scaler.fit_transform(self.X_train)
            self.X_test = self.scaler.transform(self.X_test)
        
        for clf in self.classifiers:
            start = time.time()
            clf.fit(self.X_train, self.y_train)
            self.acc_time.append(time.time()-start)
        
        self.acc_time = np.array(self.acc_time)
        
        return self
    
    def score(self, X=None):
        
        acc_test = []
        
        for clf in self.classifiers:
            acc_test.append(clf.score(self.X_test, self.y_test))
        
        acc_test = np.array(acc_test)

        return acc_test, self.acc_time
    
    def score_f1(self):
        f1score = []
        
        for clf in self.classifiers:
            aux = clf.predict(self.X_test)
            f1score.append(f1_score(self.y_test, aux, average = 'macro'))

        f1score = np.array(f1score)

        return f1score
    

    def all_metrics(self):
            
        f1score = self.score_f1()
        accuracy, dtime = self.score()
        
        precision = []
        recall = []
        cm = []
        
        for clf in self.classifiers:
            aux = clf.predict(self.X_test)
            # Matriz de confusão
            cm.append(confusion_matrix(self.y_test, aux))
            # Método para calcular a Precision
            precision.append(precision_score(self.y_test, aux, average = 'macro'))
            # Método para calcular o Recall
            recall.append(recall_score(self.y_test, aux, average = 'macro'))
        
        metrics_dict = {"acuracia": accuracy, "f1score": f1score, "matriz_confusao": cm,
                        "precisao": precision, "recall": recall, "tempo": dtime}
        
        return metrics_dict

<h5>Grid das combinações dos extratores e normalizadores</h5>

Para efetuar uma análise sobre outro dataset, altere o 'path_dataset' do dicionário 'grid'.

Este trecho de código irá fazer a combinação de todos os extratores e normalizadores, e determinará qual o melhor cenário, de acordo com a acurácia média dos classificadores.

In [4]:
grid = {"selectedNormalization": [None, 1, 2, 3, 4],
         "path_dataset": [path_game],
         "arff_selected": [0, 1, 2, 3, 4, 5, 6, 7]}

all_clf = TrainAllClassifiers()

params = defaultdict(list)

i = 0
for g in ParameterGrid(grid):
    all_clf.set_params(**g)
    start = time.time()
    all_clf.fit()
    delta = time.time() - start
    (acc_test, acc_time) = all_clf.score([])
    params["param_selectedNormalization"].append(g["selectedNormalization"])
    params["param_path_dataset"].append(g["path_dataset"])
    params["param_arff_selected"].append(g["arff_selected"])
    params["mean_test_score"].append(acc_test.mean())
    params["std_test_score"].append(acc_test.std())
    params["mean_fit_time"].append(acc_time.mean())
    params["std_fit_time"].append(acc_time.std())
    i+=1
    sys.stdout.write("Progresso: {:.2f}%    \r".format(i/40 * 100))
    sys.stdout.flush()

Progresso: 100.00%    

<h5>Geração da tabela das combinações</h5>

In [5]:
cols = ["param_arff_selected", "param_selectedNormalization", "Acuracia", "Tempo (s)"]

def combine_time(each):
    return "{:.2f} ± {:.2f}".format(each["mean_fit_time"], each["std_fit_time"])

def combine_test(each):
    return "{:.3f} ± {:.3f}".format(each["mean_test_score"], each["std_test_score"])

def combine_train(each):
    return "{:.3f} ± {:.3f}".format(each["mean_train_score"], each["std_train_score"])

results = pd.DataFrame(params)
results["Tempo (s)"] = results.apply(combine_time, axis=1)
results["Acuracia"] = results.apply(combine_test, axis=1)
# results["Acuracia (Treino)"] = results.apply(combine_train, axis=1)
results = results[cols]
results = results.rename(columns={"param_arff_selected": "Descritor", "param_selectedNormalization": "Normalizacao"})
results["Descritor"] = results["Descritor"].map({0: "CEDD", 1: "FCTH", 2: "Gabor", 3: "GCH", 4: "JCD", 5: "LBP", 6: "MPOC", 7: "C. Similarity"})
results["Normalizacao"] = results["Normalizacao"].map({None: "Nenhuma", 1: "MinMax", 2: "Standard", 3: "MaxAbs", 4: "Robust"})


results = results.set_index(["Descritor", "Normalizacao"])

print(results.to_latex())

\begin{tabular}{llll}
\toprule
              &        &       Acuracia &    Tempo (s) \\
Descritor & Normalizacao &                &              \\
\midrule
CEDD & NaN &  0.856 ± 0.180 &  0.24 ± 0.52 \\
              & MinMax &  0.839 ± 0.176 &  0.18 ± 0.31 \\
              & Standard &  0.844 ± 0.182 &  0.24 ± 0.51 \\
              & MaxAbs &  0.838 ± 0.176 &  0.19 ± 0.33 \\
              & Robust &  0.855 ± 0.179 &  0.22 ± 0.47 \\
FCTH & NaN &  0.677 ± 0.052 &  0.24 ± 0.42 \\
              & MinMax &  0.658 ± 0.074 &  0.22 ± 0.37 \\
              & Standard &  0.686 ± 0.056 &  0.34 ± 0.67 \\
              & MaxAbs &  0.658 ± 0.074 &  0.24 ± 0.39 \\
              & Robust &  0.680 ± 0.054 &  0.32 ± 0.60 \\
Gabor & NaN &  0.520 ± 0.021 &  0.11 ± 0.16 \\
              & MinMax &  0.530 ± 0.015 &  0.22 ± 0.40 \\
              & Standard &  0.536 ± 0.022 &  0.19 ± 0.38 \\
              & MaxAbs &  0.525 ± 0.016 &  0.17 ± 0.28 \\
              & Robust &  0.537 ± 0.021 &  0.18 ± 0.33 \\
G

<h5> Verificando o melhor classificador </h5>

Com os três melhores cenários para cada Dataset, verificou-se qual é o melhor cenário e classificador, pela métrica de acurácia e F1-Score.

In [7]:
# #game
grid_dict = [{"selectedNormalization": None, "path_dataset": path_alien, "arff_selected": 1},
             {"selectedNormalization": 2, "path_dataset": path_alien, "arff_selected": 5},
             {"selectedNormalization": 4, "path_dataset": path_alien, "arff_selected": 5},
             {"selectedNormalization": 4, "path_dataset": path_malaria, "arff_selected": 0},
             {"selectedNormalization": 2, "path_dataset": path_malaria, "arff_selected": 3},
             {"selectedNormalization": 4, "path_dataset": path_malaria, "arff_selected": 3},
             {"selectedNormalization": None, "path_dataset": path_fruits, "arff_selected": 0},
             {"selectedNormalization": None, "path_dataset": path_fruits, "arff_selected": 4},
             {"selectedNormalization": 2, "path_dataset": path_fruits, "arff_selected": 4},
             {"selectedNormalization": 2, "path_dataset": path_iris, "arff_selected": 3},
             {"selectedNormalization": 3, "path_dataset": path_iris, "arff_selected": 3},
             {"selectedNormalization": 4, "path_dataset": path_iris, "arff_selected": 3},
             {"selectedNormalization": 4, "path_dataset": path_game, "arff_selected": 4},
             {"selectedNormalization": 2, "path_dataset": path_game, "arff_selected": 5},
             {"selectedNormalization": 4, "path_dataset": path_game, "arff_selected": 5}]



all_clf = TrainAllClassifiers()
params = defaultdict(list)

for g in grid_dict:
    print(g)
    all_clf.set_params(**g)
    all_clf.fit()
    acc_test, _ = all_clf.score()
    f1score = all_clf.score_f1()
    params["param_selectedNormalization"].append(g["selectedNormalization"])
    params["param_path_dataset"].append(g["path_dataset"])
    params["param_arff_selected"].append(g["arff_selected"])
    
    params["acuracia_gnb"].append(acc_test[0])
    params["acuracia_logreg"].append(acc_test[1])
    params["acuracia_dectree"].append(acc_test[2])
    params["acuracia_knn"].append(acc_test[3])
    params["acuracia_lda"].append(acc_test[4])
    params["acuracia_svm"].append(acc_test[5])
    params["acuracia_rf"].append(acc_test[6])
    params["acuracia_nnet"].append(acc_test[7])
    
    params["f1score_gnb"].append(f1score[0])
    params["f1score_logreg"].append(f1score[1])
    params["f1score_dectree"].append(f1score[2])
    params["f1score_knn"].append(f1score[3])
    params["f1score_lda"].append(f1score[4])
    params["f1score_svm"].append(f1score[5])
    params["f1score_rf"].append(f1score[6])
    params["f1score_nnet"].append(f1score[7])

{'selectedNormalization': None, 'path_dataset': 'arff_files/Alien-vs-Predator/', 'arff_selected': 1}
{'selectedNormalization': 2, 'path_dataset': 'arff_files/Alien-vs-Predator/', 'arff_selected': 5}
{'selectedNormalization': 4, 'path_dataset': 'arff_files/Alien-vs-Predator/', 'arff_selected': 5}
{'selectedNormalization': 4, 'path_dataset': 'arff_files/Malaria-Cell/', 'arff_selected': 0}
{'selectedNormalization': 2, 'path_dataset': 'arff_files/Malaria-Cell/', 'arff_selected': 3}
{'selectedNormalization': 4, 'path_dataset': 'arff_files/Malaria-Cell/', 'arff_selected': 3}
{'selectedNormalization': None, 'path_dataset': 'arff_files/Fruits/', 'arff_selected': 0}
{'selectedNormalization': None, 'path_dataset': 'arff_files/Fruits/', 'arff_selected': 4}
{'selectedNormalization': 2, 'path_dataset': 'arff_files/Fruits/', 'arff_selected': 4}
{'selectedNormalization': 2, 'path_dataset': 'arff_files/Iris/', 'arff_selected': 3}
{'selectedNormalization': 3, 'path_dataset': 'arff_files/Iris/', 'arff_s

In [9]:
results_clf = pd.DataFrame(params)
results_clf = results_clf.set_index(["param_path_dataset"])
results_clf

Unnamed: 0_level_0,param_selectedNormalization,param_arff_selected,acuracia_gnb,acuracia_logreg,acuracia_dectree,acuracia_knn,acuracia_lda,acuracia_svm,acuracia_rf,acuracia_nnet,f1score_gnb,f1score_logreg,f1score_dectree,f1score_knn,f1score_lda,f1score_svm,f1score_rf,f1score_nnet
param_path_dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
arff_files/Alien-vs-Predator/,,1,0.568345,0.733813,0.690647,0.690647,0.733813,0.726619,0.697842,0.791367,0.512623,0.733592,0.69007,0.690583,0.733758,0.725924,0.695937,0.791324
arff_files/Alien-vs-Predator/,2.0,5,0.604317,0.726619,0.697842,0.741007,0.654676,0.741007,0.71223,0.776978,0.604235,0.726491,0.697826,0.737074,0.654515,0.740886,0.712215,0.776562
arff_files/Alien-vs-Predator/,4.0,5,0.604317,0.741007,0.690647,0.697842,0.654676,0.769784,0.705036,0.769784,0.604235,0.740672,0.690647,0.694282,0.654515,0.769486,0.705036,0.769486
arff_files/Malaria-Cell/,4.0,0,0.586538,0.835087,0.809507,0.82529,0.817852,0.850508,0.836176,0.853229,0.514995,0.834687,0.809506,0.825169,0.817123,0.850406,0.836158,0.853057
arff_files/Malaria-Cell/,2.0,3,0.721154,0.885341,0.938861,0.823476,0.735849,0.911284,0.960813,0.922533,0.708208,0.884435,0.93884,0.823144,0.734899,0.911043,0.960808,0.922421
arff_files/Malaria-Cell/,4.0,3,0.707729,0.884978,0.938498,0.870646,0.735849,0.863208,0.960994,0.936865,0.691151,0.884077,0.938476,0.870506,0.734899,0.86237,0.960989,0.936864
arff_files/Fruits/,,0,0.29386,0.72807,0.657895,0.811404,0.671053,0.741228,0.780702,0.828947,0.335585,0.689265,0.584109,0.78707,0.633501,0.657639,0.734503,0.818735
arff_files/Fruits/,,4,0.320175,0.75,0.649123,0.850877,0.688596,0.697368,0.736842,0.833333,0.333653,0.729505,0.597348,0.843036,0.640589,0.559771,0.672593,0.803545
arff_files/Fruits/,2.0,4,0.298246,0.70614,0.649123,0.758772,0.688596,0.675439,0.736842,0.833333,0.314843,0.661516,0.597348,0.700774,0.640589,0.565559,0.667246,0.802541
arff_files/Iris/,2.0,3,0.966667,0.933333,0.9,0.933333,0.933333,0.9,0.9,0.966667,0.961026,0.924603,0.901919,0.929563,0.924603,0.903103,0.8885,0.961026


<h5> Obtendo as métricas dos melhores classificadores nos melhores cenários </h5>

In [None]:
grid_dict = [{"selectedNormalization": None, "path_dataset": path_alien, "arff_selected": 1, "just_these_classifiers": ["nnet"]},
             {"selectedNormalization": 4, "path_dataset": path_malaria, "arff_selected": 3, "just_these_classifiers": ["rf"]},
             {"selectedNormalization": None, "path_dataset": path_fruits, "arff_selected": 4, "just_these_classifiers": ["knn"]},
             {"selectedNormalization": 2, "path_dataset": path_iris, "arff_selected": 3, "just_these_classifiers": ["nnet"]},
             {"selectedNormalization": 2, "path_dataset": path_game, "arff_selected": 5, "just_these_classifiers": ["nnet"]}]

all_clf = TrainAllClassifiers()
metrics = []
# classifiers_in_order = ["nnet", "rf", "knn", "nnet", "nnet"]
# metrics = ["acurácia", "precision", "recall", "f1score", "matriz_confusao"]

for g in grid_dict:
    all_clf.set_params(**g)
    all_clf.fit()
    
    metrics.append(all_clf.all_metrics())

In [None]:
results = pd.DataFrame(metrics)
metricas = results.applymap(lambda x: np.squeeze(x))
metricas = metricas.rename(index={0: "Alien vs Predator", 1: "Malaria Cell", 2: "Fruits", 3: "Iris", 4: "Rock-Paper-Scissor"})
metricas

<h5>Conversão da Tabela para LaTeX</h5>

In [None]:
print(metricas.to_latex())

<h5>Geração das Matrizes de Confusão</h5>

In [None]:
cols = ["alien", "malaria", "fruits", "iris", "game"]
for m, c in zip(metricas["matriz_confusao"], cols):
    plt.figure(figsize = (8,6))
    sns.heatmap(m, annot=True, cbar=False)
    plt.savefig("cm_"+c)

<h5>Cálculo dos falsos negativos</h5>

In [None]:
def cada_col(col):
    maximo = np.max(col)
    return 1 - maximo/np.sum(col)

def cada_matriz(matriz):
    return np.apply_along_axis(cada_col, 0, matriz)

falsos_negativos = metricas["matriz_confusao"].apply(cada_matriz)

#Mude a chave, para analisar outros datasets
falsos_negativos["Iris"]