# Prática de Aprendizado Supervisionado

**Importando bibliotecas e funções**

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io.arff import loadarff
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import time
import warnings

warnings.filterwarnings("ignore")

**Lendo o arquivo**

In [2]:
def leitura(dataset, nome):

    # Carrega o .arff
    raw_data = loadarff('Data/%s/%s.arff' % (dataset, nome))
    # Transforma o .arff em um Pandas Dataframe
    return pd.DataFrame(raw_data[0])
    # Imprime o Dataframe com suas colunas

**Separando em Conjunto de Treino e Teste**

In [3]:
# Com o iloc voce retira as linhas e colunas que quiser do Dataframe, no caso aqui sem as classes
def treinoTeste(df):

    X = df.iloc[:, 0:-1].values

    # Aqui salvamos apenas as classes agora
    y = df['class']
    # Substituimos os valores binários por inteiro
    bow = []
    int_value = 0
    y_aux = []
    for i in y:
      if i in bow:
        y_aux.append(int_value)
      else:
        bow.append(i)
        int_value += 1
        y_aux.append(int_value)
    # Novo y
    y = y_aux
    
    return train_test_split(X,y,test_size=0.2,random_state=327)

**Padronizando os dados com Técnicas de Normalização**

In [4]:
def normalizar(X_train, X_test, selectedNormalization):

    if selectedNormalization == 0:
        return X_train, X_test
    if selectedNormalization == 1:
        scaler = preprocessing.MinMaxScaler()
    if selectedNormalization == 2:
        scaler = preprocessing.StandardScaler()
    if selectedNormalization == 3:
        scaler = preprocessing.MaxAbsScaler()
    if selectedNormalization == 4:
        scaler = preprocessing.RobustScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    
    return X_train, X_test

**Treinando os Classificadores**

In [5]:
def treinarClassificadores(classificador, X_train, X_test, y_train, y_test):
        
    cls = classificador
    cls.fit(X_train, y_train)
    t = time.time()
    aux = classificador.predict(X_test)
    f1score   = f1_score(y_test, aux, average = 'macro')
    precision = precision_score(y_test, aux, average = 'macro')
    recall    = recall_score(y_test, aux, average = 'macro')
    acc_train = classificador.score(X_train, y_train)
    acc_test  = classificador.score(X_test, y_test)
    tempoExe  = time.time() - t
        
    return acc_train, acc_test, f1score, precision, recall, tempoExe

**Testando o Conjunto de Teste**

In [20]:
def grid(dataset):
    
    normDic = {'0': 'não aplicado', '1': 'MinMaxScaler', '2': 'StandardScaler',
               '3': 'MaxAbsScaler', '4': 'RobustScaler'}
    extratores = ['AutoColorCorrelogram', 'FCTH', 'Gabor', 'GCH', 'LBP', 'LCH', 
                  'Moments', 'ReferenceColorSimilarity']
    nomes = ['Extrator', 'Normalizador', 'Classificador', 'Tempo', 'Acurácia', 'Precisão', 'Recall', 'F1 Score']
    classificadores = [(GaussianNB(), 'Gaussian Naive Bayes'), (LogisticRegression(), 'Logistic Regression'), 
                       (DecisionTreeClassifier(), 'Decision Tree'), (KNeighborsClassifier(n_neighbors = 3), 'KNN'),
                       (LinearDiscriminantAnalysis(), 'Linear Discriminant Analysis'), (SVC(), 'SVM'), 
                       (RandomForestClassifier(random_state=42), 'Random Forest'), (MLPClassifier(alpha=1), 'MLP')]
    
    analise = []
    
    for extrator in extratores:
        
        df = leitura(dataset, extrator)
        
        for norm in range(5):
            X_train, X_test, y_train, y_test = treinoTeste(df)
            X_train, X_test = normalizar(X_train, X_test, norm)
            
            for cls, nome in classificadores:
                dados = treinarClassificadores(cls, X_train, X_test, y_train, y_test)
                acc_train, acc_test, f1score, precision, recall, tempoExe = dados
            
                resultados = [extrator, normDic[str(norm)], nome, tempoExe, 
                              acc_test, precision, recall, f1score]
            
                analise.append(resultados)
            
    return pd.DataFrame(analise, columns=nomes).set_index(['Extrator', 'Normalizador', 'Classificador'])

In [19]:
df = leitura('Malaria-Cell', 'FCTH')
X_train, X_test, y_train, y_test = treinoTeste(df)
X_train, X_test = normalizar(X_train, X_test, 1)
dados = treinarClassificadores(GaussianNB(), X_train, X_test, y_train, y_test)
acc_train, acc_test, f1score, precision, recall, tempoExe = dados
resultados = ['FCTH', 'MinMaxScaler', 'nome', tempoExe, 
                              acc_test, precision, recall, f1score]
resultados

['FCTH',
 'MinMaxScaler',
 'nome',
 0.3478817939758301,
 0.5400943396226415,
 0.6262803092067518,
 0.5440514156580938,
 0.4532288463446086]

In [21]:
def selecao(resultados):
    
    perf = ((resultados['Acurácia'] + resultados['F1 Score']) / 2) / resultados['Tempo']
    acc  = (resultados['Acurácia'] + resultados['F1 Score']) / 2
    
    return pd.DataFrame(resultados.loc[np.argmax(perf), :]).T , pd.DataFrame(resultados.loc[np.argmax(acc), :]).T

In [22]:
resultados = grid('Malaria-Cell')

In [23]:
resultados

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tempo,Acurácia,Precisão,Recall,F1 Score
Extrator,Normalizador,Classificador,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AutoColorCorrelogram,não aplicado,Gaussian Naive Bayes,1.241665,0.628447,0.768363,0.631952,0.576037
AutoColorCorrelogram,não aplicado,Logistic Regression,0.113908,0.698295,0.699702,0.698669,0.697997
AutoColorCorrelogram,não aplicado,Decision Tree,0.180844,0.938135,0.938168,0.938099,0.938125
AutoColorCorrelogram,não aplicado,KNN,555.105502,0.722605,0.722590,0.722538,0.722551
AutoColorCorrelogram,não aplicado,Linear Discriminant Analysis,0.082135,0.839985,0.847375,0.840699,0.839315
AutoColorCorrelogram,não aplicado,SVM,521.662776,0.642598,0.652368,0.643776,0.637898
AutoColorCorrelogram,não aplicado,Random Forest,0.181783,0.955552,0.955576,0.955524,0.955545
AutoColorCorrelogram,não aplicado,MLP,0.148893,0.664369,0.665503,0.664724,0.664071
AutoColorCorrelogram,MinMaxScaler,Gaussian Naive Bayes,0.623256,0.598149,0.754574,0.601952,0.529383
AutoColorCorrelogram,MinMaxScaler,Logistic Regression,0.054749,0.728592,0.730230,0.728978,0.728309


In [24]:
perform, accur = selecao(resultados)

In [30]:
import pickle

pickle_out = open("resMalaria.pickle", "rb")
#pickle.dump(resultados, pickle_out)
resultados2 = pickle.load(pickle_out)
pickle_out.close()

In [31]:
resultados2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tempo,Acurácia,Precisão,Recall,F1 Score
Extrator,Normalizador,Classificador,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AutoColorCorrelogram,não aplicado,Gaussian Naive Bayes,1.241665,0.628447,0.768363,0.631952,0.576037
AutoColorCorrelogram,não aplicado,Logistic Regression,0.113908,0.698295,0.699702,0.698669,0.697997
AutoColorCorrelogram,não aplicado,Decision Tree,0.180844,0.938135,0.938168,0.938099,0.938125
AutoColorCorrelogram,não aplicado,KNN,555.105502,0.722605,0.722590,0.722538,0.722551
AutoColorCorrelogram,não aplicado,Linear Discriminant Analysis,0.082135,0.839985,0.847375,0.840699,0.839315
AutoColorCorrelogram,não aplicado,SVM,521.662776,0.642598,0.652368,0.643776,0.637898
AutoColorCorrelogram,não aplicado,Random Forest,0.181783,0.955552,0.955576,0.955524,0.955545
AutoColorCorrelogram,não aplicado,MLP,0.148893,0.664369,0.665503,0.664724,0.664071
AutoColorCorrelogram,MinMaxScaler,Gaussian Naive Bayes,0.623256,0.598149,0.754574,0.601952,0.529383
AutoColorCorrelogram,MinMaxScaler,Logistic Regression,0.054749,0.728592,0.730230,0.728978,0.728309


In [25]:
perform

Unnamed: 0,Unnamed: 1,Unnamed: 2,Tempo,Acurácia,Precisão,Recall,F1 Score
GCH,RobustScaler,Logistic Regression,0.018512,0.884615,0.89948,0.885568,0.883704


In [26]:
accur

Unnamed: 0,Unnamed: 1,Unnamed: 2,Tempo,Acurácia,Precisão,Recall,F1 Score
GCH,RobustScaler,Random Forest,0.073414,0.960994,0.961004,0.960978,0.960989
