# **APLICAÇÃO DOS MÉTODOS**
---
---

## **1.Importando as Bibliotecas**

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os

## **2.Importando os Datasets**

In [3]:
#bases hog
df_hog_128_16 = pd.read_csv('./datasets/hog_128_16.csv')
df_hog_128_20 = pd.read_csv('./datasets/hog_128_20.csv')
df_hog_256_16 = pd.read_csv('./datasets/hog_256_16.csv')
df_hog_256_20 = pd.read_csv('./datasets/hog_256_20.csv')

In [4]:
#bases cnn
df_cnn_VGG16_AVG_128 = pd.read_csv('./datasets/cnn_VGG16_AVG_128.csv')
df_cnn_VGG19_AVG_128 = pd.read_csv('./datasets/cnn_VGG19_AVG_128.csv')
df_cnn_VGG16_MAX_128 = pd.read_csv('./datasets/cnn_VGG16_MAX_128.csv')
df_cnn_VGG19_MAX_128 = pd.read_csv('./datasets/cnn_VGG19_MAX_128.csv')
df_cnn_VGG16_AVG_256 = pd.read_csv('./datasets/cnn_VGG16_AVG_256.csv')
df_cnn_VGG19_AVG_256 = pd.read_csv('./datasets/cnn_VGG19_AVG_256.csv')
df_cnn_VGG16_MAX_256 = pd.read_csv('./datasets/cnn_VGG16_MAX_256.csv')
df_cnn_VGG19_MAX_256 = pd.read_csv('./datasets/cnn_VGG19_MAX_256.csv')

In [5]:
#colocando tudo em uma lista
bases = [df_hog_128_16, df_hog_128_20, df_hog_256_16, df_hog_256_20, df_cnn_VGG16_AVG_128, df_cnn_VGG19_AVG_128, df_cnn_VGG16_MAX_128, df_cnn_VGG19_MAX_128, df_cnn_VGG16_AVG_256, df_cnn_VGG19_AVG_256, df_cnn_VGG16_MAX_256, df_cnn_VGG19_MAX_256]

# Lista de nomes das bases de dados
nomes_bases = [
    'df_hog_128_16', 'df_hog_128_20', 'df_hog_256_16', 'df_hog_256_20', 
    'df_cnn_VGG16_AVG_128', 'df_cnn_VGG19_AVG_128', 'df_cnn_VGG16_MAX_128', 
    'df_cnn_VGG19_MAX_128', 'df_cnn_VGG16_AVG_256', 'df_cnn_VGG19_AVG_256', 
    'df_cnn_VGG16_MAX_256', 'df_cnn_VGG19_MAX_256'
]

## **3.Definindo Funções**

In [6]:
# Função para calcular a acurácia para uma base de dados utilizando KNN
def calcular_acuracia_knn(X, y, k_range=range(1, 11)):
    acuracia = []
    
    # Utilizando o método train_test_split 70/30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        acuracia.append(accuracy_score(y_test, y_pred))
    
    return acuracia

In [7]:
# Função para calcular a acurácia usando K-Fold Cross-Validation
def calcular_acuracia_knn_kfold(X, y, k_range=range(1, 11), k=10):
    acuracia_kfold = []
    
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    for k_folds in k_range:
        fold_accuracies = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            knn = KNeighborsClassifier(n_neighbors=k_folds)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            fold_accuracies.append(accuracy_score(y_test, y_pred))
        
        acuracia_kfold.append(np.mean(fold_accuracies))
    
    return acuracia_kfold

## **4.Aplicando o Código**

In [8]:
#colocando tudo em uma lista
bases = [df_hog_128_16, df_hog_128_20, df_hog_256_16, df_hog_256_20, df_cnn_VGG16_AVG_128, df_cnn_VGG19_AVG_128, df_cnn_VGG16_MAX_128, df_cnn_VGG19_MAX_128, df_cnn_VGG16_AVG_256, df_cnn_VGG19_AVG_256, df_cnn_VGG16_MAX_256, df_cnn_VGG19_MAX_256]

# Lista de nomes das bases de dados
nomes_bases = [
    'df_hog_128_16', 'df_hog_128_20', 'df_hog_256_16', 'df_hog_256_20', 
    'df_cnn_VGG16_AVG_128', 'df_cnn_VGG19_AVG_128', 'df_cnn_VGG16_MAX_128', 
    'df_cnn_VGG19_MAX_128', 'df_cnn_VGG16_AVG_256', 'df_cnn_VGG19_AVG_256', 
    'df_cnn_VGG16_MAX_256', 'df_cnn_VGG19_MAX_256'
]

# Inicializa o DataFrame para armazenar as acurácias
acuracias = pd.DataFrame(columns=range(1, 11), index=pd.MultiIndex.from_product([nomes_bases, ['train_test_split_70_30', 'k_fold_10']], names=["Base", "Método"]))

In [9]:
# Itera sobre cada base de dados na lista
for df_name, base in zip(nomes_bases, bases):
    # Separa y (primeira coluna - label) e X (demais colunas - features)
    y = base.iloc[:, 0]  # Classe está na primeira coluna
    X = base.iloc[:, 1:] # Demais colunas são as features
    
    # Calcula as acurácias usando o método train_test_split
    acuracia_tt = calcular_acuracia_knn(X, y)
    
    # Calcula as acurácias usando o método K-Fold Cross-Validation
    acuracia_kfold = calcular_acuracia_knn_kfold(X.values, y.values)
    
    # Armazena as acurácias no DataFrame para o método 70/30 e para 10-fold
    acuracias.loc[(df_name, 'train_test_split_70_30')] = acuracia_tt
    acuracias.loc[(df_name, 'k_fold_10')] = acuracia_kfold

acuracias.to_excel('acuracias_resultados.xlsx', index=True)
acuracias



Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,7,8,9,10
Base,Método,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
df_hog_128_16,train_test_split_70_30,0.55,0.5875,0.533333,0.5625,0.554167,0.558333,0.516667,0.55,0.520833,0.516667
df_hog_128_16,k_fold_10,0.611614,0.632927,0.563908,0.59644,0.551377,0.561361,0.537627,0.553877,0.540095,0.543845
df_hog_128_20,train_test_split_70_30,0.575,0.6125,0.583333,0.616667,0.55,0.579167,0.5375,0.5625,0.529167,0.554167
df_hog_128_20,k_fold_10,0.634161,0.654177,0.610316,0.63413,0.586551,0.600269,0.570174,0.582674,0.567706,0.585222
df_hog_256_16,train_test_split_70_30,0.554167,0.579167,0.520833,0.541667,0.495833,0.533333,0.495833,0.495833,0.5,0.504167
df_hog_256_16,k_fold_10,0.56269,0.581582,0.551424,0.575237,0.535095,0.551377,0.525063,0.532611,0.518813,0.528845
df_hog_256_20,train_test_split_70_30,0.595833,0.579167,0.55,0.629167,0.545833,0.5875,0.541667,0.566667,0.5,0.520833
df_hog_256_20,k_fold_10,0.588924,0.599098,0.565301,0.602864,0.552753,0.577785,0.541408,0.560222,0.540127,0.548908
df_cnn_VGG16_AVG_128,train_test_split_70_30,0.558333,0.5375,0.575,0.579167,0.645833,0.591667,0.6375,0.6375,0.620833,0.6
df_cnn_VGG16_AVG_128,k_fold_10,0.548877,0.541297,0.59019,0.582674,0.616535,0.610285,0.609114,0.613972,0.639019,0.629114


## **5.Selecionando as 6 melhores bases**

Esta estapa foi realizada analisando a tabela no excel.

In [10]:
melhores_bases_dict = {
    'df_hog_128_16': df_hog_128_16,
    'df_hog_128_20': df_hog_128_20,
    'df_cnn_VGG16_AVG_128': df_cnn_VGG16_AVG_128,
    'df_cnn_VGG19_AVG_128': df_cnn_VGG19_AVG_128,
    'df_cnn_VGG16_MAX_128': df_cnn_VGG16_MAX_128,
    'df_cnn_VGG19_MAX_128': df_cnn_VGG19_MAX_128
}

## **6.Realizando o PCA com 10 Componentes**

In [11]:
# Lista para armazenar novos DataFrames com PCA
new_dataframes = []

# Itera sobre cada DataFrame no dicionário e seus respectivos nomes
for base_name, base in melhores_bases_dict.items():
    # Separa a classe (primeira coluna) e as features (demais colunas)
    y = base.iloc[:, 0]  # Classe (primeira coluna)
    X = base.iloc[:, 1:]  # Features (restante das colunas)
    
    # Normaliza os dados
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Normaliza as features
    
    # Aplica o PCA com 10 componentes
    pca = PCA(n_components=10)
    X_pca = pca.fit_transform(X_scaled)  # Aplica PCA e transforma as features
    
    # Cria um novo DataFrame com as features transformadas e a coluna de classe
    df_pca = pd.DataFrame(X_pca, columns=[f'PC_{i+1}' for i in range(X_pca.shape[1])])  # Nome das colunas de componentes principais
    df_pca.insert(0, 'label', y)  # Insere a coluna de classe de volta no DataFrame
    
    # Cria o nome do DataFrame transformado
    df_name_pca = f'{base_name}_PCA'  # Adiciona o sufixo '_PCA' ao nome do DataFrame
    
    # Adiciona o DataFrame transformado à lista de novos DataFrames
    new_dataframes.append((df_name_pca, df_pca))  # Adiciona o nome e o DataFrame transformado à lista

# Agora, atualizamos o dicionário com os DataFrames transformados por PCA
for df_name_pca, df_pca in new_dataframes:
    melhores_bases_dict[df_name_pca] = df_pca  # Atualiza o dicionário com os DataFrames PCA

In [13]:
melhores_bases = list(melhores_bases_dict.values())
melhores_bases_nomes = list(melhores_bases_dict.keys())

In [18]:
melhores_bases_nomes

['df_hog_128_16',
 'df_hog_128_20',
 'df_cnn_VGG16_AVG_128',
 'df_cnn_VGG19_AVG_128',
 'df_cnn_VGG16_MAX_128',
 'df_cnn_VGG19_MAX_128',
 'df_hog_128_16_PCA',
 'df_hog_128_20_PCA',
 'df_cnn_VGG16_AVG_128_PCA',
 'df_cnn_VGG19_AVG_128_PCA',
 'df_cnn_VGG16_MAX_128_PCA',
 'df_cnn_VGG19_MAX_128_PCA']

## **7.Aplicando os Códigos**

In [14]:
# Inicializa o DataFrame para armazenar as acurácias
acuracias_2 = pd.DataFrame(columns=range(1, 11), index=pd.MultiIndex.from_product([melhores_bases_nomes, ['train_test_split_70_30', 'k_fold_10']], names=["Base", "Método"]))

In [15]:
# Itera sobre cada base de dados na lista
for df_name, base in zip(melhores_bases_nomes, melhores_bases):
    # Separa y (primeira coluna - label) e X (demais colunas - features)
    y = base.iloc[:, 0]  # Classe está na primeira coluna
    X = base.iloc[:, 1:] # Demais colunas são as features
    
    # Calcula as acurácias usando o método train_test_split
    acuracia_tt = calcular_acuracia_knn(X, y)
    
    # Calcula as acurácias usando o método K-Fold Cross-Validation
    acuracia_kfold = calcular_acuracia_knn_kfold(X.values, y.values)
    
    # Armazena as acurácias no DataFrame para o método 70/30 e para 10-fold
    acuracias_2.loc[(df_name, 'train_test_split_70_30')] = acuracia_tt
    acuracias_2.loc[(df_name, 'k_fold_10')] = acuracia_kfold

acuracias_2.to_excel('acuracias_resultados_2.xlsx', index=True)
acuracias_2

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,7,8,9,10
Base,Método,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
df_hog_128_16,train_test_split_70_30,0.55,0.5875,0.533333,0.5625,0.554167,0.558333,0.516667,0.55,0.520833,0.516667
df_hog_128_16,k_fold_10,0.611614,0.632927,0.563908,0.59644,0.551377,0.561361,0.537627,0.553877,0.540095,0.543845
df_hog_128_20,train_test_split_70_30,0.575,0.6125,0.583333,0.616667,0.55,0.579167,0.5375,0.5625,0.529167,0.554167
df_hog_128_20,k_fold_10,0.634161,0.654177,0.610316,0.63413,0.586551,0.600269,0.570174,0.582674,0.567706,0.585222
df_cnn_VGG16_AVG_128,train_test_split_70_30,0.558333,0.5375,0.575,0.579167,0.645833,0.591667,0.6375,0.6375,0.620833,0.6
df_cnn_VGG16_AVG_128,k_fold_10,0.548877,0.541297,0.59019,0.582674,0.616535,0.610285,0.609114,0.613972,0.639019,0.629114
df_cnn_VGG19_AVG_128,train_test_split_70_30,0.616667,0.6125,0.558333,0.645833,0.6375,0.629167,0.629167,0.633333,0.6125,0.616667
df_cnn_VGG19_AVG_128,k_fold_10,0.602674,0.598766,0.627785,0.650332,0.640285,0.666614,0.655348,0.661551,0.672911,0.676614
df_cnn_VGG16_MAX_128,train_test_split_70_30,0.579167,0.595833,0.579167,0.6125,0.583333,0.583333,0.583333,0.579167,0.595833,0.604167
df_cnn_VGG16_MAX_128,k_fold_10,0.60769,0.615316,0.605237,0.606582,0.617801,0.624035,0.619035,0.634066,0.619082,0.629098


NameError: name 'acuracias_2' is not defined

##### **SALVANDO A BASE PCA PARA USO NO FUTURO**

In [24]:
diretorio_saida = './datasets'
os.makedirs(diretorio_saida, exist_ok=True)
melhores_bases_dict['df_hog_128_16_PCA'].to_csv(os.path.join(diretorio_saida, 'df_hog_128_16_PCA.csv'), index=False)
melhores_bases_dict['df_hog_128_20_PCA'].to_csv(os.path.join(diretorio_saida, 'df_hog_128_20_PCA.csv'), index=False)
melhores_bases_dict['df_cnn_VGG16_AVG_128_PCA'].to_csv(os.path.join(diretorio_saida, 'df_cnn_VGG16_AVG_128_PCA.csv'), index=False)
melhores_bases_dict['df_cnn_VGG19_AVG_128_PCA'].to_csv(os.path.join(diretorio_saida, 'df_cnn_VGG19_AVG_128_PCA.csv'), index=False)
melhores_bases_dict['df_cnn_VGG16_MAX_128_PCA'].to_csv(os.path.join(diretorio_saida, 'df_cnn_VGG16_MAX_128_PCA.csv'), index=False)
melhores_bases_dict['df_cnn_VGG19_MAX_128_PCA'].to_csv(os.path.join(diretorio_saida, 'df_cnn_VGG19_MAX_128_PCA.csv'), index=False)