# **Decision Tree**
---
---

### **0.Importando as Bibliotecas**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from openpyxl import Workbook

### **1.Importando as Bases**

Neste caso, vamos trabalhar com as 6 melhores bases do método K-NN mais elas transformadas por PCA com 10 componentes.

In [4]:
hog_128_16 = pd.read_csv('../datasets/hog_128_16.csv')
hog_128_20 = pd.read_csv('../datasets/hog_128_20.csv')
cnn_VGG16_AVG_128 = pd.read_csv('../datasets/cnn_VGG16_AVG_128.csv')
cnn_VGG19_AVG_128 = pd.read_csv('../datasets/cnn_VGG19_AVG_128.csv')
cnn_VGG16_MAX_128 = pd.read_csv('../datasets/cnn_VGG16_MAX_128.csv')
cnn_VGG19_MAX_128 = pd.read_csv('../datasets/cnn_VGG19_MAX_128.csv')
hog_128_16_PCA = pd.read_csv('../datasets/hog_128_16_PCA.csv')
hog_128_20_PCA = pd.read_csv('../datasets/hog_128_20_PCA.csv')
cnn_VGG16_AVG_128_PCA = pd.read_csv('../datasets/cnn_VGG16_AVG_128_PCA.csv')
cnn_VGG19_AVG_128_PCA= pd.read_csv('../datasets/cnn_VGG19_AVG_128_PCA.csv')
cnn_VGG16_MAX_128_PCA= pd.read_csv('../datasets/cnn_VGG16_MAX_128_PCA.csv')
cnn_VGG19_MAX_128_PCA = pd.read_csv('../datasets/cnn_VGG19_MAX_128_PCA.csv')


### **3.Código**

##### 3.1.Instanciando uma lista com todos os DataFrames

In [None]:
dataframes = {
    'hog_128_16': hog_128_16,
    'hog_128_20': hog_128_20,
    'cnn_VGG16_AVG_128': cnn_VGG16_AVG_128,
    'cnn_VGG19_AVG_128': cnn_VGG19_AVG_128,
    'cnn_VGG16_MAX_128': cnn_VGG16_MAX_128,
    'cnn_VGG19_MAX_128': cnn_VGG19_MAX_128,
    'hog_128_16_PCA': hog_128_16_PCA,
    'hog_128_20_PCA': hog_128_20_PCA,
    'cnn_VGG16_AVG_128_PCA': cnn_VGG16_AVG_128_PCA,
    'cnn_VGG19_AVG_128_PCA': cnn_VGG19_AVG_128_PCA,
    'cnn_VGG16_MAX_128_PCA': cnn_VGG16_MAX_128_PCA,
    'cnn_VGG19_MAX_128_PCA': cnn_VGG19_MAX_128_PCA
}

##### 3.2.Definindo o DataFrame resultante

In [None]:
# Configuração do DataFrame final para salvar as acurácias
multi_index = []
#Índice
for name in dataframes.keys():
    multi_index.extend([(name, '70/30'), (name, '10-fold CV')])

columns = list(range(2, 11))
accuracy_df = pd.DataFrame(index=pd.MultiIndex.from_tuples(multi_index), columns=columns)


##### 3.3.Iterando sob cada DataFrame

O que iremos fazer é:
* Realizar uma pré-poda para cada árvore, a fim de que achemos o melhor ccp.alpha para cada uma delas;
* Fazer uma árvore de decisão para cada uma das árvores, para cada uma das bases, utilizando os métodos train_test_split e k-fold com k=10;
* Realizar uma pós pode com o parâmetro max_depth indo de 2 até 10.

In [None]:
# Loop sobre cada DataFrame
for name, df in dataframes.items():
    # Separar features e rótulos
    y = df.iloc[:, 0]
    X = df.iloc[:, 1:]


    # -----------------------------------------------------------------------------------------#
    # -----------------------------------------------------------------------------------------#
    # -----------------------------------------------------------------------------------------#

    
    # ---- Train/Test Split (70/30) ----
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # GridSearch para ccp_alpha (70/30)
    tree = DecisionTreeClassifier(random_state=42)
    path = tree.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas

    grid = GridSearchCV(tree, param_grid={'ccp_alpha': ccp_alphas}, cv=5)
    grid.fit(X_train, y_train)
    best_alpha_7030 = grid.best_params_['ccp_alpha']

    # Treinar a árvore com o melhor ccp_alpha (70/30)
    best_tree_7030 = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha_7030)
    best_tree_7030.fit(X_train, y_train)

    # Pós-poda com max_depth de 2 a 10 (70/30)
    for max_depth in range(2, 11):
        pruned_tree_7030 = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha_7030, max_depth=max_depth)
        pruned_tree_7030.fit(X_train, y_train)
        y_pred = pruned_tree_7030.predict(X_test)
        accuracy_df.loc[(name, '70/30'), max_depth] = accuracy_score(y_test, y_pred)


    # -----------------------------------------------------------------------------------------#
    # -----------------------------------------------------------------------------------------#
    # -----------------------------------------------------------------------------------------#


    # ---- K-Fold Cross Validation (k=10) ----
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    accuracies_kfold = []

    for train_index, test_index in kfold.split(X):
        # Divisão dos dados em treino e teste para cada fold
        X_train_kfold, X_test_kfold = X.iloc[train_index], X.iloc[test_index]
        y_train_kfold, y_test_kfold = y.iloc[train_index], y.iloc[test_index]

        # GridSearch para ccp_alpha (K-Fold)
        tree_kfold = DecisionTreeClassifier(random_state=42)
        path_kfold = tree_kfold.cost_complexity_pruning_path(X_train_kfold, y_train_kfold)
        ccp_alphas_kfold = path_kfold.ccp_alphas

        grid_kfold = GridSearchCV(tree_kfold, param_grid={'ccp_alpha': ccp_alphas_kfold}, cv=5)
        grid_kfold.fit(X_train_kfold, y_train_kfold)
        best_alpha_kfold = grid_kfold.best_params_['ccp_alpha']

        # Pós-poda com max_depth de 2 a 10 (K-Fold)
        fold_accuracies = []
        for max_depth in range(2, 11):
            pruned_tree_kfold = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha_kfold, max_depth=max_depth)
            pruned_tree_kfold.fit(X_train_kfold, y_train_kfold)
            y_pred_kfold = pruned_tree_kfold.predict(X_test_kfold)
            fold_accuracies.append(accuracy_score(y_test_kfold, y_pred_kfold))

        accuracies_kfold.append(fold_accuracies)

    # Calcular a média das acurácias para cada max_depth
    mean_accuracies_kfold = np.mean(accuracies_kfold, axis=0)
    for idx, max_depth in enumerate(range(2, 11)):
        accuracy_df.loc[(name, '10-fold CV'), max_depth] = mean_accuracies_kfold[idx]

### **Salvando o DataFrame em um Arquivo Excel**

In [None]:
accuracy_df

Unnamed: 0,Unnamed: 1,2,3,4,5,6,7,8,9,10
hog_128_16,70/30,0.6,0.570833,0.570833,0.570833,0.570833,0.6,0.6,0.6,0.6
hog_128_16,10-fold CV,0.563908,0.545127,0.541297,0.555142,0.555142,0.553892,0.553892,0.551392,0.552642
hog_128_20,70/30,0.533333,0.595833,0.604167,0.616667,0.6125,0.595833,0.625,0.608333,0.616667
hog_128_20,10-fold CV,0.624035,0.607769,0.609003,0.598972,0.59894,0.600222,0.598924,0.59894,0.602737
cnn_VGG16_AVG_128,70/30,0.629167,0.65,0.608333,0.641667,0.641667,0.641667,0.641667,0.641667,0.641667
cnn_VGG16_AVG_128,10-fold CV,0.649082,0.646582,0.652785,0.647785,0.659035,0.650285,0.660285,0.657785,0.656535
cnn_VGG19_AVG_128,70/30,0.629167,0.633333,0.645833,0.658333,0.658333,0.658333,0.658333,0.658333,0.65
cnn_VGG19_AVG_128,10-fold CV,0.63288,0.63913,0.641646,0.640396,0.644146,0.645396,0.642896,0.645396,0.644146
cnn_VGG16_MAX_128,70/30,0.6375,0.6375,0.6375,0.6375,0.6375,0.6375,0.6375,0.6375,0.6375
cnn_VGG16_MAX_128,10-fold CV,0.631551,0.645396,0.642848,0.641614,0.640348,0.640348,0.640348,0.644098,0.640348


In [None]:
accuracy_df.to_excel('decision_tree_accuracies.xlsx')