# Avaliacao de Decision Tree
## Imports e funções

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

def raca_para_especie(raca):
    if raca in ['basset_hound', 'saint_bernard']:
        return 'dog'
    elif raca in ['Birman', 'Persian']:
        return 'cat'
    else:
        return raca  # fallback

def separar_dataset(df):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    return X, y

def dtree_holdout(df, criterio, max_depth):
    X, y = separar_dataset(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    dtc = DecisionTreeClassifier(criterion=criterio, max_depth=max_depth)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    confusao = confusion_matrix(y_test, y_pred)
    return dtc, f1_score, confusao

def dtree_crossvalidation(df, criterio, max_depth):
    X, y = separar_dataset(df)
    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    dtc = DecisionTreeClassifier(criterion=criterio, max_depth=max_depth)
    f1_score = cross_val_score(dtc, X, y, scoring='f1_weighted', cv=kf)
    y_pred = cross_val_predict(dtc, X, y, cv=kf)
    confusao = confusion_matrix(y, y_pred)
    return dtc, f1_score, confusao



## Lendo lista de arquivos a processar

In [2]:
datafiles = pd.read_csv('dataset_list.csv',encoding='utf-8')

datafiles.head(20)

Unnamed: 0,key,filename
0,hogfeat_128_16_4_9_pca,../Aula11/hogfeat_128_16_4_9_pca.csv.gz
1,hogfeat_256_64_2_18,../Aula11/hogfeat_256_64_2_18.csv.gz
2,hogfeat_256_64_2_9,../Aula11/hogfeat_256_64_2_9.csv.gz
3,hogfeat_128_16_4_9,../Aula11/hogfeat_128_16_4_9.csv.gz
4,hogfeat_256_32_2_9_pca,../Aula11/hogfeat_256_32_2_9_pca.csv.gz
5,hogfeat_128_16_2_9_pca,../Aula11/hogfeat_128_16_2_9_pca.csv.gz
6,hogfeat_128_32_2_9,../Aula11/hogfeat_128_32_2_9.csv.gz
7,lbpfeat_256_6_48,../Aula11/lbpfeat_256_6_48.csv.gz
8,lbpfeat_256_12_96,../Aula11/lbpfeat_256_12_96.csv.gz
9,hogfeat_256_32_2_9,../Aula11/hogfeat_256_32_2_9.csv.gz


## Lendo Dataframes e traduzindo Raça para especie

In [3]:

dfs = {}
shapes = []
last_cols = []

for metadata in datafiles.itertuples():
    # Imprime o arquivo que está sendo lido
    #print(f"Lendo arquivo: {metadata.key}")
    # Carrega o DataFrame
    df = pd.read_csv(metadata.filename)
    
    # Aplica a função raca_para_especie na coluna raca
    if 'raca' in df.columns:
        df['especie'] = df['raca'].apply(raca_para_especie)
        df = df.drop('raca', axis=1)
    
    # Elimina a coluna nome_arquivo se existir
    if 'nome_arquivo' in df.columns:
        df = df.drop('nome_arquivo', axis=1)
    
    dfs[metadata.key] = df
    shapes.append(df.shape)
    # Pega as últimas duas colunas
    last_cols.append(df.columns[-1:].tolist())

# Adiciona as colunas shape e last_two_columns ao datafiles
datafiles['shape'] = shapes
datafiles['last_column'] = last_cols

datafiles
    

Unnamed: 0,key,filename,shape,last_column
0,hogfeat_128_16_4_9_pca,../Aula11/hogfeat_128_16_4_9_pca.csv.gz,"(800, 104)",[especie]
1,hogfeat_256_64_2_18,../Aula11/hogfeat_256_64_2_18.csv.gz,"(800, 649)",[especie]
2,hogfeat_256_64_2_9,../Aula11/hogfeat_256_64_2_9.csv.gz,"(800, 325)",[especie]
3,hogfeat_128_16_4_9,../Aula11/hogfeat_128_16_4_9.csv.gz,"(800, 3601)",[especie]
4,hogfeat_256_32_2_9_pca,../Aula11/hogfeat_256_32_2_9_pca.csv.gz,"(800, 94)",[especie]
5,hogfeat_128_16_2_9_pca,../Aula11/hogfeat_128_16_2_9_pca.csv.gz,"(800, 118)",[especie]
6,hogfeat_128_32_2_9,../Aula11/hogfeat_128_32_2_9.csv.gz,"(800, 325)",[especie]
7,lbpfeat_256_6_48,../Aula11/lbpfeat_256_6_48.csv.gz,"(800, 51)",[especie]
8,lbpfeat_256_12_96,../Aula11/lbpfeat_256_12_96.csv.gz,"(800, 99)",[especie]
9,hogfeat_256_32_2_9,../Aula11/hogfeat_256_32_2_9.csv.gz,"(800, 1765)",[especie]


## Percorrendo espaço de busca para critérios, profundidades e tipos de treinamento pré-definidos

In [None]:
# Montando Gridsearch
par_criterios = ['gini', 'entropy', 'log_loss']
par_max_depth = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
par_training = ["holdout", "crossvalidation"]

# Estrutura para armazenar resultados
resultados = []

for criterio in par_criterios:
    for max_depth in par_max_depth:
        for training in par_training:
            for metadata in datafiles.itertuples():
                print(f"Criterio: {criterio}, Max Depth: {max_depth}, Training: {training}")
                print(f"Dataset: {metadata.key}")
                
                if training == "holdout":
                    dtc, f1_score, confusao = dtree_holdout(dfs[metadata.key], criterio, max_depth)
                else:
                    dtc, f1_score, confusao = dtree_crossvalidation(dfs[metadata.key], criterio, max_depth)
                
                # Armazena resultados no DataFrame
                resultado = {
                    'dataset': metadata.key,
                    'criterio': criterio,
                    'max_depth': max_depth,
                    'training_type': training,
                    'f1_score': f1_score if isinstance(f1_score, float) else f1_score.mean(),
                    'f1_std': f1_score.std() if hasattr(f1_score, 'std') else 0,
                    'confusion_matrix': confusao,
                    'model': dtc
                }
                resultados.append(resultado)
                print(f"f1_score: {resultado['f1_score']}, f1_std: {resultado['f1_std']}")
                print("-" * 50)

# Converte para DataFrame
df_resultados = pd.DataFrame(resultados)
                
df_resultados['shape'] = df_resultados['dataset'].apply(lambda x: datafiles.loc[datafiles['key'] == x, 'shape'].values[0])



Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_128_16_4_9_pca
f1_score: 0.6591079612818743, f1_std: 0
--------------------------------------------------
Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_256_64_2_18
f1_score: 0.6130401897052209, f1_std: 0
--------------------------------------------------
Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_256_64_2_9
f1_score: 0.6178713597318249, f1_std: 0
--------------------------------------------------
Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_128_16_4_9
f1_score: 0.6476869565217391, f1_std: 0
--------------------------------------------------
Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_256_32_2_9_pca
f1_score: 0.725, f1_std: 0
--------------------------------------------------
Criterio: gini, Max Depth: 2, Training: holdout
Dataset: hogfeat_128_16_2_9_pca
f1_score: 0.7128744074594988, f1_std: 0
-------------------------------------------------

## Salvando resultados em CSV e Joblib

In [13]:
# Adiciona coluna shape no df_resultados buscando no datafiles
df_resultados['shape'] = df_resultados['dataset'].apply(lambda x: datafiles.loc[datafiles['key'] == x, 'shape'].values[0])

import joblib
joblib.dump(df_resultados, 'dtree_avaliacao_resultados.joblib')

df_csv = df_resultados.drop(['confusion_matrix', 'model'], axis=1)
df_csv.to_csv('dtree_avaliacao_resultados.csv', index=False)

### Amostra do DataFrame de Resultados

In [14]:
# Visualiza os resultados
print(f"Total de combinações processadas: {len(df_resultados)}")
print(f"Colunas disponíveis: {df_resultados.columns.tolist()}")
df_resultados.head(10)


Total de combinações processadas: 1008
Colunas disponíveis: ['dataset', 'criterio', 'max_depth', 'training_type', 'f1_score', 'f1_std', 'confusion_matrix', 'model', 'shape']


Unnamed: 0,dataset,criterio,max_depth,training_type,f1_score,f1_std,confusion_matrix,model,shape
0,hogfeat_128_16_4_9_pca,gini,2,holdout,0.659108,0.0,"[[97, 9], [70, 64]]",DecisionTreeClassifier(max_depth=2),"(800, 104)"
1,hogfeat_256_64_2_18,gini,2,holdout,0.61304,0.0,"[[86, 20], [71, 63]]",DecisionTreeClassifier(max_depth=2),"(800, 649)"
2,hogfeat_256_64_2_9,gini,2,holdout,0.617871,0.0,"[[65, 41], [51, 83]]",DecisionTreeClassifier(max_depth=2),"(800, 325)"
3,hogfeat_128_16_4_9,gini,2,holdout,0.647687,0.0,"[[83, 23], [61, 73]]",DecisionTreeClassifier(max_depth=2),"(800, 3601)"
4,hogfeat_256_32_2_9_pca,gini,2,holdout,0.725,0.0,"[[73, 33], [33, 101]]",DecisionTreeClassifier(max_depth=2),"(800, 94)"
5,hogfeat_128_16_2_9_pca,gini,2,holdout,0.712874,0.0,"[[84, 22], [47, 87]]",DecisionTreeClassifier(max_depth=2),"(800, 118)"
6,hogfeat_128_32_2_9,gini,2,holdout,0.601956,0.0,"[[46, 60], [33, 101]]",DecisionTreeClassifier(max_depth=2),"(800, 325)"
7,lbpfeat_256_6_48,gini,2,holdout,0.692632,0.0,"[[78, 28], [46, 88]]",DecisionTreeClassifier(max_depth=2),"(800, 51)"
8,lbpfeat_256_12_96,gini,2,holdout,0.680257,0.0,"[[74, 32], [45, 89]]",DecisionTreeClassifier(max_depth=2),"(800, 99)"
9,hogfeat_256_32_2_9,gini,2,holdout,0.594916,0.0,"[[99, 7], [84, 50]]",DecisionTreeClassifier(max_depth=2),"(800, 1765)"
