# Decision Tree Model

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

## Dataset

In [2]:
# Dados Treinamento
X_train = pd.read_csv("/home/ds_deivisson/repos/datasets/X_training_classificacao.csv")
y_train = pd.read_csv("/home/ds_deivisson/repos/datasets/y_training_classificacao.csv")

# Dados Validacao
X_val = pd.read_csv("/home/ds_deivisson/repos/datasets/X_validation_classificacao.csv")
y_val = pd.read_csv("/home/ds_deivisson/repos/datasets/y_validation_classificacao.csv")


# Dados Teste
X_test = pd.read_csv("/home/ds_deivisson/repos/datasets/X_test_classificacao.csv")
y_test = pd.read_csv("/home/ds_deivisson/repos/datasets/y_test_classificacao.csv")

# Contagem
linhas = len(y_train) + len(y_val) + len(y_test)

treino_perc = len(y_train)/linhas*100
validacao_perc = len(y_val)/linhas*100
teste_perc = len(y_test)/linhas*100

print (f" Percentual Treino: {treino_perc:.2f}")
print (f" Percentual Validacao: {validacao_perc:.2f}")
print (f" Percentual Teste: {teste_perc:.2f}")

 Percentual Treino: 56.00
 Percentual Validacao: 24.00
 Percentual Teste: 20.00


## Features/ Label

In [3]:
features = [
 'customer_type',
 'age',
 'flight_distance',
 'class',
 'inflight_wifi_service',
 'departure_arrival_time_convenient',
 'ease_of_online_booking',
 'gate_location',
 'food_and_drink',
 'online_boarding',
 'seat_comfort',
 'inflight_entertainment',
 'on_board_service',
 'leg_room_service',
 'baggage_handling',
 'checkin_service',
 'inflight_service',
 'cleanliness',
 'departure_delay_in_minutes',
 'arrival_delay_in_minutes',
 'gender_Female',
 'type_of_travel_business_travel'
]

# Selecionando os dados

x_train_class = X_train.loc[:, features]
X_val_class = X_val.loc[:, features]
X_test_class = X_test.loc[:, features]

y_train_class = y_train.iloc[:, 0].to_numpy()
y_val_class = y_val.iloc[:, 0].to_numpy()
y_test_class = y_test.iloc[:, 0].to_numpy()


## Decision Tree Model - Dados Treino

In [4]:
resultados = []

for m in range(2, 21):
    dtc = DecisionTreeClassifier(
        max_depth=m,
        random_state=42
    )
    dtc.fit(x_train_class, y_train_class)

    y_pred_tr = dtc.predict(x_train_class)
    y_pred_val = dtc.predict(X_val_class)

    # metricas treino
    acc_tr = mt.accuracy_score(y_train_class, y_pred_tr)
    prec_tr = mt.precision_score(y_train_class, y_pred_tr)
    recall_tr = mt.recall_score(y_train_class, y_pred_tr)
    f1_tr = mt.f1_score(y_train_class, y_pred_tr, average="weighted")

    # metricas validação
    acc_val = mt.accuracy_score(y_val_class, y_pred_val)
    prec_val = mt.precision_score(y_val_class, y_pred_val)
    recall_val = mt.recall_score(y_val_class, y_pred_val)
    f1_val = mt.f1_score(y_val_class, y_pred_val, average="weighted")
    
    
    resultados.append({
        "max_depth": m,
        "acuracia_train": acc_tr,
        "precisao_train": prec_tr,
        "recall_train": recall_tr,
        "f1_train": f1_tr,
        "acuracia_val": acc_val,
        "precisao_val": prec_val,
        "recall_val": recall_val,
        "f1_val": f1_val
    })

df_results = pd.DataFrame(resultados)
df_results.sort_values("f1_val", ascending=False)

Unnamed: 0,max_depth,acuracia_train,precisao_train,recall_train,f1_train,acuracia_val,precisao_val,recall_val,f1_val
12,14,0.97373,0.981945,0.956981,0.973687,0.951028,0.954362,0.931547,0.950954
13,15,0.977412,0.989484,0.958063,0.977365,0.950352,0.955678,0.928502,0.950261
11,13,0.969827,0.980731,0.949026,0.969763,0.950352,0.956236,0.927909,0.950257
10,12,0.964642,0.976587,0.940976,0.964557,0.949419,0.956488,0.925384,0.949312
14,16,0.980845,0.989123,0.966431,0.980818,0.949291,0.950117,0.931918,0.94923
16,18,0.987851,0.99392,0.97795,0.987839,0.949162,0.946921,0.93511,0.949123
15,17,0.984693,0.991728,0.972795,0.984675,0.948711,0.94835,0.932437,0.948658
17,19,0.990609,0.995041,0.983232,0.990602,0.948583,0.943245,0.937783,0.948565
18,20,0.992608,0.99481,0.9881,0.992605,0.948132,0.940355,0.939936,0.948131
9,11,0.958588,0.963688,0.939863,0.958523,0.946073,0.948028,0.926349,0.945995


## Decision Tree model - predição para dados teste

In [5]:
# concatenacao dos dados treino e validação
X_train_full = np.vstack([x_train_class, X_val_class])
y_train_full = np.hstack([y_train_class, y_val_class])

# modelo final
dtc_final = DecisionTreeClassifier()
dtc_final.fit(X_train_full, y_train_full )

# Classes previstas
y_pred_test = dtc_final.predict(X_test_class)

# Probabilidades Previstas
y_proba_test = dtc_final.predict_proba(X_test_class)

print(mt.classification_report(y_test_class, y_pred_test))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95     14528
           1       0.94      0.94      0.94     11365

    accuracy                           0.95     25893
   macro avg       0.95      0.95      0.95     25893
weighted avg       0.95      0.95      0.95     25893



