# Árvore de decisão

## Importações

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as IMBPipeline
from scipy.stats import randint

from model_pipeline import *

seed = 777
rng = np.random.default_rng(seed)

import warnings

warnings.filterwarnings("ignore")

def rng_int():
    return rng.integers(1, 10000)

## Preparação e separação do conjunto de dados

In [6]:
df = pd.read_csv('../../data/preprocessed/_90_drp_ohe.csv')

X = df.drop(['Dropout'], axis=1)
y = df['Dropout']

X_train, _, y_train, _ = train_test_split(X, y, train_size=.8, stratify=y, random_state=rng_int())

## Configurando pipeline

In [7]:
kfold = KFold(n_splits=5, shuffle=True, random_state=rng_int())

pipeline = IMBPipeline([
    ('smotetomek', SMOTETomek()), 
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

param_dist = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_leaf': randint(1, 20),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__splitter': ['best', 'random'],
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

best_model, best_params, best_score = get_best_params(pipeline, param_dist, kfold, X_train, y_train)

print("Melhores parâmetros:", best_params)
print("Melhor acurácia:", best_score)

Melhores parâmetros: {'classifier__criterion': 'entropy', 'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 19, 'classifier__min_samples_split': 3, 'classifier__splitter': 'best'}
Melhor acurácia: 0.8012558869701726


## Armazenando melhor modelo

In [8]:
dump_model(best_model, '../dump/decision_tree', best_score)