# Gradient Boost

## Importações

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, train_test_split
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as IMBPipeline
from scipy.stats import randint

from model_pipeline import *

seed = 777
rng = np.random.default_rng(seed)

import warnings

warnings.filterwarnings("ignore")

def rng_int():
    return rng.integers(1, 10000)

## Preparação e separação do conjunto de dados

In [2]:
df = pd.read_csv('../../data/preprocessed/_90_drp_ohe.csv')

X = df.drop(['Dropout'], axis=1)
y = df['Dropout']

X_train, _, y_train, _ = train_test_split(X, y, train_size=.8, stratify=y, random_state=rng_int())

## Configurando pipeline

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=rng_int())

pipeline = IMBPipeline([
    ('smotetomek', SMOTETomek()), 
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier())
])

# Definir a distribuição de parâmetros para random search
param_dist = {
    'classifier__n_estimators': randint(50, 400),
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__min_samples_split': [10, 20, 50, 100],
    'classifier__min_samples_leaf': [5, 10, 20, 50], 
}

best_model, best_params, best_score = get_best_params(pipeline, param_dist, kfold, X_train, y_train)

print("Melhores parâmetros:", best_params)
print("Melhor acurácia:", best_score)

  File "c:\Users\Felipe Castro\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Melhores parâmetros: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 4, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 50, 'classifier__n_estimators': 132, 'classifier__subsample': 0.8}
Melhor acurácia: 0.7848431381106863


## Armazenando melhor modelo

In [4]:
dump_model(best_model, '../dump/gradient_boost', best_score)