In [1]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score, cross_val_predict
import xgboost as xgb
import lightgbm as lgb
from tabpfn import TabPFNClassifier
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import optuna
import plotly.graph_objects as go
import seaborn as sns


In [2]:
df_completo = pd.read_csv(r"C:\Users\alefs\OneDrive\Documentos\GitHub\tcp\01_dados\data_mobilidade.csv")

list(df_completo.columns)

['Unnamed: 0',
 'id',
 'id_seq',
 'transicao',
 'renda_depois',
 'idade',
 'genero',
 'UF',
 'escolaridade',
 'raça',
 'posicao_familiar',
 'regiao']

In [3]:
df_completo = df_completo.drop(['Unnamed: 0', 'renda_depois', 'UF', 'id', 'id_seq'], axis = "columns")
df_completo['transicao'] = df_completo['transicao'].map({'Migrou para TCP': 1, "Migrou para o setor privado": 0})
df_completo['genero'] = df_completo['genero'].map({'Homem': 1, "Mulher": 0})
df_completo['escolaridade'] = df_completo['escolaridade'].map({'Ensino médio completo': 1, "Ensino médio incompleto": 0})
df_completo['raça'] = df_completo['raça'].map({'Branco': 1, 'Não branco': 0})
df_completo['posicao_familiar'] = df_completo['posicao_familiar'].map({'Responsável pelo domicílio': 1,
                                                                      'Não responsável pelo domicílio': 0})

# Encoder
drop_enc = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Ajuste e transformação
df_completo_encoded = pd.DataFrame(
    drop_enc.fit_transform(df_completo[['regiao']]),
    columns=drop_enc.get_feature_names_out(['regiao']),
    index=df_completo.index
)

# Substituir no original
df_encod = df_completo.drop(columns=['regiao']).join(df_completo_encoded)

df_encod

Unnamed: 0,transicao,idade,genero,escolaridade,raça,posicao_familiar,regiao_Nordeste,regiao_Norte,regiao_Sudeste,regiao_Sul
0,0,20,0,1,0,0,0.0,1.0,0.0,0.0
1,0,40,0,1,0,0,0.0,1.0,0.0,0.0
2,1,27,0,0,0,0,0.0,1.0,0.0,0.0
3,0,19,0,1,0,0,0.0,1.0,0.0,0.0
4,0,26,1,1,1,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
558,1,29,1,1,1,0,0.0,0.0,0.0,0.0
559,0,18,0,1,0,0,0.0,0.0,0.0,0.0
560,1,33,0,1,0,0,0.0,0.0,0.0,0.0
561,1,38,1,1,0,0,0.0,0.0,0.0,0.0


In [4]:
x = df_encod.drop(columns=['transicao'])
y = df_encod['transicao'] 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 42)

In [5]:
smote = SMOTE(sampling_strategy = 'auto', random_state = 42)
X_train_balanceado, y_train_balanceado = smote.fit_resample(X_train, y_train)

In [6]:
resultados_trials = []

def objective(trial):
    
    model_name = trial.suggest_categorical('model_name', [
        'Logistic Regression',
        'Decision Tree',
        'Random Forest',
        'XGBoost',
        'LightGBM',
        'AdaBoost'     
    ])
    
    # Inicialização do modelo
    model = None
    
    if model_name == 'Logistic Regression':
        C = trial.suggest_float('logreg', 1e-3, 10, log=True)
        model = LogisticRegression(C=C, solver='lbfgs', 
                                   max_iter=1000, 
                                   random_state=42)
        
    elif model_name == 'Decision Tree':
        max_depth = trial.suggest_int("dt_max_depth", 2, 32)
        min_samples_split = trial.suggest_int('dt_min_sample_split', 2, 10)
        model = DecisionTreeClassifier(max_depth=max_depth, 
                                       min_samples_split=min_samples_split, 
                                       random_state=42)
        
    elif model_name == 'Random Forest':
        n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
        max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, 
                                       max_depth=max_depth, 
                                       random_state=42)
        
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('xgb_n_estimators', 50, 300)
        max_depth = trial.suggest_int('xgb_max_depth', 3, 10)
        learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.3)
        model = xgb.XGBClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   use_label_encoder=False,
                                   eval_metric='logloss',
                                   random_state=42)
        
    elif model_name == "LightGBM":
        n_estimators = trial.suggest_int("lgb_n_estimators", 50, 300)
        max_depth = trial.suggest_int("lgb_max_depth", 3, 10)
        learning_rate = trial.suggest_float("lgb_learning_rate", 0.01, 0.3)
        model = lgb.LGBMClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    learning_rate=learning_rate,
                                    random_state=42)
        
    elif model_name == 'AdaBoost':
        n_estimators = trial.suggest_int('ada_n_estimators', 50, 300)
        learning_rate = trial.suggest_float('ada_learning_rate', 0.01, 1)
        model = AdaBoostClassifier(n_estimators=n_estimators,
                                   learning_rate=learning_rate,
                                   random_state=42)

    #Cross-validation
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    y_pred_proba = cross_val_predict(model, X_train_balanceado, y_train_balanceado, cv = skf, method='predict_proba')
    y_pred = np.argmax(y_pred_proba, axis = 1)
    
    #Métricas
    acc = accuracy_score(y_train_balanceado, y_pred)
    prec = precision_score(y_train_balanceado, y_pred, average = 'macro')
    rec = recall_score(y_train_balanceado, y_pred, average = 'macro')
    f1 = f1_score(y_train_balanceado, y_pred, average = 'macro')
    logloss = log_loss(y_train_balanceado, y_pred_proba)
    roc_auc = roc_auc_score(y_train_balanceado, y_pred_proba[:, 1])
    
    # Armazenar resultados
    resultados_trials.append({
        'model_name': model_name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'log_loss': logloss,
        'roc_auc': roc_auc,
        **trial.params
    })

    return -logloss  # porque optuna está maximizando

In [7]:
# Criando o objeto de estudo
study = optuna.create_study(direction='maximize',
                           sampler=optuna.samplers.TPESampler(seed=42))

# Iniciando a busca de hiperparâmetros
study.optimize(objective, n_trials=50)

# Mostra os resultados
print("Melhores hiperparâmetros:", study.best_trial.params)

#Transformando resultados em DataFrame
df_resultados = pd.DataFrame(resultados_trials)

#Ordenando pelo log-loss
df_resultados_ordenado = df_resultados.sort_values(by='log_loss')
print(df_resultados_ordenado.head(10))

[I 2025-04-16 15:07:10,400] A new study created in memory with name: no-name-9b4d93dd-fbbe-49eb-8ee1-30f0e91e0ec6
[I 2025-04-16 15:07:10,429] Trial 0 finished with value: -0.7101730469288752 and parameters: {'model_name': 'Decision Tree', 'dt_max_depth': 3, 'dt_min_sample_split': 9}. Best is trial 0 with value: -0.7101730469288752.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:10,620] Trial 1 finished with value: -0.3894043473293778 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 95, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.09823025045826593}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 

[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498938 -> initscore=-0.004246
[LightGBM] [Info] Start training from score -0.004246
[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFro

[I 2025-04-16 15:07:11,226] Trial 6 finished with value: -0.6506105930201872 and parameters: {'model_name': 'Logistic Regression', 'logreg': 0.002259279742015696}. Best is trial 1 with value: -0.3894043473293778.
[I 2025-04-16 15:07:12,168] Trial 7 finished with value: -0.6745328162277373 and parameters: {'model_name': 'AdaBoost', 'ada_n_estimators': 139, 'ada_learning_rate': 0.28812516459050697}. Best is trial 1 with value: -0.3894043473293778.
[I 2025-04-16 15:07:12,232] Trial 8 finished with value: -0.40313424498391415 and parameters: {'model_name': 'LightGBM', 'lgb_n_estimators': 99, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.2464838142519019}. Best is trial 1 with value: -0.3894043473293778.


[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498938 -> initscore=-0.004246
[LightGBM] [Info] Start training from score -0.004246
[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFro

[I 2025-04-16 15:07:13,591] Trial 9 finished with value: -0.444435252788154 and parameters: {'model_name': 'Random Forest', 'rf_n_estimators': 266, 'rf_max_depth': 21}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:13,734] Trial 10 finished with value: -0.5082692516321007 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 51, 'xgb_max_depth': 3, 'xgb_learning_rate': 0.034619903407986055}. Best is trial 1 with value: -0.3894043473293778.
[I 2025-04-16 15:07:13,839] Trial 11 finished with value: -0.4004988909840277 and parameters: {'model_name': 'LightGBM', 'lgb_n_estimators': 145, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.2062059901254062}. Best is trial 1 with value: -0.3894043473293778.


[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498938 -> initscore=-0.004246
[LightGBM] [Info] Start training from score -0.004246
[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFro

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:14,327] Trial 12 finished with value: -0.3946782689860008 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 299, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.08842479648255097}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:14,715] Trial 13 finished with value: -0.39219825946604275 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 273, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.08943905977938431}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_e

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:18,223] Trial 18 finished with value: -0.39323839727180715 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 190, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.05315018073184383}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:18,395] Trial 19 finished with value: -0.39915269343595505 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 86, 'xgb_max_depth': 3, 'xgb_learning_rate': 0.23079417685695425}. Best is trial 1 with value: -0.3894043473293778.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:18,784

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:20,846] Trial 24 finished with value: -0.40541970565840135 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 235, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.010655688978894056}. Best is trial 21 with value: -0.37917366695971777.
[I 2025-04-16 15:07:21,539] Trial 25 finished with value: -0.44830214968055515 and parameters: {'model_name': 'Random Forest', 'rf_n_estimators': 122, 'rf_max_depth': 32}. Best is trial 21 with value: -0.37917366695971777.
[I 2025-04-16 15:07:21,921] Trial 26 finished with value: -0.5941787957751409 and parameters: {'model_name': 'AdaBoost', 'ada_n_estimators': 53, 'ada_learning_rate': 0.02620506243249887}. Best is trial 21 with value: -0.37917366695971777.
[I 2025-04-16 15:07:21,953] Trial 27 finished with value: -4.005203120397

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:23,976] Trial 33 finished with value: -0.38631244570395196 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 210, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.03347633174481875}. Best is trial 32 with value: -0.3766648114039881.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:24,401] Trial 34 finished with value: -0.3952360636098269 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 202, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.06189192452345489}. Best is trial 32 with value: -0.3766648114039881.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label

[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498938 -> initscore=-0.004246
[LightGBM] [Info] Start training from score -0.004246
[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFro

[I 2025-04-16 15:07:25,459] Trial 38 finished with value: -0.6258639254466447 and parameters: {'model_name': 'Logistic Regression', 'logreg': 7.604331145003303}. Best is trial 35 with value: -0.37653098358893783.
[I 2025-04-16 15:07:25,494] Trial 39 finished with value: -4.762358588144103 and parameters: {'model_name': 'Decision Tree', 'dt_max_depth': 32, 'dt_min_sample_split': 6}. Best is trial 35 with value: -0.37653098358893783.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:25,893] Trial 40 finished with value: -0.4904149350936795 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 159, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.2869396190172007}. Best is trial 35 with value: -0.37653098358893783.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:29,837] Trial 45 finished with value: -0.3796027989341415 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 165, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.033231931982325054}. Best is trial 35 with value: -0.37653098358893783.
[I 2025-04-16 15:07:30,029] Trial 46 finished with value: -0.4291319338925528 and parameters: {'model_name': 'LightGBM', 'lgb_n_estimators': 297, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.07981771030583382}. Best is trial 35 with value: -0.37653098358893783.


[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498938 -> initscore=-0.004246
[LightGBM] [Info] Start training from score -0.004246
[LightGBM] [Info] Number of positive: 235, number of negative: 236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 9
[LightGBM] [Info] [binary:BoostFro

[I 2025-04-16 15:07:31,322] Trial 47 finished with value: -0.4877181707355847 and parameters: {'model_name': 'Random Forest', 'rf_n_estimators': 283, 'rf_max_depth': 4}. Best is trial 35 with value: -0.37653098358893783.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 15:07:31,660] Trial 48 finished with value: -0.3916958835741677 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 122, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.024086509221144733}. Best is trial 35 with value: -0.37653098358893783.
[I 2025-04-16 15:07:31,707] Trial 49 finished with value: -0.6519527075372276 and parameters: {'model_name': 'Logistic Regression', 'logreg': 0.0010735323637531588}. Best is trial 35 with value: -0.37653098358893783.


Melhores hiperparâmetros: {'model_name': 'XGBoost', 'xgb_n_estimators': 162, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.04469896453509295}
   model_name  accuracy  precision    recall  f1_score  log_loss   roc_auc  \
35    XGBoost  0.847134   0.847190  0.847134  0.847128  0.376531  0.914119   
32    XGBoost  0.842357   0.842527  0.842357  0.842337  0.376665  0.912557   
21    XGBoost  0.837580   0.837634  0.837580  0.837573  0.379174  0.910559   
45    XGBoost  0.842357   0.842638  0.842357  0.842324  0.379603  0.911031   
23    XGBoost  0.842357   0.842527  0.842357  0.842337  0.382726  0.908825   
41    XGBoost  0.840764   0.840820  0.840764  0.840758  0.383116  0.913146   
33    XGBoost  0.834395   0.834449  0.834395  0.834388  0.386312  0.908505   
22    XGBoost  0.829618   0.829621  0.829618  0.829617  0.387316  0.906441   
31    XGBoost  0.842357   0.842444  0.842357  0.842347  0.387919  0.911107   
43    XGBoost  0.829618   0.829782  0.829618  0.829597  0.388934  0.912801   

  

In [12]:
# Recuperar os melhores hiperparâmetros encontrados
melhores_params = study.best_trial.params

# Treinar o modelo vencedor (XGBoost) com os melhores hiperparâmetros
melhor_modelo = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    **melhores_params  # passa os hiperparâmetros otimizados aqui
)

melhor_modelo.fit(X_train_balanceado, y_train_balanceado)

# Fazer previsões no conjunto de teste
y_pred = melhor_modelo.predict(X_test)
y_prob = melhor_modelo.predict_proba(X_test)[:, 1]

# Avaliar o desempenho
print("Avaliação no conjunto de teste:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Log Loss:", log_loss(y_test, y_prob))


Avaliação no conjunto de teste:
Accuracy: 0.6568047337278107
Precision: 0.12903225806451613
Recall: 0.11428571428571428
F1 Score: 0.12121212121212122
ROC AUC: 0.5551172707889126
Log Loss: 0.9359190771241391


Parameters: { "model_name", "use_label_encoder", "xgb_learning_rate", "xgb_max_depth", "xgb_n_estimators" } are not used.

