In [1]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score, cross_val_predict
import xgboost as xgb
import lightgbm as lgb
from tabpfn import TabPFNClassifier
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import optuna
import plotly.graph_objects as go
import seaborn as sns


In [2]:
df_completo = pd.read_csv(r"C:\Users\alefs\OneDrive\Documentos\GitHub\tcp\01_dados\data_mobilidade.csv")

list(df_completo.columns)

['Unnamed: 0',
 'id',
 'id_seq',
 'transicao',
 'renda_depois',
 'idade',
 'genero',
 'UF',
 'escolaridade',
 'raça',
 'posicao_familiar',
 'regiao']

In [3]:
df_completo = df_completo.drop(['Unnamed: 0', 'renda_depois', 'UF', 'id', 'id_seq'], axis = "columns")
df_completo['transicao'] = df_completo['transicao'].map({'Migrou para TCP': 1, "Migrou para o setor privado": 0})
df_completo['genero'] = df_completo['genero'].map({'Homem': 1, "Mulher": 0})
df_completo['escolaridade'] = df_completo['escolaridade'].map({'Ensino médio completo': 1, "Ensino médio incompleto": 0})
df_completo['raça'] = df_completo['raça'].map({'Branco': 1, 'Não branco': 0})
df_completo['posicao_familiar'] = df_completo['posicao_familiar'].map({'Responsável pelo domicílio': 1,
                                                                      'Não responsável pelo domicílio': 0})

# Encoder
drop_enc = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Ajuste e transformação
df_completo_encoded = pd.DataFrame(
    drop_enc.fit_transform(df_completo[['regiao']]),
    columns=drop_enc.get_feature_names_out(['regiao']),
    index=df_completo.index
)

# Substituir no original
df_encod = df_completo.drop(columns=['regiao']).join(df_completo_encoded)

df_encod

Unnamed: 0,transicao,idade,genero,escolaridade,raça,posicao_familiar,regiao_Nordeste,regiao_Norte,regiao_Sudeste,regiao_Sul
0,0,20,0,1,0,0,0.0,1.0,0.0,0.0
1,0,40,0,1,0,0,0.0,1.0,0.0,0.0
2,1,27,0,0,0,0,0.0,1.0,0.0,0.0
3,0,19,0,1,0,0,0.0,1.0,0.0,0.0
4,0,26,1,1,1,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
558,1,29,1,1,1,0,0.0,0.0,0.0,0.0
559,0,18,0,1,0,0,0.0,0.0,0.0,0.0
560,1,33,0,1,0,0,0.0,0.0,0.0,0.0
561,1,38,1,1,0,0,0.0,0.0,0.0,0.0


In [4]:
x = df_encod.drop(columns=['transicao'])
y = df_encod['transicao'] 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 42)

In [5]:
smote = SMOTE(sampling_strategy = 'auto', random_state = 42)
X_train_balanceado, y_train_balanceado = smote.fit_resample(X_train, y_train)

In [6]:
resultados_trials = []

def objective(trial):
    
    model_name = trial.suggest_categorical('model_name', [
        'Logistic Regression',
        'Decision Tree',
        'Random Forest',
        'XGBoost',
        'LightGBM',
        'AdaBoost'     
    ])
    
    # Inicialização do modelo
    model = None
    
    if model_name == 'Logistic Regression':
        C = trial.suggest_float('logreg', 1e-3, 10, log=True)
        model = LogisticRegression(C=C, solver='lbfgs', 
                                   max_iter=1000, 
                                   random_state=42)
        
    elif model_name == 'Decision Tree':
        max_depth = trial.suggest_int("dt_max_depth", 2, 32)
        min_samples_split = trial.suggest_int('dt_min_sample_split', 2, 10)
        model = DecisionTreeClassifier(max_depth=max_depth, 
                                       min_samples_split=min_samples_split, 
                                       random_state=42)
        
    elif model_name == 'Random Forest':
        n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
        max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, 
                                       max_depth=max_depth, 
                                       random_state=42)
        
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('xgb_n_estimators', 50, 300)
        max_depth = trial.suggest_int('xgb_max_depth', 3, 10)
        learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.3)
        model = xgb.XGBClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   use_label_encoder=False,
                                   eval_metric='logloss',
                                   random_state=42)
        
    elif model_name == "LightGBM":
        n_estimators = trial.suggest_int("lgb_n_estimators", 50, 300)
        max_depth = trial.suggest_int("lgb_max_depth", 3, 10)
        learning_rate = trial.suggest_float("lgb_learning_rate", 0.01, 0.3)
        model = lgb.LGBMClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    learning_rate=learning_rate,
                                    random_state=42)
        
    elif model_name == 'AdaBoost':
        n_estimators = trial.suggest_int('ada_n_estimators', 50, 300)
        learning_rate = trial.suggest_float('ada_learning_rate', 0.01, 1)
        model = AdaBoostClassifier(n_estimators=n_estimators,
                                   learning_rate=learning_rate,
                                   random_state=42)

    #Cross-validation
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv = skf, method='predict_proba')
    y_pred = np.argmax(y_pred_proba, axis = 1)
    
    #Métricas
    acc = accuracy_score(y_train, y_pred)
    prec = precision_score(y_train, y_pred, average = 'macro')
    rec = recall_score(y_train, y_pred, average = 'macro')
    f1 = f1_score(y_train, y_pred, average = 'macro')
    logloss = log_loss(y_train, y_pred_proba)
    roc_auc = roc_auc_score(y_train, y_pred_proba[:, 1])
    
    # Armazenar resultados
    resultados_trials.append({
        'model_name': model_name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'log_loss': logloss,
        'roc_auc': roc_auc,
        **trial.params
    })

    return -logloss  # porque optuna está maximizando

In [None]:
# Criando o objeto de estudo
study = optuna.create_study(direction='maximize',
                           sampler=optuna.samplers.TPESampler(seed=42))

# Iniciando a busca de hiperparâmetros
study.optimize(objective, n_trials=50)

# Mostra os resultados
print("Melhores hiperparâmetros:", study.best_trial.params)

#Transformando resultados em DataFrame
df_resultados = pd.DataFrame(resultados_trials)

#Ordenando pelo log-loss
df_resultados_ordenado = df_resultados.sort_values(by='log_loss')
print(df_resultados_ordenado.head(10))

[I 2025-04-16 16:51:26,449] A new study created in memory with name: no-name-90493ebb-8c5f-4868-a7fe-f5544393bf65
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-16 16:51:26,475] Trial 0 finished with value: -1.0537410511395564 and parameters: {'model_name': 'Decision Tree', 'dt_max_depth': 3, 'dt_min_sample_split': 9}. Best is trial 0 with value: -1.0537410511395564.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-04-16 16:51:26,658] Trial 1 finished with value: -0.5222738173414797 and parameters: {'model_name': 'XGBoost', 'xgb_n_estimators': 95, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.09823025045826593}. Best is trial 1 with value: -0.5222738173414797.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder

[LightGBM] [Info] Number of positive: 60, number of negative: 235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 295, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203390 -> initscore=-1.365241
[LightGBM] [Info] Start training from score -1.365241
[LightGBM] [Info] Number of positive: 60, number of negative: 235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55
[LightGBM] [Info] Number of data points in the train set: 295, number of used features: 9
[LightGBM] [Info] [binary:BoostFromSco

[I 2025-04-16 16:51:28,066] Trial 7 finished with value: -0.668189654741698 and parameters: {'model_name': 'AdaBoost', 'ada_n_estimators': 139, 'ada_learning_rate': 0.28812516459050697}. Best is trial 6 with value: -0.4807505316306502.
[I 2025-04-16 16:51:28,129] Trial 8 finished with value: -0.5583658353345377 and parameters: {'model_name': 'LightGBM', 'lgb_n_estimators': 99, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.2464838142519019}. Best is trial 6 with value: -0.4807505316306502.


[LightGBM] [Info] Number of positive: 60, number of negative: 235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 295, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203390 -> initscore=-1.365241
[LightGBM] [Info] Start training from score -1.365241
[LightGBM] [Info] Number of positive: 60, number of negative: 235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55
[LightGBM] [Info] Number of data points in the train set: 295, number of used features: 9
[LightGBM] [Info] [binary:BoostFromSco

[I 2025-04-16 16:51:29,355] Trial 9 finished with value: -0.6850698404752172 and parameters: {'model_name': 'Random Forest', 'rf_n_estimators': 266, 'rf_max_depth': 21}. Best is trial 6 with value: -0.4807505316306502.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-16 16:51:29,400] Trial 10 finished with value: -0.4809651120519734 and parameters: {'model_name': 'Logistic Regression', 'logreg': 0.0010567900894501644}. Best is trial 6 with value: -0.4807505316306502.
[I 2025-04-16 16:51:29,457] Trial 11 finished with value: -0.4819572323784015 and parameters: {'model_name': 'Logistic Regression', 'logreg': 0.36240382475490507}. Best is trial 6 with value: -0.4807505316306502.
[I 2025-04-16 16:51:29,498] Trial 12 finished with value: -0.480723228231019 and parameters: {'model_name': 'Logistic Regression', 'logreg': 0.0033339567491445597}. Best is trial 12 with value: -0.480723228231019.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(re

In [12]:
# Recuperar os melhores hiperparâmetros encontrados
melhores_params = study.best_trial.params

# Treinar o modelo vencedor (XGBoost) com os melhores hiperparâmetros
melhor_modelo = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    **melhores_params  # passa os hiperparâmetros otimizados aqui
)

melhor_modelo.fit(X_train_balanceado, y_train_balanceado)

# Fazer previsões no conjunto de teste
y_pred = melhor_modelo.predict(X_test)
y_prob = melhor_modelo.predict_proba(X_test)[:, 1]

# Avaliar o desempenho
print("Avaliação no conjunto de teste:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Log Loss:", log_loss(y_test, y_prob))


Avaliação no conjunto de teste:
Accuracy: 0.6568047337278107
Precision: 0.12903225806451613
Recall: 0.11428571428571428
F1 Score: 0.12121212121212122
ROC AUC: 0.5551172707889126
Log Loss: 0.9359190771241391


Parameters: { "model_name", "use_label_encoder", "xgb_learning_rate", "xgb_max_depth", "xgb_n_estimators" } are not used.

