In [1]:
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping,Dataset
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_curve
from scipy.stats import randint, uniform
import pickle

from optuna.integration import LightGBMTunerCV


import sys
import os

PROJECT_DIRECTORY = os.path.join(os.getcwd(), '..')
sys.path.append(PROJECT_DIRECTORY)

from src.features_processing import *
from src.model_evaluation import *

  from .autonotebook import tqdm as notebook_tqdm


# Carregando bases de treino e teste

In [2]:
AUX_VAR_PATH     = PROJECT_DIRECTORY + '/models/artefacts/aux_vars.pickle'
TARGET_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/target_vars.pickle'
NUM_VARS_PATH    = PROJECT_DIRECTORY + '/models/artefacts/num_vars.pickle'
CAT_VARS_PATH    = PROJECT_DIRECTORY + '/models/artefacts/cat_vars.pickle'

CAT_IMPUTER_PATH = PROJECT_DIRECTORY + '/models/artefacts/cat_imputer.pickle'
NUM_IMPUTER_PATH = PROJECT_DIRECTORY + '/models/artefacts/num_imputer.pickle'

CAT_ENCONDER_PATH = PROJECT_DIRECTORY + '/models/artefacts/cat_encoder.pickle'
NUM_SCALER_PATH   = PROJECT_DIRECTORY + '/models/artefacts/num_scaler.pickle'

SELECTED_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/selected_vars.pickle'


with open(AUX_VAR_PATH,'rb') as f:
    aux_vars = pickle.load(f)

with open(TARGET_VARS_PATH,'rb') as f:
    target_vars = pickle.load(f)

with open(SELECTED_VARS_PATH,'rb') as f:
    selected_vars = pickle.load(f)

In [3]:
abt_treino = pd.read_parquet(PROJECT_DIRECTORY + '/data/processed/abt_treino.parquet')
abt_teste = pd.read_parquet(PROJECT_DIRECTORY + '/data/processed/abt_teste.parquet')


abt_treino = tratamento_completo(df = abt_treino,
                                 aux_vars_path    = AUX_VAR_PATH,
                                 target_var_path  = TARGET_VARS_PATH,
                                 num_vars_path    = NUM_VARS_PATH,
                                 cat_vars_path    = CAT_VARS_PATH,
                                 cat_imputer_path = CAT_IMPUTER_PATH,
                                 num_imputer_path = NUM_IMPUTER_PATH,
                                 cat_encoder_path = CAT_ENCONDER_PATH,
                                 num_scaler_path  = NUM_SCALER_PATH)


abt_teste = tratamento_completo(df = abt_teste,
                                 aux_vars_path    = AUX_VAR_PATH,
                                 target_var_path  = TARGET_VARS_PATH,
                                 num_vars_path    = NUM_VARS_PATH,
                                 cat_vars_path    = CAT_VARS_PATH,
                                 cat_imputer_path = CAT_IMPUTER_PATH,
                                 num_imputer_path = NUM_IMPUTER_PATH,
                                 cat_encoder_path = CAT_ENCONDER_PATH,
                                 num_scaler_path  = NUM_SCALER_PATH)

# Preparando dados para Tunning dos Parâmetros

In [4]:
X_train  = abt_treino[selected_vars]
y_train  = abt_treino[target_vars]

X_test = abt_teste[selected_vars]
y_test = abt_teste[target_vars]

In [5]:
# Crie o dataset LightGBM
train_set = Dataset(X_train, label=y_train)

# Parâmetros fixos (são usados como base e depois otimizados)
base_params = {
    # 1) Definição do problema
    'objective': 'binary',       # classificação binária
    'metric': 'auc',             # métrica principal

    # 2) Tipo de boosting e paralelização
    'boosting_type': 'gbdt',     # Gradient Boosting clássico
    'n_jobs': -1,                # usa todos os núcleos disponíveis

    # 3) Controle de overfitting “por padrão”
    'feature_fraction': 0.8,     # amostra 80% das features por iteração
    'bagging_fraction': 0.8,     # amostra 80% das instâncias por iteração
    'bagging_freq': 1,           # faz bagging em todas as iterações
    'min_child_samples': 20,     # mínimo de observações em cada leaf

    # 4) Balanceamento de classes (se aplicável)
    # Caso haja muito desbalanceamento, descomente a linha abaixo
    # 'is_unbalance': True,  
    # 'scale_pos_weight': 0.28,


    # 5) Outros controles
    'verbosity': -1,             # sem logs verbosos
    'seed': 42,                  # reprodutibilidade
    'force_row_wise': True,      # processamento em linha (melhora em alguns casos)
}


folds = StratifiedKFold(
    n_splits=5,      
    shuffle=True,
    random_state=42
)

tuner = LightGBMTunerCV(
    params=base_params,        
    train_set=train_set,       
    folds=folds,               
    num_boost_round=200,      
    return_cvbooster=True      
)

tuner.run()

[I 2025-05-21 11:46:26,164] A new study created in memory with name: no-name-9f5e88b5-f7e7-405d-9429-7d81327775ed
feature_fraction, val_score: 0.793912:  14%|#4        | 1/7 [00:01<00:11,  1.85s/it][I 2025-05-21 11:46:28,014] Trial 0 finished with value: 0.7939115484080568 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.7939115484080568.
feature_fraction, val_score: 0.794071:  29%|##8       | 2/7 [00:03<00:08,  1.79s/it][I 2025-05-21 11:46:29,769] Trial 1 finished with value: 0.7940709455009853 and parameters: {'feature_fraction': 0.6}. Best is trial 1 with value: 0.7940709455009853.
feature_fraction, val_score: 0.794071:  43%|####2     | 3/7 [00:05<00:07,  1.76s/it][I 2025-05-21 11:46:31,494] Trial 2 finished with value: 0.7934659991612358 and parameters: {'feature_fraction': 0.5}. Best is trial 1 with value: 0.7940709455009853.
feature_fraction, val_score: 0.794071:  57%|#####7    | 4/7 [00:07<00:05,  1.93s/it][I 2025-05-21 11:46:33,670] Trial 3 finished with

In [6]:
print("Melhor AUC (CV):", tuner.best_score)
print("Melhores parâmetros:")
for k, v in tuner.best_params.items():
    print(f"  - {k}: {v}")

Melhor AUC (CV): 0.8108504130935648
Melhores parâmetros:
  - objective: binary
  - metric: auc
  - boosting_type: gbdt
  - n_jobs: -1
  - feature_fraction: 0.784
  - bagging_fraction: 0.8367217576063575
  - bagging_freq: 5
  - min_child_samples: 20
  - verbosity: -1
  - seed: 42
  - force_row_wise: True
  - feature_pre_filter: False
  - lambda_l1: 7.537390252655616e-08
  - lambda_l2: 9.62602476927058
  - num_leaves: 6


# Salvando a lista dos melhores parâmetros encontrados

In [7]:
# 8.2. Ou treine um modelo final numa só vez:
final_params = {**base_params, **tuner.best_params}
FINAL_PARAMS_PATH = PROJECT_DIRECTORY + '/models/trained_model/final_params.pickle'

with open(FINAL_PARAMS_PATH,'wb') as f:
    pickle.dump(final_params,f)

# Treinamento Final

In [8]:
FINAL_PARAMS_PATH = PROJECT_DIRECTORY + '/models/trained_model/final_params.pickle'

with open(FINAL_PARAMS_PATH,'rb') as f:
    final_params = pickle.load(f)

In [9]:
final_params

{'objective': 'binary',
 'metric': 'auc',
 'boosting_type': 'gbdt',
 'n_jobs': -1,
 'feature_fraction': 0.784,
 'bagging_fraction': 0.8367217576063575,
 'bagging_freq': 5,
 'min_child_samples': 20,
 'verbosity': -1,
 'seed': 42,
 'force_row_wise': True,
 'feature_pre_filter': False,
 'lambda_l1': 7.537390252655616e-08,
 'lambda_l2': 9.62602476927058,
 'num_leaves': 6}

In [10]:
model_final = LGBMClassifier(**final_params)
model_final.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [11]:
y_prob_train = model_final.predict_proba(X_train)[:,1]
ks_t, auc_t, gini_t = performance_metrics(y_train, y_prob_train)

metrics_train = {'ks':[], 'auc':[], 'gini':[]}

metrics_train['ks'].append(ks_t)
metrics_train['auc'].append(auc_t)
metrics_train['gini'].append(gini_t)
metrics_train = pd.DataFrame(metrics_train).stack().reset_index().drop(columns='level_0')
metrics_train.columns = ['metrica','valor_treino']

In [12]:
y_prob_test = model_final.predict_proba(X_test)[:,1]
ks_t, auc_t, gini_t = performance_metrics(y_test, y_prob_test)

metrics_test = {'ks':[], 'auc':[], 'gini':[]}

metrics_test['ks'].append(ks_t)
metrics_test['auc'].append(auc_t)
metrics_test['gini'].append(gini_t)
metrics_test = pd.DataFrame(metrics_test).stack().reset_index().drop(columns='level_0')
metrics_test.columns = ['metrica','valor_teste']

In [13]:
metricas_finais = metrics_train.merge(metrics_test,how='left',on='metrica')

In [14]:
metricas_finais

Unnamed: 0,metrica,valor_treino,valor_teste
0,ks,52.922649,31.639188
1,auc,0.845314,0.717642
2,gini,69.062743,43.528415


# Salvando Modelo Final

In [15]:
FINAL_MODEL_PATH = PROJECT_DIRECTORY + '/models/trained_model/final_model.pickle'


with open(FINAL_MODEL_PATH,'wb') as f:
    pickle.dump(model_final,f)