# 1. Imports e configs e constantes 

In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import loguniform

import numpy as np
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer

In [2]:
import sys
from pathlib import Path

# Descobre o diretório raiz do projeto (onde fica a pasta src/)
ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent

sys.path.append(str(ROOT))
print("PROJECT ROOT:", ROOT)

from src.utils import (
    ks_score,
    performance_metrics
)

PROJECT ROOT: c:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge


# 2. Ler bases

In [3]:
# Base de treino e teste já com feature engineering
df_train_fe = pd.read_csv(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\data\processed\train_fe.csv')
df_test_fe = pd.read_csv(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\data\processed\test_fe.csv')

# Carrega as colunas selecionadas
selected_features = joblib.load(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\models\selected_features.pkl')


In [4]:

TARGET_COL = "y"
X_train = df_train_fe[selected_features].copy()
y_train = df_train_fe[TARGET_COL].copy()

X_test  = df_test_fe[selected_features].copy()
y_test  = df_test_fe[TARGET_COL].copy()

X_train.shape, X_test.shape

((8211, 20), (2527, 20))

# 3. Tunning

In [5]:
# scorer baseado no seu ks_score
ks_scorer = make_scorer(ks_score, needs_proba=True)

In [6]:
lgbm_base = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=400,       
    n_jobs=-1,
    random_state=42,
)

param_distributions = {
    "num_leaves":        [31, 63, 127],
    "max_depth":         [-1, 4, 6, 8],
    "learning_rate":     [0.01, 0.03, 0.05, 0.1],
    "n_estimators":      [200, 400, 600, 800],
    "min_child_samples": [20, 50, 100],
    "subsample":         [0.7, 0.8, 0.9],    
    "colsample_bytree":  [0.7, 0.8, 0.9],        
    "reg_alpha":         [0.0, 0.1, 0.5],
    "reg_lambda":        [0.0, 0.1, 0.5],
}

In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search_lgbm = RandomizedSearchCV(
    estimator=lgbm_base,
    param_distributions=param_distributions,
    n_iter=40,                
    scoring=ks_scorer,         # otimizar KS
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

random_search_lgbm.fit(X_train, y_train)

print("Melhor KS (CV):", random_search_lgbm.best_score_)
print("Melhores parâmetros LightGBM:")
random_search_lgbm.best_params_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan]


[LightGBM] [Info] Number of positive: 2316, number of negative: 5895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3631
[LightGBM] [Info] Number of data points in the train set: 8211, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282061 -> initscore=-0.934263
[LightGBM] [Info] Start training from score -0.934263
Melhor KS (CV): nan
Melhores parâmetros LightGBM:


{'subsample': 0.7,
 'reg_lambda': 0.0,
 'reg_alpha': 0.0,
 'num_leaves': 31,
 'n_estimators': 800,
 'min_child_samples': 20,
 'max_depth': -1,
 'learning_rate': 0.01,
 'colsample_bytree': 0.8}

# 4. Teste com hiperparâmetros

In [8]:
best_lgbm = random_search_lgbm.best_estimator_

# Treinar de novo
best_lgbm.fit(X_train, y_train)

# Probabilidades no teste
y_proba_test_lgbm = best_lgbm.predict_proba(X_test)[:, 1]

ks_t, auc_t, gini_t = performance_metrics(y_test, y_proba_test_lgbm)

print(f"Desempenho LightGBM tunado - TESTE:")
print(f"  KS   = {ks_t:.6f}")
print(f"  AUC  = {auc_t:.6f}")
print(f"  Gini = {gini_t:.6f}")

[LightGBM] [Info] Number of positive: 2316, number of negative: 5895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3631
[LightGBM] [Info] Number of data points in the train set: 8211, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282061 -> initscore=-0.934263
[LightGBM] [Info] Start training from score -0.934263
Desempenho LightGBM tunado - TESTE:
  KS   = 0.313688
  AUC  = 0.708333
  Gini = 0.416665


# 5. Salvar modelo

In [9]:
joblib.dump(best_lgbm, r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\models\final_model.pkl')

['C:\\Users\\Enrico\\OneDrive\\Documentos\\Python\\credit_scoring_challenge\\models\\final_model.pkl']