In [11]:
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import loguniform

In [12]:
# Base de treino e teste já com feature engineering (antes você separou safra, etc.)
df_train_fe = pd.read_csv(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\data\processed\train_fe.csv')
df_test_fe = pd.read_csv(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\data\processed\test_fe.csv')

# Carrega as colunas selecionadas no outro notebook
selected_features = joblib.load(r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\models\selected_features.pkl')

TARGET_COL = "y"
X_train = df_train_fe[selected_features].copy()
y_train = df_train_fe[TARGET_COL].copy()

X_test  = df_test_fe[selected_features].copy()
y_test  = df_test_fe[TARGET_COL].copy()

X_train.shape, X_test.shape

((8211, 20), (2527, 20))

# Funções

In [13]:
def ks_score(y_true, y_proba) -> float:
    data = pd.DataFrame({"y": y_true, "score": y_proba}).sort_values(
        "score", ascending=False
    )

    total_bad = (data["y"] == 1).sum()
    total_good = (data["y"] == 0).sum()

    if total_bad == 0 or total_good == 0:
        return 0.0

    data["cum_bad"] = (data["y"] == 1).cumsum() / total_bad
    data["cum_good"] = (data["y"] == 0).cumsum() / total_good

    ks = (data["cum_bad"] - data["cum_good"]).abs().max()
    return float(ks)


def performance_metrics(y_true, y_proba):
    auc = roc_auc_score(y_true, y_proba)
    ks = ks_score(y_true, y_proba)
    gini = 2 * auc - 1
    return ks, auc, gini

# Tunning

In [14]:
logreg_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=2000,
        solver="liblinear",      # permite l1 e l2
        class_weight="balanced",
        random_state=42,
    )),
])

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_distributions = {
    "clf__C": loguniform(1e-3, 1e2),   # intensidade da regularização
    "clf__penalty": ["l1", "l2"],      # tipo de regularização
}

random_search = RandomizedSearchCV(
    estimator=logreg_pipe,
    param_distributions=param_distributions,
    n_iter=50,                 
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

random_search.fit(X_train, y_train)

print("Melhor AUC (CV):", random_search.best_score_)
print("Melhores parâmetros:")
random_search.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Melhor AUC (CV): 0.785739784407666
Melhores parâmetros:


{'clf__C': np.float64(0.0745934328572655), 'clf__penalty': 'l1'}

# Teste com hiperparâmetros

In [16]:
best_logreg = random_search.best_estimator_

# treina de novo em TODO o treino (boa prática após CV)
best_logreg.fit(X_train, y_train)

# avalia no TESTE
y_proba_test = best_logreg.predict_proba(X_test)[:, 1]
ks_t, auc_t, gini_t = performance_metrics(y_test, y_proba_test)

print(f"Desempenho no TESTE:")
print(f"  AUC  = {auc_t:.3f}")
print(f"  KS   = {ks_t:.5f}")
print(f"  Gini = {gini_t:.3f}")

Desempenho no TESTE:
  AUC  = 0.701
  KS   = 0.29349
  Gini = 0.401


# Salvar modelo

In [17]:
joblib.dump(best_logreg, r'C:\Users\Enrico\OneDrive\Documentos\Python\credit_scoring_challenge\models\final_model.pkl')

['C:\\Users\\Enrico\\OneDrive\\Documentos\\Python\\credit_scoring_challenge\\models\\final_model.pkl']