In [7]:
import joblib
import sklearn

print('scikit-learn version: {}.'.format(sklearn.__version__))

bundle = joblib.load("../datasets/artifacts/ico_preproc_bundle_v2.pk1")

X_train_prep = bundle.X_train_prep   # numpy array listo para modelar
X_test_prep  = bundle.X_test_prep
y_train      = bundle.y_train
y_test       = bundle.y_test

feat_names   = bundle.feature_names   # nombres alineados a X_*_prep
preproc      = bundle.preprocessor    # ColumnTransformer ya fit
cat_cols     = bundle.categorical_cols
num_cols     = bundle.numeric_cols
bin_cols     = bundle.binary_cols

# Reconstituir preprocesador con las mismas listas
def build_preprocessor(cats, nums, bins):
    cat_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),  # usa sparse=False si tu sklearn es <1.2
    ])
    num_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])
    bin_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="constant", fill_value=0)),
    ])
    transformers = []
    if nums: transformers.append(("num", num_pipe, nums))
    if bins: transformers.append(("bin", bin_pipe, bins))
    if cats: transformers.append(("cat", cat_pipe, cats))
    return ColumnTransformer(transformers, remainder="drop")

print("Bundle cargado. Todo listo para entrenar los modelos avanzados")



scikit-learn version: 1.7.2.
Bundle cargado. Todo listo para entrenar los modelos avanzados


### Random Forest
###### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [8]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

pre = build_preprocessor(bundle.categorical_cols, bundle.numeric_cols, bundle.binary_cols)

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1, 
    verbose=1
)

pipe_rf = Pipeline([
    ("pre", pre),
    ("clf", rf)
])

param_grid_rf = {
    "clf__n_estimators": [300, 600],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_leaf": [1, 3, 5],
    "clf__max_features": ["sqrt", 0.5],
}

gs_rf = GridSearchCV(
    pipe_rf, param_grid_rf, scoring="roc_auc", cv=5, n_jobs=-1, verbose=3
)
gs_rf.fit(bundle.X_train, bundle.y_train)

y_pred = gs_rf.predict(bundle.X_test)
y_proba = gs_rf.predict_proba(bundle.X_test)[:,1]

print("=== RandomForest (balanced_subsample) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}  |  ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}  |  PR-AUC: {average_precision_score(y_test, y_proba):.3f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nBest params LR:", gs_rf.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 22 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed:    0.2s


=== RandomForest (balanced_subsample) ===
Accuracy: 0.764  |  ROC-AUC: 0.828  |  PR-AUC: 0.713

Classification Report:
              precision    recall  f1-score   support

           0      0.779     0.892     0.832       751
           1      0.720     0.523     0.606       398

    accuracy                          0.764      1149
   macro avg      0.749     0.707     0.719      1149
weighted avg      0.759     0.764     0.753      1149

Confusion Matrix:
[[670  81]
 [190 208]]

Best params LR: {'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 1, 'clf__n_estimators': 300}
CPU times: total: 2.25 s
Wall time: 40.1 s


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=22)]: Using backend ThreadingBackend with 22 concurrent workers.
[Parallel(n_jobs=22)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=22)]: Done 156 tasks      | elapsed:    0.0s
[Parallel(n_jobs=22)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=22)]: Using backend ThreadingBackend with 22 concurrent workers.
[Parallel(n_jobs=22)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=22)]: Done 156 tasks      | elapsed:    0.0s
[Parallel(n_jobs=22)]: Done 300 out of 300 | elapsed:    0.0s finished


### XGBoost

In [9]:
%%time

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, average_precision_score
import numpy as np

pre = build_preprocessor(bundle.categorical_cols, bundle.numeric_cols, bundle.binary_cols)

# Desbalance (neg/pos)
pos = (bundle.y_train == 1).sum()
neg = (bundle.y_train == 0).sum()
spw = float(neg) / float(pos) if pos > 0 else 1.0

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    n_jobs=-1,
    scale_pos_weight=spw,
    random_state=42,
    #device="cuda",
    verbosity=1
)

pipe_xgb = Pipeline([
    ("pre", pre),
    ("clf", xgb)
])

param_grid_xgb = {
    "clf__n_estimators": [300, 600],
    "clf__max_depth": [4, 6, 8],
    "clf__learning_rate": [0.03, 0.05, 0.1],
    "clf__subsample": [0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.9, 1.0],
    "clf__reg_lambda": [0.0, 1.0],
    "clf__reg_alpha": [0.0, 0.5],
}

gs_xgb = GridSearchCV(
    pipe_xgb, param_grid_xgb, scoring="roc_auc", cv=5, n_jobs=-1, verbose=1
)
gs_xgb.fit(bundle.X_train, bundle.y_train)

y_pred  = gs_xgb.predict(bundle.X_test)
y_proba = gs_xgb.predict_proba(bundle.X_test)[:, 1]

print("=== XGBoost (GridSearchCV) ===")
print(f"Accuracy: {accuracy_score(bundle.y_test, y_pred):.3f}  |  ROC-AUC: {roc_auc_score(bundle.y_test, y_proba):.3f}  |  PR-AUC: {average_precision_score(bundle.y_test, y_proba):.3f}\n")
print("Classification Report:")
print(classification_report(bundle.y_test, y_pred, digits=3))
print("Confusion Matrix:")
print(confusion_matrix(bundle.y_test, y_pred))
print("\nBest params XGB:", gs_xgb.best_params_)


Fitting 5 folds for each of 432 candidates, totalling 2160 fits
=== XGBoost (GridSearchCV) ===
Accuracy: 0.766  |  ROC-AUC: 0.819  |  PR-AUC: 0.720

Classification Report:
              precision    recall  f1-score   support

           0      0.817     0.827     0.822       751
           1      0.666     0.651     0.658       398

    accuracy                          0.766      1149
   macro avg      0.741     0.739     0.740      1149
weighted avg      0.765     0.766     0.765      1149

Confusion Matrix:
[[621 130]
 [139 259]]

Best params XGB: {'clf__colsample_bytree': 0.6, 'clf__learning_rate': 0.03, 'clf__max_depth': 8, 'clf__n_estimators': 300, 'clf__reg_alpha': 0.0, 'clf__reg_lambda': 0.0, 'clf__subsample': 0.8}
CPU times: total: 14.3 s
Wall time: 1min 27s


### LightBGM

In [None]:
%%time

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, average_precision_score

pre = build_preprocessor(bundle.categorical_cols, bundle.numeric_cols, bundle.binary_cols)

pos = (bundle.y_train == 1).sum()
neg = (bundle.y_train == 0).sum()
spw = float(neg) / float(pos) if pos > 0 else 1.0

lgbm = LGBMClassifier(
    n_estimators=200,
    max_depth=-1,               # -1 = sin límite
    num_leaves=31,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary",
    class_weight=None,          # usamos scale_pos_weight
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=spw,
    #device="gpu",
    verbose=1
)

pipe_lgbm = Pipeline([
    ("pre", pre),
    ("clf", lgbm)
])

param_grid_lgbm = {
    "clf__n_estimators": [200, 400, 600],
    "clf__num_leaves": [31, 63],
    "clf__learning_rate": [0.03, 0.05, 0.1],
    "clf__subsample": [0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.9, 1.0],
    "clf__reg_lambda": [0.0, 1.0],
    "clf__reg_alpha": [0.0, 0.5],
    "clf__max_depth": [-1, 10, 20],
}

gs_lgbm = GridSearchCV(
    pipe_lgbm, param_grid_lgbm, scoring="roc_auc", cv=5, n_jobs=-1, verbose=1
)
gs_lgbm.fit(bundle.X_train, bundle.y_train)

y_pred  = gs_lgbm.predict(bundle.X_test)
y_proba = gs_lgbm.predict_proba(bundle.X_test)[:, 1]

print("=== LightGBM (GridSearchCV) ===")
print(f"Accuracy: {accuracy_score(bundle.y_test, y_pred):.3f}  |  ROC-AUC: {roc_auc_score(bundle.y_test, y_proba):.3f}  |  PR-AUC: {average_precision_score(bundle.y_test, y_proba):.3f}\n")
print("Classification Report:")
print(classification_report(bundle.y_test, y_pred, digits=3))
print("Confusion Matrix:")
print(confusion_matrix(bundle.y_test, y_pred))
print("\nBest params LGBM:", gs_lgbm.best_params_)


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


### CatBoost

In [6]:
%%time

from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, average_precision_score

pre = build_preprocessor(bundle.categorical_cols, bundle.numeric_cols, bundle.binary_cols)

pos = (bundle.y_train == 1).sum()
neg = (bundle.y_train == 0).sum()
spw = float(neg) / float(pos) if pos > 0 else 1.0

catb = CatBoostClassifier(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=3.0,
    subsample=0.9,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=None,       # usamos scale_pos_weight
    scale_pos_weight=spw,
    random_seed=42,
    verbose=False,            # silenciado para GridSearchCV
    task_type="GPU",
)

pipe_catb = Pipeline([
    ("pre", pre),
    ("clf", catb)
])

param_grid_catb = {
    "clf__iterations": [400, 600, 800],
    "clf__depth": [4, 6, 8],
    "clf__learning_rate": [0.03, 0.05, 0.1],
    "clf__l2_leaf_reg": [1.0, 3.0, 5.0],
    "clf__subsample": [0.8, 1.0],
}

gs_catb = GridSearchCV(
    pipe_catb, param_grid_catb, scoring="roc_auc", cv=5, n_jobs=-1, verbose=2
)
gs_catb.fit(bundle.X_train, bundle.y_train)

y_pred  = gs_catb.predict(bundle.X_test)
# CatBoost devuelve 2D para predict_proba
y_proba = gs_catb.predict_proba(bundle.X_test)[:, 1]

print("=== CatBoost (GridSearchCV) ===")
print(f"Accuracy: {accuracy_score(bundle.y_test, y_pred):.3f}  |  ROC-AUC: {roc_auc_score(bundle.y_test, y_proba):.3f}  |  PR-AUC: {average_precision_score(bundle.y_test, y_proba):.3f}\n")
print("Classification Report:")
print(classification_report(bundle.y_test, y_pred, digits=3))
print("Confusion Matrix:")
print(confusion_matrix(bundle.y_test, y_pred))
print("\nBest params CAT:", gs_catb.best_params_)


Fitting 5 folds for each of 162 candidates, totalling 810 fits


ValueError: 
All the 810 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dmigl\anaconda3\Lib\site-packages\catboost\core.py", line 2321, in _prepare_train_params
    _check_train_params(params)
  File "_catboost.pyx", line 6601, in _catboost._check_train_params
  File "_catboost.pyx", line 6623, in _catboost._check_train_params
_catboost.CatBoostError: catboost/private/libs/options/catboost_options.cpp:794: Error: default bootstrap type (bayesian) doesn't support 'subsample' option


Tabla comparativa de modelos

In [None]:
%%time
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, roc_auc_score, average_precision_score,
    classification_report, confusion_matrix, roc_curve, precision_recall_curve
)

# Recolectar los modelos entrenados disponibles automáticamente
candidatos = {
    "Logistic": "gs_lr",
    "RandomForest": "gs_rf",
    "XGBoost": "gs_xgb",
    "LightGBM": "gs_lgbm",
    "CatBoost": "gs_catb",
}
modelos = {}
for name, var in candidatos.items():
    if var in globals() and eval(var) is not None:
        modelos[name] = eval(var)

if not modelos:
    raise RuntimeError("No encontré objetos GridSearchCV (p.ej. gs_lr, gs_rf, gs_xgb...). Ejecútalos antes o cárgalos.")

# Helper para métricas
def eval_model(nombre, gs, X, y):
    est = gs.best_estimator_
    y_pred  = est.predict(X)
    # algunos estimadores no tienen predict_proba para ciertas configuraciones:
    if hasattr(est, "predict_proba"):
        y_proba = est.predict_proba(X)[:, 1]
    elif hasattr(est, "decision_function"):
        # normalizamos a [0,1] aprox para métricas PR/ROC si no hay proba
        dec = est.decision_function(X)
        dec_min, dec_max = dec.min(), dec.max()
        y_proba = (dec - dec_min) / (dec_max - dec_min + 1e-9)
    else:
        # fallback: usar predicción dura como probabilidad (menos informativa)
        y_proba = y_pred.astype(float)

    return {
        "model": nombre,
        "accuracy": accuracy_score(y, y_pred),
        "roc_auc": roc_auc_score(y, y_proba),
        "pr_auc": average_precision_score(y, y_proba),
        "y_pred": y_pred,
        "y_proba": y_proba,
        "best_params": gs.best_params_,
    }

rows = []
per_model_outputs = {}
for name, gs in modelos.items():
    res = eval_model(name, gs, bundle.X_test, bundle.y_test)
    rows.append({k: res[k] for k in ["model","accuracy","roc_auc","pr_auc"]})
    per_model_outputs[name] = res  # guardamos para las curvas

df_cmp = pd.DataFrame(rows).sort_values("roc_auc", ascending=False).reset_index(drop=True)
display(df_cmp.style.format({"accuracy": "{:.3f}", "roc_auc": "{:.3f}", "pr_auc": "{:.3f}"}))

# (Opcional) Mostrar mejores hiperparámetros
for name, gs in modelos.items():
    print(f"[{name}] best params:", gs.best_params_)


Gráficas comparativas (barras + ROC + PR)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve

# --- Barras comparativas de métricas ---
plt.figure(figsize=(8, 4))
plt.bar(df_cmp["model"], df_cmp["roc_auc"], alpha=0.8)
plt.title("Comparativa ROC-AUC")
plt.ylabel("ROC-AUC")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.2)
plt.show()

plt.figure(figsize=(8, 4))
plt.bar(df_cmp["model"], df_cmp["pr_auc"], alpha=0.8)
plt.title("Comparativa PR-AUC")
plt.ylabel("PR-AUC")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.2)
plt.show()

# --- Curvas ROC comparativas ---
plt.figure(figsize=(7, 6))
for name, res in per_model_outputs.items():
    fpr, tpr, _ = roc_curve(bundle.y_test, res["y_proba"])
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc_score(bundle.y_test, res['y_proba']):.3f})")
plt.plot([0,1],[0,1],"--", alpha=0.5)
plt.title("Curvas ROC comparativas")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(alpha=0.2)
plt.show()

# --- Curvas Precision-Recall comparativas ---
plt.figure(figsize=(7, 6))
for name, res in per_model_outputs.items():
    prec, rec, _ = precision_recall_curve(bundle.y_test, res["y_proba"])
    ap = average_precision_score(bundle.y_test, res["y_proba"])
    plt.plot(rec, prec, label=f"{name} (AP={ap:.3f})")
plt.title("Curvas Precision-Recall comparativas")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(alpha=0.2)
plt.show()


Importancias de características para modelos de árboles

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_feature_importances(gs, topn=20, title="Importancias"):
    # Paso 1: obtener el clasificador y las importancias
    est = gs.best_estimator_.named_steps.get("clf", None)
    if est is None or not hasattr(est, "feature_importances_"):
        print("Este modelo no expone feature_importances_.")
        return

    importances = est.feature_importances_

    # Paso 2: obtener nombres de features tras el preprocesamiento
    pre = gs.best_estimator_.named_steps.get("pre", None)
    if pre is not None and hasattr(pre, "get_feature_names_out"):
        feat_names = pre.get_feature_names_out()
    else:
        # fallback por si acaso
        feat_names = np.array([f"f{i}" for i in range(len(importances))])

    # Paso 3: armar DF ordenado y plot
    df_imp = pd.DataFrame({"feature": feat_names, "importance": importances})
    df_imp = df_imp.sort_values("importance", ascending=False).head(topn)
    plt.figure(figsize=(8, max(4, topn*0.35)))
    plt.barh(df_imp["feature"][::-1], df_imp["importance"][::-1])
    plt.title(title)
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.show()

# Intentar para cada uno de los árboles presentes
for name in ["RandomForest", "XGBoost", "LightGBM", "CatBoost"]:
    if name in modelos:
        plot_feature_importances(modelos[name], topn=20, title=f"Importancias - {name}")
