
# Semana 07 — (LightGBM) Importancia de variables, Calibración y Curvas de Aprendizaje
**Plantilla lista para entregar**: ranking de variables, calibración y curvas de aprendizaje con **LightGBM**.



## 0) Requisitos
```bash
pip install lightgbm scikit-learn matplotlib
# opcional
pip install shap
```


In [None]:

# Configuración
from pathlib import Path
DATA_PATH  = Path("/mnt/data/tu_dataset.csv")
TARGET_COL = "label"
DROP_COLS  = []
RANDOM_STATE = 42
print("Datos:", DATA_PATH, "| Target:", TARGET_COL)



## 1) Carga y preprocesamiento


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier

df = pd.read_csv(DATA_PATH)
if TARGET_COL not in df.columns:
    raise ValueError(f"No encuentro '{TARGET_COL}'.")

if DROP_COLS:
    df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])

y = df[TARGET_COL].astype(int)
X = df.drop(columns=[TARGET_COL])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]
print("num:", len(num_cols), "cat:", len(cat_cols))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler(with_mean=False))])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))])

preprocess = ColumnTransformer([("num", num_pipe, num_cols),
                                ("cat", cat_pipe, cat_cols)])

base_clf = LGBMClassifier(
    n_estimators=600, learning_rate=0.05, num_leaves=64,
    subsample=0.8, colsample_bytree=0.8, class_weight="balanced",
    random_state=RANDOM_STATE, n_jobs=-1
)

pipe = Pipeline([("prep", preprocess), ("clf", base_clf)])
pipe



## 2) Entrenamiento y métricas


In [None]:

from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

pipe.fit(X_train, y_train)
proba_test = pipe.predict_proba(X_test)[:,1]
pred_test = (proba_test >= 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, proba_test))
print("PR  AUC:", average_precision_score(y_test, proba_test))
print("\n", classification_report(y_test, pred_test, digits=4))

cm = confusion_matrix(y_test, pred_test)
fig = plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation="nearest")
plt.title("Matriz de confusión")
plt.colorbar()
plt.xticks([0,1],[0,1]); plt.yticks([0,1],[0,1])
plt.xlabel("Predicho"); plt.ylabel("Real")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j,i,cm[i,j],ha="center",va="center")
plt.tight_layout(); plt.show()



## 3) Importancia de variables (modelo + Permutation Importance)


In [None]:

import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance

def get_feature_names(preprocess):
    num_names = preprocess.named_transformers_["num"].named_steps["imputer"]                     .get_feature_names_out(preprocess.transformers_[0][2])
    oh = preprocess.named_transformers_["cat"].named_steps["onehot"]
    cat_cols = preprocess.transformers_[1][2]
    cat_names = oh.get_feature_names_out(cat_cols)
    return np.concatenate([num_names, cat_names])

preprocess.fit(X_train, y_train)
feature_names = get_feature_names(preprocess)

# Importancia del modelo
X_train_t = preprocess.transform(X_train)
model = pipe.named_steps["clf"]
model.fit(X_train_t, y_train)
fi = pd.DataFrame({"feature": feature_names, "importance": model.feature_importances_})        .sort_values("importance", ascending=False)
display(fi.head(30))

# Permutation Importance (test)
perm = permutation_importance(pipe, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
perm_df = pd.DataFrame({"feature": feature_names,
                        "importance_mean": perm.importances_mean,
                        "importance_std": perm.importances_std})             .sort_values("importance_mean", ascending=False)
print("\nTop 30 por Permutation Importance (test):")
display(perm_df.head(30))

import matplotlib.pyplot as plt
TOP_N = 20
fig = plt.figure(figsize=(8, max(4, int(TOP_N*0.35))))
plt.barh(perm_df["feature"].head(TOP_N)[::-1], perm_df["importance_mean"].head(TOP_N)[::-1])
plt.title(f"Permutation Importance (Top {TOP_N}) — Test")
plt.xlabel("Disminución promedio de la métrica")
plt.tight_layout(); plt.show()


In [None]:

# (Opcional) SHAP
try:
    import shap, matplotlib.pyplot as plt
    explainer = shap.TreeExplainer(model)
    X_test_t = preprocess.transform(X_test)
    shap_values = explainer.shap_values(X_test_t)
    shap.summary_plot(shap_values, X_test_t, feature_names=feature_names, show=False)
    plt.tight_layout(); plt.show()
except Exception as e:
    print("SHAP no disponible:", e)



## 4) Calibración (Brier, ECE, curvas de confiabilidad)


In [None]:

from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

def expected_calibration_error(y_true, y_prob, n_bins=15):
    y_true = np.asarray(y_true); y_prob = np.asarray(y_prob)
    bins = np.linspace(0,1,n_bins+1); ece=0.0; total=len(y_true)
    for i in range(n_bins):
        l,r=bins[i],bins[i+1]
        mask=(y_prob>=l)&(y_prob<(r if i<n_bins-1 else r))
        if np.any(mask):
            acc=y_true[mask].mean(); conf=y_prob[mask].mean()
            ece += abs(acc-conf)*(mask.sum()/total)
    return ece

def plot_reliability(y_true, proba_dict, n_bins=15, title_suffix=""):
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(5,5))
    plt.plot([0,1],[0,1],"--")
    for name,p in proba_dict.items():
        frac_pos, mean_pred = calibration_curve(y_true, p, n_bins=n_bins, strategy="uniform")
        plt.plot(mean_pred, frac_pos, marker="o", label=name)
    plt.xlabel("Probabilidad promedio por bin"); plt.ylabel("Fracción de positivos")
    plt.title(f"Curva de confiabilidad {title_suffix}"); plt.legend(); plt.tight_layout(); plt.show()

orig_brier = brier_score_loss(y_test, proba_test)
orig_ece   = expected_calibration_error(y_test.values, proba_test, n_bins=15)
print(f"Brier (original): {orig_brier:.6f} | ECE (original): {orig_ece:.6f}")

cal_iso = CalibratedClassifierCV(pipe, method="isotonic", cv=5).fit(X_train, y_train)
cal_sig = CalibratedClassifierCV(pipe, method="sigmoid",  cv=5).fit(X_train, y_train)

proba_iso = cal_iso.predict_proba(X_test)[:,1]
proba_sig = cal_sig.predict_proba(X_test)[:,1]

iso_brier = brier_score_loss(y_test, proba_iso); iso_ece = expected_calibration_error(y_test.values, proba_iso, 15)
sig_brier = brier_score_loss(y_test, proba_sig); sig_ece = expected_calibration_error(y_test.values, proba_sig, 15)
print(f"Brier (isotonic): {iso_brier:.6f} | ECE (isotonic): {iso_ece:.6f}")
print(f"Brier (sigmoid):  {sig_brier:.6f} | ECE (sigmoid):  {sig_ece:.6f}")

plot_reliability(y_test.values, {"Original": proba_test, "Isotonic": proba_iso, "Sigmoid": proba_sig}, 15, "— LightGBM")



## 5) Curvas de aprendizaje


In [None]:

from sklearn.model_selection import learning_curve, StratifiedKFold
import matplotlib.pyplot as plt

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
train_sizes, train_scores, val_scores = learning_curve(
    pipe, X, y, cv=cv, scoring="roc_auc", n_jobs=-1,
    train_sizes=np.linspace(0.1,1.0,6), shuffle=True
)
train_mean, val_mean = train_scores.mean(axis=1), val_scores.mean(axis=1)
train_std,  val_std  = train_scores.std(axis=1),  val_scores.std(axis=1)

fig = plt.figure(figsize=(6,4))
plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=0.2)
plt.fill_between(train_sizes, val_mean-val_std,     val_mean+val_std,     alpha=0.2)
plt.plot(train_sizes, train_mean, marker="o", label="Entrenamiento (AUC)")
plt.plot(train_sizes, val_mean,   marker="o", label="Validación (AUC)")
plt.xlabel("Tamaño de entrenamiento"); plt.ylabel("ROC AUC"); plt.title("Curvas de aprendizaje — LightGBM")
plt.legend(); plt.tight_layout(); plt.show()
