In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    roc_auc_score, roc_curve,
    confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier


In [None]:
DATA_PATH = os.path.join("..", "data", "Loan_default.csv")
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
display(df.head())


In [None]:
display(df.dtypes.value_counts())
display(df.isna().mean().sort_values(ascending=False).head(20))


In [None]:

assert "Default" in df.columns, "No encuentro la columna 'Default' en el dataset."

y = df["Default"]
X = df.drop(columns=["Default"])

print("X shape:", X.shape)
print("y shape:", y.shape)
display(y.value_counts(dropna=False))


In [None]:
counts = y.value_counts(dropna=False)
props = y.value_counts(normalize=True, dropna=False) * 100

print("Counts:")
display(counts)

print("\nProportions (%):")
display(props)

props.sort_index().plot(kind="bar")
plt.title("Default distribution (0/1)")
plt.ylabel("Percentage (%)")
plt.xticks(rotation=0)
plt.show()


In [None]:
display(
    df
    .groupby("Default")["Income"]
    .describe()
)



In [None]:
#Credit Score

display(
    df
    .groupby("Default")["CreditScore"]
    .describe()
)


In [None]:
#DTIRatio
display(
    df
    .groupby("Default")["DTIRatio"]
    .describe()
)

In [None]:
#EmploymentType
default_rate_emp = (
    df
    .groupby("EmploymentType")["Default"]
    .mean()
    .sort_values(ascending=False)
)

display(default_rate_emp)

default_rate_emp.plot(kind="bar")
plt.title("Default rate by EmploymentType")
plt.ylabel("Default rate")
plt.xticks(rotation=45, ha="right")
plt.show()



In [None]:
y = df["Default"]
X = df.drop(columns=["Default"])

X.shape, y.shape


In [None]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric columns:", num_cols)
print("\nCategorical columns:", cat_cols)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

print("\nDefault rate train:", y_train.mean())
print("Default rate test :", y_test.mean())


In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse

X_train_num = sparse.csr_matrix(X_train[num_cols].to_numpy(dtype=np.float32))
X_test_num  = sparse.csr_matrix(X_test[num_cols].to_numpy(dtype=np.float32))

ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=True)  # use sparse=True if needed

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat  = ohe.transform(X_test[cat_cols])

X_train_final = sparse.hstack([X_train_num, X_train_cat], format="csr")
X_test_final  = sparse.hstack([X_test_num, X_test_cat], format="csr")

X_train_final.shape, X_test_final.shape


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled  = scaler.transform(X_test_final)

X_train_scaled.shape, X_test_scaled.shape


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=None
)

logreg.fit(X_train_scaled, y_train)

proba_lr = logreg.predict_proba(X_test_scaled)[:, 1]
auc_lr = roc_auc_score(y_test, proba_lr)

auc_lr

from sklearn.metrics import confusion_matrix, classification_report

pred_lr = (proba_lr >= 0.5).astype(int)

confusion_matrix(y_test, pred_lr)

print(classification_report(y_test, pred_lr, digits=4))



In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_lr = roc_auc_score(y_test, proba_lr)
roc_auc_lr


In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr_lr, tpr_lr, _ = roc_curve(y_test, proba_lr)

plt.plot(fpr_lr, tpr_lr, label=f"LogReg (AUC = {roc_auc_lr:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.show()


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=200, random_state=42)

X_train_svd = svd.fit_transform(X_train_final)
X_test_svd  = svd.transform(X_test_final)

X_train_svd.shape, X_test_svd.shape


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score

hgb = HistGradientBoostingClassifier(
    random_state=42,
    learning_rate=0.1
)

hgb.fit(X_train_svd, y_train)

proba_hgb = hgb.predict_proba(X_test_svd)[:, 1]
roc_auc_hgb = roc_auc_score(y_test, proba_hgb)

roc_auc_hgb


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

pred_hgb = (proba_hgb >= 0.5).astype(int)

confusion_matrix(y_test, pred_hgb)


In [None]:
print(classification_report(y_test, pred_hgb, digits=4))


In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr_lr, tpr_lr, _ = roc_curve(y_test, proba_lr)
fpr_hgb, tpr_hgb, _ = roc_curve(y_test, proba_hgb)

plt.plot(fpr_lr, tpr_lr, label=f"LogReg (AUC={roc_auc_lr:.3f})")
plt.plot(fpr_hgb, tpr_hgb, label=f"HistGB+SVD (AUC={roc_auc_hgb:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
#threshold analysis
from sklearn.metrics import confusion_matrix

def eval_threshold(y_true, proba, thr):
    pred = (proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
    return {
        "thr": thr,
        "tp": tp, "fn": fn, "fp": fp, "tn": tn,
        "recall_1": tp/(tp+fn) if (tp+fn) else 0,
        "precision_1": tp/(tp+fp) if (tp+fp) else 0
    }

pd.DataFrame([eval_threshold(y_test.values, proba_hgb, t)
              for t in [0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50]])


In [None]:
import time, numpy as np

t0 = time.time()
_ = X_train_final[:2000].astype(np.float32).toarray()
print("dense slice seconds:", round(time.time()-t0, 2))


In [None]:
import numpy as np
import time
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

X_train_ebm = X_train_red.astype(np.float32).toarray()
X_test_ebm  = X_test_red.astype(np.float32).toarray()

print("Reduced shapes:", X_train_ebm.shape, X_test_ebm.shape, flush=True)

ebm = ExplainableBoostingClassifier(
    random_state=42,
    interactions=0,
    max_bins=64,
    learning_rate=0.05,
    max_rounds=500,
    outer_bags=1,
    inner_bags=0,
    n_jobs=-1
)

t0 = time.time()
ebm.fit(X_train_ebm, y_train)
print("fit seconds:", round(time.time()-t0, 2), flush=True)

proba_ebm = ebm.predict_proba(X_test_ebm)[:, 1]
roc_auc_ebm = roc_auc_score(y_test, proba_ebm)
print("ROC AUC:", round(roc_auc_ebm, 4), flush=True)

fpr_ebm, tpr_ebm, _ = roc_curve(y_test, proba_ebm)

plt.plot(fpr_lr, tpr_lr, label=f"LogReg (AUC={roc_auc_lr:.3f})")
plt.plot(fpr_hgb, tpr_hgb, label=f"HistGB+SVD (AUC={roc_auc_hgb:.3f})")
plt.plot(fpr_ebm, tpr_ebm, label=f"EBM (AUC={roc_auc_ebm:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# feature names
feature_names_red = [feature_names[k] for k in keep_cols]
n_features = len(feature_names_red)

# select a client with mixed contributions
order = np.argsort(-proba_ebm)

chosen_i = None
chosen_top = None
chosen_pd = None

for i in order[:2000]:
    exp = ebm.explain_local(X_test_ebm[i:i+1], y_test.iloc[i:i+1])
    data = exp.data(0)

    contrib = np.array(data["scores"], dtype=float)

    if len(contrib) == n_features + 1:
        contrib = contrib[:-1]

    if len(contrib) != n_features:
        continue

    df = pd.DataFrame({
        "feature": feature_names_red,
        "contribution": contrib
    })

    df["abs_contribution"] = df["contribution"].abs()
    df = df.sort_values("abs_contribution", ascending=False)

    top = df.head(10)

    if (top["contribution"] > 0).any() and (top["contribution"] < 0).any():
        chosen_i = int(i)
        chosen_pd = float(proba_ebm[i])
        chosen_top = top.copy()
        break

# fallback
if chosen_i is None:
    chosen_i = int(order[0])
    chosen_pd = float(proba_ebm[chosen_i])

    exp = ebm.explain_local(X_test_ebm[chosen_i:chosen_i+1], y_test.iloc[chosen_i:chosen_i+1])
    data = exp.data(0)
    contrib = np.array(data["scores"], dtype=float)

    if len(contrib) == n_features + 1:
        contrib = contrib[:-1]

    chosen_top = pd.DataFrame({
        "feature": feature_names_red,
        "contribution": contrib
    })

    chosen_top["abs_contribution"] = chosen_top["contribution"].abs()
    chosen_top = chosen_top.sort_values("abs_contribution", ascending=False).head(10)

# relative impact
total_abs = chosen_top["abs_contribution"].sum()
chosen_top["relative_impact_pct"] = 100 * chosen_top["abs_contribution"] / total_abs

# clean table
chosen_top = chosen_top[[
    "feature",
    "contribution",
    "relative_impact_pct"
]]

print(f"Example client (test index): {chosen_i}")
print(f"Estimated PD: {chosen_pd:.2%}")

display(chosen_top)

# plot
plot_df = chosen_top.sort_values("contribution")

plt.rcParams.update({
    "figure.facecolor": "#F3F4F6",
    "axes.facecolor": "white",
    "axes.edgecolor": "#E5E7EB",
    "font.family": "sans-serif"
})

colors = ["#DC2626" if v > 0 else "#16A34A" for v in plot_df["contribution"]]

fig, ax = plt.subplots(figsize=(11, 6))

ax.barh(
    plot_df["feature"],
    plot_df["contribution"],
    color=colors
)

ax.axvline(0, color="#9CA3AF", linewidth=1)
ax.grid(axis="x", linestyle="--", alpha=0.25)

for spine in ["top", "right", "left"]:
    ax.spines[spine].set_visible(False)

fig.suptitle(
    "Credit Risk Drivers â€“ Example Client",
    fontsize=16,
    fontweight="bold",
    x=0.02,
    ha="left"
)

ax.set_title(
    f"Estimated Probability of Default: {chosen_pd:.2%}",
    fontsize=11,
    loc="left",
    color="#4B5563",
    pad=12
)

ax.set_xlabel("Contribution to risk (log-odds)")

plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.savefig("ebm_waterfall.png", dpi=220, bbox_inches="tight")
plt.show()
