In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [2]:
data_set_base = pd.read_parquet("../02_preparación de los datos/data_set_final.parquet")

print(data_set_base.shape)

(307511, 105)


# Separacion de los datos

In [3]:
X = data_set_base.drop(columns=["TARGET", "SK_ID_CURR"])
y = data_set_base["TARGET"]


# División del dataset

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)
print("Proporción default train:", y_train.mean())
print("Proporción default test:", y_test.mean())

Train: (230633, 103)
Test: (76878, 103)
Proporción default train: 0.08072999093798372
Proporción default test: 0.08072530502874685


# Entrenamiento con regresión logistica

In [5]:
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=-1)

log_reg.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.94      0.65      0.77     70672
           1       0.12      0.54      0.19      6206

    accuracy                           0.64     76878
   macro avg       0.53      0.59      0.48     76878
weighted avg       0.87      0.64      0.72     76878

ROC AUC: 0.6302893265122573


## Resultados de la Regresión Logística

Con el objetivo de validar la calidad del dataset preparado y verificar la existencia de señal predictiva, se entrenó un modelo base de **Regresión Logística** utilizando el conjunto de datos final (`data_set_base`).  
Debido al fuerte desbalance de la variable objetivo (`TARGET` ≈ 8% de incumplimiento), se utilizó el parámetro `class_weight="balanced"` para priorizar la detección de la clase minoritaria.

### Reporte de clasificación

- **Clase 0 (clientes sin incumplimiento):**
  - Alta precisión (0.94)
  - Recall moderado (0.65)

- **Clase 1 (clientes con incumplimiento):**
  - Recall elevado (0.54), lo que indica que el modelo logra identificar más del 50% de los clientes en riesgo.
  - Precisión baja (0.12), esperable en un contexto altamente desbalanceado.

### Análisis e interpretación

El valor de **ROC AUC = 0.63** confirma que el modelo presenta una capacidad de discriminación superior al azar, lo cual es especialmente relevante considerando que se trata de un modelo lineal simple y sin optimización de hiperparámetros.  
Este resultado valida que el proceso de preparación de datos fue exitoso y permitió capturar patrones significativos asociados al incumplimiento de pago.

Desde una perspectiva de negocio, el **recall de la clase positiva** adquiere mayor relevancia que la precisión, dado que el costo de aprobar a un cliente riesgoso es superior al de rechazar a un cliente solvente. En este sentido, el desempeño del modelo es consistente con los objetivos del análisis de riesgo crediticio.


# Entrenamiento con regresión logisita ajustado

## Entrenamiendo con class_weight

In [7]:
lr_balanced = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    solver="lbfgs",
    n_jobs=-1
)

lr_balanced.fit(X_train, y_train)

y_proba_lr = lr_balanced.predict_proba(X_test)[:, 1]
y_pred_lr = (y_proba_lr >= 0.5).astype(int)

print("=== Logistic Regression (class_weight='balanced') ===")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_proba_lr))

=== Logistic Regression (class_weight='balanced') ===
              precision    recall  f1-score   support

           0       0.94      0.64      0.76     70672
           1       0.12      0.55      0.19      6206

    accuracy                           0.63     76878
   macro avg       0.53      0.59      0.48     76878
weighted avg       0.88      0.63      0.72     76878

ROC AUC: 0.6303116389005039


## Entrenamiento con SMOTE

In [8]:
smote = SMOTE(random_state=42)

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

lr_smote = LogisticRegression(
    max_iter=3000,
    solver="lbfgs",
    n_jobs=-1
)

lr_smote.fit(X_train_sm, y_train_sm)

y_proba_lr_sm = lr_smote.predict_proba(X_test)[:, 1]
y_pred_lr_sm = (y_proba_lr_sm >= 0.5).astype(int)

print("=== Logistic Regression + SMOTE ===")
print(classification_report(y_test, y_pred_lr_sm))
print("ROC AUC:", roc_auc_score(y_test, y_proba_lr_sm))

=== Logistic Regression + SMOTE ===
              precision    recall  f1-score   support

           0       0.94      0.63      0.76     70672
           1       0.12      0.56      0.19      6206

    accuracy                           0.63     76878
   macro avg       0.53      0.60      0.48     76878
weighted avg       0.88      0.63      0.71     76878

ROC AUC: 0.6301170815326861


# Entrenamiento con Random Forest

In [9]:
# Variables predictoras y objetivo
X = data_set_base.drop(columns=["TARGET", "SK_ID_CURR"])
y = data_set_base["TARGET"]

# Train / Test split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [10]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=50,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Predicciones
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

# Métricas
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.96      0.75      0.84     70672
           1       0.18      0.63      0.28      6206

    accuracy                           0.74     76878
   macro avg       0.57      0.69      0.56     76878
weighted avg       0.90      0.74      0.80     76878

ROC AUC: 0.7577269811485536


## Resultados de Random Forest

El objetivo de este modelo es predecir la probabilidad de incumplimiento de pago (`TARGET = 1`) de un cliente, utilizando el dataset integrado y preparado (`data_set_base`), el cual incorpora información:

- Socioeconómica del cliente (application)
- Historial crediticio (bureau, bureau_balance)
- Créditos previos (previous_application)
- Comportamiento de pago (pos_cash, installments)
- Uso de tarjetas de crédito (credit_card_balance)

---

### Justificación del uso de Random Forest

Se seleccionó Random Forest como modelo debido a que:

- Captura relaciones no lineales entre variables explicativas y el riesgo crediticio.
- Es robusto frente a multicolinealidad, outliers y ruido residual.
- Maneja adecuadamente datasets con alta dimensionalidad.
- Es ampliamente utilizado en problemas reales de scoring crediticio.

Dado que el comportamiento de pago depende de múltiples interacciones complejas (historial, montos, antigüedad, atrasos previos), un modelo lineal resulta insuficiente, justificando el uso de Random Forest.

---

### Configuración general del modelo

El modelo fue entrenado utilizando:

- División train/test estratificada.
- Variable objetivo desbalanceada (~8% de incumplimiento).
- Dataset previamente limpiado, imputado, codificado y escalado.
- Sin técnicas de balanceo artificial, para preservar la distribución real del riesgo.

---


### Análisis de métricas

- **ROC AUC (0.758):** indica una buena capacidad discriminante del modelo.
- **Recall clase 1 (0.63):** el modelo identifica correctamente cerca del 63% de los clientes riesgosos.
- **Precision clase 1 (0.18):** valor esperado dado el fuerte desbalance de clases.
- **Accuracy (0.74):** desempeño global aceptable, aunque no es la métrica principal.

---

### Interpretación desde el punto de vista del negocio

En el contexto del riesgo crediticio:

- Es preferible identificar clientes con alto riesgo, aunque se generen falsos positivos.
- Se prioriza el Recall sobre la Precision para la clase positiva.
- El modelo cumple adecuadamente con este criterio de negocio.

# Entrenamiento con Random Forest ajustado

In [12]:
# Separar X e y
X = data_set_base.drop(columns=["TARGET", "SK_ID_CURR"])
y = data_set_base["TARGET"]

# Train / Test estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Modelo Random Forest
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=50,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)

# Probabilidades
y_proba = rf.predict_proba(X_test)[:, 1]

# ROC AUC base
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC AUC:", roc_auc)

ROC AUC: 0.766175074699304


In [13]:
def evaluar_umbral(y_true, y_proba, threshold):
    y_pred = (y_proba >= threshold).astype(int)
    
    return {
        "umbral": threshold,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_1": precision_score(y_true, y_pred, zero_division=0),
        "recall_1": recall_score(y_true, y_pred),
        "f1_1": f1_score(y_true, y_pred)
    }


In [14]:
umbrales = [0.2, 0.3, 0.4, 0.5, 0.6]

resultados = []

for u in umbrales:
    res = evaluar_umbral(y_test, y_proba, u)
    resultados.append(res)

df_resultados = pd.DataFrame(resultados)
df_resultados


Unnamed: 0,umbral,accuracy,precision_1,recall_1,f1_1
0,0.2,0.904641,0.348178,0.207863,0.260317
1,0.3,0.919652,0.544615,0.028521,0.054203
2,0.4,0.919288,1.0,0.000161,0.000322
3,0.5,0.919275,0.0,0.0,0.0
4,0.6,0.919275,0.0,0.0,0.0


### Observaciones
El ajuste de umbral evidenció que el Random Forest, sin corrección por desbalance, presenta una fuerte tendencia a clasificar la clase mayoritaria. Aunque alcanza un ROC AUC elevado, su desempeño operativo en detección de morosidad es limitado si no se ajusta el umbral o el peso de clases. Esto demuestra la importancia de evaluar modelos más allá de la accuracy y de adaptar el criterio de decisión al contexto del negocio.

## Entrenamiendo de Random Forest con class_weight

In [15]:
rf_balanced = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=50,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

rf_balanced.fit(X_train, y_train)

y_proba_rf_bal = rf_balanced.predict_proba(X_test)[:, 1]
y_pred_rf_bal = (y_proba_rf_bal >= 0.5).astype(int)

print("=== Random Forest (class_weight='balanced') ===")
print(classification_report(y_test, y_pred_rf_bal))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf_bal))

=== Random Forest (class_weight='balanced') ===
              precision    recall  f1-score   support

           0       0.95      0.84      0.89     70672
           1       0.22      0.52      0.31      6206

    accuracy                           0.81     76878
   macro avg       0.59      0.68      0.60     76878
weighted avg       0.89      0.81      0.85     76878

ROC AUC: 0.7661736633598063


## Entrenamiento de Random Forest con SMOTE

In [16]:
# Aplicar SMOTE SOLO en entrenamiento
smote = SMOTE(
    sampling_strategy=0.5,   # no 1.0 para evitar overfitting
    random_state=42
)

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("Distribución original:", y_train.value_counts())
print("Distribución tras SMOTE:", pd.Series(y_train_sm).value_counts())

Distribución original: TARGET
0    212014
1     18619
Name: count, dtype: int64
Distribución tras SMOTE: TARGET
0    212014
1    106007
Name: count, dtype: int64


In [17]:
rf_smote = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=50,
    n_jobs=-1,
    random_state=42
)

rf_smote.fit(X_train_sm, y_train_sm)

y_proba_rf_smote = rf_smote.predict_proba(X_test)[:, 1]
y_pred_rf_smote = (y_proba_rf_smote >= 0.5).astype(int)

print("=== Random Forest + SMOTE ===")
print(classification_report(y_test, y_pred_rf_smote))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf_smote))

=== Random Forest + SMOTE ===
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     70672
           1       0.30      0.01      0.01      6206

    accuracy                           0.92     76878
   macro avg       0.61      0.50      0.48     76878
weighted avg       0.87      0.92      0.88     76878

ROC AUC: 0.7328299811155023


In [18]:
umbrales = [0.3, 0.4, 0.5]

resultados = []

for u in umbrales:
    y_pred_u = (y_proba_rf_smote >= u).astype(int)
    resultados.append({
        "umbral": u,
        "accuracy": accuracy_score(y_test, y_pred_u),
        "precision_1": precision_score(y_test, y_pred_u, zero_division=0),
        "recall_1": recall_score(y_test, y_pred_u),
        "f1_1": f1_score(y_test, y_pred_u)
    })

pd.DataFrame(resultados)

Unnamed: 0,umbral,accuracy,precision_1,recall_1,f1_1
0,0.3,0.874893,0.226865,0.228327,0.227594
1,0.4,0.913278,0.300433,0.055914,0.094281
2,0.5,0.918728,0.301887,0.005156,0.010139


# Entrenamiento de GradientBoostingClassifier, XGBoost y LightGBM

In [19]:
X = data_set_base.drop(columns=["TARGET", "SK_ID_CURR"])
y = data_set_base["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

In [36]:
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
y_proba = gb.predict_proba(X_test)[:, 1]

gb_auc = roc_auc_score(y_test, y_proba)

print("=== Gradient Boosting ===")
print(classification_report(y_test, y_pred))
print("ROC AUC:", gb_auc)

=== Gradient Boosting ===
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     70672
           1       0.66      0.02      0.03      6206

    accuracy                           0.92     76878
   macro avg       0.79      0.51      0.50     76878
weighted avg       0.90      0.92      0.88     76878

ROC AUC: 0.7681456056022673


In [37]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric="auc",
    random_state=42
)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:, 1]

xgb_auc = roc_auc_score(y_test, y_proba)

print("=== XGBoost ===")
print(classification_report(y_test, y_pred))
print("ROC AUC:", xgb_auc)

=== XGBoost ===
              precision    recall  f1-score   support

           0       0.96      0.72      0.82     70672
           1       0.18      0.69      0.28      6206

    accuracy                           0.72     76878
   macro avg       0.57      0.71      0.55     76878
weighted avg       0.90      0.72      0.78     76878

ROC AUC: 0.7757039145806082


In [38]:
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)[:, 1]

lgbm_auc = roc_auc_score(y_test, y_proba)

print("=== LightGBM ===")
print(classification_report(y_test, y_pred))
print("ROC AUC:", lgbm_auc)

[LightGBM] [Info] Number of positive: 18619, number of negative: 212014
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,098900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13725
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 98
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0,500000 -> initscore=-0,000000
[LightGBM] [Info] Start training from score -0,000000
=== LightGBM ===
              precision    recall  f1-score   support

           0       0.96      0.73      0.83     70672
           1       0.18      0.68      0.29      6206

    accuracy                           0.73     76878
   macro avg       0.57      0.71      0.56     76878
weighted avg       0.90      0.73      0.79     76878

ROC AUC: 0.7770620586634229


In [39]:
results = pd.DataFrame([
    {"Modelo": "Logistic Regression", "ROC_AUC": 0.63},
    {"Modelo": "Random Forest", "ROC_AUC": 0.76},
    {"Modelo": "Gradient Boosting", "ROC_AUC": gb_auc},
    {"Modelo": "XGBoost", "ROC_AUC": xgb_auc},
    {"Modelo": "LightGBM", "ROC_AUC": lgbm_auc},
])

results.sort_values("ROC_AUC", ascending=False)

Unnamed: 0,Modelo,ROC_AUC
4,LightGBM,0.777062
3,XGBoost,0.775704
2,Gradient Boosting,0.768146
1,Random Forest,0.76
0,Logistic Regression,0.63


In [34]:
joblib.dump(lgbm, "modelo_riesgo_crediticio_lgb.pkl")


['modelo_riesgo_crediticio_lgb.pkl']

In [35]:
results.dtypes


Modelo     object
ROC_AUC    object
dtype: object

In [40]:
X.iloc[0].tolist()

[202500.0,
 406597.5,
 24700.5,
 0.018801,
 -9461.0,
 -637.0,
 -3648.0,
 -2120.0,
 9.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 2.0,
 10.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0830369673913225,
 0.2629485927471776,
 0.1393757800997895,
 0.0833,
 2.0,
 2.0,
 -1134.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.2556818181818182,
 0.5,
 1.0,
 108131.945625,
 450000.0,
 49156.2,
 245781.0,
 0.0,
 0.0,
 0.25,
 2.3945205479452056,
 3.936986301369863,
 0.9561643835616439,
 179055.0,
 179055.0,
 9251.775,
 9251.775,
 1.0,
 1.6602739726027398,
 1.6602739726027398,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 24.0,
 24.0,
 15.0,
 1.0,
 -20.42105263157895,
 -20.42105263157895,
 -12.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0]