In [5]:
#Hacemos nuevamente la carga al df desde el archivo limpio del parquet.

import pandas as pd
df = pd.read_parquet("../data/processed/creditos.parquet")

X = df.drop(columns=["estado"])
y = (df["estado"] == "DESISTIDA").astype(int)   # 1 = desistido


In [6]:
# Definimos las columnas

num_cols = X.select_dtypes("number").columns
cat_cols = X.select_dtypes("string").columns


In [7]:
# Verificamos las categorías que tenemos en estado..
df['estado'].unique()

<StringArray>
['APROBADA', 'DESISTIDA', 'ANULADA', 'NEGADA']
Length: 4, dtype: string

In [8]:
# Importamos las librerías y aplicamos los modelos de regresión logística y ramdon forest

from sklearn.compose      import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline     import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier
import numpy as np

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

models = {
    "LogReg": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RF":     RandomForestClassifier(n_estimators=300, class_weight="balanced")
}

for name, clf in models.items():
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    auc  = cross_val_score(pipe, X, y, cv=5, scoring="roc_auc")
    print(f"{name}: {auc.mean():.3f} ± {auc.std():.3f}")



LogReg: 0.626 ± 0.006
RF: 0.613 ± 0.047


Meta razonable: ≥ 0.70 para que el modelo aporte valor real. Vamos a exprimir la logística, afinándola.

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "clf__penalty": ["l2", "l1", "elasticnet"],
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__solver": ["liblinear", "saga"],
    "clf__l1_ratio": [0, 0.5, 1]  # solo aplica con elasticnet
}

pipe_lr = Pipeline([("pre", pre),
                    ("clf", LogisticRegression(
                         max_iter=2000, class_weight="balanced", n_jobs=-1))])

gscv = GridSearchCV(pipe_lr, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
gscv.fit(X, y)

print("Mejor ROC-AUC:", gscv.best_score_.round(3))
print("Mejores hiperparámetros:", gscv.best_params_)


60 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\It_nova\OneDrive - Conconcreto\Escritorio\proyectos_personales\celerix\credito-digital-analytics\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\It_nova\OneDrive - Conconcreto\Escritorio\proyectos_personales\celerix\credito-digital-analytics\venv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\It_nova\OneDrive - Conconcreto\Escr

Mejor ROC-AUC: 0.627
Mejores hiperparámetros: {'clf__C': 0.1, 'clf__l1_ratio': 0.5, 'clf__penalty': 'l1', 'clf__solver': 'saga'}


In [10]:
# --- Agregamos un entrenamiento con el df -----------------------------
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_parquet("../data/processed/creditos.parquet")
X = df.drop(columns=["estado"])
y = (df["estado"].str.upper().str.strip() == "DESISTIDA").astype(int)

num_cols = X.select_dtypes("number").columns
cat_cols = X.select_dtypes("string").columns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)


In [11]:
# --- Celda B: modelo final (hiperparámetros del grid) ---
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    penalty="l1", solver="saga", C=0.1,
    max_iter=2000, class_weight="balanced", n_jobs=1
)
pipe = Pipeline([("pre", pre), ("clf", clf)])
pipe.fit(X_train, y_train)


0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,2000


In [12]:
# --- Celda C: métricas + umbral óptimo ------------------
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
import numpy as np

proba = pipe.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, proba)

# Umbral que maximiza F1 de la clase positiva (DESISTIDA)
prec, rec, thr = precision_recall_curve(y_test, proba)
f1 = 2*prec*rec/(prec+rec+1e-9)
best_idx = np.nanargmax(f1)
best_thr = float(thr[best_idx])

pred_def   = (proba >= 0.5).astype(int)
pred_best  = (proba >= best_thr).astype(int)

print("ROC-AUC:", round(roc, 3))
print("\nReporte @0.50\n", classification_report(y_test, pred_def))
print("Matriz @0.50\n", confusion_matrix(y_test, pred_def))
print("\nUmbral óptimo F1:", round(best_thr, 3))
print("\nReporte @umbral óptimo\n", classification_report(y_test, pred_best))
print("Matriz @umbral óptimo\n", confusion_matrix(y_test, pred_best))


ROC-AUC: 0.64

Reporte @0.50
               precision    recall  f1-score   support

           0       0.73      0.58      0.64      6421
           1       0.46      0.62      0.53      3680

    accuracy                           0.59     10101
   macro avg       0.59      0.60      0.58     10101
weighted avg       0.63      0.59      0.60     10101

Matriz @0.50
 [[3699 2722]
 [1392 2288]]

Umbral óptimo F1: 0.416

Reporte @umbral óptimo
               precision    recall  f1-score   support

           0       0.79      0.28      0.41      6421
           1       0.41      0.87      0.56      3680

    accuracy                           0.49     10101
   macro avg       0.60      0.57      0.48     10101
weighted avg       0.65      0.49      0.46     10101

Matriz @umbral óptimo
 [[1783 4638]
 [ 473 3207]]


In [13]:
# --- Celda D: guarda artefactos -------------------------
import joblib, json, pathlib

pathlib.Path("../models").mkdir(exist_ok=True)
joblib.dump(pipe, "../models/model_desist.pkl")

meta = {"threshold": best_thr, "roc_auc": float(roc)}
with open("../models/model_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Guardado ../models/model_desist.pkl y model_meta.json")


Guardado ../models/model_desist.pkl y model_meta.json
