In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import csv
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
import Feature_Engineering

In [4]:
data = Feature_Engineering.data

In [None]:
drop_cols = [c for c in ['Cobra', 'INGRESO','P47T', 'logP47T', 'CODUSU', 'Q', 'Reg_rk', 'AGLO_rk', 'INGRESO_NLB', 'INGRESO_JUB','INGRESO_SBS', 
                         'P21','PP08D1','T_VI','V12_M','V2_M','V3_M','V5_M','TOT_P12','PP07H','PP07I','PP07J','PP07G_59','PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07K'] if c in data.columns]
X = data.drop(columns=drop_cols)
y = data['INGRESO'] 
X_train_test, X_val, y_train_test, y_val = train_test_split(X, y, test_size=0.10, random_state=42) #Separo 10% para validacion
X_train, X_test, y_train, y_test = train_test_split(X_train_test, y_train_test, test_size=2/9, random_state=42) #Separo 20% total para test, 70% Train

In [None]:
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Grilla hiperparámetros
param_grids = {
    "LogisticRegression": {
        "clf__penalty": [None],
        "clf__solver": ["lbfgs", "saga"],
        "clf__max_iter": [20, 50]
    },
    "LogisticRegression_Scaled": {
        "clf__penalty": [None],
        "clf__solver": ["lbfgs", "saga"],
        "clf__max_iter": [20, 50]
    },
    "LogisticRegressionWithPenalty": {
        "clf__penalty": ["l1", "l2"],
        "clf__C": [0.1, 10],
        "clf__solver": ["saga"],   
        "clf__max_iter": [20, 50]
    },
    "RandomForest": {
        "clf__n_estimators": [10, 30],
        "clf__max_depth": [None, 20],
        "clf__min_samples_split": [2, 5],
        "clf__min_samples_leaf": [1, 2],
        "clf__max_features": ["sqrt", "log2"]
    },
    "SVM": {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf', 'poly','sigmoid']
    },
    "KNN": {
        'clf__n_neighbors': [3,5],
        'clf__weights': ['uniform'],
        'clf__metric': ['euclidean']
    },
    "HistGradientBoosting": {
        "clf__max_iter": [100],
        "clf__max_depth": [None, 20],
        "clf__learning_rate": [0.1, 0.01],
        "clf__min_samples_leaf": [20, 50]
    },
    "XGBClassifier": {
        "clf__learning_rate": [0.01, 0.1],
        "clf__max_depth": [5, 7, 10],
        "clf__n_estimators": [10, 20],
        "clf__subsample": [0.75],
        "clf__colsample_bytree": [0.75]
    }
}

In [None]:
#Preprocessors

preprocessor_numonly_scaled = ColumnTransformer([('num', StandardScaler(), numeric_cols)], remainder='drop')

preprocessor_numonly_unscaled = ColumnTransformer([('num', 'passthrough', numeric_cols)], remainder='drop')

preprocessor_scaled = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preprocessor_scaled_cat = ColumnTransformer([
    ('num', 'passthrough', numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preprocessor_scaled_num = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', 'passthrough', cat_cols)
])

preprocessor_unscaled = ColumnTransformer([
    ('num', 'passthrough', numeric_cols),
    ('cat', 'passthrough', cat_cols)
])

In [None]:
#Pipelines
pipelines = {
    "LogisticRegression": Pipeline([
        ('preproc', preprocessor_scaled_cat),
        ('clf', LogisticRegression(random_state=42, max_iter=100))
    ]),
    "LogisticRegression_Scaled" : Pipeline([
        ('preproc', preprocessor_scaled),
        ('clf', LogisticRegression(random_state=42, max_iter=100))
    ]),
    "LogisticRegressionWithPenalty": Pipeline([
        ('preproc', preprocessor_scaled),
        ('clf', LogisticRegression(random_state=42, max_iter=100))
    ]),
    "RandomForest": Pipeline([
        ('preproc', preprocessor_numonly_unscaled),
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    "SVM": Pipeline([
        ('preproc', preprocessor_scaled),
        ('clf', SVC(probability=True, random_state=42, max_iter = 100))
    ]),
    "KNN": Pipeline([
        ('preproc', preprocessor_numonly_scaled),
        ('clf', KNeighborsClassifier())
    ]),
    "HistGradientBoosting": Pipeline([
        ('clf', HistGradientBoostingClassifier(random_state=42))
    ]),
    "XGBClassifier": Pipeline([
        ('preproc', preprocessor_scaled_cat),
        ('clf', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='auc'))
    ])
}

In [None]:
# Entrenamiento y evaluación

results = {}

# Crear archivo CSV para guardar evaluaciones completas
archivo_evaluaciones = "resultados_modelos2.csv"
escribir_encabezado = True  # Solo la primera vez

for name, pipe in pipelines.items():
    print(f"\n=== GridSearch: {name} ===")
    grid = GridSearchCV(
        pipe,
        param_grids[name],
        cv=5,
        refit='roc_auc',
        scoring=['roc_auc', 'accuracy', 'precision', 'recall', 'f1'],
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X_train, y_train)
    best = grid.best_estimator_

    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:, 1] if hasattr(best, "predict_proba") else None

    print("Best params:", grid.best_params_)
    print("Best CV score (roc_auc):", grid.best_score_)
    print("\nClassification report (test):")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    if y_proba is not None:
        print("Test ROC AUC:", roc_auc_score(y_test, y_proba))

    # Guardar resultados completos del GridSearch
    df_resultados = pd.DataFrame(grid.cv_results_)
    df_resultados["modelo"] = name

    # Filtrar solo los parámetros que están en el grid
    param_keys = param_grids[name].keys()
    df_resultados['params_filtrados'] = df_resultados['params'].apply(
        lambda d: {k: d[k] for k in param_keys if k in d}
    )

    # Redondear métricas
    df_resultados['mean_test_precision'] = df_resultados['mean_test_precision'].round(4)
    df_resultados['mean_test_accuracy'] = df_resultados['mean_test_accuracy'].round(4)
    df_resultados['mean_test_recall'] = df_resultados['mean_test_recall'].round(4)
    df_resultados['mean_test_f1'] = df_resultados['mean_test_f1'].round(4)
    df_resultados['mean_test_roc_auc'] = df_resultados['mean_test_roc_auc'].round(4)

    # Seleccionar columnas a guardar
    df_resultados = df_resultados[
        ['modelo', 'params_filtrados', 'mean_test_precision', 'mean_test_accuracy',
         'mean_test_recall', 'mean_test_f1','mean_test_roc_auc', 'rank_test_roc_auc']
    ]

    # Guardar en CSV
    df_resultados.to_csv(archivo_evaluaciones, mode="a", index=False, header=escribir_encabezado)
    escribir_encabezado = False  # Para que no repita el header en el próximo modelo

    # Guardar resumen del mejor modelo
    results[name] = {
        "grid": grid,
        "best_estimator": best,
        "test_pred": y_pred,
        "test_proba": y_proba
    }

# Resumen
summary2 = [(name, results[name]["grid"].best_score_) for name in results]
print("\nResumen (mejor CV roc_auc por modelo):", summary2)

In [None]:
#Modelo Evaluacion

model_eval = HistGradientBoostingClassifier(random_state=42, max_iter=100, max_depth  = 20, learning_rate= 0.1, min_samples_leaf= 50)
model_eval.fit(X_train_test,y_train_test)

y_pred = model_eval.predict(X_val)
y_proba = model_eval.predict_proba(X_val)[:, 1]  # Probabilidad para ROC AUC

# Métricas
print("Accuracy:", round(accuracy_score(y_val, y_pred), 4))
print("Precision:", round(precision_score(y_val, y_pred), 4))
print("Recall:", round(recall_score(y_val, y_pred), 4))
print("F1 Score:", round(f1_score(y_val, y_pred), 4))
print("ROC AUC:", round(roc_auc_score(y_val, y_proba), 4))

# Reporte y matriz de confusión
print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))