In [None]:
# =========================================
# Comparación justa SIN GridSearch (rápido)
# CV 5-fold + tuning de umbral por fold (max F1)
# Preprocesamiento unificado en Pipeline (sin leakage)
# =========================================
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    accuracy_score, f1_score, precision_score, recall_score
)
from sklearn.base import BaseEstimator, TransformerMixin

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier

# ---------- 1) Datos ----------
DATA_PATH = "./data/processed/df_priority6_with_response_index.csv"
df = pd.read_csv(DATA_PATH)
X = df.drop(columns=[c for c in ["Response","ID"] if c in df.columns])
y = df["Response"].astype(int)

BASE = "C:\Users\axime\Downloads\nueva_data\modelado\MODELO_FINAL\modelo_final.ipynb"
print(f"[INFO] Dataset shape: {X.shape}, Pos={int(y.sum())}, Neg={len(y)-int(y.sum())}")

# ---------- 2) Transformador: poda por correlación dentro del pipeline ----------
class CorrPruner(BaseEstimator, TransformerMixin):
    def __init__(self, thr=0.98):
        self.thr = thr
        self.keep_idx_ = None
    def fit(self, X, y=None):
        Xdf = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        corr = Xdf.corr().abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        drop = [c for c in upper.columns if any(upper[c] > self.thr)]
        cols_keep = [c for c in Xdf.columns if c not in drop]
        self.keep_idx_ = [Xdf.columns.get_loc(c) for c in cols_keep]
        return self
    def transform(self, X):
        Xdf = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        return Xdf.iloc[:, self.keep_idx_].values

def base_steps():
    return [("vt", VarianceThreshold(0.0)),
            ("corr", CorrPruner(thr=0.98))]

# ---------- 3) Hiperparámetros fijos (los “ganadores” / razonables por AUCPR) ----------
scale_pos_weight = (len(y) - y.sum()) / y.sum()  # para XGBoost

# LR: ganador que ya reportaste
pipe_lr = Pipeline(base_steps() + [
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(penalty="l2", C=0.1, solver="saga",
                               class_weight="balanced", max_iter=5000,
                               n_jobs=-1, random_state=42))
])

# SVM lineal: C fijo (elige 0.5 o 1.0; aquí 0.5 suele ir bien en alta-dim)
pipe_svm = Pipeline(base_steps() + [
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="linear", C=0.5, class_weight="balanced",
                probability=True, random_state=42))
])

# Random Forest: usa tus mejores previos (ajusta si tienes otros)
pipe_rf = Pipeline(base_steps() + [
    ("rf", RandomForestClassifier(
        n_estimators=500, max_depth=15, min_samples_split=5,
        min_samples_leaf=2, max_features="sqrt",
        class_weight="balanced_subsample", n_jobs=-1, random_state=42))
])

# XGBoost: parámetros fijos buenos para AUCPR + desbalance
pipe_xgb = Pipeline(base_steps() + [
    ("xgb", XGBClassifier(
        objective="binary:logistic",
        n_estimators=400, learning_rate=0.05,
        max_depth=4, min_child_weight=1,
        subsample=0.9, colsample_bytree=0.9,
        reg_lambda=3.0, reg_alpha=0.0, gamma=0.0,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist", eval_metric="aucpr",
        n_jobs=-1, random_state=42))
])

# GaussianNB: con escalado
pipe_gnb = Pipeline(base_steps() + [
    ("scaler", StandardScaler()),
    ("gnb", GaussianNB(var_smoothing=1e-8))
])

# BernoulliNB: binarizado por cuantiles (2 bins) + alpha
pipe_bnb = Pipeline(base_steps() + [
    ("bin", KBinsDiscretizer(n_bins=2, encode="ordinal", strategy="quantile")),
    ("bnb", BernoulliNB(alpha=1.0))
])

models = {
    "LR(L2,C=0.1)": pipe_lr,
    "SVM(lineal,C=0.5)": pipe_svm,
    "RandomForest": pipe_rf,
    "XGBoost": pipe_xgb,
    "GaussianNB": pipe_gnb,
    "BernoulliNB(quantile)": pipe_bnb
}

# ---------- 4) Util: umbral que maximiza F1 en el fold ----------
def best_threshold(y_true, score):
    ts = np.unique(np.round(score, 6))
    ts = np.r_[0.0, ts, 1.0]
    best_t, best_f1 = 0.5, -1.0
    for t in ts:
        y_hat = (score >= t).astype(int)
        f1 = f1_score(y_true, y_hat)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t

# ---------- 5) Evaluación 5-fold para TODOS (sin GridSearch) ----------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
summary_rows = []

for name, pipe in models.items():
    print(f"\n===== {name} | 5-fold CV con umbral óptimo por fold =====")
    mets = {"AUCPR": [], "ROC-AUC": [], "Acc": [], "F1": [], "Prec": [], "Rec": []}

    for fold, (i_tr, i_va) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X.iloc[i_tr], X.iloc[i_va]
        y_tr, y_va = y.iloc[i_tr], y.iloc[i_va]

        pipe.fit(X_tr, y_tr)

        # Probabilidades (si no existe predict_proba, usar decision_function)
        if hasattr(pipe, "predict_proba"):
            y_score = pipe.predict_proba(X_va)[:, 1]
        else:
            # Para SVC con probability=True tenemos predict_proba; este else es fallback
            y_dec = pipe.decision_function(X_va)
            # Escala [0,1] por si acaso
            y_score = (y_dec - y_dec.min()) / (y_dec.max() - y_dec.min() + 1e-12)

        thr = best_threshold(y_va, y_score)
        y_pred = (y_score >= thr).astype(int)

        aucpr = average_precision_score(y_va, y_score)
        roc   = roc_auc_score(y_va, y_score)
        acc   = accuracy_score(y_va, y_pred)
        f1    = f1_score(y_va, y_pred)
        prec  = precision_score(y_va, y_pred, zero_division=0)
        rec   = recall_score(y_va, y_pred)

        mets["AUCPR"].append(aucpr)
        mets["ROC-AUC"].append(roc)
        mets["Acc"].append(acc)
        mets["F1"].append(f1)
        mets["Prec"].append(prec)
        mets["Rec"].append(rec)

        print(f"[Fold {fold}] thr={thr:.3f} | AUCPR={aucpr:.4f} | ROC-AUC={roc:.4f} | "
              f"F1={f1:.4f} | Acc={acc:.4f} | Prec={prec:.4f} | Rec={rec:.4f}")

    print(f"\n=== {name} (PROMEDIO 5-FOLDS) ===")
    for m, v in mets.items():
        print(f"{m}: {np.mean(v):.4f} ± {np.std(v):.4f}")

    summary_rows.append({
        "Modelo": name,
        "AUCPR_mean": np.mean(mets["AUCPR"]), "AUCPR_std": np.std(mets["AUCPR"]),
        "ROC_mean":   np.mean(mets["ROC-AUC"]), "ROC_std":  np.std(mets["ROC-AUC"]),
        "F1_mean":    np.mean(mets["F1"]), "F1_std": np.std(mets["F1"]),
        "Acc_mean":   np.mean(mets["Acc"]), "Acc_std": np.std(mets["Acc"]),
        "Prec_mean":  np.mean(mets["Prec"]), "Prec_std": np.std(mets["Prec"]),
        "Rec_mean":   np.mean(mets["Rec"]), "Rec_std": np.std(mets["Rec"]),
    })

# ---------- 6) Guardar resumen combinado ----------
summary = pd.DataFrame(summary_rows)
out_path = "./data/processed/comparacion_sin_gridsearch_summary.csv"
summary.to_csv(out_path, index=False)
print(f"\n[INFO] Resumen guardado en: {out_path}")


[INFO] Dataset shape: (2551, 730), Pos=635, Neg=1916

===== LR(L2,C=0.1) | 5-fold CV con umbral óptimo por fold =====
[Fold 1] thr=0.596 | AUCPR=0.7959 | ROC-AUC=0.9045 | F1=0.7344 | Acc=0.8669 | Prec=0.7287 | Rec=0.7402
[Fold 2] thr=0.548 | AUCPR=0.7768 | ROC-AUC=0.9182 | F1=0.7510 | Acc=0.8725 | Prec=0.7313 | Rec=0.7717
[Fold 3] thr=0.574 | AUCPR=0.8261 | ROC-AUC=0.9329 | F1=0.7630 | Acc=0.8745 | Prec=0.7203 | Rec=0.8110
[Fold 4] thr=0.612 | AUCPR=0.8502 | ROC-AUC=0.9332 | F1=0.7805 | Acc=0.8941 | Prec=0.8067 | Rec=0.7559
[Fold 5] thr=0.578 | AUCPR=0.8695 | ROC-AUC=0.9420 | F1=0.7984 | Acc=0.9000 | Prec=0.8016 | Rec=0.7953

=== LR(L2,C=0.1) (PROMEDIO 5-FOLDS) ===
AUCPR: 0.8237 ± 0.0340
ROC-AUC: 0.9261 ± 0.0132
Acc: 0.8816 ± 0.0130
F1: 0.7654 ± 0.0223
Prec: 0.7577 ± 0.0381
Rec: 0.7748 ± 0.0257

===== SVM(lineal,C=0.5) | 5-fold CV con umbral óptimo por fold =====
[Fold 1] thr=0.355 | AUCPR=0.7368 | ROC-AUC=0.8729 | F1=0.7077 | Acc=0.8513 | Prec=0.6917 | Rec=0.7244
[Fold 2] thr=0.453 | 