In [1]:
# Entorno, sin entrenamiento

#imports básicos
import sys
import numpy as np
import pandas as pd
from pathlib import Path

#importar shap e instalar
try:
    import shap  # noqa
except Exception:
    !pip -q install shap
    import shap  # noqa

import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 120

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

#configuración 
RANDOM_STATE = 42
DATA_PATH = Path("../data/stroke.csv")
OUTPUTS = Path("../outputs")
TABLAS = OUTPUTS / "tablas"
FIGURAS = OUTPUTS / "figuras"
MODELOS = OUTPUTS / "modelos"
for p in [TABLAS, FIGURAS, MODELOS]:
    p.mkdir(parents=True, exist_ok=True)

# cargar datos
df = pd.read_csv(DATA_PATH)
assert "stroke" in df.columns, "No se encuentra la columna 'stroke' en el CSV."
assert set(df["stroke"].unique()) <= {0,1}, "La variable 'stroke' debe ser binaria 0/1."

y = df["stroke"].astype(int)
X = df.drop(columns=["stroke"])

# declarar columnas como en Notebook 03
num_cols = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
cat_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

missing_num = [c for c in num_cols if c not in X.columns]
missing_cat = [c for c in cat_cols if c not in X.columns]
assert not missing_num and not missing_cat, f"Faltan columnas -> num:{missing_num} | cat:{missing_cat}"

# preprocesador
num_tf = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("sc", StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[("num", num_tf, num_cols), ("cat", cat_tf, cat_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

print("Datos:", df.shape, "| X:", X.shape, "| y:", y.shape)
print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)
print("Preprocesador listo (igual al de Notebook 03). SHAP disponible:", "shap" in sys.modules)

  from .autonotebook import tqdm as notebook_tqdm


Datos: (5110, 12) | X: (5110, 11) | y: (5110,)
Numéricas: ['age', 'avg_glucose_level', 'bmi', 'hypertension', 'heart_disease']
Categóricas: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Preprocesador listo (igual al de Notebook 03). SHAP disponible: True


In [2]:
# entrenar y guardar RL, DT y RF (para interpretabilidad)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import joblib
import numpy as np
import pandas as pd

#  Definir modelos. mismos parámetros del Notebook 03
clf_rl = LogisticRegression(
    max_iter=5000, class_weight="balanced", solver="lbfgs", random_state=RANDOM_STATE
)
clf_dt = DecisionTreeClassifier(
    criterion="gini", class_weight="balanced", random_state=RANDOM_STATE
)
clf_rf = RandomForestClassifier(
    n_estimators=500, class_weight="balanced_subsample",
    random_state=RANDOM_STATE, n_jobs=-1
)

pipe_rl = Pipeline([("prepro", preprocessor), ("clf", clf_rl)])
pipe_dt = Pipeline([("prepro", preprocessor), ("clf", clf_dt)])
pipe_rf = Pipeline([("prepro", preprocessor), ("clf", clf_rf)])

# Ajustar los tres pipelines
pipe_rl.fit(X, y)
pipe_dt.fit(X, y)
pipe_rf.fit(X, y)

# Guardar modelos en outputs/modelos
path_rl = MODELOS / "pipe_RL_classweight.joblib"
path_dt = MODELOS / "pipe_DT_classweight.joblib"
path_rf = MODELOS / "pipe_RF_classweight.joblib"

joblib.dump(pipe_rl, path_rl)
joblib.dump(pipe_dt, path_dt)
joblib.dump(pipe_rf, path_rf)

print("Modelos guardados:")
print("  RL ->", path_rl)
print("  DT ->", path_dt)
print("  RF ->", path_rf)

#  Obtener X ya transformado y nombres de variables para SHAP

prepro_fitted = pipe_rf.named_steps["prepro"]
X_prepared = prepro_fitted.transform(X)

#columnas después de imputación+OHE+escalado
try:
    feat_names = prepro_fitted.get_feature_names_out()
except AttributeError:
    # compatibilidad
    feat_names = np.array([f"f{i}" for i in range(X_prepared.shape[1])])

# Guardar
np.save(MODELOS / "X_prepared.npy", X_prepared)
pd.Series(feat_names).to_csv(MODELOS / "feature_names_after_prepro.csv", index=False, header=False)

print("X_prepared shape:", X_prepared.shape)
print("Ejemplo de variables codificadas:", feat_names[:10])
print("Archivos guardados para SHAP:",
      MODELOS / "X_prepared.npy", "y", MODELOS / "feature_names_after_prepro.csv")

Modelos guardados:
  RL -> ..\outputs\modelos\pipe_RL_classweight.joblib
  DT -> ..\outputs\modelos\pipe_DT_classweight.joblib
  RF -> ..\outputs\modelos\pipe_RF_classweight.joblib
X_prepared shape: (5110, 21)
Ejemplo de variables codificadas: ['age' 'avg_glucose_level' 'bmi' 'hypertension' 'heart_disease'
 'gender_Female' 'gender_Male' 'gender_Other' 'ever_married_No'
 'ever_married_Yes']
Archivos guardados para SHAP: ..\outputs\modelos\X_prepared.npy y ..\outputs\modelos\feature_names_after_prepro.csv


In [4]:
# SHAP para Random Forest

import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib

pipe_rf = joblib.load(MODELOS / "pipe_RF_classweight.joblib")
rf = pipe_rf.named_steps["clf"]

X_prepared = np.load(MODELOS / "X_prepared.npy")
feat_names = pd.read_csv(MODELOS / "feature_names_after_prepro.csv", header=None)[0].values

print("RF cargado. X_prepared:", X_prepared.shape, "| n_features:", len(feat_names))


rng = np.random.default_rng(42)
bg_size = min(1000, X_prepared.shape[0])
background = X_prepared[rng.choice(X_prepared.shape[0], size=bg_size, replace=False)]

# Dos formas válidas.
explainer = shap.Explainer(
    rf, 
    masker=background, 
    feature_names=feat_names,
    algorithm="tree",                  # usa TreeExplainer internamente
    model_output="probability"        
)

#Calcular SHAP 
shap_exp = explainer(X_prepared, check_additivity=False)

# Para binario
if shap_exp.values.ndim == 3 and shap_exp.values.shape[2] == 2:
    shap_pos = shap_exp.values[:, :, 1]
else:
    shap_pos = shap_exp.values

print("SHAP listo. Matriz:", np.array(shap_pos).shape)

#Figuras: summary (beeswarm) y bar
plt.figure()
shap.summary_plot(shap_pos, X_prepared, feature_names=feat_names, show=False)
plt.title("SHAP Summary (beeswarm) - Random Forest (probability)")
plt.tight_layout()
plt.savefig(FIGURAS / "SHAP_summary_beeswarm_RF.png", dpi=300)
plt.close()

plt.figure()
shap.summary_plot(shap_pos, X_prepared, feature_names=feat_names, plot_type="bar", show=False)
plt.title("SHAP Importancias (media |SHAP|) - Random Forest (probability)")
plt.tight_layout()
plt.savefig(FIGURAS / "SHAP_importances_bar_RF.png", dpi=300)
plt.close()

print("Figuras guardadas:",
      FIGURAS / "SHAP_summary_beeswarm_RF.png", "|",
      FIGURAS / "SHAP_importances_bar_RF.png")

# Tabla de importancias globales
mean_abs = np.abs(shap_pos).mean(axis=0)
imp_df = pd.DataFrame({"feature": feat_names, "mean_abs_shap": mean_abs}) \
           .sort_values("mean_abs_shap", ascending=False)

imp_csv = TABLAS / "SHAP_importances_RF.csv"
imp_df.to_csv(imp_csv, index=False)
print("Tabla guardada:", imp_csv)

# Mostrar top-10
imp_df.head(10)


RF cargado. X_prepared: (5110, 21) | n_features: 21




SHAP listo. Matriz: (5110, 21)
Figuras guardadas: ..\outputs\figuras\SHAP_summary_beeswarm_RF.png | ..\outputs\figuras\SHAP_importances_bar_RF.png
Tabla guardada: ..\outputs\tablas\SHAP_importances_RF.csv


Unnamed: 0,feature,mean_abs_shap
0,age,0.040863
2,bmi,0.011969
1,avg_glucose_level,0.010972
3,hypertension,0.006428
19,smoking_status_never smoked,0.004362
18,smoking_status_formerly smoked,0.004067
9,ever_married_Yes,0.003846
8,ever_married_No,0.003678
5,gender_Female,0.003376
4,heart_disease,0.003249


In [5]:
#Interpretabilidad local con SHAP y 3 ejemplos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap, joblib

#Cargar artefactos y probabilidades del RF
pipe_rf = joblib.load(MODELOS / "pipe_RF_classweight.joblib")
rf = pipe_rf.named_steps["clf"]

X_prepared = np.load(MODELOS / "X_prepared.npy")
feat_names = pd.read_csv(MODELOS / "feature_names_after_prepro.csv", header=None)[0].values

proba = pipe_rf.predict_proba(X)[:, 1]  # probabilidad de clase positiva (stroke)

# indices representativos
idx_high = int(np.argmax(proba))                               # mayor riesgo
idx_mid  = int(np.argmin(np.abs(proba - 0.5)))                 # más cercano al 0.5 (zona de decisión)
idx_low  = int(np.argmin(proba))                               # menor riesgo

casos = [("alto", idx_high), ("umbral", idx_mid), ("bajo", idx_low)]
print("Indices elegidos:", casos, " | probs:", [proba[i] for _, i in casos])

#Explainer
rng = np.random.default_rng(42)
bg_size = min(1000, X_prepared.shape[0])
background = X_prepared[rng.choice(X_prepared.shape[0], size=bg_size, replace=False)]

explainer = shap.Explainer(
    rf,
    masker=background,
    feature_names=feat_names,
    algorithm="tree",
    model_output="probability"
)

def explicar_y_guardar(idx: int, etiqueta: str):
    """Genera waterfall y decision plot para un índice dado."""
    
    exp = explainer(X_prepared[idx:idx+1], check_additivity=False)
    # tomar clase positiva si viene 3D
    if exp.values.ndim == 3 and exp.values.shape[2] == 2:
        exp1d = shap.Explanation(
            values=exp.values[0, :, 1],
            base_values=exp.base_values[0, 1],
            data=X_prepared[idx],
            feature_names=feat_names
        )
    else:
        exp1d = shap.Explanation(
            values=exp.values[0],
            base_values=np.atleast_1d(exp.base_values)[0],
            data=X_prepared[idx],
            feature_names=feat_names
        )

    p = proba[idx]
    #Waterfall plo
    plt.figure()
    shap.plots.waterfall(exp1d, show=False, max_display=15)
    plt.title(f"Waterfall SHAP RF – caso {etiqueta} (idx={idx}, p={p:.3f})")
    plt.tight_layout()
    out_wf = FIGURAS / f"SHAP_waterfall_RF_{etiqueta}.png"
    plt.savefig(out_wf, dpi=300)
    plt.close()

    #Decision plot
    try:
        plt.figure()
        shap.decision_plot(exp1d.base_values, exp1d.values, feature_names=feat_names, show=False)
        plt.title(f"Decision plot SHAP – caso {etiqueta} (idx={idx}, p={p:.3f})")
        plt.tight_layout()
        out_dec = FIGURAS / f"SHAP_decision_RF_{etiqueta}.png"
        plt.savefig(out_dec, dpi=300)
        plt.close()
        print(f"Guardado: {out_wf} | {out_dec}")
    except Exception as e:
        print(f"Guardado: {out_wf} | (decision plot no disponible: {e})")

# Ejecutar para los 3 casos
for etq, idx in casos:
    explicar_y_guardar(idx, etq)

print("Listo. Revisa outputs/figuras/ para los PNG locales.")

Indices elegidos: [('alto', 50), ('umbral', 88), ('bajo', 249)]  | probs: [np.float64(0.852), np.float64(0.57), np.float64(0.0)]
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_alto.png | ..\outputs\figuras\SHAP_decision_RF_alto.png
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_umbral.png | ..\outputs\figuras\SHAP_decision_RF_umbral.png
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_bajo.png | ..\outputs\figuras\SHAP_decision_RF_bajo.png
Listo. Revisa outputs/figuras/ para los PNG locales.
