In [1]:
# Importación de librerías

import pandas as pd
import numpy as np
import joblib
from pathlib import Path

#rutas
OUTPUTS = Path("../outputs")
MODELOS = OUTPUTS / "modelos"

#Random Forest
pipe_rf = joblib.load(MODELOS / "pipe_RF_classweight.joblib")
print("Modelo Random Forest cargado desde:", MODELOS / "pipe_RF_classweight.joblib")

# pacientes ejemplo
pacientes_nuevos = pd.DataFrame([
    {
        "gender": "Male",
        "age": 67,
        "hypertension": 1,
        "heart_disease": 1,
        "ever_married": "Yes",
        "work_type": "Private",
        "Residence_type": "Urban",
        "avg_glucose_level": 210.0,
        "bmi": 31.5,
        "smoking_status": "formerly smoked"
    },
    {
        "gender": "Female",
        "age": 45,
        "hypertension": 0,
        "heart_disease": 0,
        "ever_married": "Yes",
        "work_type": "Govt_job",
        "Residence_type": "Rural",
        "avg_glucose_level": 85.0,
        "bmi": 22.0,
        "smoking_status": "never smoked"
    },
    {
        "gender": "Male",
        "age": 30,
        "hypertension": 0,
        "heart_disease": 0,
        "ever_married": "No",
        "work_type": "Self-employed",
        "Residence_type": "Urban",
        "avg_glucose_level": 95.0,
        "bmi": 27.0,
        "smoking_status": "smokes"
    }
])

print("Pacientes ejemplo:\n", pacientes_nuevos)

#predicción de probabilidad de stroke
proba = pipe_rf.predict_proba(pacientes_nuevos)[:, 1]
pred = pipe_rf.predict(pacientes_nuevos)

#resultados
for i, (p, pr) in enumerate(zip(pred, proba)):
    print(f"Paciente {i+1}: Probabilidad={pr:.3f} | Predicción={p}")

Modelo Random Forest cargado desde: ..\outputs\modelos\pipe_RF_classweight.joblib
Pacientes ejemplo:
    gender  age  hypertension  heart_disease ever_married      work_type  \
0    Male   67             1              1          Yes        Private   
1  Female   45             0              0          Yes       Govt_job   
2    Male   30             0              0           No  Self-employed   

  Residence_type  avg_glucose_level   bmi   smoking_status  
0          Urban              210.0  31.5  formerly smoked  
1          Rural               85.0  22.0     never smoked  
2          Urban               95.0  27.0           smokes  
Paciente 1: Probabilidad=0.382 | Predicción=0
Paciente 2: Probabilidad=0.000 | Predicción=0
Paciente 3: Probabilidad=0.010 | Predicción=0


In [2]:
# SHAP local para pacientes ejemplo
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib
from pathlib import Path

OUTPUTS = Path("../outputs")
FIGURAS = OUTPUTS / "figuras"
TABLAS = OUTPUTS / "tablas"
MODELOS = OUTPUTS / "modelos"
FIGURAS.mkdir(parents=True, exist_ok=True)
TABLAS.mkdir(parents=True, exist_ok=True)

prepro_fitted = pipe_rf.named_steps["prepro"]
rf = pipe_rf.named_steps["clf"]

# Transformar pacientes
X_new = prepro_fitted.transform(pacientes_nuevos)

#características después de OHE+escala
try:
    feat_names = prepro_fitted.get_feature_names_out()
except AttributeError:
    feat_names = np.array([f"f{i}" for i in range(X_new.shape[1])])

#Carga 
bg_path = MODELOS / "X_prepared.npy"
if bg_path.exists():
    background = np.load(bg_path)
    # recortar background por eficiencia
    if background.shape[0] > 1000:
        rng = np.random.default_rng(42)
        idx = rng.choice(background.shape[0], size=1000, replace=False)
        background = background[idx]
else:
    background = X_new

print("Background para SHAP:", background.shape)

# Construir explainer
explainer = shap.Explainer(
    rf,
    masker=background,
    feature_names=feat_names,
    algorithm="tree",
    model_output="probability"
)

# Calcular explicaciones
exp = explainer(X_new, check_additivity=False)


def to_pos_explanation(exp_row):
    if exp_row.values.ndim == 2 and exp_row.values.shape[1] == 2:
        return shap.Explanation(
            values=exp_row.values[:, 1],
            base_values=exp_row.base_values[1],
            data=exp_row.data,
            feature_names=exp_row.feature_names
        )
    return exp_row

# Guardar gráficos y tabla
rows = []
for i in range(len(pacientes_nuevos)):
    exp_i = to_pos_explanation(exp[i])

    # Waterfall
    plt.figure()
    shap.plots.waterfall(exp_i, show=False, max_display=15)
    plt.title(f"Waterfall SHAP – Paciente {i+1}")
    plt.tight_layout()
    out_wf = FIGURAS / f"SHAP_waterfall_RF_paciente{i+1}.png"
    plt.savefig(out_wf, dpi=300)
    plt.close()

    
    try:
        plt.figure()
        shap.decision_plot(exp_i.base_values, exp_i.values, feature_names=feat_names, show=False)
        plt.title(f"Decision plot SHAP – Paciente {i+1}")
        plt.tight_layout()
        out_dec = FIGURAS / f"SHAP_decision_RF_paciente{i+1}.png"
        plt.savefig(out_dec, dpi=300)
        plt.close()
    except Exception as e:
        out_dec = None
        print(f"Decision plot no disponible para paciente {i+1}: {e}")

    #contribuciones absolutas (k=10)
    vals = exp_i.values
    abs_idx = np.argsort(np.abs(vals))[::-1][:10]
    top_feats = [(feat_names[j], float(vals[j])) for j in abs_idx]

    for rank, (fname, s) in enumerate(top_feats, start=1):
        rows.append({
            "paciente": i+1,
            "rank": rank,
            "feature": fname,
            "shap_value": s,
            "contribution": "↑ riesgo" if s > 0 else "↓ riesgo"
        })

    print(f"Guardado: {out_wf}", ("| " + str(out_dec) if out_dec else ""))


df_top = pd.DataFrame(rows)
csv_path = TABLAS / "SHAP_top10_pacientes_simulados_RF.csv"
df_top.to_csv(csv_path, index=False)
print("CSV de contribuciones guardado en:", csv_path)


df_top.head(12)

  from .autonotebook import tqdm as notebook_tqdm


Background para SHAP: (1000, 21)
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_paciente1.png | ..\outputs\figuras\SHAP_decision_RF_paciente1.png
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_paciente2.png | ..\outputs\figuras\SHAP_decision_RF_paciente2.png
Guardado: ..\outputs\figuras\SHAP_waterfall_RF_paciente3.png | ..\outputs\figuras\SHAP_decision_RF_paciente3.png
CSV de contribuciones guardado en: ..\outputs\tablas\SHAP_top10_pacientes_simulados_RF.csv


Unnamed: 0,paciente,rank,feature,shap_value,contribution
0,1,1,age,0.089044,↑ riesgo
1,1,2,avg_glucose_level,0.084372,↑ riesgo
2,1,3,heart_disease,0.079773,↑ riesgo
3,1,4,bmi,0.024497,↑ riesgo
4,1,5,smoking_status_never smoked,0.014916,↑ riesgo
5,1,6,Residence_type_Rural,0.01087,↑ riesgo
6,1,7,Residence_type_Urban,0.010339,↑ riesgo
7,1,8,work_type_Private,0.009338,↑ riesgo
8,1,9,gender_Male,0.008463,↑ riesgo
9,1,10,work_type_Self-employed,0.006633,↑ riesgo


In [3]:
#Función de predicción 

import pandas as pd
import numpy as np
from pathlib import Path

OUTPUTS = Path("../outputs")
TABLAS = OUTPUTS / "tablas"
TABLAS.mkdir(parents=True, exist_ok=True)

#umbral de Youden calculado en Notebook 03
UMBRAL_YOUDEN = 0.4432

def predecir_pacientes(df_pacientes: pd.DataFrame, modelo_pipeline, umbral: float = UMBRAL_YOUDEN) -> pd.DataFrame:
    """
    Aplica el pipeline 'modelo_pipeline' a df_pacientes y devuelve un DataFrame con:
    - prob_stroke: probabilidad de clase positiva
    - pred_label:  1 si prob >= umbral, 0 en caso contrario
    - umbral usado
    """
    # Validación 
    cols_esperadas = [
        "gender","age","hypertension","heart_disease","ever_married",
        "work_type","Residence_type","avg_glucose_level","bmi","smoking_status"
    ]
    faltantes = [c for c in cols_esperadas if c not in df_pacientes.columns]
    if faltantes:
        raise ValueError(f"Faltan columnas en df_pacientes: {faltantes}")

    prob = modelo_pipeline.predict_proba(df_pacientes)[:, 1]
    pred = (prob >= umbral).astype(int)
    out = df_pacientes.copy()
    out["prob_stroke"] = prob
    out["pred_label"]  = pred
    out["umbral"]      = umbral
    return out

# ejecucion sobre 3 pacientes simulados con 2 umbralesYouden y 0.5
res_youden = predecir_pacientes(pacientes_nuevos, pipe_rf, umbral=UMBRAL_YOUDEN)
res_05     = predecir_pacientes(pacientes_nuevos, pipe_rf, umbral=0.5)

# generar CSV
csv_youden = TABLAS / "predicciones_pacientes_simulados_umbral_youden.csv"
csv_05     = TABLAS / "predicciones_pacientes_simulados_umbral_0p5.csv"
res_youden.to_csv(csv_youden, index=False)
res_05.to_csv(csv_05, index=False)

print("Guardado:")
print("  -", csv_youden)
print("  -", csv_05)

print("\nPredicciones con umbral Youden:")
display(res_youden)

print("\nPredicciones con umbral 0.5:")
display(res_05)

Guardado:
  - ..\outputs\tablas\predicciones_pacientes_simulados_umbral_youden.csv
  - ..\outputs\tablas\predicciones_pacientes_simulados_umbral_0p5.csv

Predicciones con umbral Youden:


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,prob_stroke,pred_label,umbral
0,Male,67,1,1,Yes,Private,Urban,210.0,31.5,formerly smoked,0.382,0,0.4432
1,Female,45,0,0,Yes,Govt_job,Rural,85.0,22.0,never smoked,0.0,0,0.4432
2,Male,30,0,0,No,Self-employed,Urban,95.0,27.0,smokes,0.01,0,0.4432



Predicciones con umbral 0.5:


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,prob_stroke,pred_label,umbral
0,Male,67,1,1,Yes,Private,Urban,210.0,31.5,formerly smoked,0.382,0,0.5
1,Female,45,0,0,Yes,Govt_job,Rural,85.0,22.0,never smoked,0.0,0,0.5
2,Male,30,0,0,No,Self-employed,Urban,95.0,27.0,smokes,0.01,0,0.5
