In [10]:
# Librerias necesarias
import os, joblib, json
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from sklearn.cluster import KMeans 
import shap
from datetime import datetime
from PIL import Image
import pyfiglet
import tqdm 


In [11]:
# ====== CONFIG de rutas donde estan los datos ======
# Rutas donde se encuentran los datos y el modelo
MODEL_PATH = r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\code_modelo\svm_rbf_pipeline.joblib" # modelo SVM RBF
# Este modelo es el que mejor resultado dio en los entrenamientos
SELECTED_COLS_PATH = r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\selected_columns.txt" # columnas seleccionadas
# CSV de test con las características extraídas
TEST_CSV_PATH = r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\features_test_curado.csv" # CSV de test con las características extraídas
# Directorios donde buscar las imágenes
IMG_SEARCH_DIRS = [
    r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\train",
    r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\val",
    r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\test",
    r"C:\Users\GMADRO04\Documents\PALAS_EOLICAS_ML\processed_data\data_bin\no_etiquetadas"
]
# Directorio de salida para las explicaciones
# Se crea si no existe
OUTPUT_DIR = r"./explicabilidad" # se crea un directorio local para guardar las explicaciones
# ===================================================
SAMPLE_IDX = 0  # índice del ejemplo a explicar
N_BACKGROUND = 100 # número de muestras de fondo para SHAP
# Número de características a mostrar en LIME y SHAP
N_FEATURES_SHOW = 12 
# =====================================

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1) Cargar modelo y columnas seleccionadas
pipe = joblib.load(MODEL_PATH)
with open(SELECTED_COLS_PATH, "r", encoding="utf-8") as f:
    selected_cols = [ln.strip() for ln in f if ln.strip()]

# 2) Cargar CSV de test
df_test = pd.read_csv(TEST_CSV_PATH)
X_test = df_test[selected_cols].copy()
y_test = df_test["label"] if "label" in df_test.columns else None
file_names = df_test["file"].tolist()

# --- ADICIÓN --- usar arrays float32 para evitar warnings y bajar RAM
X_test_arr = X_test.values.astype(np.float32)

# 3) Función para buscar ruta completa de imagen (igual)
def find_image_path(filename):
    for base_dir in IMG_SEARCH_DIRS:
        candidate = os.path.join(base_dir, filename)
        if os.path.exists(candidate):
            return candidate
    return None

# 4) Predicciones rápidas — usar arrays al pipeline
probas = pipe.predict_proba(X_test_arr)[:, 1] if hasattr(pipe, "predict_proba") else None
preds = pipe.predict(X_test_arr)

# 5) LIME (usar arrays y fallback si no hay predict_proba)
explainer = LimeTabularExplainer(
    training_data=X_test_arr,
    feature_names=selected_cols,
    class_names=["sana", "defectuosa"],
    discretize_continuous=True,
    mode="classification"
)

# --- ADICIÓN ---
def _predict_fn_lime(x):
    X_arr = np.asarray(x, dtype=np.float32)
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X_arr)
    from scipy.special import expit
    s = pipe.decision_function(X_arr)
    if s.ndim == 1:
        p1 = expit(s)
        return np.vstack([1 - p1, p1]).T
    return s

exp = explainer.explain_instance(
    data_row=X_test_arr[SAMPLE_IDX],
    predict_fn=_predict_fn_lime,  # --- ADICIÓN ---
    num_features=N_FEATURES_SHOW
)

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
lime_html = os.path.join(OUTPUT_DIR, f"lime_idx{SAMPLE_IDX}_{ts}.html")
exp.save_to_file(lime_html)

# Barra LIME (igual)
weights = exp.as_list()
feat_names, feat_vals = zip(*weights)
plt.figure(figsize=(6, 4))
plt.barh(range(len(feat_vals)), feat_vals)
plt.yticks(range(len(feat_vals)), feat_names)
plt.title(f"LIME - idx {SAMPLE_IDX}")
plt.tight_layout()
lime_bar_path = os.path.join(OUTPUT_DIR, f"lime_bar_idx{SAMPLE_IDX}_{ts}.png")
plt.savefig(lime_bar_path, dpi=200)
plt.close()

# 6) Mostrar imagen original junto a LIME (igual)
img_path = find_image_path(file_names[SAMPLE_IDX])
if img_path:
    img = Image.open(img_path)
    plt.figure(figsize=(8, 4))
    plt.subplot(1, 2, 1)
    plt.imshow(img)
    plt.axis("off")
    plt.title("Imagen original")

    plt.subplot(1, 2, 2)
    plt.barh(range(len(feat_vals)), feat_vals)
    plt.yticks(range(len(feat_vals)), feat_names)
    plt.title("LIME pesos")
    plt.tight_layout()
    combined_path = os.path.join(OUTPUT_DIR, f"lime_image_idx{SAMPLE_IDX}_{ts}.png")
    plt.savefig(combined_path, dpi=200)
    plt.close()

# 7) SHAP Kernel — usar KMeans (sklearn) + extracción robusta de clase positiva
rng = np.random.default_rng(42)
bg_n = min(20, len(X_test_arr))  # --- ADICIÓN --- 20 centros; baja a 10 si hace falta
kmeans = KMeans(n_clusters=bg_n, random_state=42, n_init="auto")
centers = kmeans.fit(X_test_arr).cluster_centers_.astype(np.float32)  # --- ADICIÓN ---

def predict_proba_local(X):
    X_arr = np.asarray(X, dtype=np.float32)
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X_arr)
    from scipy.special import expit
    s = pipe.decision_function(X_arr)
    if s.ndim == 1:
        p1 = expit(s)
        return np.vstack([1 - p1, p1]).T
    return s

kernel_explainer = shap.KernelExplainer(
    predict_proba_local,
    centers,
    link="logit"
)

# --- ADICIÓN --- helper para extraer la clase positiva de forma robusta
def _sv_pos(shap_values, class_index=1):
    """
    Devuelve un vector 1D (n_features,) con los SHAP de la clase positiva.
    Maneja:
        - lista de arrays por clase [ (n_samp,n_feat), (n_samp,n_feat) ]
        - array 3D (n_samp, n_feat, n_clases)
        - array 2D (n_samp, n_feat)
    """
    if isinstance(shap_values, list):
        return np.asarray(shap_values[class_index])[0]  # (n_feat,)
    arr = np.asarray(shap_values)
    if arr.ndim == 3:   # (n_samp, n_feat, n_classes)
        return arr[0, :, class_index]
    elif arr.ndim == 2: # (n_samp, n_feat)
        return arr[0, :]
    else:
        raise ValueError(f"Forma inesperada de shap_values: {arr.shape}")

# Local (1 caso) con nsamples limitado
shap_values_local = kernel_explainer.shap_values(X_test_arr[[SAMPLE_IDX]], nsamples=400)
sv_local_pos = _sv_pos(shap_values_local, class_index=1)  # --- ADICIÓN ---
contrib = pd.Series(sv_local_pos, index=selected_cols).sort_values(key=np.abs, ascending=False)

# SHAP summary global: subset + nsamples limitado
subset_idx = rng.choice(len(X_test_arr), size=min(200, len(X_test_arr)), replace=False)
X_subset = X_test_arr[subset_idx]
shap_values_subset = kernel_explainer.shap_values(X_subset, nsamples=300)

# --- ADICIÓN --- obtener matriz (n_samp, n_feat) para clase positiva
def _sv_matrix_pos(shap_values, class_index=1):
    if isinstance(shap_values, list):
        return np.asarray(shap_values[class_index])  # (n_samp, n_feat)
    arr = np.asarray(shap_values)
    if arr.ndim == 3:   # (n_samp, n_feat, n_classes)
        return arr[:, :, class_index]
    elif arr.ndim == 2: # (n_samp, n_feat)
        return arr
    else:
        raise ValueError(f"Forma inesperada de shap_values: {arr.shape}")

sv_subset_pos = _sv_matrix_pos(shap_values_subset, class_index=1)  # --- ADICIÓN ---

# Plots SHAP (global)
plt.figure()
shap.summary_plot(sv_subset_pos, X_subset, feature_names=selected_cols, show=False, plot_type="bar", max_display=N_FEATURES_SHOW)
plt.tight_layout()
shap_summary_bar_path = os.path.join(OUTPUT_DIR, f"shap_summary_bar_{ts}.png")
plt.savefig(shap_summary_bar_path, dpi=200, bbox_inches="tight")
plt.close()

plt.figure()
shap.summary_plot(sv_subset_pos, X_subset, feature_names=selected_cols, show=False, max_display=N_FEATURES_SHOW)
plt.tight_layout()
shap_summary_dot_path = os.path.join(OUTPUT_DIR, f"shap_summary_dot_{ts}.png")
plt.savefig(shap_summary_dot_path, dpi=200, bbox_inches="tight")
plt.close()

# 8) Guardar metadatos (igual + extras)
meta = {
    "model_path": MODEL_PATH,
    "sample_idx": int(SAMPLE_IDX),
    "pred_sample": int(preds[SAMPLE_IDX]),
    "prob_sample": float(probas[SAMPLE_IDX]) if probas is not None else None,
    "y_true_sample": int(y_test.iloc[SAMPLE_IDX]) if y_test is not None else None,
    "image_path": img_path,
    "lime_html": lime_html,
    "lime_bar_png": lime_bar_path,
    "lime_combined_png": combined_path if img_path else None,
    "shap_summary_bar_png": shap_summary_bar_path,
    "shap_summary_dot_png": shap_summary_dot_path,
    # --- ADICIÓN ---
    "bg_method": f"sklearn.KMeans_{bg_n}",
    "shap_nsamples_local": 400,
    "shap_nsamples_global": 300,
}
with open(os.path.join(OUTPUT_DIR, f"explain_meta_{ts}.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print(pyfiglet.figlet_format("Explicaciones Generadas", font="digital"))
print("Listo. Explicaciones en:", OUTPUT_DIR)

100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
100%|██████████| 94/94 [04:19<00:00,  2.76s/it]


+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+
|E|x|p|l|i|c|a|c|i|o|n|e|s| |G|e|n|e|r|a|d|a|s|
+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+

Listo. Explicaciones en: ./explicabilidad
