In [1]:
import os
import json
import gc
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Configuración global
DTYPE = np.float32
SAMPLE_FRACTION = 0.2
SAMPLE_SIZE_MODEL = 10000
CUMULATIVE_THRESHOLD = 0.95

def get_memory_usage():
    """Obtiene el uso de memoria actual en GB."""
    import psutil
    return psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3

def clean_nan(df):
    """Limpieza agresiva de NaN optimizada."""
    # Optimización: Usar máscaras booleanas en lugar de operaciones iterativas
    null_counts = df.isnull().sum()
    cols_to_keep = null_counts[null_counts < len(df)].index
    df = df[cols_to_keep]
    
    # Optimización: Vectorizar el reemplazo de NaN
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    df = df.fillna(0.0)
    
    return df.astype(DTYPE)

def find_optimal_cutoff(cumulative_importance, min_features=10):
    """Encuentra el punto de corte óptimo (optimizado)."""
    threshold_idx = np.where(cumulative_importance >= CUMULATIVE_THRESHOLD)[0]
    threshold_idx = threshold_idx[0] if len(threshold_idx) > 0 else len(cumulative_importance)
    
    if len(cumulative_importance) < 2:
        return min_features
    
    # Optimización: Cálculo vectorizado del punto de inflexión
    x = np.arange(len(cumulative_importance))
    y = cumulative_importance
    slopes = np.diff(y) / np.diff(x)
    elbow_idx = np.argmax(np.abs(np.diff(slopes))) + 1
    
    return max(min_features, min(threshold_idx, elbow_idx))

def analyze_feature_importance_enhanced(scores, feature_names=None, min_features=10):
    """Análisis de importancia optimizado."""
    # Si scores no viene con nombres de características, crear índices
    if feature_names is None:
        feature_names = [f"feature_{i}" for i in range(len(scores))]
    
    # Crear Series con los nombres de características correctos
    scores = pd.Series(scores, index=feature_names)
    
    sorted_scores = scores.sort_values(ascending=False)
    
    if sorted_scores.empty or sorted_scores.sum() == 0:
        return [], pd.DataFrame(), 0
    
    cumulative = np.cumsum(sorted_scores.values) / sorted_scores.sum()
    cutoff_idx = find_optimal_cutoff(cumulative, min_features)
    
    analysis_df = pd.DataFrame({
        'feature': sorted_scores.index,
        'importance_score': sorted_scores.values,
        'cumulative_importance': cumulative,
        'rank': np.arange(1, len(sorted_scores) + 1)
    })
    
    selected_features = analysis_df['feature'].iloc[:cutoff_idx].tolist()
    return selected_features, analysis_df, cutoff_idx

def plot_importance_analysis(analysis_df, cutoff_idx, plot_path):
    """Generación de gráficos optimizada."""
    plt.figure(figsize=(15, 10))
    
    dynamic_num = min(max(int(cutoff_idx * 1.25), 10), len(analysis_df))
    
    plt.subplot(2, 1, 1)
    top_features = analysis_df.head(dynamic_num)
    bars = plt.bar(range(len(top_features)), top_features['importance_score'], color='darkblue')
    
    if cutoff_idx < dynamic_num:
        plt.axvline(x=cutoff_idx, color='r', linestyle='--', linewidth=2, 
                   label=f'Punto de corte: {cutoff_idx}')
    
    plt.title(f'Top {dynamic_num} Características por Importancia')
    plt.xlabel('Ranking')
    plt.ylabel('Score de Importancia')
    plt.xticks(rotation=45)
    plt.legend()
    
    plt.subplot(2, 1, 2)
    plt.plot(analysis_df['cumulative_importance'], 'g-', linewidth=2)
    plt.axvline(cutoff_idx, color='r', linestyle='--', linewidth=2, 
               label=f'Punto de corte: {cutoff_idx}')
    plt.axhline(CUMULATIVE_THRESHOLD, color='orange', linestyle=':', 
               linewidth=2, label=f'Umbral {CUMULATIVE_THRESHOLD}')
    
    plt.title('Importancia Acumulada de Características')
    plt.xlabel('Número de Características')
    plt.ylabel('Importancia Acumulada')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()

def save_single_parquet(ddf, path):
    """Guarda un Dask DataFrame como un único archivo Parquet."""
    # Convertir a Pandas DataFrame si es necesario
    if isinstance(ddf, dd.DataFrame):
        ddf = ddf.compute()
    ddf.to_parquet(path, index=False)

def process_dataset(dataset_path, norm_name, experiment_folder):
    print(f"\nProcesando {norm_name}")
    
    output_folder = os.path.join(experiment_folder, norm_name)
    os.makedirs(output_folder, exist_ok=True)

    try:
        ddf = dd.read_parquet(dataset_path)
        target_col = 'nivel_triage'
        
        sample = ddf.sample(frac=SAMPLE_FRACTION).compute()
        X = clean_nan(sample.drop(columns=[target_col]))
        y = sample[target_col].astype(int)

        if X.empty:
            raise ValueError("No hay características válidas después de la limpieza")

        feature_names = X.columns.tolist()

        # Guardar un archivo Parquet con todas las variables (sin selección de características)
        save_single_parquet(
            ddf,
            os.path.join(output_folder, f"{norm_name}_all_features.parquet")
        )

        # Modificación principal: Separar la lógica de los selectores
        for sel_name in ["Linear", "Nonlinear", "Model-Based"]:
            try:
                # Obtener scores según el tipo de selector
                if sel_name == "Model-Based":
                    # Usar RandomForest
                    X_sample, _, y_sample, _ = train_test_split(
                        X, y,
                        train_size=min(SAMPLE_SIZE_MODEL, len(X)),
                        stratify=y,
                        random_state=42
                    )
                    model = RandomForestClassifier(
                        n_estimators=50,
                        n_jobs=-1,
                        random_state=42,
                        max_depth=10
                    )
                    model.fit(X_sample, y_sample)
                    scores = model.feature_importances_
                
                elif sel_name == "Linear":
                    # Usar f_classif
                    with np.errstate(all='ignore'):
                        scores, _ = f_classif(X, y)
                
                else:  # Nonlinear
                    # Usar mutual_info_classif
                    with np.errstate(all='ignore'):
                        scores = mutual_info_classif(X, y)

                # Análisis de importancia
                selected_features, analysis_df, cutoff_idx = analyze_feature_importance_enhanced(
                    scores,
                    feature_names=feature_names
                )
                
                # Guardar resultados
                analysis_df.to_csv(
                    os.path.join(output_folder, f"{norm_name}_{sel_name}_report.csv"),
                    index=False
                )
                
                plot_importance_analysis(
                    analysis_df,
                    cutoff_idx,
                    os.path.join(output_folder, f"{norm_name}_{sel_name}_plot.png")
                )

                selected_cols = selected_features + [target_col]
                save_single_parquet(
                    ddf[selected_cols],
                    os.path.join(output_folder, f"{norm_name}_{sel_name}_selected.parquet")
                )

            except Exception as e:
                print(f"❌ Error en {sel_name}: {str(e)}")
            finally:
                gc.collect()

    except Exception as e:
        print(f"❌ Error crítico: {str(e)}")

def main():
    experiment_name = "experimento_final"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    base_path = os.path.dirname(os.getcwd())
    with open(os.path.join(base_path, "config.json")) as f:
        config = json.load(f)
    
    normalized_path = os.path.join(base_path, config["paths"]["intermediate"]["normalized"])
    experiment_folder = os.path.join(
        base_path, 
        config["paths"]["outputs"], 
        "experiments", 
        f"{experiment_name}_{timestamp}"
    )
    os.makedirs(experiment_folder, exist_ok=True)
    
    datasets = {
        "MaxAbs": "df_feateng_MaxAbs.parquet",
        "MinMax": "df_feateng_MinMax.parquet",
        "NoNorm": "df_feateng_None.parquet",
        "Robust": "df_feateng_Robust.parquet",
        "Standard": "df_feateng_Standard.parquet"
    }
    
    for norm_name, file in datasets.items():
        dataset_path = os.path.join(normalized_path, file)
        if os.path.exists(dataset_path):
            process_dataset(dataset_path, norm_name, experiment_folder)
        else:
            print(f"❌ Dataset no encontrado: {dataset_path}")

if __name__ == "__main__":
    main()


Procesando MaxAbs

Procesando MinMax

Procesando NoNorm

Procesando Robust

Procesando Standard
