In [1]:
import os
import json
import gc
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Configuración global
DTYPE = np.float32
COLS_PER_GROUP = 50
SAMPLE_FRACTION = 0.2
SAMPLE_SIZE_MODEL = 10000
CUMULATIVE_THRESHOLD = 0.95

def get_memory_usage():
    """Obtiene el uso de memoria actual en GB."""
    import psutil
    return psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3

def is_boolean_column(series):
    """Verifica si una columna es booleana (0/1)."""
    unique_values = series.dropna().unique()
    return set(unique_values).issubset({0, 1})

def safe_transform(data, transform_fn):
    """Aplica transformaciones numéricas seguras."""
    with np.errstate(all='ignore'):
        result = transform_fn(np.abs(data))
    return np.nan_to_num(result, nan=0.0, posinf=0.0, neginf=0.0).astype(DTYPE)

def process_feature_group(df, cols_group, target_col):
    """Procesa características incluyendo transformaciones."""
    features = []
    feature_names = []
    
    for col in cols_group:
        if col == target_col:
            continue
            
        col_data = df[col].astype(DTYPE).values
        
        if not is_boolean_column(pd.Series(col_data)):
            target_data = df[target_col].astype(DTYPE).values
            
            transformations = [
                (col_data, f"{col}_orig"),
                (safe_transform(col_data, np.log1p), f"{col}_log"),
                (safe_transform(col_data, np.sqrt), f"{col}_sqrt"),
                (col_data ** 2, f"{col}_squared"),
                (col_data ** 3, f"{col}_cubic"),
                (col_data * target_data, f"{col}_interact")
            ]
            
            for data, name in transformations:
                features.append(data)
                feature_names.append(name)
        else:
            features.append(col_data)
            feature_names.append(f"{col}_orig")
    
    return np.column_stack(features), feature_names

def clean_nan(df):
    """Limpieza agresiva de NaN."""
    df = df.dropna(axis=1, how='all')
    for col in df.columns:
        if df[col].isna().any():
            median_val = df[col].median(skipna=True)
            df[col] = df[col].fillna(median_val if not np.isnan(median_val) else 0.0)
    return df.astype(DTYPE)

def find_optimal_cutoff(cumulative_importance, min_features=10):
    """Encuentra el punto de corte óptimo combinando métodos."""
    # Método 1: Umbral acumulado
    threshold_idx = np.where(cumulative_importance >= CUMULATIVE_THRESHOLD)[0]
    threshold_idx = threshold_idx[0] if len(threshold_idx) > 0 else len(cumulative_importance)
    
    # Método 2: Punto de inflexión
    npoints = len(cumulative_importance)
    if npoints < 2:
        return min_features
    
    all_coords = np.vstack((range(npoints), cumulative_importance)).T
    vec_line = all_coords[-1] - all_coords[0]
    vec_norm = np.array([-vec_line[1], vec_line[0]])
    vec_norm = vec_norm / np.sqrt(np.sum(vec_norm**2))
    vec_from_first = all_coords - all_coords[0]
    distances = np.abs(np.dot(vec_from_first, vec_norm))
    
    try:
        elbow_idx = np.nanargmax(distances)
    except ValueError:
        elbow_idx = min_features
    
    return max(min_features, min(threshold_idx, elbow_idx))

def analyze_feature_importance_enhanced(scores, min_features=10):
    """Analiza la importancia de características con método combinado."""
    sorted_scores = scores.sort_values(ascending=False)
    
    if sorted_scores.empty or sorted_scores.sum() == 0:
        return [], pd.DataFrame(), 0
    
    cumulative = sorted_scores.cumsum() / sorted_scores.sum()
    cutoff_idx = find_optimal_cutoff(cumulative.values, min_features)
    
    analysis_df = pd.DataFrame({
        'feature': sorted_scores.index,
        'importance_score': sorted_scores.values,
        'cumulative_importance': cumulative.values,
        'rank': range(1, len(sorted_scores)+1)
    })
    
    selected_features = analysis_df.iloc[:cutoff_idx]['feature'].tolist()
    return selected_features, analysis_df, cutoff_idx

def plot_importance_analysis(analysis_df, cutoff_idx, plot_path):
    """Genera gráficos de importancia mejorados."""
    plt.figure(figsize=(15, 8))
    
    # Gráfico de barras para top 50
    plt.subplot(2, 1, 1)
    top_features = analysis_df.head(50)
    plt.bar(range(len(top_features)), top_features['importance_score'], color='darkblue')
    if cutoff_idx < 50:
        plt.axvline(x=cutoff_idx, color='r', linestyle='--', label=f'Punto de corte: {cutoff_idx}')
    plt.title('Top 50 Características por Importancia')
    plt.xlabel('Ranking')
    plt.ylabel('Score de Importancia')
    plt.legend()
    
    # Gráfico de importancia acumulada
    plt.subplot(2, 1, 2)
    plt.plot(analysis_df['cumulative_importance'], 'g-')
    plt.axvline(cutoff_idx, color='r', linestyle='--', label=f'Punto de corte: {cutoff_idx}')
    plt.axhline(CUMULATIVE_THRESHOLD, color='orange', linestyle=':', label=f'Umbral {CUMULATIVE_THRESHOLD}')
    plt.title('Importancia Acumulada')
    plt.xlabel('Número de Características')
    plt.ylabel('Importancia Acumulada')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()

def process_dataset(dataset_path, norm_name, experiment_folder):
    print(f"\nProcesando {norm_name}")
    print(f"Memoria inicial: {get_memory_usage():.2f} GB")

    output_folder = os.path.join(experiment_folder, norm_name)
    os.makedirs(output_folder, exist_ok=True)

    try:
        ddf = dd.read_parquet(dataset_path).reset_index(drop=True)
        target_col = 'nivel_triage'

        numeric_cols = [col for col in ddf.columns
                        if (col != target_col) and (np.issubdtype(ddf[col].dtype, np.number))]

        non_boolean = []
        for col in numeric_cols:
            sample = ddf[col].head(500)
            if not is_boolean_column(sample):
                non_boolean.append(col)

        print(f"Columnas válidas: {len(non_boolean)}/{len(numeric_cols)}")

        col_groups = [non_boolean[i:i+COLS_PER_GROUP]
                      for i in range(0, len(non_boolean), COLS_PER_GROUP)]

        full_partition_paths = []  # List to store paths of partition files

        for partition_idx in range(ddf.npartitions):
            print(f"\nPartición {partition_idx+1}/{ddf.npartitions}")
            partition_df = ddf.get_partition(partition_idx).compute().reset_index(drop=True)

            all_features = []
            all_names = []

            for cols in col_groups:
                features, names = process_feature_group(partition_df, cols, target_col)
                all_features.append(features)
                all_names.extend(names)

            full_df = pd.DataFrame(
                np.hstack(all_features),
                columns=all_names,
                dtype=DTYPE
            )
            full_df[target_col] = partition_df[target_col].astype(DTYPE).values

            full_path = os.path.join(output_folder, f"{norm_name}_full_part_{partition_idx}.parquet")
            full_df.to_parquet(full_path, index=False)
            full_partition_paths.append(full_path)  # Add path to list

            del full_df, partition_df
            gc.collect()

        # ***NEW CODE TO CONSOLIDATE FULL DATASETS***
        print("\nConsolidating full datasets...")
        full_ddf = dd.read_parquet(full_partition_paths)
        final_full_path = os.path.join(output_folder, f"{norm_name}_full.parquet")
        full_ddf.to_parquet(final_full_path, write_index=False) # Save the combined full dataset
        print(f"✅ Consolidated full dataset saved to: {final_full_path}")


        selectors = {
            "Linear": f_classif,
            "Nonlinear": mutual_info_classif,
            "Model-Based": RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)
        }

        for sel_name, selector in selectors.items():
            print(f"\n[SELECTOR] {sel_name}")

            try:
                # Load the *consolidated* full dataset
                full_ddf = dd.read_parquet(final_full_path)  # Changed this line

                sample = full_ddf.sample(frac=SAMPLE_FRACTION).compute()
                X = clean_nan(sample.drop(target_col, axis=1))
                y = sample[target_col].astype(int)

                if X.empty:
                    raise ValueError("No hay características válidas después de la limpieza")

                if sel_name == "Model-Based":
                    X_sample, _, y_sample, _ = train_test_split(
                        X, y,
                        stratify=y,
                        train_size=SAMPLE_SIZE_MODEL,
                        random_state=42
                    )
                    model = selector.fit(X_sample, y_sample)
                    scores = pd.Series(model.feature_importances_, index=X.columns)
                else:
                    with np.errstate(all='ignore'):
                        scores_values = selector(X, y)
                        scores = pd.Series(scores_values[0] if isinstance(scores_values, tuple) else scores_values,
                                          index=X.columns)

                scores = scores.replace([np.inf, -np.inf], np.nan)
                scores = scores.fillna(scores.mean() if not scores.isna().all() else 0)

                selected_features, analysis_df, cutoff_idx = analyze_feature_importance_enhanced(scores)

                analysis_df.to_csv(os.path.join(output_folder, f"{norm_name}_{sel_name}_report.csv"), index=False)
                plot_importance_analysis(analysis_df, cutoff_idx, os.path.join(output_folder, f"{norm_name}_{sel_name}_plot.png"))

                selected_df = full_ddf[selected_features + [target_col]].compute()
                selected_path = os.path.join(output_folder, f"{norm_name}_{sel_name}_selected.parquet")
                selected_df.to_parquet(selected_path, index=False)
                print(f"✅ Dataset seleccionado guardado: {selected_path}")

            except Exception as e:
                print(f"❌ Error en {sel_name}: {str(e)}")
            finally:
                gc.collect()

        # Limpiar archivos temporales *después* de consolidar
        for f in os.listdir(output_folder):
            if "_full_part_" in f:
                os.remove(os.path.join(output_folder, f))

    except Exception as e:
        print(f"❌ Error crítico: {str(e)}")
def main():
    experiment_name = "experimento_final"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Configuración de paths
    base_path = os.path.dirname(os.getcwd())
    config_path = os.path.join(base_path, "config.json")
    
    with open(config_path, "r") as f:
        config = json.load(f)
    
    normalized_path = os.path.join(base_path, config["paths"]["intermediate"]["normalized"])
    outputs_path = os.path.join(base_path, config["paths"]["outputs"])
    experiment_folder = os.path.join(outputs_path, "experiments", f"{experiment_name}_{timestamp}")
    os.makedirs(experiment_folder, exist_ok=True)
    
    datasets = {
        "MaxAbs": "02_df_Maxabs.parquet",
        "MinMax": "02_df_MinMax.parquet",
        "NoNorm": "02_df_None.parquet",
        "Robust": "02_df_Robust.parquet",
        "Standard": "02_df_Standard.parquet"
    }
    
    for norm_name, file in datasets.items():
        dataset_path = os.path.join(normalized_path, file)
        if os.path.exists(dataset_path):
            process_dataset(dataset_path, norm_name, experiment_folder)
        else:
            print(f"❌ Dataset no encontrado: {dataset_path}")

if __name__ == "__main__":
    main()


Procesando MaxAbs
Memoria inicial: 0.21 GB
Columnas válidas: 106/567

Partición 1/1

Consolidating full datasets...
✅ Consolidated full dataset saved to: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MaxAbs\MaxAbs_full.parquet

[SELECTOR] Linear




✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MaxAbs\MaxAbs_Linear_selected.parquet

[SELECTOR] Nonlinear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MaxAbs\MaxAbs_Nonlinear_selected.parquet

[SELECTOR] Model-Based
✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MaxAbs\MaxAbs_Model-Based_selected.parquet

Procesando MinMax
Memoria inicial: 4.32 GB
Columnas válidas: 106/567

Partición 1/1

Consolidating full datasets...
✅ Consolidated full dataset saved to: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MinMax\MinMax_full.parquet

[SELECTOR] Linear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MinMax\MinMax_Linear_selected.parquet

[SELECTOR] Nonlinear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MinMax\MinMax_Nonlinear_selected.parquet

[SELECTOR] Model-Based
✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\MinMax\MinMax_Model-Based_selected.parquet

Procesando NoNorm
Memoria inicial: 4.82 GB
Columnas válidas: 90/567

Partición 1/1

Consolidating full datasets...
✅ Consolidated full dataset saved to: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\NoNorm\NoNorm_full.parquet

[SELECTOR] Linear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\NoNorm\NoNorm_Linear_selected.parquet

[SELECTOR] Nonlinear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\NoNorm\NoNorm_Nonlinear_selected.parquet

[SELECTOR] Model-Based
✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\NoNorm\NoNorm_Model-Based_selected.parquet

Procesando Robust
Memoria inicial: 4.97 GB
Columnas válidas: 90/567

Partición 1/1

Consolidating full datasets...
✅ Consolidated full dataset saved to: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Robust\Robust_full.parquet

[SELECTOR] Linear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Robust\Robust_Linear_selected.parquet

[SELECTOR] Nonlinear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Robust\Robust_Nonlinear_selected.parquet

[SELECTOR] Model-Based
✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Robust\Robust_Model-Based_selected.parquet

Procesando Standard
Memoria inicial: 4.74 GB
Columnas válidas: 112/567

Partición 1/1

Consolidating full datasets...
✅ Consolidated full dataset saved to: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Standard\Standard_full.parquet

[SELECTOR] Linear




✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Standard\Standard_Linear_selected.parquet

[SELECTOR] Nonlinear


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Standard\Standard_Nonlinear_selected.parquet

[SELECTOR] Model-Based
✅ Dataset seleccionado guardado: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\experimento_final_20250201_151905\Standard\Standard_Model-Based_selected.parquet
