In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

# Lista de datasets normalizados
datasets = {
    "MaxAbs": "02_df_Maxabs.parquet",
    "MinMax": "02_df_MinMax.parquet",
    "No_Norm": "02_df_No_Norm.parquet",
    "Robust": "02_df_Robust.parquet",
    "Standard": "02_df_Std.parquet"
}

# Métodos de selección
selectors = {
    "Linear (f_classif)": f_classif,
    "Nonlinear (mutual_info_classif)": mutual_info_classif,
    "Model-Based (RandomForest)": "RandomForest"
}

# Función de feature engineering
def feature_engineering(X):
    features = []
    for col in range(X.shape[1]):
        features.append(X[:, col])  # Original
        features.append(np.log1p(np.abs(X[:, col])))  # Log
        features.append(np.sqrt(np.abs(X[:, col])))  # Sqrt
        features.append(X[:, col] ** 2)  # Squared
        features.append(X[:, col] ** 3)  # Cubic
        features.append(1 / (X[:, col] + 1e-6))  # Reciprocal
    return np.column_stack(features)

# Automatización del flujo
for norm_name, dataset_path in datasets.items():
    # Cargar el dataset normalizado
    df = pd.read_parquet(dataset_path)
    target_column = 'nivel_triage'
    numerical_cols = df.drop(columns=[target_column]).columns
    X = df[numerical_cols].values
    y = df[target_column].values

    # Generar nuevas características
    X_features = feature_engineering(X)

    # Guardar todas las características sin selección
    all_features_df = pd.DataFrame(X_features, columns=[
        f"original_{col}" for col in numerical_cols
    ] + [
        f"{col}_log" for col in numerical_cols
    ] + [
        f"{col}_sqrt" for col in numerical_cols
    ] + [
        f"{col}_squared" for col in numerical_cols
    ] + [
        f"{col}_cubic" for col in numerical_cols
    ] + [
        f"{col}_reciprocal" for col in numerical_cols
    ])
    all_features_df.to_csv(f"{norm_name}_all_features.csv", index=False)
    all_features_df.to_parquet(f"{norm_name}_all_features.parquet", index=False)

    for sel_name, score_func in selectors.items():
        if sel_name == "Model-Based (RandomForest)":
            # Selección basada en modelo (Random Forest)
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_features, y)
            importance_scores = model.feature_importances_

            selected_indices = np.argsort(importance_scores)[-500:]  # Seleccionar las 500 características más importantes
            X_selected = X_features[:, selected_indices]

            selected_features = pd.DataFrame({
                "Feature": all_features_df.columns[selected_indices],
                "Score": importance_scores[selected_indices]
            }).sort_values(by="Score", ascending=False)
        else:
            # Selección con SelectKBest
            selector = SelectKBest(score_func=score_func, k=500)
            X_selected = selector.fit_transform(X_features, y)

            selected_features = pd.DataFrame({
                "Feature": all_features_df.columns,
                "Score": selector.scores_,
                "Selected": selector.get_support()
            })
            selected_features = selected_features[selected_features["Selected"]]

        # Exportar resultados
        result_prefix = f"{norm_name}_{sel_name.replace(' ', '_')}"
        selected_features.to_csv(f"{result_prefix}_selected_features_report.csv", index=False)
        selected_features.to_parquet(f"{result_prefix}_selected_features_report.parquet", index=False)
        pd.DataFrame(X_selected).to_csv(f"{result_prefix}_selected_features.csv", index=False)
        pd.DataFrame(X_selected).to_parquet(f"{result_prefix}_selected_features.parquet", index=False)

        print(f"Processed: Normalization={norm_name}, Selection={sel_name}")