In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def process_feature_selection(feature_importance_file, selection_results_file, output_file):
    # Caricare i file
    df_importances = pd.read_csv(feature_importance_file)
    df_selection = pd.read_csv(selection_results_file)

    # Estrarre il nome della feature senza la stazione di provenienza e rimuovere suffissi extra
    df_importances['Base Feature'] = df_importances['Feature'].str.replace(r'_(ECNE|ECPN|EPDN|ESLN|AIO|ECBD|EPD|EPLC)$', '', regex=True)
    df_importances['Base Feature'] = df_importances['Base Feature'].str.replace('.pkl', '', regex=False).str.strip()

    # Aggregare le importanze delle feature sommando il valore delle diverse stazioni
    df_importances_aggregated = df_importances.groupby('Base Feature', as_index=False).agg({'Importance': 'sum'})

    # Normalizzare i nomi delle feature nel secondo dataset e rimuovere spazi extra
    df_selection['Feature Rimossa'] = df_selection['Feature Rimossa'].str.replace('_cleaned', '', regex=False).str.strip()

    # Stampare i nomi unici per debug
    print("Base Feature (df_importances_aggregated):", df_importances_aggregated['Base Feature'].unique())
    print("Feature Rimossa (df_selection):", df_selection['Feature Rimossa'].unique())

    # Normalizzare le metriche utilizzando MinMaxScaler
    scaler = MinMaxScaler()
    df_selection[['TPR_norm', 'FDR_norm', 'FTA_norm', 'Lead_Time_norm']] = scaler.fit_transform(
        df_selection[['True Positive Rate', 'False Discovery Rate', 'FTA', 'avg_lead_time']]
    )

    # Invertire FDR_norm per dare più peso ai valori bassi
    df_selection['FDR_norm'] = 1 - df_selection['FDR_norm']

    # Evitare valori 0 aggiungendo un piccolo offset
    epsilon = 1e-6
    df_selection[['TPR_norm', 'FDR_norm', 'FTA_norm', 'Lead_Time_norm']] = df_selection[['TPR_norm', 'FDR_norm', 'FTA_norm', 'Lead_Time_norm']].clip(lower=epsilon)

    # Calcolare uno score combinato
    df_selection['Model Score'] = df_selection[['TPR_norm', 'FDR_norm', 'FTA_norm', 'Lead_Time_norm']].mean(axis=1)

    # Unire i dataset per confrontare l'importanza aggregata e lo score modello
    df_comparison = df_importances_aggregated.merge(df_selection, 
                                                    left_on='Base Feature', right_on='Feature Rimossa', how='inner')

    # Stampare il numero di righe dopo il merge per verificare
    print("Numero di righe dopo il merge:", len(df_comparison))

    # Calcolare il Final Score come media tra Importanza e Model Score
    df_comparison['Final Score'] = (df_comparison['Importance'] + df_comparison['Model Score']) / 2

    # Ordinare tutte le feature in base al Final Score
    df_all_features_sorted = df_comparison.sort_values(by='Final Score', ascending=False)

    # Salvare il risultato in un file CSV
    df_all_features_sorted.to_csv(output_file, index=False)

    # Mostrare il dataframe con tutte le metriche e ordinato per Importanza Finale
    print(df_all_features_sorted[['Base Feature', 'Importance', 'Model Score', 'Final Score', 'True Positive Rate', 'False Discovery Rate', 'FTA', 'avg_lead_time']].head(20))

# Esempio di utilizzo
process_feature_selection("data/Analysis/feature_importances_INFRA.csv", "data/Analysis/feature_selection_results_infra.csv", "data/Analysis/feature_ranking_infra.csv")
process_feature_selection("data/Analysis/feature_importances_SEISMIC.csv", "data/Analysis/feature_selection_result_seismic.csv", "data/Analysis/feature_ranking_seismic.csv")



Base Feature (df_importances_aggregated): ['Crest Factor' 'Decay Time' 'Dominant Frequency' 'Kurtosis'
 'Mean Amplitude' 'Peak Frequency' 'Peak-to-Peak Amplitude'
 'RMS Amplitude' 'Rise Time' 'Skewness' 'Spectral Bandwidth'
 'Spectral Centroid' 'Spectral Entropy' 'Spectral Flatness' 'Variance'
 'Zero Crossing Rate']
Feature Rimossa (df_selection): ['Rise Time' 'Spectral Bandwidth' 'Spectral Entropy' 'Peak Frequency'
 'Spectral Centroid' 'Spectral Flatness' 'Skewness' 'Mean Amplitude'
 'Kurtosis' 'Zero Crossing Rate' 'Decay Time' 'Variance'
 'Dominant Frequency' 'Crest Factor' 'Peak-to-Peak Amplitude'
 'RMS Amplitude']
Numero di righe dopo il merge: 16
              Base Feature  Importance  Model Score  Final Score  \
0             Crest Factor    0.041808     0.863342     0.452575   
15      Zero Crossing Rate    0.022412     0.878198     0.450305   
6   Peak-to-Peak Amplitude    0.117269     0.775501     0.446385   
10      Spectral Bandwidth    0.058882     0.811210     0.435046   
