# Chapter 3.3. - Feature Selection

In this JupyterNotebook, three feature selection technique are conjointly applied to all four training data subsets, in order to enable the comparison of models trained on all features and models trained on only selected features in chapter 4.1.

#### Feature Selection

In [26]:
# Import required Python modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from matplotlib import pyplot

"""
Estimate the most important features of a dataset based on chi-squared, ANOVA F-value and Random Forest feature selection
@param parameter_values: dataset containing numerical features and the corresponding categorical target in the last column
@param k: desired number of best features to be selected
"""
def most_important_features(parameter_values, k):
    
    # Store normalized features and target in X and y
    X = MinMaxScaler().fit_transform(parameter_values.iloc[:, :-1])
    y = parameter_values.iloc[:,-1]
    
    # Create a DataFrame for the result
    feature_selection_results = pd.DataFrame(index=parameter_values.iloc[:, :-1].columns)
    
    # Apply chi-squared feature selection
    feature_selection_results['Chi-Squared'] = SelectKBest(score_func=chi2, k=k).fit(X,y).get_support()
    
    # Apply ANOVA F-Value feature selection
    feature_selection_results['ANOVA F-Value'] = SelectKBest(score_func=f_classif, k=k).fit(X,y).get_support()
    
    # Apply Random Forest feature selection 
    # Random Forest Estimator is trained on 150 estimators and with a fixed random seed to ensure reproducability
    feature_selection_results['Random Forest'] = SelectFromModel(RandomForestClassifier(n_estimators=150, random_state=42)).fit(X,y).get_support()
    
    # Sum up the scores ("is among the k best"=1, "is not among the k best"=0) across all feature selection methods
    feature_selection_results['Total'] = feature_selection_results.sum(axis='columns')
    
    # Sort descending to identify the best features
    feature_selection_results.sort_values(by='Total', ascending=False, inplace=True)
    
    # Return the feature names of the k best features
    return feature_selection_results.iloc[:k,:].index.to_list()

# Import all four data subsets
pfas_profit = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')
pfas_gain = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')
afps_profit = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')
afps_gain = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

# For each data subset, compute the feature selection utuility function and store the results in a DataFrame 
result = pd.DataFrame(columns = range(1,21),
                      index = ['PFAS Profit', 'PFAS Gain', 'AFPS Profit', 'AFPS Gain'],
                      data = [most_important_features(pfas_profit, 20),
                              most_important_features(pfas_gain, 20),
                              most_important_features(afps_profit, 20),
                              most_important_features(afps_gain, 20)]).T

# Save the feature selection results
result.to_excel('Feature_Selection_Results.xlsx')

result

Unnamed: 0,PFAS Profit,PFAS Gain,AFPS Profit,AFPS Gain
1,median_daily_maximum_demand_winter_weekend,maximum_tou_summer_workday,median_daily_maximum_demand_spring/autumn_workday,variance_summer_weekend
2,night_impact_spring/autumn_weekend,end_of_work_impact_summer_weekend,daily_range_factor_winter_weekend,kurtosis_summer_weekend
3,fft_peak_summer_workday,lunch_impact_spring/autumn_weekend,daily_range_factor_winter_workday,median_daily_minimum_demand_summer_weekend
4,maximum_tou_winter_weekend,minimum_tou_spring/autumn_workday,daily_nonuniformity_coefficient_winter_workday,median_daily_minimum_demand_summer_workday
5,maximum_tou_winter_workday,kurtosis_summer_workday,daily_range_factor_spring/autumn_workday,end_of_work_impact_summer_workday
6,maximum_tou_spring/autumn_weekend,kurtosis_summer_weekend,daily_load_factor_spring/autumn_workday,daily_nonuniformity_coefficient_summer_weekend
7,variance_summer_workday,fft_peak_summer_weekend,variance_winter_workday,pv_correlation_spring/autumn_weekend
8,variance_summer_weekend,morning_slope_summer_workday,morning_slope_summer_weekend,skewness_summer_weekend
9,variance_winter_workday,end_of_work_impact_summer_workday,median_daily_maximum_demand_summer_workday,lunch_impact_spring/autumn_weekend
10,variance_winter_weekend,median_daily_maximum_demand_spring/autumn_workday,median_daily_maximum_demand_winter_workday,kurtosis_summer_workday


In [30]:
print('PFAS Profit: ', result['PFAS Profit'].to_list(), '\n\n')
print('PFAS Gain: ', result['PFAS Gain'].to_list(), '\n\n')
print('AFPS Profit: ', result['AFPS Profit'].to_list(), '\n\n')
print('AFPS Gain: ', result['AFPS Gain'].to_list(), '\n\n')

PFAS Profit:  ['median_daily_maximum_demand_winter_weekend', 'night_impact_spring/autumn_weekend', 'fft_peak_summer_workday', 'maximum_tou_winter_weekend', 'maximum_tou_winter_workday', 'maximum_tou_spring/autumn_weekend', 'variance_summer_workday', 'variance_summer_weekend', 'variance_winter_workday', 'variance_winter_weekend', 'end_of_work_impact_spring/autumn_weekend', 'minimum_tou_summer_weekend', 'morning_slope_winter_weekend', 'night_impact_winter_weekend', 'minimum_tou_winter_weekend', 'fft_peak_winter_workday', 'daily_load_factor_winter_weekend', 'fft_peak_winter_weekend', 'median_daily_maximum_demand_winter_workday', 'night_slope_spring/autumn_weekend'] 


PFAS Gain:  ['maximum_tou_summer_workday', 'end_of_work_impact_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'minimum_tou_spring/autumn_workday', 'kurtosis_summer_workday', 'kurtosis_summer_weekend', 'fft_peak_summer_weekend', 'morning_slope_summer_workday', 'end_of_work_impact_summer_workday', 'median_daily_maximum

#### Analysis of the Feature Selection Results

In [34]:
# Remove seasonal and weekly differentiation criteria from the parameter names in the feature selection results
result['PFAS Profit'] = result['PFAS Profit'].map(lambda str : '_'.join(str.split('_')[:-2]))
result['PFAS Gain'] = result['PFAS Gain'].map(lambda str : '_'.join(str.split('_')[:-2]))
result['AFPS Profit'] = result['AFPS Profit'].map(lambda str : '_'.join(str.split('_')[:-2]))
result['AFPS Gain'] = result['AFPS Gain'].map(lambda str : '_'.join(str.split('_')[:-2]))

In [44]:
# Count, without seasonal and weekly differentiation criteria, to inspect how often each parameter was selected across all four data subsets
pd.Series(np.append([], [result['PFAS Profit'].to_list(), 
                         result['PFAS Gain'].to_list(), 
                         result['AFPS Profit'].to_list(), 
                         result['AFPS Gain'].to_list()])).value_counts()

minimum_tou                        11
variance                           11
fft_peak                            6
maximum_tou                         6
median_daily_maximum_demand         6
morning_slope                       6
kurtosis                            5
end_of_work_impact                  4
skewness                            3
daily_nonuniformity_coefficient     3
median_daily_minimum_demand         3
daily_load_factor                   3
daily_range_factor                  3
night_impact                        3
lunch_impact                        2
night_slope                         2
pv_correlation                      2
summer_winter                       1
dtype: int64