# PreProcessing: Feature Selection

Feature Selection is an important step in data pre-processing. It consists in selecting the best subset of input variable as the most pertinent. Discarding irrelevant data is essential before applying Machine Learning algorithm in order to:
* *Reduce Overfitting*: less opportunity to make decisions based on noise;
* *Improve Accuracy*: less misleading data means modelling accuracy improves. Predictions can be greatly distorted by redundant attributes. 
* *Reduce Training Time*: With less data the algorithms will train faster;


### Import Libraries

In [1]:
import scipy.stats as stats
import geopandas as gpd
import warnings
import numpy as np
from fs import methods as m
import ipywidgets as widgets
import pandas as pd
warnings.filterwarnings("ignore")
from IPython.core.display import display
from sklearn import preprocessing
import os
from sklearn.preprocessing import MinMaxScaler
FILTERED_ARPA = True

## Results Feature Selection
In this section fs results are evaluated for each geopackages contained in the folder [grids/](https://github.com/opengeolab/D-DUST/tree/thesis_MB/notebooks/grids).<br />
The results are stored in a list of dataframe (one for each dataset) and are displayed in n bar plot. <br />
Each subplot refers to the method choosen with the dropdown widgets, with the possiblity to normalized results or not. <br />
The methods used are:

* Pearson correlation;
* Spearmanr correlation;
* Kendall tau; 
* F-Test;
* Random Forest importance; 

<br />
In addition, an average score for these methods is added.



In [None]:
geopackages = os.listdir('grids')
geopackages.remove('.DS_Store')
grid_data = []
dataframes_results = {}
for grid in geopackages:
    #read gpkg file
    data = gpd.read_file('grids/'+ grid)

    if(FILTERED_ARPA):
        data = data[~data['pm25_st'].isnull()]

    labels = m.check_NotNull(data)
    grid_data.append(data)
    #read variables which are not null
    score_results = pd.DataFrame()

    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels )
    Y = X[['pm25_st']]
    Y = Y.values.ravel()
    X.pop('pm25_st')
    X.pop('geometry')
    X.pop('bottom')
    X.pop('top')
    X.pop('left')
    X.pop('right')
    #coordinates definition used for mgwr
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')

    labels = X.columns.tolist()
    score_results['Features'] = labels
    X = X.apply(stats.zscore)
    X = X.dropna(axis=1)
    score_results = m.fs_results_computation(X, Y)

    x = score_results.loc[:, score_results.columns != 'Features'].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    temp = pd.DataFrame(x_scaled)
    mean_results =  temp.mean(axis=1)
    score_results['Average Scores'] = mean_results
    dataframes_results[grid] = score_results

results_norm = widgets.Checkbox(
    value=True,
    description='Results normalized',
    disabled=False,
    indent=True
)

#Radiobutton used to display data in regular or logaritmic scale
scale = widgets.RadioButtons(
    options=['Regular', 'Logaritmic'],
    description='Scale:',
    disabled=False
)

#Radiobutton used to display data ordered by score or by labels
order = widgets.RadioButtons(
    options=['Labels', 'Scores'],
    description='Order by:',
    disabled=False
)
#Dropdown widgets used to choose the scores of the method selected
method_choosen = widgets.Dropdown(
    options=['Pearson', 'Spearmanr', 'Kendall', 'Fisher', 'Random Forest Importance', 'Average Scores'],
    value='Pearson',
    description='Method:',
    disabled=False
)



def fs_manager(change_scale, method, normalized_results):
    res = []
    if(normalized_results):
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            temp = m.NormalizeData1D(temp)
            res.append(temp)
    else:
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            res.append(temp)


    if (change_scale == 'Logaritmic'):
        m.show_bars_log(labels, res, method, geopackages)
        return

    else:
        m.show_bars(labels, res, method, geopackages)
        return

container1 = widgets.VBox([scale])
container2 = widgets.VBox([method_choosen, results_norm])
ui = widgets.HBox([container2, container1])
out = widgets.interactive_output(fs_manager, {'method':method_choosen, 'change_scale': scale,  'normalized_results': results_norm})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Method:', options=('Pearson', 'Spearmanr', 'Kendall', 'Fis…

Output()

### Other methods
In this section are grouped method which are not included in the previous feature selection results. These methods are:
* Variance Threshold;
* Exhaustive feature selection;
* Recursive feature selection;
* Multiscale Geographically Weighted Regression (MGWR);


In [3]:
for data in grid_data:
    labels = m.check_NotNull(data)
    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels )
    Y = X[['pm25_cams']]
    Y = Y.values.ravel()
    X.pop('pm25_cams')
    X.pop('geometry')
    X.pop('bottom')
    X.pop('top')
    X.pop('left')
    X.pop('right')
    #coordinates definition used for mgwr
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    X = X.apply(stats.zscore)

    #Other methods computation
    #m.variance_threshold(X_notStand, 0.1) #O.1 represents the threshold value for the variances
    #m.exhaustive_feature_selection(X, Y)
    #m.recursive_feature_selection(X, Y.astype(int), 5) #5 represents the cardinality of the subset selected
    #m.mgwr(data, 'pm25_st') #'pm25_cams' is the target variable used for the regression

