# Pre Processing: Feature Selection

Feature Selection is an important step in data pre-processing. It consists in selecting the best subset of input variable as the most pertinent. Discarding irrelevant data is essential before applying Machine Learning algorithm in order to:
* *Reduce Overfitting*: less opportunity to make decisions based on noise;
* *Improve Accuracy*: less misleading data means modelling accuracy improves. Predictions can be greatly distorted by redundant attributes. 
* *Reduce Training Time*: With less data the algorithms will train faster;


### Import Libraries

In [1]:
from tokenize import String

import scipy.stats as stats
import geopandas as gpd
import numpy as np
from numpy import arange
from fs import methods as m
from fs import model as ml
import ipywidgets as widgets
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from IPython.core.display import display, clear_output
from sklearn import preprocessing
import os
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import AppLayout, Layout

pd.set_option('display.max_rows', 500)


### Dataframe 

In [2]:
RESOLUTION= '0_01'
KNN = True
knn_value = 10
NO_MOUNTAINS = True
geopackages = os.listdir('assets/grids_'+RESOLUTION)
grid_data = []
dataframes_results = {}
var_t = 'empty'
target_labels = []
target_labels.append('pm25_cams')





## Results Feature Selection
In this section fs results are evaluated for each geopackages contained in the folder [grids/](https://github.com/opengeolab/D-DUST/tree/thesis_MB/notebooks/grids).<br />
The results are stored in a list of dataframe (one for each dataset) and are displayed in n bar plot. <br />
Each subplot refers to the method choosen with the dropdown widgets, with the possiblity to normalized results or not. <br />
The methods used are:

* Pearson correlation;
* Spearmanr correlation;
* Kendall tau; 
* F-Test;
* Random Forest importance; 

<br />
In addition, an average score for these methods is added.



In [3]:
labels = list(gpd.read_file('assets/grids_'+RESOLUTION+'/'+ geopackages[0]).dropna(axis=0).dropna(axis=1).columns)
frequencies_tables = []
method_list = ['Pearson', 'Spearmanr', 'Kendall', 'Fisher', 'RF Importance', 'RFS']
for l in labels:
    if(l.endswith('_st')):
        target_labels.append(l)

results_norm = widgets.Checkbox(
    value=True,
    description='Results normalized',
    disabled=False,
    indent=True
)

#Radiobutton used to display data in regular or logaritmic scale
scale = widgets.RadioButtons(
    options=['Regular', 'Logaritmic'],
    description='Scale:',
    disabled=False
)

compute_button = widgets.Button(
    description='Compute',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Compute',
    icon='', # (FontAwesome names without the `fa-` prefix)
    layout = Layout(width='60%', margin='10px 150px 10px 80px')


)

#Radiobutton used to display data ordered by score or by labels
order = widgets.RadioButtons(
    options=['Labels', 'Scores'],
    description='Order by:',
    disabled=False
)
#Dropdown widgets used to choose the scores of the method selected
method_choosen = widgets.Dropdown(
    options=['---']+ method_list + ['Borda Count Voting', 'Final Score'],
    value='---',
    description='Method:',
    disabled=False,
    layout = Layout(width='90%')
)




target_variable = widgets.Dropdown(
    options=target_labels,
    value=target_labels[0],
    description='Target:',
    layout = Layout(width='90%')
)

variance_TH= widgets.Checkbox(
    value=True,
    description='Apply',
    disabled=False,
    indent=False,
    layout = Layout(margin='10px 10px 10px 80px')

)

value_th = widgets.Dropdown(
    options=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    value=0,
    description='Variance TH:',
    disabled=False,
    layout = Layout(width='90%')

)

labels_list = []
def compute_button_f(b):
    clear_output()
    global features_deleted
    global th
    global frequencies_tables

    global var_t
    global labels
    global dataframes_results
    global grid_data

    labels_list.clear()
    dataframes_results.clear()

    var_t = target_variable.value
    grid_data = []
    to_print = '<h3>Features deleted</h3>'

    for index, grid in enumerate(geopackages):
        #read gpkg file
        data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)
        if KNN:
            data = m.process_data(data, knn_value, target_variable.value, NO_MOUNTAINS)
    
        data = data[~data[target_variable.value].isnull()]
        data = data.dropna(axis=1).dropna(axis=0)
        data.pop('geometry')

        grid_data.append(data)

        labels = list(data.columns)

        #read variables which are not null
        score_results = pd.DataFrame()

        #Store dataset in x and y variables
        X = pd.DataFrame(data=data, columns=labels )
        Y = X[target_variable.value]
        Y = Y.values.ravel()
        X.pop(target_variable.value)
        X.pop('lat_cen')
        X.pop('lng_cen')

        if value_th.disabled == False:
            scores_th = m.variance_threshold(X, value_th.value)
            to_print = to_print + '<ul>' + geopackages[index] + '</ul>'

            for i, label in enumerate(scores_th['Features']):
                if scores_th['Scores'].tolist()[i] != 1:
                    to_print = to_print + '<li>' + label + '</li>'
                    X = X.drop(label, 1)
        else:
            to_print = to_print + 'None'
            
        features_deleted = widgets.HTML(to_print)

        X = X.apply(stats.zscore)
        X = X.dropna(axis=1)
        Y = (Y - Y.mean(axis=0)) / Y.std(axis=0)
        
        labels = X.columns.tolist()
        labels_list.append(labels)
        score_results['Features'] = labels

        score_results = m.fs_results_computation(X, Y)
        frequency = pd.DataFrame()

        frequency['Features'] = labels

        for i in method_list:
            frequency[i] = score_results['Features'].isin(list(score_results.nlargest(30,i)['Features'])).astype(int)
        frequency['Final Score'] = frequency.sum(axis=1)
        
        frequencies_tables.append(frequency)

        var_t = target_variable.value
        score_results['Final Score']=frequency['Final Score']
        score_results['Borda Count Voting'] = m.borda_voting(score_results)

        dataframes_results[grid] = score_results
        method_choosen.value = '---'






def fs_manager(change_scale, method, normalized_results, target, order, filter_variance, th_value):

    if filter_variance == True:
        value_th.disabled = False
    else :
        value_th.disabled = True

    if method == '---':
        return
    res = []

    if(normalized_results):
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            temp = m.NormalizeData1D(temp)
            res.append(temp)
    else:
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            res.append(temp)



    if (change_scale == 'Logaritmic'):
        m.show_bars_log(labels_list, res, method, geopackages, order)
        return

    else:
        m.show_bars(labels_list, res, method, geopackages, order)

title= widgets.HTML('<h2 style="text-align:center;">Options</h2><hr><h3 style="padding: 10px;">Input</h3>')
features_deleted = widgets.HTML('')
title2 = widgets.HTML('<h2 style="text-align:center;">Feature Selection scores</h2><hr>')
plots = widgets.interactive_output(fs_manager, {'method':method_choosen, 'change_scale': scale, 'order':order, 'normalized_results': results_norm, 'target': target_variable, 'filter_variance':variance_TH,'th_value':value_th })
plot = widgets.VBox([title2, plots], layout=Layout(border='solid'))
output = widgets.VBox([title, target_variable, value_th, compute_button, features_deleted,variance_TH,widgets.HTML('<br><h3 style="padding: 10px;">Output</h3'),
                           method_choosen, scale, order, results_norm], layout=Layout(border='solid'))

ui = AppLayout(header=None,
          left_sidebar=output,
          center=plot,
          right_sidebar=None,
          footer=None,
              layout=Layout(border='solid'))

compute_button.on_click(compute_button_f)

container = widgets.Box([ui], )
display(container)

Box(children=(AppLayout(children=(VBox(children=(HTML(value='<h2 style="text-align:center;">Options</h2><hr><h…

### Export Feature Selection
By running this section, a dataframe containing the list of feature selected ordered by its average score value is exported as .csv file.

In [5]:
for index, grid in enumerate(geopackages):
    print(grid)
    display(dataframes_results[grid])
    print('\n')


for grid in geopackages:
    dataframes_results[grid].to_csv(r'assets/Votes/'+RESOLUTION+grid[:-5]+'.csv', index = False)


general_fs = pd.DataFrame()
general_fs['Features'] = list(dataframes_results.values())[0]['Features']
for index, grid in enumerate(geopackages):

    dataframes_results[grid].sort_values(by ='Borda Count Voting', axis=0, ascending=False, inplace=True, kind='quicksort', na_position='last')
    labels_selected = pd.DataFrame()
    labels_selected['Features'] = dataframes_results[grid]['Features']
    labels_selected['Borda Count Voting'] = dataframes_results[grid]['Borda Count Voting'].round(decimals = 3)

    labels_selected.to_csv(r'assets/features_'+RESOLUTION+'/'+grid[:-5]+'.csv', index = False)
    labels_selected.to_excel('assets/features_'+RESOLUTION+'/'+grid[:-5]+'.xlsx')
    general_fs[index] = dataframes_results[grid]['Borda Count Voting']


general_fs['Scores'] = m.borda_voting(general_fs)
general_fs.to_csv(r'assets/features_'+RESOLUTION+'general'+'.csv', index = False)






grid_0_01_1007_1014_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.034474,-0.0356,-0.05668,1.358365,0.013813,30,6,129
1,aspect_major,-0.235217,-0.216736,-0.148298,13.384746,0.00245,29,6,101
2,slope_mean,-0.231093,-0.38857,-0.273381,20.206129,0.005026,17,6,64
3,pop,-0.036344,-0.162696,-0.101875,1.085769,0.002871,24,6,70
4,int_prim_sec,0.066886,0.032021,0.027415,0.181005,0.000821,23,6,92
5,int_sec,0.009108,0.022251,0.018895,0.152489,0.000537,4,6,65
6,prim_road,0.053389,0.087622,0.065213,0.437412,0.00169,12,6,94
7,sec_road,-0.069301,-0.070212,-0.044669,0.645852,0.00122,11,6,61
8,farm_pigs,-0.102267,-0.09626,-0.076935,2.918209,0.000317,2,6,52
9,farm_sheep,-0.069115,-0.004477,-0.004944,1.929752,0.000519,3,6,72




grid_0_01_0418_0425_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.088326,-0.010135,-0.039308,4.261505,0.023375,25,6,113
1,aspect_major,-0.070446,-0.022098,-0.018264,2.046072,0.000681,24,6,93
2,slope_mean,-0.160124,-0.347436,-0.248334,5.842625,0.00571,23,6,61
3,pop,0.077594,0.112444,0.076779,0.542366,0.001866,8,6,63
4,int_prim_sec,0.018761,-0.051324,-0.036267,0.950231,0.000189,21,6,62
5,int_sec,0.183422,0.226049,0.157607,3.885476,0.000986,10,6,102
6,prim_road,-0.122301,-0.142892,-0.099097,3.325619,0.000599,2,6,40
7,sec_road,0.270928,0.270104,0.182369,14.240182,0.000829,3,6,114
8,farm_pigs,-0.032737,-0.026703,-0.021424,0.890555,9.5e-05,4,6,46
9,farm_sheep,-0.001805,0.076829,0.055216,0.381921,0.000175,5,6,52




grid_0_01_0903_0910_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.297397,-0.367714,-0.308421,21.189701,0.004991,30,6,106
1,aspect_major,-0.227431,-0.264667,-0.176113,8.830648,0.000325,29,6,101
2,slope_mean,-0.208045,-0.401522,-0.272606,8.015984,0.001338,28,6,73
3,pop,-0.126072,-0.195349,-0.127234,5.681173,0.001906,27,6,94
4,int_prim_sec,-0.036469,-0.072236,-0.055638,0.011798,6.1e-05,26,5,79
5,int_sec,0.027962,-0.007719,-0.006969,0.547241,0.00057,24,6,115
6,prim_road,-0.059944,-0.007978,-0.006637,0.404928,0.000147,23,6,99
7,sec_road,-0.01829,-0.049459,-0.034246,1.739401,0.00057,22,6,106
8,farm_pigs,-0.069067,-0.079582,-0.065706,1.365138,0.000199,21,6,89
9,farm_sheep,-0.045346,-0.035133,-0.030467,0.128279,4.5e-05,2,5,64




grid_0_01_0717_0724_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,0.302485,0.128941,0.085121,12.319887,0.028452,11,6,153
1,aspect_major,0.031323,-0.016324,-0.006384,0.003992,0.00522,7,6,81
2,slope_mean,-0.022003,-0.275026,-0.19436,1.558232,0.008255,21,6,63
3,pop,0.02894,-0.024118,-0.008244,0.088493,0.00361,2,6,46
4,int_prim_sec,-0.045022,-0.04206,-0.030428,0.436389,0.001623,3,6,41
5,int_sec,0.075144,0.092578,0.067486,2.089102,0.002929,4,6,82
6,prim_road,-0.053031,-0.014684,-0.010383,0.317894,0.002799,5,6,48
7,sec_road,0.04089,-0.007672,0.002313,0.970707,0.00296,6,6,71
8,farm_pigs,0.030822,0.058384,0.048634,0.040676,0.000763,23,6,81
9,farm_sheep,0.075795,0.096423,0.074752,0.313472,0.000165,25,6,96




grid_0_01_0324_0331_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.081486,-0.0029,0.010948,3.463503,0.042052,1,6,96
1,aspect_major,0.102419,0.124309,0.081948,0.265968,0.000408,22,6,103
2,slope_mean,-0.085137,-0.249448,-0.171252,0.719019,0.005632,18,6,54
3,pop,-0.061047,0.064965,0.043158,0.865277,0.000849,17,6,71
4,int_prim_sec,0.023584,-0.053892,-0.040309,1.415676,0.000116,5,6,45
5,int_sec,0.161335,0.20134,0.13181,1.677743,0.000447,4,6,77
6,prim_road,-0.136135,-0.184898,-0.13546,5.618466,0.00029,2,6,41
7,sec_road,0.258884,0.268981,0.17888,10.405466,0.00051,9,6,100
8,farm_pigs,-0.013162,-0.036973,-0.028774,0.825257,3.6e-05,20,6,59
9,farm_sheep,0.019284,0.056062,0.041397,0.255248,6.4e-05,25,6,71






## Other methods
In this sections are grouped method which are not included in the previous feature selection results. These methods are:
* Exhaustive feature selection;
* Recursive feature selection;
* Multiscale Geographically Weighted Regression (MGWR);



### MGWR bandwidth and Betas computation
bandwidths = []

for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    
    

    res = m.mgwr_beta(grid_data[index], target_variable.value, 50, geopackages[index])
    list(dataframes_results.values())[index]['MGWR Median Betas'] = m.NormalizeData(res['Betas Median'])
    bandwidths.append(res['Bandwidthds'])
    
    x = list(dataframes_results.values())[index].loc[:, list(dataframes_results.values())[index].columns != 'Features'].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    temp = pd.DataFrame(x_scaled)
    mean_results = temp.mean(axis=1)
    list(dataframes_results.values())[index]['Average Scores'] = mean_results

# MGWR Bandwidths bar plots
m.show_bars(labels_list, bandwidths, 'MGWR Bandwidths', geopackages)

res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['MGWR Median Betas']
        temp = m.NormalizeData1D(temp)
        res.append(temp)

# MGWR Median(Betas) bar plots
m.show_bars(labels_list, res, 'MGWR Median Betas', geopackages)


res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['Average Scores']
        res.append(temp)

# Average scores bar plots (including mgwr results)
m.show_bars(labels_list, res, 'Average Scores', geopackages)

### Exhaustive feature selection


In [None]:
efs_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    efs_results.append(m.exhaustive_feature_selection(X, Y))

### Recursive feature selection


In [13]:
rfe_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    m.recursive_feature_selection(X, Y.astype(int), 20)
    
for index, grid in enumerate(geopackages):
    rfe_results[index].sort_values(by =['Ranking'], axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
    rfe_results[index].to_csv(r'RFS'+grid[:-5]+'.csv', index = False)
    


IndexError: list index out of range

In [8]:
rfe_results

[          Features  isSelected  Ranking
 47      farm_sheep        True        1
 65        no2_cams        True        1
 73          o3_int        True        1
 74        pm10_int        True        1
 60         no_cams        True        1
 ..             ...         ...      ...
 2             dsf3       False       59
 81  wind_speed_int       False       60
 1             dsf2       False       61
 41         highway       False       62
 0         dusafSum       False       63
 
 [82 rows x 3 columns],
         Features  isSelected  Ranking
 79  rad_glob_int        True        1
 72       no2_int        True        1
 70        co_int        True        1
 63     dust_cams        True        1
 59        o3_s5p        True        1
 ..           ...         ...      ...
 43      sec_road       False       60
 1           dsf2       False       61
 0       dusafSum       False       62
 40       int_sec       False       63
 41       highway       False       64
 
 [83 rows x 