# Pre Processing: Feature Selection

Feature Selection is an important step in data pre-processing. It consists in selecting the best subset of input variable as the most pertinent. Discarding irrelevant data is essential before applying Machine Learning algorithm in order to:
* *Reduce Overfitting*: less opportunity to make decisions based on noise;
* *Improve Accuracy*: less misleading data means modelling accuracy improves. Predictions can be greatly distorted by redundant attributes. 
* *Reduce Training Time*: With less data the algorithms will train faster;


### Import Libraries

In [1]:
from tokenize import String

import scipy.stats as stats
import geopandas as gpd
import numpy as np
from numpy import arange
from fs import methods as m
from fs import model as ml
import ipywidgets as widgets
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from IPython.core.display import display, clear_output
from sklearn import preprocessing
import os
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import AppLayout, Layout

pd.set_option('display.max_rows', 500)


### Dataframe 

In [2]:
RESOLUTION= '0_01'
KNN = True
knn_value = 30
NO_MOUNTAINS = False
geopackages = os.listdir('assets/grids_'+RESOLUTION)
grid_data = []
dataframes_results = {}
var_t = 'empty'
target_labels = []
target_labels.append('pm25_cams')





## Results Feature Selection
In this section fs results are evaluated for each geopackages contained in the folder [grids/](https://github.com/opengeolab/D-DUST/tree/thesis_MB/notebooks/grids).<br />
The results are stored in a list of dataframe (one for each dataset) and are displayed in n bar plot. <br />
Each subplot refers to the method choosen with the dropdown widgets, with the possiblity to normalized results or not. <br />
The methods used are:

* Pearson correlation;
* Spearmanr correlation;
* Kendall tau; 
* F-Test;
* Random Forest importance; 

<br />
In addition, an average score for these methods is added.



In [3]:
labels = list(gpd.read_file('assets/grids_'+RESOLUTION+'/'+ geopackages[0]).dropna(axis=0).dropna(axis=1).columns)
frequencies_tables = []
method_list = ['Pearson', 'Spearmanr', 'Kendall', 'Fisher', 'RF Importance', 'RFS']
for l in labels:
    if(l.endswith('_st')):
        target_labels.append(l)

results_norm = widgets.Checkbox(
    value=True,
    description='Results normalized',
    disabled=False,
    indent=True
)

#Radiobutton used to display data in regular or logaritmic scale
scale = widgets.RadioButtons(
    options=['Regular', 'Logaritmic'],
    description='Scale:',
    disabled=False
)

compute_button = widgets.Button(
    description='Compute',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Compute',
    icon='', # (FontAwesome names without the `fa-` prefix)
    layout = Layout(width='60%', margin='10px 150px 10px 80px')


)

#Radiobutton used to display data ordered by score or by labels
order = widgets.RadioButtons(
    options=['Labels', 'Scores'],
    description='Order by:',
    disabled=False
)
#Dropdown widgets used to choose the scores of the method selected
method_choosen = widgets.Dropdown(
    options=['---']+ method_list + ['Borda Count Voting', 'Final Score'],
    value='---',
    description='Method:',
    disabled=False,
    layout = Layout(width='90%')
)




target_variable = widgets.Dropdown(
    options=target_labels,
    value=target_labels[0],
    description='Target:',
    layout = Layout(width='90%')
)

variance_TH= widgets.Checkbox(
    value=True,
    description='Apply',
    disabled=False,
    indent=False,
    layout = Layout(margin='10px 10px 10px 80px')

)

value_th = widgets.Dropdown(
    options=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    value=0,
    description='Variance TH:',
    disabled=False,
    layout = Layout(width='90%')

)

labels_list = []
def compute_button_f(b):
    clear_output()
    global features_deleted
    global th
    global frequencies_tables

    global var_t
    global labels
    global dataframes_results
    global grid_data

    labels_list.clear()
    dataframes_results.clear()

    var_t = target_variable.value
    grid_data = []
    to_print = '<h3>Features deleted</h3>'

    for index, grid in enumerate(geopackages):
        #read gpkg file
        data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)
        if KNN:
            data = m.process_data(data, knn_value, target_variable.value, NO_MOUNTAINS)
    
        data = data[~data[target_variable.value].isnull()]
        data = data.dropna(axis=1).dropna(axis=0)
        data.pop('geometry')

        grid_data.append(data)

        labels = list(data.columns)

        #read variables which are not null
        score_results = pd.DataFrame()

        #Store dataset in x and y variables
        X = pd.DataFrame(data=data, columns=labels )
        Y = X[target_variable.value]
        Y = Y.values.ravel()
        X.pop(target_variable.value)
        X.pop('lat_cen')
        X.pop('lng_cen')

        if value_th.disabled == False:
            scores_th = m.variance_threshold(X, value_th.value)
            to_print = to_print + '<ul>' + geopackages[index] + '</ul>'

            for i, label in enumerate(scores_th['Features']):
                if scores_th['Scores'].tolist()[i] != 1:
                    to_print = to_print + '<li>' + label + '</li>'
                    X = X.drop(label, 1)
        else:
            to_print = to_print + 'None'
            
        features_deleted = widgets.HTML(to_print)

        X = X.apply(stats.zscore)
        X = X.dropna(axis=1)
        Y = (Y - Y.mean(axis=0)) / Y.std(axis=0)
        
        labels = X.columns.tolist()
        labels_list.append(labels)
        score_results['Features'] = labels

        score_results = m.fs_results_computation(X, Y)
        frequency = pd.DataFrame()

        frequency['Features'] = labels

        for i in method_list:
            frequency[i] = score_results['Features'].isin(list(score_results.nlargest(30,i)['Features'])).astype(int)
        frequency['Final Score'] = frequency.sum(axis=1)
        
        frequencies_tables.append(frequency)

        var_t = target_variable.value
        score_results['Final Score']=frequency['Final Score']
        score_results['Borda Count Voting'] = m.borda_voting(score_results)

        dataframes_results[grid] = score_results
        method_choosen.value = '---'






def fs_manager(change_scale, method, normalized_results, target, order, filter_variance, th_value):

    if filter_variance == True:
        value_th.disabled = False
    else :
        value_th.disabled = True

    if method == '---':
        return
    res = []

    if(normalized_results):
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            temp = m.NormalizeData1D(temp)
            res.append(temp)
    else:
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            res.append(temp)



    if (change_scale == 'Logaritmic'):
        m.show_bars_log(labels_list, res, method, geopackages, order)
        return

    else:
        m.show_bars(labels_list, res, method, geopackages, order)

title= widgets.HTML('<h2 style="text-align:center;">Options</h2><hr><h3 style="padding: 10px;">Input</h3>')
features_deleted = widgets.HTML('')
title2 = widgets.HTML('<h2 style="text-align:center;">Feature Selection scores</h2><hr>')
plots = widgets.interactive_output(fs_manager, {'method':method_choosen, 'change_scale': scale, 'order':order, 'normalized_results': results_norm, 'target': target_variable, 'filter_variance':variance_TH,'th_value':value_th })
plot = widgets.VBox([title2, plots], layout=Layout(border='solid'))
output = widgets.VBox([title, target_variable, value_th, variance_TH, compute_button, features_deleted,widgets.HTML('<br><h3 style="padding: 10px;">Output</h3'),
                           method_choosen, scale, order, results_norm], layout=Layout(border='solid'))

ui = AppLayout(header=None,
          left_sidebar=output,
          center=plot,
          right_sidebar=None,
          footer=None,
              layout=Layout(border='solid'))

compute_button.on_click(compute_button_f)

container = widgets.Box([ui], )
display(container)

Box(children=(AppLayout(children=(VBox(children=(HTML(value='<h2 style="text-align:center;">Options</h2><hr><hâ€¦

### Export Feature Selection
By running this section, a dataframe containing the list of feature selected ordered by its average score value is exported as .csv file.

In [4]:
for index, grid in enumerate(geopackages):
    print(grid)
    display(dataframes_results[grid])
    print('\n')


for grid in geopackages:
    dataframes_results[grid].to_csv(r'assets/Votes/'+RESOLUTION+grid[:-5]+'.csv', index = False)


general_fs = pd.DataFrame()
general_fs['Features'] = list(dataframes_results.values())[0]['Features']
for index, grid in enumerate(geopackages):

    dataframes_results[grid].sort_values(by ='Borda Count Voting', axis=0, ascending=False, inplace=True, kind='quicksort', na_position='last')
    labels_selected = pd.DataFrame()
    labels_selected['Features'] = dataframes_results[grid]['Features']
    labels_selected['Borda Count Voting'] = dataframes_results[grid]['Borda Count Voting'].round(decimals = 3)

    labels_selected.to_csv(r'assets/features_'+RESOLUTION+'/'+grid[:-5]+'.csv', index = False)
    labels_selected.to_excel('assets/features_'+RESOLUTION+'/'+grid[:-5]+'.xlsx')
    general_fs[index] = dataframes_results[grid]['Borda Count Voting']


general_fs['Scores'] = m.borda_voting(general_fs)
general_fs.to_csv(r'assets/features_'+RESOLUTION+'general'+'.csv', index = False)






grid_0_01_1007_1014_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.372262,-0.848274,-0.649837,26.371451,0.01990642,36,3,104
1,aspect_major,-0.116248,-0.151543,-0.101929,3.809752,4.29291e-05,35,6,132
2,slope_mean,-0.262041,-0.479858,-0.337419,13.527967,8.120214e-05,34,3,82
3,pop,-0.182228,-0.330739,-0.214137,6.648049,2.402336e-05,33,4,85
4,int_prim_sec,0.025501,0.082687,0.066469,3.805403,1.094583e-05,32,5,133
5,int_sec,-0.150808,-0.164389,-0.130469,3.816568,2.377014e-05,31,6,113
6,highway,-0.071306,-0.051741,-0.041503,0.581198,3.750996e-06,30,4,95
7,prim_road,0.028512,0.109459,0.090488,4.005471,7.618228e-06,29,5,137
8,sec_road,-0.272046,-0.244877,-0.172164,13.921753,4.310596e-05,28,5,102
9,farm_pigs,0.152942,-0.086216,-0.065915,0.943207,3.026352e-06,27,4,102




grid_0_01_0418_0425_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.456633,-0.636271,-0.431632,38.807637,0.01099308,34,3,93
1,aspect_major,-0.015889,-0.034963,-0.032576,0.34717,3.586975e-05,33,6,116
2,slope_mean,-0.339636,-0.328373,-0.230583,22.345768,4.300398e-05,32,3,75
3,pop,-0.152535,-0.089994,-0.059264,5.523183,4.835186e-05,31,6,103
4,int_prim_sec,0.014959,-0.003144,-0.001374,2.076906,5.162302e-06,30,6,111
5,int_sec,-0.201331,-0.106183,-0.080867,8.999041,1.760583e-05,29,6,95
6,highway,-0.094005,-0.047455,-0.038303,1.393551,1.269735e-06,28,5,79
7,prim_road,0.037693,-0.037757,-0.030508,2.869342,1.279278e-05,27,6,109
8,sec_road,-0.282709,-0.175516,-0.122263,16.429382,4.293183e-05,26,6,94
9,farm_pigs,0.116341,0.025147,0.025704,1.298604,1.137018e-06,25,5,100




grid_0_01_0903_0910_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.362635,-0.601555,-0.428424,27.213435,0.001030471,35,3,88
1,aspect_major,-0.130998,-0.142648,-0.101289,5.01063,8.402292e-05,34,6,116
2,slope_mean,-0.253697,-0.211794,-0.132619,13.39089,9.225616e-05,33,6,107
3,pop,-0.260026,-0.266707,-0.196785,15.298049,0.000485088,32,6,105
4,int_prim_sec,0.055066,0.107484,0.094777,0.565842,1.044498e-05,31,4,116
5,int_sec,-0.180285,-0.126158,-0.093436,6.293819,1.091974e-05,30,6,105
6,highway,-0.096607,-0.064094,-0.051331,1.542684,2.529114e-06,29,4,96
7,prim_road,0.068052,0.108633,0.081105,1.12078,1.077179e-05,28,5,124
8,sec_road,-0.295474,-0.169706,-0.125523,17.566656,9.007487e-05,27,6,106
9,farm_pigs,0.150413,-0.052063,-0.039867,3.751532,1.084665e-06,26,5,114




grid_0_01_0717_0724_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.310493,-0.511162,-0.412385,15.411452,0.000851,32,4,80
1,aspect_major,-0.081611,-0.10279,-0.076849,2.040014,7.8e-05,31,6,109
2,slope_mean,-0.208306,-0.112213,-0.060008,7.263462,5.3e-05,30,6,96
3,pop,-0.263634,-0.400458,-0.278808,15.27178,4.6e-05,29,6,84
4,int_prim_sec,0.04102,0.071948,0.058084,4.022177,2.3e-05,28,6,113
5,int_sec,-0.277937,-0.352125,-0.2737,14.344423,2.5e-05,27,6,82
6,highway,-0.145089,-0.205864,-0.164047,3.544374,7e-06,26,6,75
7,prim_road,0.081997,0.147982,0.118653,5.498264,2.4e-05,25,6,125
8,sec_road,-0.382515,-0.395241,-0.287827,31.123959,2.9e-05,24,5,72
9,farm_pigs,0.166786,-0.001373,0.00413,2.244633,8e-06,23,6,107




grid_0_01_0324_0331_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
0,h_mean,-0.325321,-0.534716,-0.339302,19.482006,0.00022,34,3,79
1,aspect_major,-0.01817,-0.006457,-0.013303,0.717819,2.3e-05,33,5,86
2,slope_mean,-0.246537,-0.288986,-0.191112,11.877027,2.9e-05,32,3,73
3,pop,-0.170366,0.05196,0.023754,7.298529,3e-05,31,6,108
4,int_prim_sec,0.040002,0.005079,0.008196,4.777836,7e-06,30,5,92
5,int_sec,-0.123879,0.038283,0.034513,4.515268,2.5e-05,29,6,104
6,highway,-0.072235,0.038589,0.029505,0.64405,1e-05,28,5,83
7,prim_road,0.026297,-0.079331,-0.058953,3.324469,2e-05,27,6,89
8,sec_road,-0.21132,-0.015725,-0.013231,9.516057,3.1e-05,26,5,85
9,farm_pigs,0.171482,0.014498,0.015102,2.483959,1e-06,25,5,87






## Other methods
In this sections are grouped method which are not included in the previous feature selection results. These methods are:
* Exhaustive feature selection;
* Recursive feature selection;
* Multiscale Geographically Weighted Regression (MGWR);



### MGWR bandwidth and Betas computation
bandwidths = []

for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    
    

    res = m.mgwr_beta(grid_data[index], target_variable.value, 50, geopackages[index])
    list(dataframes_results.values())[index]['MGWR Median Betas'] = m.NormalizeData(res['Betas Median'])
    bandwidths.append(res['Bandwidthds'])
    
    x = list(dataframes_results.values())[index].loc[:, list(dataframes_results.values())[index].columns != 'Features'].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    temp = pd.DataFrame(x_scaled)
    mean_results = temp.mean(axis=1)
    list(dataframes_results.values())[index]['Average Scores'] = mean_results

# MGWR Bandwidths bar plots
m.show_bars(labels_list, bandwidths, 'MGWR Bandwidths', geopackages)

res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['MGWR Median Betas']
        temp = m.NormalizeData1D(temp)
        res.append(temp)

# MGWR Median(Betas) bar plots
m.show_bars(labels_list, res, 'MGWR Median Betas', geopackages)


res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['Average Scores']
        res.append(temp)

# Average scores bar plots (including mgwr results)
m.show_bars(labels_list, res, 'Average Scores', geopackages)

### Exhaustive feature selection


In [None]:
efs_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    efs_results.append(m.exhaustive_feature_selection(X, Y))

### Recursive feature selection


In [13]:
rfe_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    m.recursive_feature_selection(X, Y.astype(int), 20)
    
for index, grid in enumerate(geopackages):
    rfe_results[index].sort_values(by =['Ranking'], axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
    rfe_results[index].to_csv(r'RFS'+grid[:-5]+'.csv', index = False)
    


IndexError: list index out of range

In [8]:
rfe_results

[          Features  isSelected  Ranking
 47      farm_sheep        True        1
 65        no2_cams        True        1
 73          o3_int        True        1
 74        pm10_int        True        1
 60         no_cams        True        1
 ..             ...         ...      ...
 2             dsf3       False       59
 81  wind_speed_int       False       60
 1             dsf2       False       61
 41         highway       False       62
 0         dusafSum       False       63
 
 [82 rows x 3 columns],
         Features  isSelected  Ranking
 79  rad_glob_int        True        1
 72       no2_int        True        1
 70        co_int        True        1
 63     dust_cams        True        1
 59        o3_s5p        True        1
 ..           ...         ...      ...
 43      sec_road       False       60
 1           dsf2       False       61
 0       dusafSum       False       62
 40       int_sec       False       63
 41       highway       False       64
 
 [83 rows x 