# Pre Processing: Feature Selection

Feature Selection is an important step in data pre-processing. It consists in selecting the best subset of input variable as the most pertinent. Discarding irrelevant data is essential before applying Machine Learning algorithm in order to:
* *Reduce Overfitting*: less opportunity to make decisions based on noise;
* *Improve Accuracy*: less misleading data means modelling accuracy improves. Predictions can be greatly distorted by redundant attributes. 
* *Reduce Training Time*: With less data the algorithms will train faster;


### Import Libraries

In [1]:
from tokenize import String

import scipy.stats as stats
import geopandas as gpd
import numpy as np
from numpy import arange
from fs import methods as m
from fs import model as ml
import ipywidgets as widgets
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from IPython.core.display import display, clear_output
from sklearn import preprocessing
import os
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import AppLayout, Layout

pd.set_option('display.max_rows', 500)


### Dataframe 

In [2]:
RESOLUTION= '0_1'
KNN = True
knn_value = 10

geopackages = os.listdir('assets/grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')
grid_data = []
dataframes_results = {}
var_t = 'empty'
target_labels = []
target_labels.append('pm25_cams')





## Results Feature Selection
In this section fs results are evaluated for each geopackages contained in the folder [grids/](https://github.com/opengeolab/D-DUST/tree/thesis_MB/notebooks/grids).<br />
The results are stored in a list of dataframe (one for each dataset) and are displayed in n bar plot. <br />
Each subplot refers to the method choosen with the dropdown widgets, with the possiblity to normalized results or not. <br />
The methods used are:

* Pearson correlation;
* Spearmanr correlation;
* Kendall tau; 
* F-Test;
* Random Forest importance; 

<br />
In addition, an average score for these methods is added.



In [3]:
labels = list(gpd.read_file('assets/grids_'+RESOLUTION+'/'+ geopackages[0]).dropna(axis=0).dropna(axis=1).columns)
frequencies_tables = []
method_list = ['Pearson', 'Spearmanr', 'Kendall', 'Fisher', 'RF Importance', 'RFS']
for l in labels:
    if(l.endswith('_st')):
        target_labels.append(l)

results_norm = widgets.Checkbox(
    value=True,
    description='Results normalized',
    disabled=False,
    indent=True
)

#Radiobutton used to display data in regular or logaritmic scale
scale = widgets.RadioButtons(
    options=['Regular', 'Logaritmic'],
    description='Scale:',
    disabled=False
)

compute_button = widgets.Button(
    description='Compute',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Compute',
    icon='', # (FontAwesome names without the `fa-` prefix)
    layout = Layout(width='60%', margin='10px 150px 10px 80px')


)

#Radiobutton used to display data ordered by score or by labels
order = widgets.RadioButtons(
    options=['Labels', 'Scores'],
    description='Order by:',
    disabled=False
)
#Dropdown widgets used to choose the scores of the method selected
method_choosen = widgets.Dropdown(
    options=['---']+ method_list + ['Borda Count Voting', 'Final Score'],
    value='---',
    description='Method:',
    disabled=False,
    layout = Layout(width='90%')
)




target_variable = widgets.Dropdown(
    options=target_labels,
    value=target_labels[0],
    description='Target:',
    layout = Layout(width='90%')
)

variance_TH= widgets.Checkbox(
    value=True,
    description='Apply',
    disabled=False,
    indent=False,
    layout = Layout(margin='10px 10px 10px 80px')

)

value_th = widgets.Dropdown(
    options=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    value=0,
    description='Variance TH:',
    disabled=False,
    layout = Layout(width='90%')

)

labels_list = []
def compute_button_f(b):
    clear_output()
    global features_deleted
    global th
    global frequencies_tables

    global var_t
    global labels
    global dataframes_results
    global grid_data

    labels_list.clear()
    dataframes_results.clear()

    var_t = target_variable.value
    grid_data = []
    to_print = '<h3>Features deleted</h3>'

    for index, grid in enumerate(geopackages):
        #read gpkg file
        data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)
        if KNN:
            data = m.process_data(data, knn_value, target_variable.value)
    
        data = data[~data[target_variable.value].isnull()]
        data = data.dropna(axis=1).dropna(axis=0)
        data.pop('geometry')

        grid_data.append(data)

        labels = list(data.columns)

        #read variables which are not null
        score_results = pd.DataFrame()

        #Store dataset in x and y variables
        X = pd.DataFrame(data=data, columns=labels )
        Y = X[target_variable.value]
        Y = Y.values.ravel()
        X.pop(target_variable.value)
        X.pop('lat_cen')
        X.pop('lng_cen')

        if value_th.disabled == False:
            scores_th = m.variance_threshold(X, value_th.value)
            to_print = to_print + '<ul>' + geopackages[index] + '</ul>'

            for i, label in enumerate(scores_th['Features']):
                if scores_th['Scores'].tolist()[i] != 1:
                    to_print = to_print + '<li>' + label + '</li>'
                    X = X.drop(label, 1)
        else:
            to_print = to_print + 'None'
            
        features_deleted = widgets.HTML(to_print)

        X = X.apply(stats.zscore)
        X = X.dropna(axis=1)
        Y = (Y - Y.mean(axis=0)) / Y.std(axis=0)
        
        labels = X.columns.tolist()
        labels_list.append(labels)
        score_results['Features'] = labels

        score_results = m.fs_results_computation(X, Y)
        frequency = pd.DataFrame()

        frequency['Features'] = labels

        for i in method_list:
            frequency[i] = score_results['Features'].isin(list(score_results.nlargest(30,i)['Features'])).astype(int)
        frequency['Final Score'] = frequency.sum(axis=1)
        
        frequencies_tables.append(frequency)

        var_t = target_variable.value
        score_results['Final Score']=frequency['Final Score']
        score_results['Borda Count Voting'] = m.borda_voting(score_results)

        dataframes_results[grid] = score_results
        method_choosen.value = '---'






def fs_manager(change_scale, method, normalized_results, target, order, filter_variance, th_value):

    if filter_variance == True:
        value_th.disabled = False
    else :
        value_th.disabled = True

    if method == '---':
        return
    res = []

    if(normalized_results):
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            temp = m.NormalizeData1D(temp)
            res.append(temp)
    else:
        for grid in geopackages:
            temp = (dataframes_results[grid])[method]
            res.append(temp)



    if (change_scale == 'Logaritmic'):
        m.show_bars_log(labels_list, res, method, geopackages, order)
        return

    else:
        m.show_bars(labels_list, res, method, geopackages, order)

title= widgets.HTML('<h2 style="text-align:center;">Options</h2><hr><h3 style="padding: 10px;">Input</h3>')
features_deleted = widgets.HTML('')
title2 = widgets.HTML('<h2 style="text-align:center;">Feature Selection scores</h2><hr>')
plots = widgets.interactive_output(fs_manager, {'method':method_choosen, 'change_scale': scale, 'order':order, 'normalized_results': results_norm, 'target': target_variable, 'filter_variance':variance_TH,'th_value':value_th })
plot = widgets.VBox([title2, plots], layout=Layout(border='solid'))
output = widgets.VBox([title, target_variable, value_th, compute_button, features_deleted,variance_TH,widgets.HTML('<br><h3 style="padding: 10px;">Output</h3'),
                           method_choosen, scale, order, results_norm], layout=Layout(border='solid'))

ui = AppLayout(header=None,
          left_sidebar=output,
          center=plot,
          right_sidebar=None,
          footer=None,
              layout=Layout(border='solid'))

compute_button.on_click(compute_button_f)

container = widgets.Box([ui], )
display(container)

Box(children=(AppLayout(children=(VBox(children=(HTML(value='<h2 style="text-align:center;">Options</h2><hr><h…

### Export Feature Selection
By running this section, a dataframe containing the list of feature selected ordered by its average score value is exported as .csv file.

In [10]:
for index, grid in enumerate(geopackages):
    print(grid)
    display(dataframes_results[grid])
    print('\n')


for grid in geopackages:
    dataframes_results[grid].to_csv(r'assets/Votes/'+RESOLUTION+grid[:-5]+'.csv', index = False)


general_fs = pd.DataFrame()
general_fs['Features'] = list(dataframes_results.values())[0]['Features']
for index, grid in enumerate(geopackages):

    dataframes_results[grid].sort_values(by ='Final Score', axis=0, ascending=False, inplace=True, kind='quicksort', na_position='last')
    labels_selected = pd.DataFrame()
    labels_selected['Features'] = dataframes_results[grid]['Features']
    labels_selected['Final Score'] = dataframes_results[grid]['Final Score'].round(decimals = 3)

    labels_selected.to_csv(r'assets/features_'+RESOLUTION+'/'+grid[:-5]+'.csv', index = False)
    labels_selected.to_excel('assets/features_'+RESOLUTION+'/'+grid[:-5]+'.xlsx')
    general_fs[index] = dataframes_results[grid]['Borda Count Voting']


general_fs['Scores'] = m.borda_voting(general_fs)
general_fs.to_csv(r'assets/features_'+RESOLUTION+'general'+'.csv', index = False)






grid_0_1_0418_0425_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
74,temp_int,0.360634,0.436961,0.292647,13.402403,0.158393,62,6,488
20,soil7,0.304011,0.372421,0.245089,12.974776,0.005446,53,6,429
13,siarl9,0.392753,0.33248,0.228504,16.743258,0.035491,60,6,470
47,n_wind,0.359384,0.388593,0.251781,20.808644,0.040621,52,6,469
52,co_s5p,0.326554,0.236672,0.175024,12.086782,0.00553,3,5,353
69,pm10_int,0.593462,0.55956,0.404221,52.600425,0.201221,16,5,459
27,soil_text4,0.498101,0.499208,0.344634,35.96261,0.014596,19,5,454
66,no2_int,0.360625,0.401111,0.258906,25.361867,0.013705,23,5,433
59,pm25_cams,0.414061,0.338,0.234843,19.585334,0.007535,38,5,432
54,nh3_cams,0.3495,0.328731,0.234575,12.395001,0.013556,34,5,414




grid_0_1_0903_0910_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
4,dsf2,0.502068,0.497923,0.355556,48.094602,0.004925,72,6,501
67,no2_int,0.318732,0.352147,0.249286,15.065729,0.015521,51,6,443
39,farms,0.251502,0.349384,0.225025,12.773995,0.005028,49,6,416
44,temp_2m,0.456138,0.583201,0.403104,28.443886,0.042602,54,6,498
59,pm10_cams,0.393754,0.430077,0.274364,29.722228,0.010975,21,5,430
52,co_s5p,0.397186,0.421476,0.27486,21.91135,0.003975,37,5,426
70,pm10_int,0.505926,0.469457,0.330726,57.148092,0.036243,20,5,453
58,dust_cams,0.661991,0.723098,0.504283,71.816573,0.504262,8,5,463
60,pm25_cams,0.438138,0.491067,0.316946,36.131965,0.025649,9,5,437
55,nh3_cams,0.383502,0.445765,0.293855,27.447507,0.008669,31,5,431




grid_0_1_1007_1014_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
46,press,0.614938,0.506711,0.381281,60.207238,0.01759,66,6,507
52,aod_055,0.56546,0.579718,0.401905,43.629922,0.004472,55,6,484
73,pm10_int,0.801932,0.799751,0.622069,223.372758,0.587835,17,5,490
57,o3_s5p,0.466252,0.426464,0.317701,20.869167,0.020827,45,5,435
44,temp_2m,0.590016,0.473392,0.359475,50.378052,0.001852,71,5,464
53,aod_047,0.597906,0.602736,0.417931,55.932649,0.001137,53,5,455
4,dsf2,0.611666,0.573753,0.422529,51.179631,0.001537,76,5,488
55,co_s5p,0.626132,0.64882,0.46798,60.940263,0.003013,11,5,452
58,nh3_cams,0.402668,0.509722,0.352512,21.976646,0.002619,42,5,414
13,siarl9,0.468295,0.559381,0.393667,31.591017,0.005804,6,5,411




grid_0_1_0717_0724_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
68,co_int,0.335901,0.365546,0.236188,15.483203,0.100485,70,6,546
0,area,0.118819,0.044823,0.025813,7.153557,0.011309,80,6,477
14,siarl12,0.200156,0.08639,0.061769,5.79107,0.0007,64,5,414
73,pm10_int,0.059862,0.048241,0.032531,0.247903,0.00785,51,5,379
45,prec,0.12059,0.041802,0.047587,7.149026,0.021502,14,5,410
65,nmvocs_cams,0.089624,0.029384,0.024197,6.397522,0.006589,42,5,413
23,soil_text,0.105003,0.179622,0.134327,3.976862,0.001056,55,5,414
27,soil_text4,0.238796,0.239502,0.163354,8.173661,0.007084,50,5,481
70,no2_int,0.151835,0.127409,0.087781,6.355806,0.00718,24,5,431
48,e_wind,0.189704,0.180136,0.12905,8.582395,0.036867,2,5,443




grid_0_1_0324_0331_2021.gpkg


Unnamed: 0,Features,Pearson,Spearmanr,Kendall,Fisher,RF Importance,RFS,Final Score,Borda Count Voting
50,aod_047,0.459829,0.492108,0.345788,24.91242,0.005507,49,6,457
43,ndvi,0.361592,0.347138,0.235516,16.916531,0.002753,65,6,428
4,dsf2,0.314746,0.273361,0.196666,11.116204,0.004802,71,6,423
46,press,0.373592,0.288058,0.210378,15.718466,0.00237,57,6,410
49,soil_moist,0.387149,0.368936,0.252991,19.252324,0.002102,51,6,417
44,temp_2m,0.320367,0.280827,0.204463,10.834156,0.002679,63,6,407
74,temp_int,0.382792,0.45051,0.30125,16.401189,0.00434,41,5,420
67,nox_int,0.450387,0.514124,0.356634,31.258617,0.007234,11,5,420
65,nh3_int,0.799003,0.741785,0.54752,228.439309,0.414947,10,5,447
13,siarl9,0.636764,0.506262,0.365009,71.570058,0.038808,16,5,438






NameError: name 'i' is not defined

## Other methods
In this sections are grouped method which are not included in the previous feature selection results. These methods are:
* Exhaustive feature selection;
* Recursive feature selection;
* Multiscale Geographically Weighted Regression (MGWR);



### MGWR bandwidth and Betas computation
bandwidths = []

for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    
    

    res = m.mgwr_beta(grid_data[index], target_variable.value, 50, geopackages[index])
    list(dataframes_results.values())[index]['MGWR Median Betas'] = m.NormalizeData(res['Betas Median'])
    bandwidths.append(res['Bandwidthds'])
    
    x = list(dataframes_results.values())[index].loc[:, list(dataframes_results.values())[index].columns != 'Features'].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    temp = pd.DataFrame(x_scaled)
    mean_results = temp.mean(axis=1)
    list(dataframes_results.values())[index]['Average Scores'] = mean_results

# MGWR Bandwidths bar plots
m.show_bars(labels_list, bandwidths, 'MGWR Bandwidths', geopackages)

res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['MGWR Median Betas']
        temp = m.NormalizeData1D(temp)
        res.append(temp)

# MGWR Median(Betas) bar plots
m.show_bars(labels_list, res, 'MGWR Median Betas', geopackages)


res = []
for grid in geopackages:
        temp = (dataframes_results[grid])['Average Scores']
        res.append(temp)

# Average scores bar plots (including mgwr results)
m.show_bars(labels_list, res, 'Average Scores', geopackages)

### Exhaustive feature selection


In [None]:
efs_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    efs_results.append(m.exhaustive_feature_selection(X, Y))

### Recursive feature selection


In [13]:
rfe_results = []
for index in range(0,len(dataframes_results)):
    X = grid_data[index].loc[:, grid_data[index].columns != target_variable.value]
    coords = list(zip(X['lat_cen'], X['lng_cen']))
    X.pop('lat_cen')
    X.pop('lng_cen')
    Y = grid_data[index][target_variable.value]
    m.recursive_feature_selection(X, Y.astype(int), 20)
    
for index, grid in enumerate(geopackages):
    rfe_results[index].sort_values(by =['Ranking'], axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
    rfe_results[index].to_csv(r'RFS'+grid[:-5]+'.csv', index = False)
    


IndexError: list index out of range

In [8]:
rfe_results

[          Features  isSelected  Ranking
 47      farm_sheep        True        1
 65        no2_cams        True        1
 73          o3_int        True        1
 74        pm10_int        True        1
 60         no_cams        True        1
 ..             ...         ...      ...
 2             dsf3       False       59
 81  wind_speed_int       False       60
 1             dsf2       False       61
 41         highway       False       62
 0         dusafSum       False       63
 
 [82 rows x 3 columns],
         Features  isSelected  Ranking
 79  rad_glob_int        True        1
 72       no2_int        True        1
 70        co_int        True        1
 63     dust_cams        True        1
 59        o3_s5p        True        1
 ..           ...         ...      ...
 43      sec_road       False       60
 1           dsf2       False       61
 0       dusafSum       False       62
 40       int_sec       False       63
 41       highway       False       64
 
 [83 rows x 