# Dimensionality reduction using correlation filter

This notebook describes a simple filter for dimensionality reduction: it takes the Spearman correlations of each
attribute with the target and take the $x\%$ attributes with the greatest correlations. Here, $x$ assumes
values $40, 50$ and $60$.

Next sections show the implementation of that filter.

## Implementation

### Importing the dataset

In [1]:
import pandas as pd

# Load dataset
dataset = pd.read_csv('datasets/covertype_norm_train.csv')
# Check
dataset.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
0,-0.573753,-0.518424,-0.428658,0.436024,-0.475092,-0.979056,0.927864,0.14452,-0.534162,-0.220768,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,3
1,1.656009,-0.010549,0.868502,-0.516497,-0.280544,1.81761,0.862413,0.665801,-0.534162,2.273548,...,-0.214265,4.938531,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,7
2,0.169501,-0.799569,0.632655,0.45517,1.89191,-0.388051,0.796962,-1.245563,-1.335438,-0.687429,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,5
3,-1.205043,1.268208,1.576043,0.23499,1.648725,-0.649457,-2.933743,-0.15956,1.956291,-0.501856,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,6
4,-1.057345,0.152697,0.986425,0.134472,0.530073,-1.041945,0.404256,1.056762,-0.014415,-0.79477,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,3


### Extract the Spearman correlations

In [2]:
# Extract spearman correlations
correlations = dataset.corr(method='spearman')
# Remove possible NaN
correlations.fillna(0, inplace=True)
# Check
correlations.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
elevation,1.0,-0.016745,-0.303552,0.389941,0.092544,0.61113,0.029151,0.197107,0.082507,0.519928,...,0.182936,0.129073,0.030389,0.123065,0.027421,0.068329,0.314189,0.307674,0.273301,0.005367
aspect,-0.016745,1.0,0.043698,0.031944,0.072968,0.058852,-0.4226,0.398678,0.626935,-0.074828,...,0.01874,0.025531,0.014841,-0.013428,0.006475,-0.039875,0.037878,-0.039789,-0.013643,-0.008699
slope,-0.303552,0.043698,1.0,0.015481,0.329505,-0.289422,-0.013672,-0.541395,-0.291422,-0.248588,...,-0.160236,0.088168,-0.034586,-0.047623,-0.000787,0.001067,-0.159869,0.062411,-0.040356,0.097802
horiz_dist_hydro,0.389941,0.031944,0.015481,1.0,0.689235,0.137323,-0.04719,0.024537,0.047078,0.187362,...,0.14685,0.065262,0.050453,-0.014999,0.028396,-0.012943,0.077177,0.078974,0.188081,-0.048423
vert_dist_hydro,0.092544,0.072968,0.329505,0.689235,1.0,-0.048246,-0.052126,-0.154444,-0.033204,-0.009792,...,0.043237,0.089233,0.024244,-0.027403,0.014893,-0.023816,-0.039279,0.053291,0.142,0.042138


Now, take the absolute values of correlations and sorts the attributes in descending order according to them:

In [3]:
# Retrieve only correlations with respect to the class
class_correlations = correlations['cover_type'].drop('cover_type')
# Absolute value
abs_class_correlations = class_correlations.abs()
# Check before sorting
abs_class_correlations.head()
# Sort
sorted_class_corr = abs_class_correlations.sort_values(ascending=False)
# Check after sorting
sorted_class_corr.head()

soil_type_37    0.260666
soil_type_38    0.251631
wild_area_0     0.223721
soil_type_28    0.215834
soil_type_21    0.197356
Name: cover_type, dtype: float64

### Select the $x$%

In [4]:
# Selection rate (the x% greater correlations)
rates = [40, 50, 60]
# Select attributes based on the computed correlations
filtered_attrs = pd.DataFrame(columns=dataset.columns.drop('cover_type'))
for rate in rates:
    # Number of attributes to select
    qtd_attrs = round(rate/100 * len(filtered_attrs.columns))
    # Select the attributes
    selected_attrs = sorted_class_corr[0:qtd_attrs].index.to_series()
    # Make row of booleans
    boolean_results = filtered_attrs.columns.to_series().isin(selected_attrs.values)
    # Append to the result set
    filtered_attrs = filtered_attrs.append(boolean_results, 
                                           ignore_index=True, 
                                           sort=False)
# Check result
filtered_attrs.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,False,False,True,False,False,True,False,True,True,True,...,True,True,True,False,True,False,True,True,True,True
1,False,False,True,True,True,True,False,True,True,True,...,True,True,True,False,True,False,True,True,True,True
2,False,False,True,True,True,True,False,True,True,True,...,True,True,True,False,True,False,True,True,True,True


### Save the results

In [5]:
# Save the result dataset
filtered_attrs.to_csv('results/filter_selected_attrs.csv', index=False)