In [1]:
import pandas as pd

# Load dataset
dataset = pd.read_csv('datasets/covertype_norm_train.csv')
# Check
dataset.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
0,-1.929805,1.477831,1.116461,-0.948019,-0.487945,-1.219708,-2.28235,-0.643397,1.322887,-0.934126,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,3
1,1.644997,1.640937,-0.184168,1.692577,0.475531,0.161461,-0.87414,-0.07569,0.761317,-0.947689,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,4.580746,-0.174891,7
2,1.404774,-0.642539,0.998222,-0.182629,-0.096023,0.350281,1.123554,-1.298443,-1.679356,0.476446,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,4.580746,-0.174891,7
3,-0.412357,1.668121,-0.302407,-1.091529,-0.830878,-0.281626,-0.710395,0.01165,0.69652,3.189085,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,2
4,0.429612,1.713428,-1.248319,-0.316572,-0.504275,2.519079,-0.120911,0.535686,0.545328,0.169918,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,5


In [2]:
# Extract spearman correlations
correlations = dataset.corr(method='spearman')
# Remove possible NaN
correlations.fillna(0, inplace=True)
# Check
correlations.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
elevation,1.0,-0.016384,-0.303229,0.389676,0.089603,0.606557,0.030053,0.196423,0.07874,0.514144,...,0.179987,0.129674,0.037665,0.117644,0.023706,0.064193,0.320187,0.305118,0.276427,0.007626
aspect,-0.016384,1.0,0.0502,0.041931,0.082135,0.048015,-0.420868,0.396251,0.623822,-0.082601,...,0.012814,0.023966,0.014129,-0.009395,0.006179,-0.040254,0.037581,-0.039046,-0.013243,-0.003919
slope,-0.303229,0.0502,1.0,0.015822,0.32925,-0.288155,-0.008505,-0.543268,-0.291258,-0.245658,...,-0.156827,0.088366,-0.03554,-0.045205,-0.001725,0.002807,-0.158889,0.065222,-0.026764,0.108354
horiz_dist_hydro,0.389676,0.041931,0.015822,1.0,0.686035,0.135994,-0.043684,0.031208,0.048538,0.185827,...,0.146646,0.067165,0.055272,-0.019414,0.024555,-0.017001,0.094243,0.073317,0.18803,-0.03735
vert_dist_hydro,0.089603,0.082135,0.32925,0.686035,1.0,-0.049081,-0.054949,-0.146757,-0.025431,-0.008971,...,0.044027,0.082122,0.016534,-0.032277,0.012872,-0.029646,-0.023454,0.053057,0.148865,0.060277


In [3]:
# Retrieve only correlations with respect to the class
class_correlations = correlations['cover_type'].drop('cover_type')
# Absolute value
abs_class_correlations = class_correlations.abs()
# Check before sorting
abs_class_correlations.head()
# Sort
sorted_class_corr = abs_class_correlations.sort_values(ascending=False)
# Check after sorting
sorted_class_corr.head()

soil_type_37    0.260195
soil_type_38    0.257571
wild_area_0     0.233420
soil_type_28    0.214971
soil_type_21    0.191335
Name: cover_type, dtype: float64

In [4]:
# Selection rate (the x% greater correlations)
rates = [40, 50, 60]
# Select attributes based on the computed correlations
filtered_attrs = pd.DataFrame(columns=dataset.columns.drop('cover_type'))
for rate in rates:
    # Number of attributes to select
    qtd_attrs = round(rate/100 * len(filtered_attrs.columns))
    # Select the attributes
    selected_attrs = sorted_class_corr[0:qtd_attrs].index.to_series()
    # Make row of booleans
    boolean_results = filtered_attrs.columns.to_series().isin(selected_attrs.values)
    # Append to the result set
    filtered_attrs = filtered_attrs.append(boolean_results, 
                                           ignore_index=True, 
                                           sort=False)
# Check result
filtered_attrs.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,False,False,True,False,True,True,False,True,False,True,...,True,True,True,False,True,False,True,True,True,True
1,False,False,True,True,True,True,False,True,True,True,...,True,True,True,False,True,False,True,True,True,True
2,False,False,True,True,True,True,False,True,True,True,...,True,True,True,False,True,False,True,True,True,True


In [5]:
# Save the result dataset
filtered_attrs.to_csv('results/filter_selected_attrs.csv', index=False)