In Chapter 9, we discussed how to reduce the dimensionality of our feature matrix by creating new features with (ideally) similar ability to train quality models but with significantly fewer dimensions. This is called feature extraction. In this chapter we will cover an alternative approach: selecting high-quality, informative features and dropping less useful features. This is called feature selection.


One problem we often run into in machine learning is highly correlated features. If two features are highly correlated, then the information they contain is very similar, and it is likely redundant to include both features. The solution to highly correlated features is simple: remove one of them from the feature set.



In [53]:
from sklearn import datasets, linear_model
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, SelectPercentile, RFECV
from sklearn.feature_selection import chi2, f_classif
import warnings
from sklearn.datasets import make_regression


In [41]:
iris_data_global = datasets.load_iris()
iris_features_global = iris_data_global.data
iris_targets_global = iris_data_global.target


In [42]:
#You have a set of numerical features and want to remove those with low variance (i.e., likely containing little information)
thresholder = VarianceThreshold(threshold=.5)
features_high_variance = thresholder.fit_transform(iris_features_global)

# View high variance feature matrix
features_high_variance[:10]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1]])

In [43]:
# Create feature matrix with two highly correlated features
#Use a correlation matrix to check for highly correlated features. 
#If highly correlated features exist, consider dropping one of the correlated features
corr_features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(corr_features)

# Create correlation matrix
corr_matrix = dataframe.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                          k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95 
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features dropped column 1
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [61]:
#You have a categorical target vector and want to remove uninformative features
# Convert to categorical data by converting data to integers
skb_features = iris_features_global.astype(int)

# Select TWO features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(skb_features, iris_targets_global)

print("Original number of features:", skb_features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

#Select percentile(75%) instead of k=n(features) like above
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(skb_features, iris_targets_global)

print("Original number of features:", skb_features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2
Original number of features: 4
Reduced number of features: 3


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [62]:
#You want to automatically select the best features to keep
# Use scikit-learn’s RFECV to conduct recursive feature elimination (RFE) using cross-validation (CV). 
# That is, repeatedly train a model, 
# each time removing a feature until model performance (e.g., accuracy) becomes worse

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  1.74709864,  1.04317962,  0.7031277 , -0.11351748],
       [-1.07500204,  0.91013086,  0.59106459,  2.56148527,  1.22873415],
       [ 1.37940721,  0.4256341 , -0.17219685, -1.77039484, -0.10640419],
       ...,
       [-0.80331656,  1.4399392 ,  0.52178503, -1.60648007,  1.76891426],
       [ 0.39508844, -1.36091146,  1.66201389, -1.34564911, -1.46393843],
       [-0.55383035, -0.91100345,  1.01945291,  0.82880112,  2.80989138]])

In [63]:
rfecv.n_features_

5

In [64]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [65]:
rfecv.ranking_

array([55, 83, 35, 80, 49,  1, 38,  8, 50, 59, 54,  1, 26, 90, 44, 68, 58,
       22, 94, 70, 53, 79, 82, 88, 37, 84, 27, 61, 11, 81,  9,  1, 89, 13,
       36,  7, 66, 12, 43,  1, 40, 51, 93, 76, 63, 48, 78, 87, 28, 85, 39,
       14, 47, 41, 77, 92, 31, 91, 73, 65, 33, 17,  1,  6, 42, 24, 46, 67,
        2, 56,  5, 34, 64, 23, 74, 25, 69,  3, 15, 29, 71, 60, 32, 86, 16,
       95, 30, 75,  4, 21, 19, 72, 52, 20, 10, 57, 62, 96, 45, 18])