In [1]:
# Feature Selection
#   Feature selection is a process where you automatically select those features in your data that
#   contribute most to the prediction variable or output in which you are interested.
#   Irrelevant or partially relevant features can negatively impact model performance. 
#   Benefits of feature selection:
#   - Reduces overfitting
#   - Improves accuracy
#   - Reduces training time

#   Automatic feature selection techniques using scikit-learn:
#   1) Remove features of low variance
#   2) Univariate Selection
#   3) Recursive feature elimination
#   4) Principal Component Analysis
#   5) Feature Importance

In [2]:
#   1) Remove features of low variance
#      VarianceThreshold by default removes features of zero variance, i.e. say the sample as same value

from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Notice the third column is constant and so has zero variance; so the 3rd feature will be removed
X = [[1, 2, 3, 4], 
     [2, 3, 3, 5], 
     [3, 4, 3, 6], 
     [4, 5, 3, 7], 
     [5, 6, 3, 8]]
sel = VarianceThreshold()
sel.fit_transform(X)

array([[1, 2, 4],
       [2, 3, 5],
       [3, 4, 6],
       [4, 5, 7],
       [5, 6, 8]])

In [3]:
# Suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero 
# in more than 80% of the samples. Boolean features are Bernoulli random variables, and the variance of 
# such variables is given by Var[X] = p(1-p)

# Notice the 1st feature is removed as it has 80% of samples with 0s
threshold = (.8 * (1 - .8))
print('Threshold = ', threshold)
sel = VarianceThreshold(threshold)
sel.fit_transform(X)

Threshold =  0.15999999999999998


array([[1, 2, 4],
       [2, 3, 5],
       [3, 4, 6],
       [4, 5, 7],
       [5, 6, 8]])

In [4]:
# To remove features that are either one or zero in more than 40% of samples
#threshold = (.4 * (1 - .4))
#print('Threshold = ', threshold)
#sel = VarianceThreshold(threshold)
#sel.fit_transform(X)