#### sklearn.feature_selection
* feature selection 
* dimensionality reduction

#### Removing faetures with low variance
* Removes all features whose variance doesn't meet certain variance

In [7]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np

In [16]:
X = np.array([[0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 1, 1], [0, 1, 0], [0, 1, 1]])

In [17]:
X[:,0]

array([0, 0, 0, 1, 0, 0])

In [18]:
np.var(X[:,0])

0.13888888888888892

In [19]:
np.var(X[:,1])

0.22222222222222224

In [20]:
np.var(X[:,2])

0.25

In [21]:
sel = VarianceThreshold(0.2)

In [22]:
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

#### Univariate Feature selection
* SelectKBest - removes all but k-highest scoring features
* SelectPercentile - removes all but a configured percentage imp features

In [30]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest

In [24]:
iris = load_iris()

In [26]:
iris.data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [27]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [28]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

In [29]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [32]:
X_new = SelectKBest(k=2).fit_transform(iris.data,iris.target)

In [33]:
X_new

array([[ 1.4,  0.2],
       [ 1.4,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.7,  0.4],
       [ 1.4,  0.3],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.5,  0.2],
       [ 1.6,  0.2],
       [ 1.4,  0.1],
       [ 1.1,  0.1],
       [ 1.2,  0.2],
       [ 1.5,  0.4],
       [ 1.3,  0.4],
       [ 1.4,  0.3],
       [ 1.7,  0.3],
       [ 1.5,  0.3],
       [ 1.7,  0.2],
       [ 1.5,  0.4],
       [ 1. ,  0.2],
       [ 1.7,  0.5],
       [ 1.9,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.4],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.2],
       [ 1.5,  0.4],
       [ 1.5,  0.1],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.2,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.1],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.3,  0.3],
       [ 1.3,  0.3],
       [ 1.3,  0.2],
       [ 1.6,  0.6],
       [ 1.9,  0.4],
       [ 1.4,  0.3],
       [ 1.6,  0.2],
       [ 1.4,

#### Feature selection using SelectFromModel
* machine learning models can find out importance of each features
* It can drop features that might not be required

In [36]:
from sklearn.svm import LinearSVC

In [37]:
from sklearn.feature_selection import SelectFromModel

In [38]:
iris = load_iris()

In [39]:
X = iris.data
y = iris.target

In [40]:
#Feature selection can be used with RandomForest as well
lsvc = LinearSVC().fit(X,y)

In [44]:
model = SelectFromModel(lsvc, prefit=True)

In [45]:
model.transform(X)

array([[ 3.5,  1.4,  0.2],
       [ 3. ,  1.4,  0.2],
       [ 3.2,  1.3,  0.2],
       [ 3.1,  1.5,  0.2],
       [ 3.6,  1.4,  0.2],
       [ 3.9,  1.7,  0.4],
       [ 3.4,  1.4,  0.3],
       [ 3.4,  1.5,  0.2],
       [ 2.9,  1.4,  0.2],
       [ 3.1,  1.5,  0.1],
       [ 3.7,  1.5,  0.2],
       [ 3.4,  1.6,  0.2],
       [ 3. ,  1.4,  0.1],
       [ 3. ,  1.1,  0.1],
       [ 4. ,  1.2,  0.2],
       [ 4.4,  1.5,  0.4],
       [ 3.9,  1.3,  0.4],
       [ 3.5,  1.4,  0.3],
       [ 3.8,  1.7,  0.3],
       [ 3.8,  1.5,  0.3],
       [ 3.4,  1.7,  0.2],
       [ 3.7,  1.5,  0.4],
       [ 3.6,  1. ,  0.2],
       [ 3.3,  1.7,  0.5],
       [ 3.4,  1.9,  0.2],
       [ 3. ,  1.6,  0.2],
       [ 3.4,  1.6,  0.4],
       [ 3.5,  1.5,  0.2],
       [ 3.4,  1.4,  0.2],
       [ 3.2,  1.6,  0.2],
       [ 3.1,  1.6,  0.2],
       [ 3.4,  1.5,  0.4],
       [ 4.1,  1.5,  0.1],
       [ 4.2,  1.4,  0.2],
       [ 3.1,  1.5,  0.1],
       [ 3.2,  1.2,  0.2],
       [ 3.5,  1.3,  0.2],
 

### SelectKFisrt
* regression - f_regression, mutual_info_regression
* classification - chi2, f_classif, mutual_info_classif
* chi2 - non-negative features or categorical