#### sklearn.feature_selection
* feature selection 
* dimensionality reduction

#### Removing faetures with low variance
* Removes all features whose variance doesn't meet certain variance

In [9]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np

In [10]:
X = np.array([[0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 1, 1], [0, 1, 0], [0, 1, 1]])

In [11]:
X[:,0]

array([0, 0, 0, 1, 0, 0])

In [12]:
np.var(X[:,0])

0.13888888888888892

In [13]:
np.var(X[:,1])

0.22222222222222224

In [14]:
np.var(X[:,2])

0.25

In [15]:
sel = VarianceThreshold(0.2)

In [16]:
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

#### Univariate Feature selection
* SelectKBest - removes all but k-highest scoring features
* SelectPercentile - removes all but a configured percentage imp features

In [17]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest

In [18]:
iris = load_iris()

In [19]:
iris.data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [20]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [21]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

In [22]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [23]:
X_new = SelectKBest(k=2).fit_transform(iris.data,iris.target)

In [33]:
X_new

array([[ 1.4,  0.2],
       [ 1.4,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.7,  0.4],
       [ 1.4,  0.3],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.5,  0.2],
       [ 1.6,  0.2],
       [ 1.4,  0.1],
       [ 1.1,  0.1],
       [ 1.2,  0.2],
       [ 1.5,  0.4],
       [ 1.3,  0.4],
       [ 1.4,  0.3],
       [ 1.7,  0.3],
       [ 1.5,  0.3],
       [ 1.7,  0.2],
       [ 1.5,  0.4],
       [ 1. ,  0.2],
       [ 1.7,  0.5],
       [ 1.9,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.4],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.2],
       [ 1.5,  0.4],
       [ 1.5,  0.1],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.2,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.1],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.3,  0.3],
       [ 1.3,  0.3],
       [ 1.3,  0.2],
       [ 1.6,  0.6],
       [ 1.9,  0.4],
       [ 1.4,  0.3],
       [ 1.6,  0.2],
       [ 1.4,

#### Feature selection using SelectFromModel
* machine learning models can find out importance of each features
* It can drop features that might not be required

In [36]:
from sklearn.svm import LinearSVC

In [37]:
from sklearn.feature_selection import SelectFromModel

In [38]:
iris = load_iris()

In [39]:
X = iris.data
y = iris.target

In [40]:
#Feature selection can be used with RandomForest as well
lsvc = LinearSVC().fit(X,y)

In [44]:
model = SelectFromModel(lsvc, prefit=True)

In [45]:
model.transform(X)

array([[ 3.5,  1.4,  0.2],
       [ 3. ,  1.4,  0.2],
       [ 3.2,  1.3,  0.2],
       [ 3.1,  1.5,  0.2],
       [ 3.6,  1.4,  0.2],
       [ 3.9,  1.7,  0.4],
       [ 3.4,  1.4,  0.3],
       [ 3.4,  1.5,  0.2],
       [ 2.9,  1.4,  0.2],
       [ 3.1,  1.5,  0.1],
       [ 3.7,  1.5,  0.2],
       [ 3.4,  1.6,  0.2],
       [ 3. ,  1.4,  0.1],
       [ 3. ,  1.1,  0.1],
       [ 4. ,  1.2,  0.2],
       [ 4.4,  1.5,  0.4],
       [ 3.9,  1.3,  0.4],
       [ 3.5,  1.4,  0.3],
       [ 3.8,  1.7,  0.3],
       [ 3.8,  1.5,  0.3],
       [ 3.4,  1.7,  0.2],
       [ 3.7,  1.5,  0.4],
       [ 3.6,  1. ,  0.2],
       [ 3.3,  1.7,  0.5],
       [ 3.4,  1.9,  0.2],
       [ 3. ,  1.6,  0.2],
       [ 3.4,  1.6,  0.4],
       [ 3.5,  1.5,  0.2],
       [ 3.4,  1.4,  0.2],
       [ 3.2,  1.6,  0.2],
       [ 3.1,  1.6,  0.2],
       [ 3.4,  1.5,  0.4],
       [ 4.1,  1.5,  0.1],
       [ 4.2,  1.4,  0.2],
       [ 3.1,  1.5,  0.1],
       [ 3.2,  1.2,  0.2],
       [ 3.5,  1.3,  0.2],
 

### SelectKFisrt
* regression - f_regression, mutual_info_regression
* classification - chi2, f_classif, mutual_info_classif
* chi2 - non-negative features or categorical

In [24]:
help(SelectKBest)

Help on class SelectKBest in module sklearn.feature_selection.univariate_selection:

class SelectKBest(_BaseFilter)
 |  Select features according to the k highest scores.
 |  
 |  Read more in the :ref:`User Guide <univariate_feature_selection>`.
 |  
 |  Parameters
 |  ----------
 |  score_func : callable
 |      Function taking two arrays X and y, and returning a pair of arrays
 |      (scores, pvalues) or a single array with scores.
 |      Default is f_classif (see below "See also"). The default function only
 |      works with classification tasks.
 |  
 |  k : int or "all", optional, default=10
 |      Number of top features to select.
 |      The "all" option bypasses selection, for use in a parameter search.
 |  
 |  Attributes
 |  ----------
 |  scores_ : array-like, shape=(n_features,)
 |      Scores of features.
 |  
 |  pvalues_ : array-like, shape=(n_features,)
 |      p-values of feature scores, None if `score_func` returned only scores.
 |  
 |  Notes
 |  -----
 |  Ties 

### PCA

In [3]:
from sklearn.decomposition import PCA

In [4]:
from sklearn.datasets import load_iris

In [5]:
iris = load_iris()

In [8]:
iris.data[:5]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [11]:
pca = PCA(n_components=4)

In [12]:
pca.fit_transform(iris.data)

array([[ -2.68420713e+00,   3.26607315e-01,  -2.15118370e-02,
          1.00615724e-03],
       [ -2.71539062e+00,  -1.69556848e-01,  -2.03521425e-01,
          9.96024240e-02],
       [ -2.88981954e+00,  -1.37345610e-01,   2.47092410e-02,
          1.93045428e-02],
       [ -2.74643720e+00,  -3.11124316e-01,   3.76719753e-02,
         -7.59552741e-02],
       [ -2.72859298e+00,   3.33924564e-01,   9.62296998e-02,
         -6.31287327e-02],
       [ -2.27989736e+00,   7.47782713e-01,   1.74325619e-01,
         -2.71468037e-02],
       [ -2.82089068e+00,  -8.21045110e-02,   2.64251085e-01,
         -5.00996251e-02],
       [ -2.62648199e+00,   1.70405349e-01,  -1.58015103e-02,
         -4.62817610e-02],
       [ -2.88795857e+00,  -5.70798026e-01,   2.73354061e-02,
         -2.66154143e-02],
       [ -2.67384469e+00,  -1.06691704e-01,  -1.91533300e-01,
         -5.58909660e-02],
       [ -2.50652679e+00,   6.51935014e-01,  -6.92749958e-02,
         -1.66082478e-02],
       [ -2.61314272e

In [15]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])