# Feature Selection
- Reduces Overfitting : Less redundant data means less opportunity to make decisions based on noise.
- Improves Accuracy: Less misleading data means modeling accuracy improves.
- Reduces Training Time: Less data means that algorithms train faster.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data

In [2]:
filename = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(filename, names=names)

data.head(20)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
data.shape

(768, 9)

In [4]:
data.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object

### Separate dataset

In [5]:
data = data.values

In [6]:
X = data[:, 0:8]
Y = data[:, 8]

### Univariate Selection
-  select those features that have the strongest relationship with the output variable

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature extraction: uses the chi-squared ( chi 2 ) statistical test to select 4 of the best features
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

In [9]:
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [11]:
features = fit.transform(X)

# summarize selected features
print(features[0:10,:])

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]
 [116.    0.   25.6  30. ]
 [ 78.   88.   31.   26. ]
 [115.    0.   35.3  29. ]
 [197.  543.   30.5  53. ]
 [125.    0.    0.   54. ]]


### Recursive Feature Elimination
-  recursively removing attributes and building a model on those attributes that remain
- uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


### Principal Component Analysis
- uses linear algebra to transform the dataset into a compressed form (data reduction technique)
- can choose the number of dimensions or principal components in the transformed result

In [14]:
from sklearn.decomposition import PCA

# feature extraction: use PCA and select 3 principal components
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


### Feature Importance
- Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance of features

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.111 0.22  0.099 0.077 0.073 0.153 0.12  0.146]
