In [1]:
### --- VERY IMPORTANT --- ###

# WHILE REVIEWING, CHECK OUT BELOW LINKS AND MAKE ADDITIONS HERE:
# http://scikit-learn.org/stable/modules/feature_selection.html#feature-selection
# http://scikit-learn.org/stable/modules/decomposition.html#pca
# http://scikit-learn.org/stable/modules/ensemble.html#forest (for 8.5 feature importance)

In [2]:
# 8.1 Feature Selection

In [3]:
# - Feature selection is a process where you automatically select those features in your data that 
# contribute most to the prediction variable or output in which you are interested. 

# - Having irrelevant features in your data can decrease the accuracy of many models, especially 
# linear algorithms like linear and logistic regression. 

# - Three benefits of performing feature selection before modeling your data are:
    # - Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
    # - Improves Accuracy: Less misleading data means modeling accuracy improves.
    # - Reduces Training Time: Less data means that algorithms train faster.

In [4]:
from pandas import read_csv

In [5]:
import numpy

In [6]:
import sys

In [7]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [8]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [9]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [10]:
_dataframe = read_csv(_uri, names=_col_names)

In [11]:
_array = _dataframe.values

In [12]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [13]:
_X = _array[:,0:8]

In [14]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [15]:
_Y = _array[:,8:]

In [16]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [17]:
# 8.2 Univariate Selection

In [18]:
# - Statistical tests can be used to select those features that have the strongest relationship 
# with the output variable.

# - The example below uses the chi-squared (chi2) statistical test for non-negative features to 
# select 4 of the best features from the Pima Indians onset of diabetes dataset.

In [19]:
from sklearn.feature_selection import SelectKBest

In [20]:
from sklearn.feature_selection import chi2

In [21]:
_select = SelectKBest(score_func=chi2, k=4)

In [22]:
_fit = _select.fit(_X,_Y)

In [23]:
_features = _fit.transform(_X)

In [24]:
_dataframe.head(5)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [25]:
_fit.scores_

array([  111.51969064,  1411.88704064,    17.60537322,    53.10803984,
        2175.56527292,   127.66934333,     5.39268155,   181.30368904])

In [26]:
print_data(_features)

148.000 0.000 33.600 50.000
85.000 0.000 26.600 31.000
183.000 0.000 23.300 32.000
89.000 94.000 28.100 21.000
137.000 168.000 43.100 33.000


In [27]:
# Looking at the dataframe head(5) records and considering the highest scores of _fit
# we get 4 features: plas, test, mass, and age

In [28]:
# 8.3 Recursive Feature Elimination

In [29]:
# - The Recursive Feature Elimination (or RFE) works by recursively removing attributes and building a 
# model on those attributes that remain. 

# - It uses the model accuracy to identify which attributes (and combination of attributes) contribute 
# the most to predicting the target attribute.

# - The choice of algorithm does not matter too much as long as it is skillful and consistent.

In [30]:
from sklearn.feature_selection import RFE

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
_model = LogisticRegression()

In [33]:
_rfe = RFE(_model, 3)

In [34]:
# To avoid warning: 
# DataConversionWarning: A column-vector y was passed when a 1d array was expected. 
# Please change the shape of y to (n_samples, ), for example using ravel().
# y = column_or_1d(y, warn=True)

from sklearn.utils import column_or_1d
_y = numpy.ravel(_Y)
column_or_1d(_y[:5], warn=True)

array([ 1.,  0.,  1.,  0.,  1.])

In [35]:
_fit = _rfe.fit(_X, _y)

In [36]:
_fit.n_features_

3

In [37]:
_fit.support_

array([ True, False, False, False, False,  True,  True, False], dtype=bool)

In [38]:
_fit.ranking_

array([1, 2, 3, 5, 6, 1, 1, 4])

In [39]:
_dataframe.head(1)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1


In [40]:
# features chosen = preg, mass, pedi

In [41]:
# Principal Component Analysis

In [42]:
# - Principal Component Analysis (or PCA) uses linear algebra to transform the dataset into a 
# compressed form. 

# - Generally this is called a data reduction technique. 

# - A property of PCA is that you can choose the number of dimensions or principal components 
# in the transformed result.

In [43]:
from sklearn.decomposition import PCA

In [44]:
_pca = PCA(n_components=3)

In [45]:
_fit = _pca.fit(_X)

In [46]:
_fit.explained_variance_ratio_

array([ 0.88854663,  0.06159078,  0.02579012])

In [47]:
_fit.explained_variance_

array([ 13439.05140161,    931.54560089,    390.06926626])

In [48]:
print_data(_fit.components_)

-0.002 0.098 0.016 0.061 0.993 0.014 0.001 -0.004
-0.023 -0.972 -0.142 0.058 0.095 -0.047 -0.001 -0.140
-0.022 0.143 -0.922 -0.307 0.021 -0.132 -0.001 -0.125


In [49]:
# - You can see that the transformed dataset (3 principal components) bare little resemblance to the source data.

In [50]:
# 8.5 Feature Importance

In [51]:
# - Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance of features

In [52]:
from sklearn.ensemble import ExtraTreesClassifier

In [53]:
_model = ExtraTreesClassifier()

In [54]:
_model.fit(_X,_y) # using _y instead of _Y to avoid columnar warning as above

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [55]:
_model.feature_importances_

array([ 0.1104795 ,  0.22500665,  0.1089014 ,  0.07569528,  0.06823946,
        0.15209357,  0.12306662,  0.13651752])

In [56]:
_dataframe.head(1)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1


In [57]:
# - You can see that we are given an importance score for each attribute where the larger the score, 
# the more important the attribute. 

# - The scores suggest at the importance of plas, mass and age.