In [4]:
# Univariate Selection

from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest, f_classif

# Load Data and Split into X and Y
filename = 'pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 
         'age', 'class']

dataframe = read_csv(filename, names = names)
array = dataframe.values
X = array[:, 0:-1]
y = array[:, -1]

# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, y)

# summarize scores
set_printoptions(precision=3)
print(fit.scores_)

# scores = fit.scores_
# print(scores)
# sort_s = sorted(scores, reverse=True)
# print(sort_s[0:4])


features = fit.transform(X)

print(features[0:5, :])

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]
[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


In [6]:
# Recursive Feature Elimination

from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Load data and split same as above

# feature extraction
model = LogisticRegression(solver='liblinear')
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, y)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


In [7]:
# Principal Component Analysis

from pandas import read_csv
from sklearn.decomposition import PCA

# Load data and split same as above

# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


In [9]:
# Feature Importance with Extra Trees Classifier

# Bagged decision trees like Random Forest and Extra Trees can be used to 
# estimate the importance of features.

from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

# Load Data is the same as above

# feature extraction

model = ExtraTreesClassifier(n_estimators=100)
model.fit(X, y)
print(model.feature_importances_)

[0.11  0.237 0.098 0.08  0.075 0.14  0.12  0.14 ]
