In [1]:
# Import important libraries
import pandas as pd
import numpy as np

# Import the necessary sklearn libraries for feature selection
# The libraries below are used for Filter Based Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# The libraries below are used for Wrapper Based Selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [3]:
# The following URL points to the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
# Assign the names of the features to the RAW data above
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# Load the data in dataframe
dataframe = pd.read_csv(url, names=names)
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Split the data into features and labels
array = dataframe.values
X = array[:,0:8]
y = array[:,8]

# Feature Selection
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)
# Or   test = SelectKBest(score_func=chi2, k=4).fit_transform(X,y)
# Summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [5]:
features = fit.transform(X)
# Summarize selected features
print(features[0:5,:])
# You can see the scores for each attribute 
# the four attributes chosen  Plas, Test, Mass, Age

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [7]:
# Next, we will implement Recursive Feature Elimination (Wrapper Method)
# The Recursive Feature Elimination (or RFE) works by recursively 
# removing attributes and building a model on those attributes that remain.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
# It uses the model accuracy to identify which attributes (and combination of attributes) 
# contribute the most to predicting the target attribute.

# RFE Method
model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=3, step=1)
fit = rfe.fit(X, y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 4 6 5 1 1 3]
