In [24]:
# Feature Selection
#   Feature selection is a process where you automatically select those features in your data that
#   contribute most to the prediction variable or output in which you are interested.
#   Irrelevant or partially relevant features can negatively impact model performance. 
#   Benefits of feature selection:
#   - Reduces overfitting
#   - Improves accuracy
#   - Reduces training time

#   Automatic feature selection techniques using scikit-learn:
#   1) Remove features of low variance
#   2) Univariate Selection
#   3) Recursive feature elimination
#   4) Principal Component Analysis
#   5) Feature Importance

In [25]:
# 3) Recursive feature elimination
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# load data
filename = 'diabetes.csv'
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
dataframe = read_csv(filename, header = 0, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

# Show how many features are picked up
print("Num Features: ", fit.n_features_)

# Show which features are selected
# Selected features- Pregnancies, BMI, DiabetesPedigreeFunction
# Discarded features - Glucose, BloodPressure, SkinThickness, Insulin, Age
print("Selected Features: ", fit.support_)

# Show the feature ranking
# Ranks of features
# 1 - Pregnancies, BMI, DiabetesPedigreeFunction
# 2 - Glucose
# 3 - Age
# 4 - BloodPressure
# 5 - SkinThickness
# 6 - Insulin
print("Feature Ranking: ", fit.ranking_)

Num Features:  3
Selected Features:  [ True False False False False  True  True False]
Feature Ranking:  [1 2 4 5 6 1 1 3]


