In [1]:
# Feature Selection
#   Feature selection is a process where you automatically select those features in your data that
#   contribute most to the prediction variable or output in which you are interested.
#   Irrelevant or partially relevant features can negatively impact model performance. 
#   Benefits of feature selection:
#   - Reduces overfitting
#   - Improves accuracy
#   - Reduces training time

#   Automatic feature selection techniques using scikit-learn:
#   1) Remove features of low variance
#   2) Univariate Selection
#   3) Recursive feature elimination
#   4) Principal Component Analysis
#   5) Feature Importance

In [2]:
#   2) Univariate Selection
#      Statistical tests can be used to select those features that have the strongest relationship with the output variable.
#      scikit-learn library provides the "SelectKBest" class that can be used with a suite of different statistical tests 
#      to select a specific number of features.
#      For example, use chi-squared statistical test for non-negative features to select 4 best features from the dataset.

import numpy as np
from pandas import read_csv
from numpy import set_printoptions

filename = 'diabetes.csv'
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = read_csv(filename, header = 0, names = names)
array = df.values
X = array[:, 0:8]
Y = array[:, 8]

set_printoptions(precision=0)

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Extract best features
test = SelectKBest(score_func=chi2, k = 4)
fit = test.fit(X, Y)

print(fit.scores_)

[ 112. 1412.   18.   53. 2176.  128.    5.  181.]


In [4]:
print(np.around(X[0:5, :], decimals = 0))

[[  6. 148.  72.  35.   0.  34.   1.  50.]
 [  1.  85.  66.  29.   0.  27.   0.  31.]
 [  8. 183.  64.   0.   0.  23.   1.  32.]
 [  1.  89.  66.  23.  94.  28.   0.  21.]
 [  0. 137.  40.  35. 168.  43.   2.  33.]]


In [5]:
features = fit.transform(X)
print(np.around(features[0:5, :],1))
# The features of importance are Glucose, Insulin, BMI and Age in that order

[[148.   0.  34.  50.]
 [ 85.   0.  27.  31.]
 [183.   0.  23.  32.]
 [ 89.  94.  28.  21.]
 [137. 168.  43.  33.]]


In [6]:
# Another example using Iris Data
# Load libraries 
from sklearn.datasets import load_iris 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
  
# Load iris data 
iris_dataset = load_iris() 
  
# Create features and target 
X = iris_dataset.data 
y = iris_dataset.target 
  
# Convert to categorical data by converting data to integers 
X = X.astype(int) 
  
# Two features with highest chi-squared statistics are selected 
chi2_features = SelectKBest(chi2, k = 2) 
fit = chi2_features.fit(X, y)
X_kbest_features = chi2_features.fit_transform(X, y) 

# Reduced features 
print('Original feature number:', X.shape[1]) 
print('Reduced feature number:', X_kbest_features.shape[1]) 

Original feature number: 4
Reduced feature number: 2


In [7]:
print(X)

[[5 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [4 2 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [5 4 1 0]
 [5 4 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 4 1 0]
 [5 4 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 2 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [7 3 4 1]
 [6 3 4 1]
 [6 3 4 1]
 [5 2 4 1]
 [6 2 4 1]
 [5 2 4 1]
 [6 3 4 1]
 [4 2 3 1]
 [6 2 4 1]
 [5 2 3 1]
 [5 2 3 1]
 [5 3 4 1]
 [6 2 4 1]
 [6 2 4 1]
 [5 2 3 1]
 [6 3 4 1]
 [5 3 4 1]
 [5 2 4 1]
 [6 2 4 1]
 [5 2 3 1]
 [5 3 4 1]
 [6 2 4 1]
 [6 2 4 1]
 [6 2 4 1]
 [6 2 4 1]
 [6 3 4 1]
 [6 2 4 1]
 [6 3 5 1]
 [6 2 4 1]
 [5 2 3 1]
 [5 2 3 1]
 [5 2 3 1]
 [5 2 3 1]
 [6 2 5 1]
 [5 3 4 1]
 [6 3 4 1]
 [6 3 4 1]
 [6 2 4 1]
 [5 3 4 1]
 [5 2 4 1]
 [5 2 4 1]

In [8]:
# Let us see what are the two best features selected
print(X_kbest_features[0:5, :])

[[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]


In [9]:
# Notice that the features having top 2 scores viz., 133.06854839, 74.27906977 are selected into "X_kbest_features"
print(fit.scores_ )

[ 10.   5. 133.  74.]
