In [1]:
# Feature Selection
#   Feature selection is a process where you automatically select those features in your data that
#   contribute most to the prediction variable or output in which you are interested.
#   Irrelevant or partially relevant features can negatively impact model performance. 
#   Benefits of feature selection:
#   - Reduces overfitting
#   - Improves accuracy
#   - Reduces training time

#   Automatic feature selection techniques using scikit-learn:
#   1) Remove features of low variance
#   2) Univariate Selection
#   3) Recursive feature elimination
#   4) Principal Component Analysis
#   5) Feature Importance

In [2]:
#   4) Principal Component Analysis
# Feature Extraction with PCA
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance:\n", fit.explained_variance_ratio_)
print()
print(fit.components_)

Explained Variance:
 [0.88854663 0.06159078 0.02579012]

[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [-2.26488861e-02 -9.72210040e-01 -1.41909330e-01  5.78614699e-02
   9.46266913e-02 -4.69729766e-02 -8.16804621e-04 -1.40168181e-01]
 [-2.24649003e-02  1.43428710e-01 -9.22467192e-01 -3.07013055e-01
   2.09773019e-02 -1.32444542e-01 -6.39983017e-04 -1.25454310e-01]]
