In [1]:
# pip install mlxtend 

-----------------------------
#### Wrapper - backward method
--------------------------

In [9]:
import pandas as pd
import numpy as np

from mlxtend.feature_selection import SequentialFeatureSelector as sfs

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [14]:
# read data

In [2]:
location = r"D:\AI-DATASETS\01-MISC\winequality-white.csv"

In [3]:
# load the training data from data set
df = pd.read_csv(location, sep=',')

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,  : -1],
    df.values[:,-1:   ],
    test_size=0.25,
    random_state=42)

In [6]:
y_train = y_train.ravel()
y_test  = y_test.ravel()

In [7]:
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


In [21]:
# Next, we will define a classifier, as well as a step forward feature selector, 
# and then perform our feature selection. 

# The feature feature selector in mlxtend has some parameters we can define, 
# so here's how we will proceed:

In [22]:
# First, we pass our classifier, the Random Forest classifier defined above the feature selector
# Next, we define the subset of features we are looking to select (k_features=5)
# We then set floating to False; see the documentation for more info on floating:

# We set he desired level of verbosity for mlxtend to report
# Importantly, we set our scoring to accuracy; this is but one metric which could be used 
# to score our resulting models built on the selected features
# mlxtend feature selector uses cross validation internally, and we set our desired folds to 
# 5 for our demonstration

In [15]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [16]:
# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=3,
           forward   =False,
           floating  =False,
           verbose   =3,
           scoring   ='accuracy',
           cv        =5)

In [17]:
%%time
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:  1.5min finished

[2024-01-19 08:33:54] Features: 10/3 -- score: 0.6572221913288476[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   16.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.4min finished

[2024-01-19 08:35:19] Features: 9/3 -- score: 0.6550442084190624[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.1s remaining:    0.0s
[Paral

CPU times: total: 1min 14s
Wall time: 7min 19s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   20.8s finished

[2024-01-19 08:39:34] Features: 3/3 -- score: 0.6093036015496118

In [26]:
# Our best performing model, given our scoring metric, is some subset of 5 features, 
# with a score of 0.638  

# But which subset of 5 features were selected?

In [18]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[1, 3, 7]


In [19]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [20]:
cols = df.columns

In [21]:
for feature_selected in feat_cols:
    print(cols[feature_selected-1])

fixed acidity
citric acid
total sulfur dioxide


In [22]:
sfs1.k_feature_names_

('1', '3', '7')

In [23]:
# use subsets_ attribute, we can take a look at the selected feature indices 
sfs1.subsets_

{11: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
  'cv_scores': array([0.65306122, 0.68027211, 0.65034014, 0.62942779, 0.65122616]),
  'avg_score': 0.6528654840682867,
  'feature_names': ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10')},
 10: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
  'cv_scores': array([0.67210884, 0.66666667, 0.65578231, 0.63215259, 0.65940054]),
  'avg_score': 0.6572221913288476,
  'feature_names': ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')},
 9: {'feature_idx': (0, 1, 3, 4, 5, 6, 7, 8, 9),
  'cv_scores': array([0.66530612, 0.67755102, 0.64489796, 0.63896458, 0.64850136]),
  'avg_score': 0.6550442084190624,
  'feature_names': ('0', '1', '3', '4', '5', '6', '7', '8', '9')},
 8: {'feature_idx': (0, 1, 3, 4, 5, 7, 8, 9),
  'cv_scores': array([0.66938776, 0.67346939, 0.64761905, 0.626703  , 0.64168937]),
  'avg_score': 0.6517737122096795,
  'feature_names': ('0', '1', '3', '4', '5', '7', '8', '9')},
 7: {'feature_idx': (0, 1, 3, 4, 5, 7, 8