In [3]:
#Fist step is to install the mlxtend library to perform step forward feature selection 
# Random Forest classifier for feature selection and model building

In [22]:
conda install -c conda-forge mlxtend 

Retrieving notices: ...working... done
Note: you may need to restart the kernel to use updated packages.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [6]:
# The first steps are too make imports, load the dataset, and split it into training and testing sets.
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import os

In [7]:
# Reading the data (csv format)
df = pd.read_csv('winequality-white.csv', sep=';')


In [8]:
#Summary statistics

FF_Description = df.describe()
FF_Description.to_csv("FF_Description.csv")

In [15]:
# In this command a train/test aplit.
#Train Dataset: Used to fit the machine learning model.
#Test Dataset: Used to evaluate the fit machine learning model.
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1],
    df.values[:,-1:],
    test_size=0.25,
    random_state=42)

y_train = y_train.ravel()
y_test = y_test.ravel()

# Showing the traning/test data set shape
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


In [16]:
#Defining a classifier 
#Using random forest that is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging
#n_estimatorsint, default=100. The number of trees in the forest.
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [25]:
# Build step forward feature selection giving the subset of features that in this case are k_features=5
# floating algorithms have an additional exclusion or inclusion step to remove features once they were included (or excluded)
# The verbose is defined for mlxtend to report
# The scoring to accuracy is used to score the models results that were built based on the selected features 
# mlxtend feature selector uses cross validation internally, and we set our desired folds to 5.
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)


In [None]:
# Perform SFFS
# The score metric got comes from the subset of 5 features , using cross validation
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   16.4s finished

[2023-02-17 12:15:24] Features: 1/5 -- score: 0.49686370460990936[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.2s finished

[2023-02-17 12:15:40] Features: 2/5 -- score: 0.5412519231125692[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   19.5s finished

[2023-02-17 12:16:00] Features: 3/5 -- score: 0.6084917236649428[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1

In [None]:
# The commands are showing which features were selected for the model
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

In [None]:
# With the selected features it is possible not to build a full with the traning and test sets 
# The command is building a classifier for only the subset pf the selected features

clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)

y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

In [None]:
# Comparision of the model above with the accuracies of another full model using all features 
# Comparision check 
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

In [None]:
# It is important to check the feature subset that will work best for the data and do the comparision with the full data set.
# Comparing the two models the accuracy is not very high and is very similar

In [17]:
# With six features
# Build step forward feature selection
# It is important to choose the right numbers of features and check because 
#it can lead to a sub-optimak numer and combination of features being decided upon 
sfs1 = sfs(clf,
           k_features=6,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)


In [18]:
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   25.1s finished

[2023-02-17 13:38:52] Features: 1/6 -- score: 0.495775639956255[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.2s finished

[2023-02-17 13:39:10] Features: 2/6 -- score: 0.5407073347049991[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.4s finished

[2023-02-17 13:39:28] Features: 3/6 -- score: 0.6063118871526813[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 o

In [19]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[1, 3, 4, 6, 7, 10]


In [20]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)

y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.559
Testing accuracy on selected features: 0.508


In [21]:
# the score with 6 (0.64) features a little bit higher than with 5 (0.62) features. 
# The full model built with the selected features ( training an testing accuracy) 5 and 6 are very approximate 