In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Read data
df = pd.read_csv('winequality-white.csv', sep=';')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1],
    df.values[:,-1:],
    test_size=0.25,
    random_state=42)

y_train = y_train.ravel()
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

('Training dataset shape:', (3673L, 11L), (3673L,))
('Testing dataset shape:', (1225L, 11L), (1225L,))


  from numpy.core.umath_tests import inner1d


In [7]:
# Build RF classifier to use in feature selection
clf = LogisticRegression(C=1e5)

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=6,
           forward=True, # Otherwise, this will be the backward selection 
           floating=False,
           n_jobs=10, # The number of CPUs to use for evaluating 
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=10)]: Done   4 out of  11 | elapsed:    1.3s remaining:    2.4s
[Parallel(n_jobs=10)]: Done  11 out of  11 | elapsed:    3.7s finished

[2018-10-07 12:31:55] Features: 1/6 -- score: 0.4990383272434632[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.9s remaining:    2.2s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    3.1s finished

[2018-10-07 12:31:58] Features: 2/6 -- score: 0.5227297590506889[Parallel(n_jobs=9)]: Done   2 out of   9 | elapsed:    0.7s remaining:    2.7s
[Parallel(n_jobs=9)]: Done   7 out of   9 | elapsed:    2.1s remaining:    0.5s
[Parallel(n_jobs=9)]: Done   9 out of   9 | elapsed:    2.7s finished

[2018-10-07 12:32:01] Features: 3/6 -- score: 0.5341606517893626[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:    1.1s remaining:    1.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s finished

[2018-10-07 12:32:05] Features: 4/6 -- s

In [8]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[0, 1, 3, 4, 9, 10]


In [9]:
# Build full model with selected features
clf =  LogisticRegression(C=1e5)
clf.fit(X_train[:, feat_cols], y_train)

y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.538
Testing accuracy on selected features: 0.522


In [10]:
# Build full model on ALL features, for comparison
clf = LogisticRegression(C=1e5)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on all features: 0.542
Testing accuracy on all features: 0.519
