In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
np.random.seed(0)

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
y

In [None]:
from sklearn.model_selection import train_test_split
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
y_test

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# Create Logistic Regression classifier
logmodel = LogisticRegression(max_iter=1000)
efs = EFS(logmodel, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
efs = efs.fit(X_train, y_train, custom_feature_names=feature_names)

print('Best accuracy score: %.2f' % efs.best_score_)
print('Best subset (indices):', efs.best_idx_)
print('Best subset (corresponding names):', efs.best_feature_names_)

In [None]:
#Via the subsets_ attribute, we can take a look at the selected feature indices at each step
efs.subsets_

In [None]:
pd.DataFrame.from_dict(efs.get_metric_dict()).T

In [None]:
logmodel=LogisticRegression(max_iter=1000)
X_train_selected=X_train[:,2].reshape(-1,1)
X_test_selected=X_test[:,2].reshape(-1,1)
logmodel.fit(X_train_selected, y_train)
score = logmodel.score(X_test_selected, y_test)
print(score)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
logmodel=LogisticRegression(max_iter=1000)
sfs = SFS(logmodel, 
           k_features=4, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=5)
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
sfs = sfs.fit(X_train, y_train, custom_feature_names=feature_names)
sfs.subsets_


In [None]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
logmodel=LogisticRegression(max_iter=1000)
sbs = SFS(logmodel, 
           k_features=1, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=5)
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
sbs = sbs.fit(X_train, y_train, custom_feature_names=feature_names)
sbs.subsets_

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
#read in the data using pandas
df = pd.read_csv('diabetes.csv')
#check data has been read in properly
df

In [None]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
# Create Logistic Regression classifier
logmodel = LogisticRegression(max_iter=1000)
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
efs = EFS(logmodel, 
           min_features=1,
           max_features=8,
           scoring='accuracy',
           print_progress=True,
           cv=5)
feature_names = ('Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI','DiabetesPedigreeFunction','Age')
efs = efs.fit(X_train, y_train, custom_feature_names=feature_names)

print('Best accuracy score: %.2f' % efs.best_score_)
print('Best subset (indices):', efs.best_idx_)
print('Best subset (corresponding names):', efs.best_feature_names_)


In [None]:
pd.DataFrame.from_dict(efs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(efs.get_metric_dict()).T.sort_values('avg_score',ascending=False)

In [None]:
X_train_selected=X_train.iloc[:,[0,1,2,4,5,7]]
X_train_selected

In [None]:
X_train_selected=X_train.iloc[:,[0,1,2,4,5,7]]
X_test_selected=X_test.iloc[:,[0,1,2,4,5,7]]
logmodel=LogisticRegression(max_iter=1000)
logmodel.fit(X_train_selected, y_train)
score = logmodel.score(X_test_selected, y_test)
print(score)

In [None]:
logmodel = LogisticRegression(max_iter=1000)
sfs = SFS(logmodel, 
           k_features=8, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=5)
feature_names = ('Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI','DiabetesPedigreeFunction','Age')
sfs = sfs.fit(X_train, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T.sort_values('avg_score',ascending=False)

In [None]:
X_train_selected=X_train.iloc[:,[1,2,3,4,5,6,7]]
X_test_selected=X_test.iloc[:,[1,2,3,4,5,6,7]]
logmodel=LogisticRegression(max_iter=1000)
logmodel.fit(X_train_selected, y_train)
score = logmodel.score(X_test_selected, y_test)
print(score)

In [None]:
logmodel = LogisticRegression(max_iter=1000)
sbs = SFS(logmodel, 
           k_features=1, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=5)
feature_names = ('Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI','DiabetesPedigreeFunction','Age')
sbs = sbs.fit(X_train, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T.sort_values('avg_score',ascending=False)