In [1]:
# compare diffrent numbers of features selected using anova f-test
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# load the dataset
def load_dataset(filename):
    # load the dataset as a pandas DataFrame
    data = read_csv(filename, header=0)
    # retrieve numpy array
    dataset = data.values
    # split into (X) and output (y) varriables
    X = dataset[:, :-1]
    y = dataset[:, -1]
    return X, y

In [3]:
# define dataset
X, y = load_dataset('diabetes.csv')

In [4]:
# define the evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [5]:
# define the pipeline to evaluate
model = LogisticRegression(solver='liblinear')
fs = SelectKBest(score_func=f_classif)
pipeline = Pipeline(steps=[('anova',fs), ('lr',model)])

In [6]:
# define the grid
grid = dict()
grid['anova__k'] = [i+1 for i in range(X.shape[1])]

In [7]:
# define the grid search
search = GridSearchCV(pipeline, grid, scoring='accuracy', n_jobs=-1, cv=cv)

In [8]:
# perform the search
results = search.fit(X, y)

In [9]:
# summarize best
print('BEST MEAN ACCURACY : %.3f' % results.best_score_)
print('BEST CONFIG : %s' % results.best_params_)

BEST MEAN ACCURACY : 0.770
BEST CONFIG : {'anova__k': 7}
