In [1]:
from sklearn.feature_selection import SelectKBest

In [2]:
from sklearn.datasets import load_iris

In [4]:
iris = load_iris()

In [8]:
import numpy as np

In [11]:
d = iris.data

In [12]:
from sklearn.model_selection import train_test_split

In [30]:
trainX, testX, trainY, testY = train_test_split(iris.data, iris.target)

In [31]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
lr = LogisticRegression()

In [34]:
lr.fit(trainX,trainY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
lr.score(testX,testY)

0.97368421052631582

In [36]:
lr.predict(testX[:3])

array([2, 0, 1])

In [37]:
sel = SelectKBest(k=3)

In [38]:
sel.fit(trainX,trainY)

SelectKBest(k=3, score_func=<function f_classif at 0x0000012DB2C4DD08>)

In [39]:
trainX_new = sel.transform(trainX)

In [40]:
lr = LogisticRegression()

In [41]:
lr.fit(trainX_new, trainY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
testX_new = sel.transform(testX)

In [43]:
lr.score(testX_new,testY)

0.94736842105263153

In [44]:
from sklearn.pipeline import Pipeline

In [54]:
pipeline = Pipeline([
    ('select',SelectKBest()), 
    ('lr',LogisticRegression())
])

In [48]:
pipeline.fit(trainX,trainY)

Pipeline(memory=None,
     steps=[('select', SelectKBest(k=3, score_func=<function f_classif at 0x0000012DB2C4DD08>)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [49]:
pipeline.predict(testX[:3])

array([2, 0, 1])

In [50]:
pipeline.score(testX,testY)

0.94736842105263153

In [51]:
from sklearn.model_selection import GridSearchCV

In [66]:
params = {
    'select__k':[2,3,4],
    'lr__C':[0.1,0.5,1.0]
}

In [67]:
grid = GridSearchCV(pipeline, params, cv=5)

In [68]:
grid.fit(trainX,trainY)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('select', SelectKBest(k=10, score_func=<function f_classif at 0x0000012DB2C4DD08>)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'lr__C': [0.1, 0.5, 1.0], 'select__k': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [69]:
grid.best_score_

0.9553571428571429

In [70]:
grid.best_params_

{'lr__C': 1.0, 'select__k': 4}

In [60]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model.logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model.base.LinearClassifierMixin, sklearn.linear_model.base.SparseCoefMixin)
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
 |  entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It can handle
 |  both dense and sparse input. Use C-ordered arrays or CSR matrices
 |  containing 64-bit floats for optimal performance; any other input format
 |  will be converted (and copied).
 |  
 |  The 'newton-cg', 'sag', and 'lbfgs' solve