In [1]:
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_samples=10000,
                           n_features=500,
                           n_classes=2,
                           n_redundant=250,
                           random_state=42)

In [3]:
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline

In [25]:
logistic = linear_model.LogisticRegression(solver='saga')
pca = decomposition.PCA()

In [26]:
pipe = Pipeline(steps=[('pca', pca),
                       ('logistic', logistic)])

In [27]:
grid = dict(pca__n_components=[50, 100, 250],
            logistic__C=[1e-4, 1.0, 1e4],
            logistic__penalty=['l1', 'l2'])

In [28]:
# from warnings import simplefilter
# simplefilter(action='ignore', category=FutureWarning)

In [29]:
from sklearn.model_selection import GridSearchCV

estimator = GridSearchCV(pipe, grid, n_jobs=1, cv=5, error_score='raise')

In [30]:
%time estimator.fit(X, y)

CPU times: user 2min 22s, sys: 5.34 s, total: 2min 27s
Wall time: 1min 27s


GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('logistic',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                

In [31]:
estimator.best_score_

0.8923

In [32]:
estimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

In [33]:
estimator.best_estimator_

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=50,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logistic',
                 LogisticRegression(C=0.0001, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

Now we will use estimators with many jobs

In [34]:
estimator = GridSearchCV(pipe, grid, n_jobs=-1, cv=5, error_score='raise')

In [35]:
%time estimator.fit(X, y)

CPU times: user 1.36 s, sys: 345 ms, total: 1.7 s
Wall time: 1min 8s


GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('logistic',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                

In [36]:
estimator.best_score_

0.8922

In [37]:
estimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

In [38]:
estimator.best_estimator_

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=50,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logistic',
                 LogisticRegression(C=0.0001, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)