# Dask-learn

Original blogpost from Jim Crist: http://jcrist.github.io/dask-sklearn-part-1.html

## Grid search with scikit-learn

In [1]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10000,
                           n_features=500,
                           n_classes=2,
                           n_redundant=250,
                           random_state=42)

In [2]:
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline

logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca),
                       ('logistic', logistic)])

In [3]:
grid = dict(pca__n_components=[50, 100, 250],
            logistic__C=[1e-4, 1.0, 1e4],
            logistic__penalty=['l1', 'l2'])

In [4]:
from sklearn.grid_search import GridSearchCV

estimator = GridSearchCV(pipe, grid, n_jobs=-1)

%time estimator.fit(X, y)

CPU times: user 5.55 s, sys: 228 ms, total: 5.78 s
Wall time: 23.2 s


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 1.0, 10000.0], 'pca__n_components': [50, 100, 250]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [5]:
estimator.best_score_

0.89290000000000003

In [6]:
estimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

## Grid search with dask-learn

In [7]:
from dklearn.grid_search import GridSearchCV as DaskGridSearchCV

destimator = DaskGridSearchCV(pipe, grid)

In [8]:
%time destimator.fit(X, y)

CPU times: user 15.1 s, sys: 1.89 s, total: 17 s
Wall time: 5.19 s


GridSearchCV(cv=None,
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, get=None, iid=True,
       param_grid={'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 1.0, 10000.0], 'pca__n_components': [50, 100, 250]},
       refit=True, scoring=None)

In [9]:
destimator.best_score_ == estimator.best_score_

True

In [10]:
destimator.best_params_ == estimator.best_params_

True