# Scale your Machine Learning Models for Faster Training with Sklearn Joblib

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_samples=100_000_000, n_features=4,
                          n_informative=2, n_redundant=0,
                          random_state=0, shuffle=False)

In [3]:
X.size

400000000

This makes `X` about 400MB.

In [4]:
clf = RandomForestClassifier(
    max_depth=2, 
    random_state=0, 
    n_jobs=-1
)

In [None]:
%%time
clf.fit(X,y)

### Spin up Coiled Cluster

### Fit Model with Dask

In [None]:
%%time
with joblib.parallel_backend(“dask”):
    clf.fit(X, y)

## GridSearch CV

In [8]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [12]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [13]:
print(param_grid)

{'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100, 200, 300, 1000]}


In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    cv=5, 
)

In [None]:
%%time
with joblib.parallel_backend(“dask”):
    grid_search.fit(X, y)