# Scale your Machine Learning Models for Faster Training with Sklearn Joblib

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_samples=10_000_000, n_features=4,
                          n_informative=2, n_redundant=0,
                          random_state=0, shuffle=False)

In [3]:
X.size

40000000

In [16]:
import pandas as pd
dfX = pd.DataFrame(data=X)

In [18]:
dfX.to_csv("X_test.csv")

This makes `X` about XXMB.

In [4]:
clf = RandomForestClassifier(
    max_depth=2, 
    random_state=0, 
    n_jobs=-1
)

In [10]:
%%time
clf.fit(X,y)

CPU times: user 13min 21s, sys: 17.8 s, total: 13min 38s
Wall time: 2min 6s


RandomForestClassifier(max_depth=2, n_jobs=-1, random_state=0)

### Spin up Coiled Cluster

In [5]:
import coiled

In [6]:
cluster = coiled.Cluster(
    name="sklearn",
    software="dask-nlp",
    n_workers=20,
    scheduler_options={'idle_timeout': '2 hours'},
    backend_options={'spot':'True'}
)

Output()

In [7]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://44.198.168.30:8787,

0,1
Dashboard: http://44.198.168.30:8787,Workers: 20
Total threads: 40,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.10.86:8786,Workers: 20
Dashboard: http://10.4.10.86:8787/status,Total threads: 40
Started: 3 minutes ago,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.3.17:38575,Total threads: 2
Dashboard: http://10.4.3.17:36461/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.17:32943,
Local directory: /dask-worker-space/worker-8r2st0g2,Local directory: /dask-worker-space/worker-8r2st0g2

0,1
Comm: tls://10.4.13.105:38297,Total threads: 2
Dashboard: http://10.4.13.105:37903/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.105:35317,
Local directory: /dask-worker-space/worker-d7thvns2,Local directory: /dask-worker-space/worker-d7thvns2

0,1
Comm: tls://10.4.3.92:37921,Total threads: 2
Dashboard: http://10.4.3.92:35709/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.92:37757,
Local directory: /dask-worker-space/worker-l98o3s43,Local directory: /dask-worker-space/worker-l98o3s43

0,1
Comm: tls://10.4.0.230:37047,Total threads: 2
Dashboard: http://10.4.0.230:44791/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.230:46173,
Local directory: /dask-worker-space/worker-b8wjmfql,Local directory: /dask-worker-space/worker-b8wjmfql

0,1
Comm: tls://10.4.10.16:36431,Total threads: 2
Dashboard: http://10.4.10.16:41241/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.16:39413,
Local directory: /dask-worker-space/worker-ef22o_lx,Local directory: /dask-worker-space/worker-ef22o_lx

0,1
Comm: tls://10.4.10.97:45515,Total threads: 2
Dashboard: http://10.4.10.97:40755/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.97:43395,
Local directory: /dask-worker-space/worker-it3cywtw,Local directory: /dask-worker-space/worker-it3cywtw

0,1
Comm: tls://10.4.5.102:44571,Total threads: 2
Dashboard: http://10.4.5.102:38249/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.102:44899,
Local directory: /dask-worker-space/worker-d5i_qnbq,Local directory: /dask-worker-space/worker-d5i_qnbq

0,1
Comm: tls://10.4.2.4:39907,Total threads: 2
Dashboard: http://10.4.2.4:35477/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.4:45429,
Local directory: /dask-worker-space/worker-1rv05exp,Local directory: /dask-worker-space/worker-1rv05exp

0,1
Comm: tls://10.4.3.243:35851,Total threads: 2
Dashboard: http://10.4.3.243:46041/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.243:35187,
Local directory: /dask-worker-space/worker-ozu_cqyk,Local directory: /dask-worker-space/worker-ozu_cqyk

0,1
Comm: tls://10.4.11.120:36797,Total threads: 2
Dashboard: http://10.4.11.120:36371/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.120:34165,
Local directory: /dask-worker-space/worker-i3dr60bv,Local directory: /dask-worker-space/worker-i3dr60bv

0,1
Comm: tls://10.4.8.76:45919,Total threads: 2
Dashboard: http://10.4.8.76:35111/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.76:44337,
Local directory: /dask-worker-space/worker-uhqjnche,Local directory: /dask-worker-space/worker-uhqjnche

0,1
Comm: tls://10.4.8.249:42707,Total threads: 2
Dashboard: http://10.4.8.249:46537/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.249:43893,
Local directory: /dask-worker-space/worker-rn_1siox,Local directory: /dask-worker-space/worker-rn_1siox

0,1
Comm: tls://10.4.13.195:34215,Total threads: 2
Dashboard: http://10.4.13.195:43467/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.195:43363,
Local directory: /dask-worker-space/worker-4dtsrqpi,Local directory: /dask-worker-space/worker-4dtsrqpi

0,1
Comm: tls://10.4.7.139:34131,Total threads: 2
Dashboard: http://10.4.7.139:39863/status,Memory: 7.67 GiB
Nanny: tls://10.4.7.139:43519,
Local directory: /dask-worker-space/worker-tjbfeezl,Local directory: /dask-worker-space/worker-tjbfeezl

0,1
Comm: tls://10.4.2.128:45717,Total threads: 2
Dashboard: http://10.4.2.128:36231/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.128:37227,
Local directory: /dask-worker-space/worker-dp0jdv8q,Local directory: /dask-worker-space/worker-dp0jdv8q

0,1
Comm: tls://10.4.0.33:42365,Total threads: 2
Dashboard: http://10.4.0.33:41581/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.33:38849,
Local directory: /dask-worker-space/worker-mb5ditp9,Local directory: /dask-worker-space/worker-mb5ditp9

0,1
Comm: tls://10.4.5.48:44057,Total threads: 2
Dashboard: http://10.4.5.48:38399/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.48:45953,
Local directory: /dask-worker-space/worker-s7q9o_l5,Local directory: /dask-worker-space/worker-s7q9o_l5

0,1
Comm: tls://10.4.2.193:46237,Total threads: 2
Dashboard: http://10.4.2.193:33505/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.193:37193,
Local directory: /dask-worker-space/worker-9w0gezs5,Local directory: /dask-worker-space/worker-9w0gezs5

0,1
Comm: tls://10.4.9.111:36631,Total threads: 2
Dashboard: http://10.4.9.111:44615/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.111:35241,
Local directory: /dask-worker-space/worker-aancngyb,Local directory: /dask-worker-space/worker-aancngyb

0,1
Comm: tls://10.4.0.173:43965,Total threads: 2
Dashboard: http://10.4.0.173:33801/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.173:45855,
Local directory: /dask-worker-space/worker-ja4fuymn,Local directory: /dask-worker-space/worker-ja4fuymn


In [21]:
client.restart()

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://44.198.168.30:8787,

0,1
Dashboard: http://44.198.168.30:8787,Workers: 20
Total threads: 40,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.10.86:8786,Workers: 20
Dashboard: http://10.4.10.86:8787/status,Total threads: 40
Started: 13 minutes ago,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.3.17:34597,Total threads: 2
Dashboard: http://10.4.3.17:39193/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.17:32943,
Local directory: /dask-worker-space/worker-vg0lz646,Local directory: /dask-worker-space/worker-vg0lz646

0,1
Comm: tls://10.4.13.105:38029,Total threads: 2
Dashboard: http://10.4.13.105:42171/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.105:35317,
Local directory: /dask-worker-space/worker-cnrbkff5,Local directory: /dask-worker-space/worker-cnrbkff5

0,1
Comm: tls://10.4.3.92:46875,Total threads: 2
Dashboard: http://10.4.3.92:37999/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.92:37757,
Local directory: /dask-worker-space/worker-h_af5htf,Local directory: /dask-worker-space/worker-h_af5htf

0,1
Comm: tls://10.4.0.230:45121,Total threads: 2
Dashboard: http://10.4.0.230:41387/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.230:46173,
Local directory: /dask-worker-space/worker-x0ueoud5,Local directory: /dask-worker-space/worker-x0ueoud5

0,1
Comm: tls://10.4.10.16:35273,Total threads: 2
Dashboard: http://10.4.10.16:38451/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.16:39413,
Local directory: /dask-worker-space/worker-52fm91wi,Local directory: /dask-worker-space/worker-52fm91wi

0,1
Comm: tls://10.4.10.97:36947,Total threads: 2
Dashboard: http://10.4.10.97:45989/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.97:43395,
Local directory: /dask-worker-space/worker-q9991p3b,Local directory: /dask-worker-space/worker-q9991p3b

0,1
Comm: tls://10.4.5.102:37175,Total threads: 2
Dashboard: http://10.4.5.102:40013/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.102:44899,
Local directory: /dask-worker-space/worker-ye9d4hqc,Local directory: /dask-worker-space/worker-ye9d4hqc

0,1
Comm: tls://10.4.2.4:37117,Total threads: 2
Dashboard: http://10.4.2.4:42233/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.4:45429,
Local directory: /dask-worker-space/worker-uhj8_kpw,Local directory: /dask-worker-space/worker-uhj8_kpw

0,1
Comm: tls://10.4.3.243:39937,Total threads: 2
Dashboard: http://10.4.3.243:40843/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.243:35187,
Local directory: /dask-worker-space/worker-j681t62d,Local directory: /dask-worker-space/worker-j681t62d

0,1
Comm: tls://10.4.11.120:42159,Total threads: 2
Dashboard: http://10.4.11.120:44297/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.120:34165,
Local directory: /dask-worker-space/worker-a581s8y1,Local directory: /dask-worker-space/worker-a581s8y1

0,1
Comm: tls://10.4.8.76:35897,Total threads: 2
Dashboard: http://10.4.8.76:42233/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.76:44337,
Local directory: /dask-worker-space/worker-hz0jvg4e,Local directory: /dask-worker-space/worker-hz0jvg4e

0,1
Comm: tls://10.4.8.249:46637,Total threads: 2
Dashboard: http://10.4.8.249:36867/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.249:43893,
Local directory: /dask-worker-space/worker-hdc5x1su,Local directory: /dask-worker-space/worker-hdc5x1su

0,1
Comm: tls://10.4.13.195:34729,Total threads: 2
Dashboard: http://10.4.13.195:40687/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.195:43363,
Local directory: /dask-worker-space/worker-ehw_u02r,Local directory: /dask-worker-space/worker-ehw_u02r

0,1
Comm: tls://10.4.7.139:34141,Total threads: 2
Dashboard: http://10.4.7.139:37175/status,Memory: 7.67 GiB
Nanny: tls://10.4.7.139:43519,
Local directory: /dask-worker-space/worker-nz5noplc,Local directory: /dask-worker-space/worker-nz5noplc

0,1
Comm: tls://10.4.2.128:39929,Total threads: 2
Dashboard: http://10.4.2.128:43233/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.128:37227,
Local directory: /dask-worker-space/worker-88qmx7i3,Local directory: /dask-worker-space/worker-88qmx7i3

0,1
Comm: tls://10.4.0.33:37965,Total threads: 2
Dashboard: http://10.4.0.33:46625/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.33:38849,
Local directory: /dask-worker-space/worker-7yo7v08a,Local directory: /dask-worker-space/worker-7yo7v08a

0,1
Comm: tls://10.4.5.48:35789,Total threads: 2
Dashboard: http://10.4.5.48:39903/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.48:45953,
Local directory: /dask-worker-space/worker-as1v1mrj,Local directory: /dask-worker-space/worker-as1v1mrj

0,1
Comm: tls://10.4.2.193:44651,Total threads: 2
Dashboard: http://10.4.2.193:45221/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.193:37193,
Local directory: /dask-worker-space/worker-hytp5emb,Local directory: /dask-worker-space/worker-hytp5emb

0,1
Comm: tls://10.4.9.111:33565,Total threads: 2
Dashboard: http://10.4.9.111:40573/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.111:35241,
Local directory: /dask-worker-space/worker-ihlpahpd,Local directory: /dask-worker-space/worker-ihlpahpd

0,1
Comm: tls://10.4.0.173:35255,Total threads: 2
Dashboard: http://10.4.0.173:37019/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.173:45855,
Local directory: /dask-worker-space/worker-nbdix9ae,Local directory: /dask-worker-space/worker-nbdix9ae


### Fit Model with Dask

In [22]:
import joblib

In [23]:
%%time
with joblib.parallel_backend("dask"):
    clf.fit(X, y)

CPU times: user 1.93 s, sys: 601 ms, total: 2.53 s
Wall time: 1min 1s


## GridSearch CV

In [24]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [25]:
# Create a parameter grid
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [26]:
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1
)

In [None]:
%%time
with joblib.parallel_backend("dask"):
    grid_search.fit(X, y)