# Scale your Machine Learning Models for Faster Training with Sklearn Joblib

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_samples=10_000_000, n_features=4,
                          n_informative=2, n_redundant=0,
                          random_state=0, shuffle=False)

In [3]:
X.size

40000000

In [16]:
import pandas as pd
dfX = pd.DataFrame(data=X)

In [18]:
dfX.to_csv("X_test.csv")

This makes `X` about XXMB.

In [4]:
clf = RandomForestClassifier(
    max_depth=2, 
    random_state=0, 
    n_jobs=-1
)

In [10]:
%%time
clf.fit(X,y)

CPU times: user 13min 21s, sys: 17.8 s, total: 13min 38s
Wall time: 2min 6s


RandomForestClassifier(max_depth=2, n_jobs=-1, random_state=0)

### Spin up Coiled Cluster

In [7]:
import coiled

In [8]:
cluster = coiled.Cluster(
    name="sklearn",
    software="dask-nlp",
    n_workers=100,
    worker_memory='30Gib',
    scheduler_options={'idle_timeout': '2 hours'},
    backend_options={'spot':'True'}
)

Output()

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-rrpelgr71-124847-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-124847-firewall -> coiled-dask-rrpelgr71-124847-firewall]
Created FW rules: coiled-dask-rrpelgr71-124847-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-124847-cluster-firewall -> coiled-dask-rrpelgr71-124847-cluster-firewall]
Created scheduler VM: coiled-dask-rrpelgr71-124847-scheduler (type: t3.medium, ip: ['3.224.147.186'])


In [9]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.224.147.186:8787,

0,1
Dashboard: http://3.224.147.186:8787,Workers: 35
Total threads: 280,Total memory: 1.05 TiB

0,1
Comm: tls://10.4.6.167:8786,Workers: 35
Dashboard: http://10.4.6.167:8787/status,Total threads: 280
Started: Just now,Total memory: 1.05 TiB

0,1
Comm: tls://10.4.5.13:44511,Total threads: 8
Dashboard: http://10.4.5.13:40723/status,Memory: 30.57 GiB
Nanny: tls://10.4.5.13:46459,
Local directory: /dask-worker-space/worker-iy089u6n,Local directory: /dask-worker-space/worker-iy089u6n

0,1
Comm: tls://10.4.12.226:39327,Total threads: 8
Dashboard: http://10.4.12.226:44165/status,Memory: 30.57 GiB
Nanny: tls://10.4.12.226:42391,
Local directory: /dask-worker-space/worker-7ext8p70,Local directory: /dask-worker-space/worker-7ext8p70

0,1
Comm: tls://10.4.15.253:33851,Total threads: 8
Dashboard: http://10.4.15.253:36133/status,Memory: 30.57 GiB
Nanny: tls://10.4.15.253:46083,
Local directory: /dask-worker-space/worker-wh0pytlh,Local directory: /dask-worker-space/worker-wh0pytlh

0,1
Comm: tls://10.4.1.0:38943,Total threads: 8
Dashboard: http://10.4.1.0:43459/status,Memory: 30.57 GiB
Nanny: tls://10.4.1.0:42301,
Local directory: /dask-worker-space/worker-jj6hksej,Local directory: /dask-worker-space/worker-jj6hksej

0,1
Comm: tls://10.4.7.60:38287,Total threads: 8
Dashboard: http://10.4.7.60:36773/status,Memory: 30.57 GiB
Nanny: tls://10.4.7.60:33819,
Local directory: /dask-worker-space/worker-t229zpy_,Local directory: /dask-worker-space/worker-t229zpy_

0,1
Comm: tls://10.4.9.185:33377,Total threads: 8
Dashboard: http://10.4.9.185:41205/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.185:32837,
Local directory: /dask-worker-space/worker-d7ph3dk_,Local directory: /dask-worker-space/worker-d7ph3dk_

0,1
Comm: tls://10.4.13.106:41765,Total threads: 8
Dashboard: http://10.4.13.106:35073/status,Memory: 30.57 GiB
Nanny: tls://10.4.13.106:39707,
Local directory: /dask-worker-space/worker-d_sryu6b,Local directory: /dask-worker-space/worker-d_sryu6b

0,1
Comm: tls://10.4.14.26:34597,Total threads: 8
Dashboard: http://10.4.14.26:34949/status,Memory: 30.74 GiB
Nanny: tls://10.4.14.26:44655,
Local directory: /dask-worker-space/worker-2lt7x4d1,Local directory: /dask-worker-space/worker-2lt7x4d1

0,1
Comm: tls://10.4.4.246:40497,Total threads: 8
Dashboard: http://10.4.4.246:46603/status,Memory: 30.57 GiB
Nanny: tls://10.4.4.246:35897,
Local directory: /dask-worker-space/worker-w475nkk0,Local directory: /dask-worker-space/worker-w475nkk0

0,1
Comm: tls://10.4.12.198:35115,Total threads: 8
Dashboard: http://10.4.12.198:38289/status,Memory: 30.57 GiB
Nanny: tls://10.4.12.198:40305,
Local directory: /dask-worker-space/worker-yh2nk0o7,Local directory: /dask-worker-space/worker-yh2nk0o7

0,1
Comm: tls://10.4.11.25:40501,Total threads: 8
Dashboard: http://10.4.11.25:43869/status,Memory: 30.57 GiB
Nanny: tls://10.4.11.25:42699,
Local directory: /dask-worker-space/worker-xrcwruoc,Local directory: /dask-worker-space/worker-xrcwruoc

0,1
Comm: tls://10.4.7.181:43013,Total threads: 8
Dashboard: http://10.4.7.181:41617/status,Memory: 30.57 GiB
Nanny: tls://10.4.7.181:34943,
Local directory: /dask-worker-space/worker-9qtubmx0,Local directory: /dask-worker-space/worker-9qtubmx0

0,1
Comm: tls://10.4.12.107:33897,Total threads: 8
Dashboard: http://10.4.12.107:34017/status,Memory: 30.57 GiB
Nanny: tls://10.4.12.107:38119,
Local directory: /dask-worker-space/worker-y7yznl75,Local directory: /dask-worker-space/worker-y7yznl75

0,1
Comm: tls://10.4.2.164:36675,Total threads: 8
Dashboard: http://10.4.2.164:35005/status,Memory: 30.57 GiB
Nanny: tls://10.4.2.164:34611,
Local directory: /dask-worker-space/worker-1kc1sw11,Local directory: /dask-worker-space/worker-1kc1sw11

0,1
Comm: tls://10.4.6.230:36053,Total threads: 8
Dashboard: http://10.4.6.230:39645/status,Memory: 30.57 GiB
Nanny: tls://10.4.6.230:40163,
Local directory: /dask-worker-space/worker-ly91a9g0,Local directory: /dask-worker-space/worker-ly91a9g0

0,1
Comm: tls://10.4.1.224:37815,Total threads: 8
Dashboard: http://10.4.1.224:35321/status,Memory: 30.57 GiB
Nanny: tls://10.4.1.224:35421,
Local directory: /dask-worker-space/worker-_ve8nabi,Local directory: /dask-worker-space/worker-_ve8nabi

0,1
Comm: tls://10.4.15.70:40647,Total threads: 8
Dashboard: http://10.4.15.70:46261/status,Memory: 30.74 GiB
Nanny: tls://10.4.15.70:37111,
Local directory: /dask-worker-space/worker-1pd4k3s9,Local directory: /dask-worker-space/worker-1pd4k3s9

0,1
Comm: tls://10.4.8.175:45195,Total threads: 8
Dashboard: http://10.4.8.175:33607/status,Memory: 30.57 GiB
Nanny: tls://10.4.8.175:43545,
Local directory: /dask-worker-space/worker-fwpoc323,Local directory: /dask-worker-space/worker-fwpoc323

0,1
Comm: tls://10.4.7.170:41967,Total threads: 8
Dashboard: http://10.4.7.170:36947/status,Memory: 30.74 GiB
Nanny: tls://10.4.7.170:42185,
Local directory: /dask-worker-space/worker-345zzbmk,Local directory: /dask-worker-space/worker-345zzbmk

0,1
Comm: tls://10.4.8.181:34413,Total threads: 8
Dashboard: http://10.4.8.181:37353/status,Memory: 30.74 GiB
Nanny: tls://10.4.8.181:44721,
Local directory: /dask-worker-space/worker-hae2i9r1,Local directory: /dask-worker-space/worker-hae2i9r1

0,1
Comm: tls://10.4.7.210:33877,Total threads: 8
Dashboard: http://10.4.7.210:44665/status,Memory: 31.01 GiB
Nanny: tls://10.4.7.210:40735,
Local directory: /dask-worker-space/worker-pwpa3dev,Local directory: /dask-worker-space/worker-pwpa3dev

0,1
Comm: tls://10.4.10.202:40695,Total threads: 8
Dashboard: http://10.4.10.202:33367/status,Memory: 30.74 GiB
Nanny: tls://10.4.10.202:33425,
Local directory: /dask-worker-space/worker-jpnvcvqk,Local directory: /dask-worker-space/worker-jpnvcvqk

0,1
Comm: tls://10.4.9.183:36659,Total threads: 8
Dashboard: http://10.4.9.183:36113/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.183:36059,
Local directory: /dask-worker-space/worker-e4p1d7x8,Local directory: /dask-worker-space/worker-e4p1d7x8

0,1
Comm: tls://10.4.13.110:34119,Total threads: 8
Dashboard: http://10.4.13.110:40253/status,Memory: 30.57 GiB
Nanny: tls://10.4.13.110:33609,
Local directory: /dask-worker-space/worker-0oe202fp,Local directory: /dask-worker-space/worker-0oe202fp

0,1
Comm: tls://10.4.1.38:43451,Total threads: 8
Dashboard: http://10.4.1.38:41813/status,Memory: 30.57 GiB
Nanny: tls://10.4.1.38:36437,
Local directory: /dask-worker-space/worker-k_bvfgu8,Local directory: /dask-worker-space/worker-k_bvfgu8

0,1
Comm: tls://10.4.1.231:46157,Total threads: 8
Dashboard: http://10.4.1.231:33279/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.231:46843,
Local directory: /dask-worker-space/worker-00r29qsz,Local directory: /dask-worker-space/worker-00r29qsz

0,1
Comm: tls://10.4.15.166:42459,Total threads: 8
Dashboard: http://10.4.15.166:43177/status,Memory: 30.57 GiB
Nanny: tls://10.4.15.166:41221,
Local directory: /dask-worker-space/worker-3s3zrgr0,Local directory: /dask-worker-space/worker-3s3zrgr0

0,1
Comm: tls://10.4.4.87:36541,Total threads: 8
Dashboard: http://10.4.4.87:41407/status,Memory: 30.74 GiB
Nanny: tls://10.4.4.87:39901,
Local directory: /dask-worker-space/worker-x9s0q3m8,Local directory: /dask-worker-space/worker-x9s0q3m8

0,1
Comm: tls://10.4.5.53:38227,Total threads: 8
Dashboard: http://10.4.5.53:37465/status,Memory: 31.01 GiB
Nanny: tls://10.4.5.53:37965,
Local directory: /dask-worker-space/worker-0n1ql_8w,Local directory: /dask-worker-space/worker-0n1ql_8w

0,1
Comm: tls://10.4.6.35:40573,Total threads: 8
Dashboard: http://10.4.6.35:39771/status,Memory: 30.57 GiB
Nanny: tls://10.4.6.35:41935,
Local directory: /dask-worker-space/worker-85a12fo6,Local directory: /dask-worker-space/worker-85a12fo6

0,1
Comm: tls://10.4.1.66:34905,Total threads: 8
Dashboard: http://10.4.1.66:46279/status,Memory: 30.57 GiB
Nanny: tls://10.4.1.66:34367,
Local directory: /dask-worker-space/worker-lykion0w,Local directory: /dask-worker-space/worker-lykion0w

0,1
Comm: tls://10.4.11.249:35239,Total threads: 8
Dashboard: http://10.4.11.249:39685/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.249:40949,
Local directory: /dask-worker-space/worker-ufi8ijz6,Local directory: /dask-worker-space/worker-ufi8ijz6

0,1
Comm: tls://10.4.15.216:36957,Total threads: 8
Dashboard: http://10.4.15.216:37407/status,Memory: 30.57 GiB
Nanny: tls://10.4.15.216:39817,
Local directory: /dask-worker-space/worker-wgdn7wsa,Local directory: /dask-worker-space/worker-wgdn7wsa

0,1
Comm: tls://10.4.1.215:33867,Total threads: 8
Dashboard: http://10.4.1.215:39209/status,Memory: 30.57 GiB
Nanny: tls://10.4.1.215:43265,
Local directory: /dask-worker-space/worker-js51li86,Local directory: /dask-worker-space/worker-js51li86

0,1
Comm: tls://10.4.13.13:40035,Total threads: 8
Dashboard: http://10.4.13.13:35427/status,Memory: 30.57 GiB
Nanny: tls://10.4.13.13:33655,
Local directory: /dask-worker-space/worker-rhshc3dk,Local directory: /dask-worker-space/worker-rhshc3dk


### Fit Model with Dask

In [10]:
import joblib

In [23]:
%%time
with joblib.parallel_backend("dask"):
    clf.fit(X, y)

CPU times: user 1.93 s, sys: 601 ms, total: 2.53 s
Wall time: 1min 1s


## GridSearch CV

In [11]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [12]:
# Create a parameter grid
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [5, 10],
    'n_estimators': [100, 200, 300, 1000]
}

In [13]:
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1
)

In [12]:
# scale Coiled cluster
cluster.scale(100)

In [15]:
%%time
with joblib.parallel_backend("dask"):
    grid_search.fit(X, y)

TypeError: 'CancelledError' object is not iterable


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


### Run GridSearch Locally

In [None]:
%%time
grid_search.fit(X,y)