# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [1]:
FILEPATH="s3://prefect-dask-examples/nyc-uber-lyft/feature_table_fixed_upper_bound.parquet"

In [2]:
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

from distributed import Client
import dask.dataframe as dd
from coiled import Cluster
import coiled

import optuna
from dask_ml.metrics import mean_squared_error as lazy_mse
import xgboost as xgb
from xgboost.dask import DaskDMatrix

from dask_ml.datasets import make_classification_df
from dask_ml.model_selection import train_test_split, KFold
from dask_ml.preprocessing import OneHotEncoder
import dask.array as da
import dask.dataframe as dd
from s3fs import S3FileSystem
from xgboost.core import XGBoostError
import numpy as np

import pandas as pd

In [3]:
import dask, coiled
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.58
dask: 2022.12.1
dask.distributed: 2022.12.1
optuna: 3.1.0.dev
xgboost: 1.7.2
coiled: 0.2.58


### Train Model

In [4]:
# Here we subset data for cross-validation

def _make_cv(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test


In [5]:
def train_model(trial_number, study_params, n_splits=5, cluster_name = None, teardown_cluster=False):
    if cluster_name is None:
        thread_id = threading.get_ident()
        cluster_name = "xgb-nyc-taxi-" + str(thread_id)

    cluster = coiled.Cluster(
        worker_vm_types=["m6i.4xlarge"],
        scheduler_vm_types=["m6i.2xlarge"],
        package_sync=True, # copy local packages,
        name=cluster_name,
        shutdown_on_close=False,  # reuse cluster across runs
        show_widget=False,
        n_workers=10,
        use_best_zone=True,
        account="dask-engineering",
        backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True}
        )

    print("starting run")
    with Client(cluster) as client:
        print(client.id)
        with client.as_current():
            
            # Load and pre-process the DataFrame
            ddf = dd.read_parquet(FILEPATH)
            ddf = ddf.repartition(partition_size="128MB").persist()
            categorical_vars = ddf.select_dtypes(include="category")
            ddf = ddf.categorize(columns=categorical_vars)
            

            val_scores = []

            
            for i, (train, test) in enumerate(_make_cv(ddf, n_splits)):
                print(f"Starting training run {i}")
                start = datetime.datetime.now()
                train = dd.concat(train)
                try:
                    assert all(train[c].cat.known for c in categorical_vars)
                    assert all(test[c].cat.known for c in categorical_vars)
                except Exception as e:
                    cluster.shutdown()
                    raise RuntimeError(f"Categorical_vars are not known")

                y_train = train['trip_time'].to_frame()
                y_train = y_train.astype(float).persist()
                X_train = train.drop(columns=['trip_time']).persist()

                # Make the training data
                y_test = test['trip_time'].to_frame().persist()
                y_test = y_test.astype(float).persist()
                X_test = test.drop(columns='trip_time').persist()

                try:
                    print("Make dtrain")
                    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

                    print("Make dtest")
                    dtest = DaskDMatrix(client, X_test, y_test, enable_categorical=True)

                    print("Training model")

                    model = xgb.dask.train(
                        client,
                        {
                            'verbosity': 2,
                            'tree_method': 'hist', 
                            "objective": "reg:squarederror",
                            **study_params
                        },
                        dtrain,
                        num_boost_round=4,
                        evals=[(dtrain, "train")],
                    )

                    print("Make predictions")
                    # It's faster to run the prediction directly on X_test DataFrame
                    # We also need to confirm that predictions on dtest when it
                    # contains categoricals performs as expected
                    predictions = xgb.dask.predict(client, model, X_test)

                    print("Score the model")
                    score = lazy_mse(y_test.astype("float32").to_dask_array(lengths=True).reshape(-1,), 
                                     predictions.to_dask_array(lengths=True), squared=False,
                                    )
                    print(f"rmse_score:  {score}")
                    val_scores.append(score)
                    print(f"val_scores:  {val_scores}")

                except XGBoostError as e:
                    print(f"Trial {i} failed with {e}")
        return np.mean(val_scores)


In [6]:
train_options = dict(
    n_splits = 5 
)

In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 75, 125),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.6),
        'subsample': trial.suggest_float('subsample', 0, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }
    rmse = train_model(
        trial_number=trial.number,
        study_params=params, 
        n_splits=train_options["n_splits"],
    )
    print(f"final mse:  {rmse}")
    return rmse

In [8]:
# create a single study

study = optuna.create_study(study_name="nyc-taxi-study")

executor = ThreadPoolExecutor(4)

futures = [
    executor.submit(study.optimize, objective, n_trials=3) for _ in range(20)
]

[32m[I 2023-01-02 20:30:30,622][0m A new study created in memory with name: nyc-taxi-study[0m


Collecting git+https://github.com/optuna/optuna.git@e8a010bb58aea943866e5f7addf0de953228de99
  Cloning https://github.com/optuna/optuna.git (to revision e8a010bb58aea943866e5f7addf0de953228de99) to /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-pkqzhdxt


  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-pkqzhdxt
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=5d89ef0b74f9b336d86f0b718acfae88cdf56ee1a5046c3c3b78eb2269a82e36
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-c1vxl3tg/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna
Collecting git+https://github.com/op

  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-7c54crbl
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=747f42de9ba4f4c2cadd1df4c5657adab63841fd15ac6214c4ade49cc2da33d9
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-htn1t_iv/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna
Collecting git+https://github.com/op

  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-t5m0b31m
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=fcbcb647c53ddeb8cee8937d09a7f6558a0665bf4a4443c16ae9ee6c07d2a914
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-krawzqxr/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna
Collecting git+https://github.com/op

  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-z9h8l1_j
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=9d0f51b104c6c3936e93be656b2bb8367e2c2b3fa99a6dac51a681aeadd51b39
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-d6zqricm/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna
starting run
Client-d72198ea-8b0e-11

[32m[I 2023-01-02 20:41:16,291][0m Trial 3 finished with value: 636.1239013671875 and parameters: {'n_estimators': 96, 'learning_rate': 0.48429041139580564, 'subsample': 0.9280380978294209, 'max_depth': 5, 'colsample_bytree': 0.03277630897332473, 'min_child_weight': 2, 'colsample_bynode': 0.29413793703398705, 'colsample_bylevel': 0.00846779634375594, 'reg_alpha': 0.1909365213154109, 'reg_lambda': 0.7640935114729467}. Best is trial 3 with value: 636.1239013671875.[0m


rmse_score:  636.0891723632812
val_scores:  [636.1154, 636.06714, 636.20856, 636.1394, 636.0892]
final mse:  636.1239013671875
starting run
Client-1a44ee8c-8b10-11ed-8ab7-1a6d039d6526
Make dtest
Training model
Make predictions
Score the model
Starting training run 0
Make dtrain


[32m[I 2023-01-02 20:41:42,939][0m Trial 2 finished with value: 591.5584716796875 and parameters: {'n_estimators': 80, 'learning_rate': 0.28333529382175426, 'subsample': 0.5725751047647893, 'max_depth': 4, 'colsample_bytree': 0.9206774403003845, 'min_child_weight': 1, 'colsample_bynode': 0.8857567484851185, 'colsample_bylevel': 0.15305225009867907, 'reg_alpha': 0.06682705721655752, 'reg_lambda': 0.6424118821965061}. Best is trial 2 with value: 591.5584716796875.[0m


rmse_score:  591.5662231445312
val_scores:  [591.5572, 591.57446, 591.5185, 591.5761, 591.5662]
final mse:  591.5584716796875
starting run
Client-2a10d13c-8b10-11ed-8ab7-1a6d039d6526
Make predictions
Make dtest
Score the model
Training model
Starting training run 0
Make dtrain


[32m[I 2023-01-02 20:42:14,961][0m Trial 1 finished with value: 773.958740234375 and parameters: {'n_estimators': 110, 'learning_rate': 0.15431343742264633, 'subsample': 0.28663057117093416, 'max_depth': 7, 'colsample_bytree': 0.19614662484903378, 'min_child_weight': 1, 'colsample_bynode': 0.4113592955603219, 'colsample_bylevel': 0.4229011647903148, 'reg_alpha': 0.2828606270344499, 'reg_lambda': 0.6190043954467033}. Best is trial 2 with value: 591.5584716796875.[0m


rmse_score:  773.9912719726562
val_scores:  [774.0113, 773.8255, 773.99854, 773.967, 773.9913]
final mse:  773.958740234375
starting run
Client-3d16ff9a-8b10-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Make dtest
Training model


[32m[I 2023-01-02 20:42:33,175][0m Trial 0 finished with value: 464.9927673339844 and parameters: {'n_estimators': 88, 'learning_rate': 0.45746128547941023, 'subsample': 0.9379880803034546, 'max_depth': 8, 'colsample_bytree': 0.7491254882380097, 'min_child_weight': 3, 'colsample_bynode': 0.00039622352757640034, 'colsample_bylevel': 0.5246962114530576, 'reg_alpha': 0.2551925624084617, 'reg_lambda': 0.12328529427008306}. Best is trial 0 with value: 464.9927673339844.[0m


rmse_score:  458.21014404296875
val_scores:  [434.36697, 529.71344, 431.86838, 470.80484, 458.21014]
final mse:  464.9927673339844
starting run
Client-48174f9e-8b10-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Make predictions
Score the model
Starting training run 0
Make dtrain
rmse_score:  836.4600830078125
val_scores:  [836.4601]
Starting training run 1
Make dtrain
Make dtest
Training model
Make predictions
Score the model
rmse_score:  597.5789794921875
val_scores:  [597.579]
Starting training run 1
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
rmse_score:  724.9520874023438
val_scores:  [724.9521]
Starting training run 1
Make dtrain
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  836.3684692382812
val_scores:  [836.4601, 836.36847]
Starting training run 2
Make dtrain
rmse_score:  512.0010986328125
val_scores:  [512.0011]
Starting training run 1
Make dtrain
Make d

[32m[I 2023-01-02 20:50:02,544][0m Trial 4 finished with value: 836.3922119140625 and parameters: {'n_estimators': 84, 'learning_rate': 0.1595100130762319, 'subsample': 0.3982349772252106, 'max_depth': 5, 'colsample_bytree': 0.10020675673279278, 'min_child_weight': 2, 'colsample_bynode': 0.8991488510080123, 'colsample_bylevel': 0.7569028125171563, 'reg_alpha': 0.37385905745218195, 'reg_lambda': 0.7127785227357619}. Best is trial 0 with value: 464.9927673339844.[0m


rmse_score:  836.42919921875
val_scores:  [836.4601, 836.36847, 836.39575, 836.3074, 836.4292]
final mse:  836.3922119140625
Make predictions
Make predictions
Score the model
Score the model
starting run
rmse_score:  724.79736328125
val_scores:  [724.9521, 724.8096, 724.98816, 724.79736]
Starting training run 4
Make dtrain
Client-54553234-8b11-11ed-8ab7-1a6d039d6526
rmse_score:  577.6465454101562
val_scores:  [512.0011, 512.5899, 512.69604, 577.64655]
Starting training run 4
Make dtrain


[32m[I 2023-01-02 20:50:18,254][0m Trial 5 finished with value: 596.3497314453125 and parameters: {'n_estimators': 106, 'learning_rate': 0.5444188885472007, 'subsample': 0.012558259938552863, 'max_depth': 4, 'colsample_bytree': 0.39105994972239977, 'min_child_weight': 2, 'colsample_bynode': 0.5214643896628075, 'colsample_bylevel': 0.4367816065304412, 'reg_alpha': 0.23702398981027134, 'reg_lambda': 0.9183258418277093}. Best is trial 0 with value: 464.9927673339844.[0m


rmse_score:  595.417236328125
val_scores:  [597.579, 596.56287, 595.44977, 596.73987, 595.41724]
final mse:  596.3497314453125
starting run
Client-5d133dda-8b11-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  725.028076171875
val_scores:  [724.9521, 724.8096, 724.98816, 724.79736, 725.0281]


[32m[I 2023-01-02 20:52:08,965][0m Trial 6 finished with value: 724.9151000976562 and parameters: {'n_estimators': 118, 'learning_rate': 0.2518366047176144, 'subsample': 0.06415601813399252, 'max_depth': 8, 'colsample_bytree': 0.12211094102001585, 'min_child_weight': 2, 'colsample_bynode': 0.1605665053236749, 'colsample_bylevel': 0.04068780278323203, 'reg_alpha': 0.2209636416286832, 'reg_lambda': 0.9273652109410293}. Best is trial 0 with value: 464.9927673339844.[0m


final mse:  724.9151000976562
starting run


[32m[I 2023-01-02 20:52:13,440][0m Trial 7 finished with value: 525.5382080078125 and parameters: {'n_estimators': 88, 'learning_rate': 0.34852101187603235, 'subsample': 0.014308218729476518, 'max_depth': 5, 'colsample_bytree': 0.4586078244036227, 'min_child_weight': 2, 'colsample_bynode': 0.7296590543606478, 'colsample_bylevel': 0.65592353387077, 'reg_alpha': 0.06249602601150556, 'reg_lambda': 0.018161651590688166}. Best is trial 0 with value: 464.9927673339844.[0m


rmse_score:  512.7572631835938
val_scores:  [512.0011, 512.5899, 512.69604, 577.64655, 512.75726]
Client-9f9c8922-8b11-11ed-8ab7-1a6d039d6526
final mse:  525.5382080078125
starting run
Client-a1fa4ea2-8b11-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  557.3681640625
val_scores:  [557.36816]
Starting training run 1
Make dtrain
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
rmse_score:  690.3851928710938
val_scores:  [690.3852]
Starting training run 1
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
rmse_score:  557.553955078125
val_scores:  [557.36816, 557.55396]
Starting training run 2
Make dtrain
Make predictions
Score the model
Make predictions
Score the model
Make predictions
rmse_score:  659.3095703125
val_scores:  [659.3096]
Starting training run 1
Score the model
Make dtrain
Make dtest
rmse_score:  690

[32m[I 2023-01-02 20:59:16,487][0m Trial 9 finished with value: 557.5410766601562 and parameters: {'n_estimators': 125, 'learning_rate': 0.481918893228928, 'subsample': 0.6619127852155322, 'max_depth': 8, 'colsample_bytree': 0.32776482588545086, 'min_child_weight': 2, 'colsample_bynode': 0.9686391784525165, 'colsample_bylevel': 0.18125315814950593, 'reg_alpha': 0.26622315288849, 'reg_lambda': 0.17786465156778997}. Best is trial 0 with value: 464.9927673339844.[0m


final mse:  557.5410766601562
starting run
Client-9eb8684a-8b12-11ed-8ab7-1a6d039d6526
rmse_score:  659.267333984375
val_scores:  [659.3096, 659.13696, 659.26733]
Starting training run 3
Make dtrain
Make dtest
Training model
Make dtest
Training model
Starting training run 0
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model


[32m[I 2023-01-02 21:00:40,718][0m Trial 8 finished with value: 689.6456298828125 and parameters: {'n_estimators': 75, 'learning_rate': 0.2268558746786056, 'subsample': 0.6874537367056962, 'max_depth': 9, 'colsample_bytree': 0.7690191792990109, 'min_child_weight': 3, 'colsample_bynode': 0.11916333279385816, 'colsample_bylevel': 0.36861769468907013, 'reg_alpha': 0.1820550115730848, 'reg_lambda': 0.3068604007916611}. Best is trial 0 with value: 464.9927673339844.[0m


rmse_score:  687.1214599609375
val_scores:  [690.3852, 690.7855, 698.89844, 681.0373, 687.12146]
final mse:  689.6456298828125
Make predictions
Score the model
starting run
Client-d09b1e52-8b12-11ed-8ab7-1a6d039d6526
rmse_score:  601.2316284179688
val_scores:  [655.0987, 698.3666, 660.7486, 601.2316]
Starting training run 4
Make dtrain
Make predictions
Score the model
Starting training run 0
Make dtrain
Make predictions
Score the model
rmse_score:  659.3897094726562
val_scores:  [659.3096, 659.13696, 659.26733, 659.3897]
Starting training run 4
Make dtrain
rmse_score:  411.0120849609375
val_scores:  [411.0121]
Starting training run 1
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  700.4060668945312
val_scores:  [655.0987, 698.3666, 660.7486, 601.2316, 700.40607]


[32m[I 2023-01-02 21:03:06,220][0m Trial 10 finished with value: 663.1702880859375 and parameters: {'n_estimators': 97, 'learning_rate': 0.20629912441245016, 'subsample': 0.5330199752052417, 'max_depth': 9, 'colsample_bytree': 0.4576896473337655, 'min_child_weight': 1, 'colsample_bynode': 0.9182838710203303, 'colsample_bylevel': 0.6963234424853249, 'reg_alpha': 0.19579227231436613, 'reg_lambda': 0.09507620562299102}. Best is trial 0 with value: 464.9927673339844.[0m


final mse:  663.1702880859375
starting run
rmse_score:  561.0812377929688
val_scores:  [561.08124]
Starting training run 1
Make dtrain
Make predictions
Client-27a8f1e2-8b13-11ed-8ab7-1a6d039d6526
Score the model
rmse_score:  415.9669189453125
val_scores:  [411.0121, 415.96692]
Starting training run 2
Make dtrain
Starting training run 0
Make dtrain
Make predictions
Score the model
Make dtest
Training model
rmse_score:  659.2091674804688
val_scores:  [659.3096, 659.13696, 659.26733, 659.3897, 659.20917]


[32m[I 2023-01-02 21:03:57,085][0m Trial 11 finished with value: 659.2625732421875 and parameters: {'n_estimators': 101, 'learning_rate': 0.3724674364658853, 'subsample': 0.2808878968711631, 'max_depth': 9, 'colsample_bytree': 0.04732514390634723, 'min_child_weight': 2, 'colsample_bynode': 0.1774204454070526, 'colsample_bylevel': 0.6069068282342884, 'reg_alpha': 0.012258163243724973, 'reg_lambda': 0.6382248824429771}. Best is trial 0 with value: 464.9927673339844.[0m


final mse:  659.2625732421875
starting run
Client-4798fcd6-8b13-11ed-8ab7-1a6d039d6526
Make dtest
Training model
Make dtest
Training model
Starting training run 0
Make dtrain
Make predictions
Score the model
rmse_score:  494.2613525390625
val_scores:  [561.08124, 494.26135]
Starting training run 2
Make dtrain
Make dtest
Training model
Make predictions
Score the model
Make dtest
Training model
rmse_score:  405.0583190917969
val_scores:  [411.0121, 415.96692, 405.05832]
Starting training run 3
Make dtrain
Make predictions
Score the model
rmse_score:  415.52239990234375
val_scores:  [415.5224]
Starting training run 1
Make dtrain
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  444.35479736328125
val_scores:  [444.3548]
Starting training run 1
Make dtrain
Make dtest
Training model
rmse_score:  494.506103515625
val_scores:  [561.08124, 494.26135, 494.5061]
Starting training run 3
Make dtrain
Make dtest
Make predictions
Training model
S

[32m[I 2023-01-02 21:08:33,632][0m Trial 12 finished with value: 459.183837890625 and parameters: {'n_estimators': 102, 'learning_rate': 0.4659181769277019, 'subsample': 0.011187601709420014, 'max_depth': 8, 'colsample_bytree': 0.5723377763095354, 'min_child_weight': 3, 'colsample_bynode': 0.8348220542144026, 'colsample_bylevel': 0.8231506420457397, 'reg_alpha': 0.47481907843465326, 'reg_lambda': 0.7997133480684852}. Best is trial 12 with value: 459.183837890625.[0m


final mse:  459.183837890625
Make predictions
Score the model
starting run
Client-eafe7a04-8b13-11ed-8ab7-1a6d039d6526
rmse_score:  415.0246887207031
val_scores:  [415.5224, 415.5791, 415.0247]
Starting training run 3
Make dtrain
Starting training run 0
Make dtrain
Make dtest
Make predictions
Score the model
Make predictions
Score the model
Training model
rmse_score:  444.84600830078125
val_scores:  [444.3548, 444.84573, 444.846]
Starting training run 3
Make dtest
Make dtrain
Training model


[32m[I 2023-01-02 21:09:30,963][0m Trial 13 finished with value: 507.80023193359375 and parameters: {'n_estimators': 95, 'learning_rate': 0.4134475728487026, 'subsample': 0.9976452265228007, 'max_depth': 7, 'colsample_bytree': 0.6920938124973038, 'min_child_weight': 3, 'colsample_bynode': 0.0024068930444446357, 'colsample_bylevel': 0.9918281340527958, 'reg_alpha': 0.49472442548769696, 'reg_lambda': 0.37785308864804823}. Best is trial 12 with value: 459.183837890625.[0m


rmse_score:  494.56817626953125
val_scores:  [561.08124, 494.26135, 494.5061, 494.58435, 494.56818]
final mse:  507.80023193359375
starting run
Client-0c497e16-8b14-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Make dtest
Training model
Make predictions
Score the model
Make dtest
Training model
rmse_score:  415.40606689453125
val_scores:  [415.5224, 415.5791, 415.0247, 415.40607]
Starting training run 4
Make predictions
Make dtrain
Score the model
rmse_score:  360.0314025878906
val_scores:  [360.0314]
Starting training run 1
Make dtrain
Make predictions
Score the model
Make dtest
rmse_score:  444.45416259765625
val_scores:  [444.3548, 444.84573, 444.846, 444.45416]
Starting training run 4
Make dtrain
Training model
Make dtest
Training model
Make predictions
Score the model
rmse_score:  498.3792724609375
val_scores:  [498.37927]
Starting training run 1
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the

[32m[I 2023-01-02 21:12:22,473][0m Trial 14 finished with value: 415.4104919433594 and parameters: {'n_estimators': 87, 'learning_rate': 0.3787669061229719, 'subsample': 0.9915480658296956, 'max_depth': 6, 'colsample_bytree': 0.6524444718118151, 'min_child_weight': 3, 'colsample_bynode': 0.6626537450967072, 'colsample_bylevel': 0.9996770084906302, 'reg_alpha': 0.013778418904812656, 'reg_lambda': 0.014774526513788022}. Best is trial 14 with value: 415.4104919433594.[0m


final mse:  415.4104919433594


[33m[W 2023-01-02 21:12:24,885][0m Trial 16 failed with parameters: {'n_estimators': 90, 'learning_rate': 0.4241225131512978, 'subsample': 0.9857993645602374, 'max_depth': 7, 'colsample_bytree': 0.6851670758292807, 'min_child_weight': 3, 'colsample_bynode': 0.6276066959140595, 'colsample_bylevel': 0.99637042707859, 'reg_alpha': 0.4809702269397312, 'reg_lambda': 0.4369595716893648} because of the following error: AttributeError("'Future' object has no attribute 'astype'").[0m
Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/optuna/study/_optimize.py", line 199, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/ipykernel_35511/1928247946.py", line 14, in objective
    rmse = train_model(
  File "/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/ipykernel_35511/2400131305.py", line 83, in train_model
    score = lazy_mse(y_test.astype("float32").to_

starting run
Client-728e651a-8b14-11ed-8ab7-1a6d039d6526
starting run
Client-73bfd716-8b14-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
rmse_score:  444.635986328125
val_scores:  [444.3548, 444.84573, 444.846, 444.45416, 444.636]


[32m[I 2023-01-02 21:12:40,160][0m Trial 15 finished with value: 444.6273498535156 and parameters: {'n_estimators': 89, 'learning_rate': 0.39516672276220277, 'subsample': 0.9358579531325141, 'max_depth': 6, 'colsample_bytree': 0.7055029652680684, 'min_child_weight': 3, 'colsample_bynode': 0.6686440026436333, 'colsample_bylevel': 0.9446045839184026, 'reg_alpha': 0.4987914021341327, 'reg_lambda': 0.34905926654230834}. Best is trial 14 with value: 415.4104919433594.[0m


final mse:  444.6273498535156
starting run
Client-7ce2432e-8b14-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Make predictions
Score the model
Starting training run 0
Make dtrain
rmse_score:  497.9462585449219
val_scores:  [498.37927, 497.94626]
Starting training run 2
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  349.0849914550781
val_scores:  [349.085]
Starting training run 1
rmse_score:  349.8620910644531
val_scores:  [349.8621]
Starting training run 1
Make dtrain
Make dtrain
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  422.227783203125
val_scores:  [422.22778]
Starting training run 1
Make dtrain
rmse_score:  498.10430908203125
val_scores:  [498.37927, 497.94626, 498.1043]
Starting training run 3
Make dtrain
Make dtest
Training model
Make dtest
Training m

[32m[I 2023-01-02 21:18:55,012][0m Trial 17 finished with value: 498.2499084472656 and parameters: {'n_estimators': 89, 'learning_rate': 0.5809842839081358, 'subsample': 0.8241635451667542, 'max_depth': 7, 'colsample_bytree': 0.6532446321323719, 'min_child_weight': 3, 'colsample_bynode': 0.629187423319895, 'colsample_bylevel': 0.879899169598678, 'reg_alpha': 0.49739368995385513, 'reg_lambda': 0.4736245975724719}. Best is trial 14 with value: 415.4104919433594.[0m


rmse_score:  498.2366027832031
val_scores:  [498.37927, 497.94626, 498.1043, 498.58292, 498.2366]
final mse:  498.2499084472656
starting run
Client-5d1c4ad4-8b15-11ed-8ab7-1a6d039d6526
rmse_score:  423.088134765625
val_scores:  [422.22778, 423.17883, 423.08813]
Starting training run 3
Make dtrain
Starting training run 0
Make dtrain
Make dtest
Make dtest
Training model
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  349.3538818359375
val_scores:  [349.8621, 349.11227, 349.02823, 349.35388]
Starting training run 4
Make dtrain
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  349.64013671875
val_scores:  [349.085, 349.6649, 349.738, 349.64014]
Starting training run 4
Make dtrain
rmse_score:  422.1744384765625
val_scores:  [422.22778, 423.17883, 423.08813, 422.17444]
Starting training run 4
Make dtrain
rmse_score:  416.4865417480469
val_scores:  [416.48654]
Startin

[32m[I 2023-01-02 21:22:45,497][0m Trial 19 finished with value: 349.36517333984375 and parameters: {'n_estimators': 111, 'learning_rate': 0.5973213766032188, 'subsample': 0.7937042358727737, 'max_depth': 6, 'colsample_bytree': 0.6256120671859474, 'min_child_weight': 3, 'colsample_bynode': 0.6719821020977526, 'colsample_bylevel': 0.9848329120293066, 'reg_alpha': 0.49601346192828766, 'reg_lambda': 0.4768295671695967}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  349.4694519042969
val_scores:  [349.8621, 349.11227, 349.02823, 349.35388, 349.46945]
final mse:  349.36517333984375
Make predictions
Score the model
Make predictions
Score the model
starting run
Client-e6c68146-8b15-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
rmse_score:  416.3855285644531
val_scores:  [416.48654, 416.38553]
Starting training run 2
Make dtrain
rmse_score:  422.2442932128906
val_scores:  [422.22778, 423.17883, 423.08813, 422.17444, 422.2443]


[32m[I 2023-01-02 21:23:02,353][0m Trial 20 finished with value: 422.58270263671875 and parameters: {'n_estimators': 90, 'learning_rate': 0.31505505681651047, 'subsample': 0.8302486602547442, 'max_depth': 6, 'colsample_bytree': 0.9489181195779588, 'min_child_weight': 3, 'colsample_bynode': 0.6052840027364976, 'colsample_bylevel': 0.9506122183250834, 'reg_alpha': 0.38743273793043653, 'reg_lambda': 0.306408134995029}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  422.58270263671875
starting run
Client-efd1086a-8b15-11ed-8ab7-1a6d039d6526


[32m[I 2023-01-02 21:23:08,975][0m Trial 18 finished with value: 349.599609375 and parameters: {'n_estimators': 108, 'learning_rate': 0.597704898870598, 'subsample': 0.8388534702024085, 'max_depth': 6, 'colsample_bytree': 0.6229406914879257, 'min_child_weight': 3, 'colsample_bynode': 0.6446579120232461, 'colsample_bylevel': 0.9992338146231808, 'reg_alpha': 0.45678719089924275, 'reg_lambda': 0.48669337489744363}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  349.86993408203125
val_scores:  [349.085, 349.6649, 349.738, 349.64014, 349.86993]
final mse:  349.599609375
Starting training run 0
Make dtrain
starting run
Client-f44d83be-8b15-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Make dtest
Training model
Starting training run 0
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  416.4392395019531
val_scores:  [416.48654, 416.38553, 416.43924]
Starting training run 3
Make dtrain
rmse_score:  490.1156005859375
val_scores:  [490.1156]
Starting training run 1
Make dtrain
Make predictions
Score the model
Make dtest
Training model
Make predictions
Score the model
rmse_score:  352.33001708984375
val_scores:  [352.33002]
Starting training run 1
Make dtrain
Make dtest
Training model
rmse_score:  353.2980041503906
val_scores:  [353.298]
Starting training run 1
Make dtrain
Make dtest
Training model
Make dtest


[32m[I 2023-01-02 21:27:52,284][0m Trial 21 finished with value: 416.40386962890625 and parameters: {'n_estimators': 75, 'learning_rate': 0.31868042599446467, 'subsample': 0.804643565876754, 'max_depth': 6, 'colsample_bytree': 0.9774410645294822, 'min_child_weight': 3, 'colsample_bynode': 0.6717755434327176, 'colsample_bylevel': 0.9951341424262625, 'reg_alpha': 0.3765997303439763, 'reg_lambda': 0.28914045625262763}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  416.40386962890625
starting run
Client-9d71b442-8b16-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Starting training run 0
Make dtrain
rmse_score:  490.23150634765625
val_scores:  [490.1156, 492.79193, 490.2315]
Starting training run 3
Make dtrain
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  353.21063232421875
val_scores:  [352.33002, 353.11905, 353.21063]
Starting training run 3
Make dtrain
rmse_score:  353.27337646484375
val_scores:  [353.298, 353.3491, 353.27338]
Starting training run 3
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Make dtest
Training model
Training model
Make predictions
Make predictions
Score the model
Score the model
Make predictions
Score the model
rmse_score:  428.3814697265625
val_scores:  [428.38147]
Starting training run 1
Make dtrain
rmse_score:  492.6895446777344
val_scores:  [490.1156, 492.79193, 490.2315, 492.68954]
Starting training run 4
Make dtrain
Make predictions
rm

[32m[I 2023-01-02 21:32:07,750][0m Trial 22 finished with value: 491.1979064941406 and parameters: {'n_estimators': 114, 'learning_rate': 0.30700983352754896, 'subsample': 0.7820711114939567, 'max_depth': 3, 'colsample_bytree': 0.8968842821148204, 'min_child_weight': 3, 'colsample_bynode': 0.5018712359570427, 'colsample_bylevel': 0.8550690814181371, 'reg_alpha': 0.37919074761262717, 'reg_lambda': 0.5019291671722065}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  491.1979064941406
starting run
Make predictions
Client-35610dd4-8b17-11ed-8ab7-1a6d039d6526
Score the model


In [9]:
futures

[<Future at 0x17fe19fd0 state=finished returned NoneType>,
 <Future at 0x17fe3b640 state=finished returned NoneType>,
 <Future at 0x17fe31310 state=finished returned NoneType>,
 <Future at 0x17fe4e850 state=finished returned NoneType>,
 <Future at 0x17fe6b580 state=finished raised AttributeError>,
 <Future at 0x17fe73580 state=finished returned NoneType>,
 <Future at 0x17fe73670 state=running>,
 <Future at 0x17fe73760 state=running>,
 <Future at 0x17fe73850 state=running>,
 <Future at 0x17fe73940 state=running>,
 <Future at 0x17fe73a30 state=pending>,
 <Future at 0x17fe73b20 state=pending>,
 <Future at 0x17fe73c10 state=pending>,
 <Future at 0x17fe73d00 state=pending>,
 <Future at 0x17fe73df0 state=pending>,
 <Future at 0x17fe73ee0 state=pending>,
 <Future at 0x17fe73fd0 state=pending>,
 <Future at 0x17fe77100 state=pending>,
 <Future at 0x17fe771f0 state=pending>,
 <Future at 0x17fe772e0 state=pending>]

[32m[I 2023-01-02 21:32:18,920][0m Trial 24 finished with value: 353.31903076171875 and parameters: {'n_estimators': 114, 'learning_rate': 0.5971836137652087, 'subsample': 0.7660021686165911, 'max_depth': 3, 'colsample_bytree': 0.8381323163880818, 'min_child_weight': 3, 'colsample_bynode': 0.779075792509333, 'colsample_bylevel': 0.8095377405277713, 'reg_alpha': 0.417914034753841, 'reg_lambda': 0.5105683257390563}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  353.33984375
val_scores:  [353.298, 353.3491, 353.27338, 353.33496, 353.33984]
final mse:  353.31903076171875
starting run
rmse_score:  428.38128662109375
val_scores:  [428.38147, 428.3813]
Starting training run 2
Make dtrain
Client-3c48dbf4-8b17-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Starting training run 0
Make dtrain
rmse_score:  352.180908203125
val_scores:  [352.33002, 353.11905, 353.21063, 352.34763, 352.1809]


[32m[I 2023-01-02 21:32:46,272][0m Trial 23 finished with value: 352.63763427734375 and parameters: {'n_estimators': 114, 'learning_rate': 0.5941571109656975, 'subsample': 0.7938708117120348, 'max_depth': 6, 'colsample_bytree': 0.5822965888810061, 'min_child_weight': 3, 'colsample_bynode': 0.7643325918779996, 'colsample_bylevel': 0.8272833753693533, 'reg_alpha': 0.11793487073956084, 'reg_lambda': 0.5071619039627417}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  352.63763427734375
Starting training run 0
Make dtrain
starting run
Client-4ca087fe-8b17-11ed-8ab7-1a6d039d6526
Make dtest
Training model
Starting training run 0
Make dtrain
Make dtest
Training model
Make dtest
Training model
Make dtest
Training model
Make predictions
Score the model
rmse_score:  428.467041015625
val_scores:  [428.38147, 428.3813, 428.46704]
Starting training run 3
Make dtrain
Make predictions
Score the model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  451.24737548828125
val_scores:  [451.24738]
Starting training run 1
Make dtrain
rmse_score:  359.6910705566406
val_scores:  [359.69107]
Starting training run 1
Make dtrain
rmse_score:  457.3029479980469
val_scores:  [457.30295]
Starting training run 1
Make dtrain
Make predictions
Score the model


2023-01-02 21:36:05,604 - distributed.client - ERROR - 
Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/utils.py", line 742, in wrapper
    return await func(*args, **kwargs)
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/client.py", line 1301, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/client.py", line 1331, in _ensure_connected
    comm = await connect(
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/core.py", line 328, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 466, in wait_for
    await waiter
asynci

starting run
Client-c2c11aca-8b17-11ed-8ab7-1a6d039d6526
starting run
starting run
Client-c3803720-8b17-11ed-8ab7-1a6d039d6526
Client-c39ab35c-8b17-11ed-8ab7-1a6d039d6526
starting run
Client-c42fc44c-8b17-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Make dtest
Training model
Make dtest
Make dtest
Training model
Training model
Make dtest
Training model
Make predictions
Score the model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  458.81512451171875
val_scores:  [458.81512]
Starting training run 1
Make dtrain
rmse_score:  473.9134216308594
val_scores:  [473.91342]
Starting training run 1
Make predictions
Make dtrain
Score the model
rmse_score:  366.13629150390625
val_scores:  [366.1363]
Starting training run 1
Make dtrain
rmse_score:  453.89984130859375
val_scores:  [453.89984]
Starting training run 1
Make dtrain
Make dtest
Training mo

[32m[I 2023-01-02 21:46:46,019][0m Trial 32 finished with value: 366.0806579589844 and parameters: {'n_estimators': 122, 'learning_rate': 0.5356822396500232, 'subsample': 0.6681078426277278, 'max_depth': 5, 'colsample_bytree': 0.5701583796768239, 'min_child_weight': 3, 'colsample_bynode': 0.7909750393356416, 'colsample_bylevel': 0.8921402527291182, 'reg_alpha': 0.13378866535715503, 'reg_lambda': 0.4486876337009982}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  366.1526184082031
val_scores:  [366.1363, 365.8003, 366.18002, 366.13416, 366.15262]
final mse:  366.0806579589844
rmse_score:  473.1295166015625
val_scores:  [458.81512, 458.87335, 473.08737, 458.9679, 473.12952]


[32m[I 2023-01-02 21:46:49,304][0m Trial 31 finished with value: 464.57464599609375 and parameters: {'n_estimators': 121, 'learning_rate': 0.5433495543333521, 'subsample': 0.6759462925016122, 'max_depth': 5, 'colsample_bytree': 0.5351040180449416, 'min_child_weight': 3, 'colsample_bynode': 0.41994876140743476, 'colsample_bylevel': 0.8879423507566231, 'reg_alpha': 0.32375766286780483, 'reg_lambda': 0.4211309325715948}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  464.57464599609375
starting run
Client-40b1d2ca-8b19-11ed-8ab7-1a6d039d6526
starting run
Client-42a4aca6-8b19-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  473.9473571777344
val_scores:  [473.91342, 473.95435, 473.87305, 473.88284, 473.94736]


[32m[I 2023-01-02 21:47:10,484][0m Trial 29 finished with value: 473.9142150878906 and parameters: {'n_estimators': 121, 'learning_rate': 0.5372683321784807, 'subsample': 0.7369039769379033, 'max_depth': 5, 'colsample_bytree': 0.556834906363876, 'min_child_weight': 3, 'colsample_bynode': 0.42152213223043006, 'colsample_bylevel': 0.8851608015932635, 'reg_alpha': 0.1299023975261071, 'reg_lambda': 0.4374655961759971}. Best is trial 19 with value: 349.36517333984375.[0m


final mse:  473.9142150878906
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain


[32m[I 2023-01-02 21:47:15,542][0m Trial 30 finished with value: 465.7566833496094 and parameters: {'n_estimators': 121, 'learning_rate': 0.5390809758948473, 'subsample': 0.6666002818730254, 'max_depth': 5, 'colsample_bytree': 0.5674494706427554, 'min_child_weight': 3, 'colsample_bynode': 0.3895848204648972, 'colsample_bylevel': 0.8989331437581225, 'reg_alpha': 0.32432041972423065, 'reg_lambda': 0.4204487177776992}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  453.8991394042969
val_scores:  [453.89984, 473.64337, 473.66193, 473.67917, 453.89914]
final mse:  465.7566833496094
starting run
Client-50e7a282-8b19-11ed-8ab7-1a6d039d6526
starting run
Client-51d2e0e4-8b19-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Make dtest
Make dtest
Training model
Training model
Make dtest
Make dtest
Training model
Training model
Make predictions
Score the model
Make predictions
Score the model
rmse_score:  568.7406616210938
val_scores:  [568.74066]
Starting training run 1
Make dtrain
rmse_score:  397.9715881347656
val_scores:  [397.9716]
Starting training run 1
Make dtrain
Make predictions
Score the model
Make predictions
Score the model
Make dtest
Training model
rmse_score:  413.42437744140625
val_scores:  [413.42438]
Starting training run 1
Make dtrain
Make dtest
Training model
rmse_score:  550.3267211914062
val_scores:  [550.3267]
Starting training run 1
Make dtrain
Make predictions
Score the mo

[32m[I 2023-01-02 21:55:13,002][0m Trial 34 finished with value: 483.2798767089844 and parameters: {'n_estimators': 107, 'learning_rate': 0.5148531045934989, 'subsample': 0.8803004423921772, 'max_depth': 7, 'colsample_bytree': 0.3300559185309505, 'min_child_weight': 3, 'colsample_bynode': 0.5620403760145586, 'colsample_bylevel': 0.745216916892747, 'reg_alpha': 0.4427244212596995, 'reg_lambda': 0.5657740991134119}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  462.562744140625
val_scores:  [568.74066, 460.07236, 462.46774, 462.55603, 462.56274]
final mse:  483.2798767089844
starting run
Client-6ecb9078-8b1a-11ed-8ab7-1a6d039d6526
Make predictions
Score the model
Make dtest
Training model


[32m[I 2023-01-02 21:55:28,779][0m Trial 33 finished with value: 469.315185546875 and parameters: {'n_estimators': 107, 'learning_rate': 0.5342354659097144, 'subsample': 0.7020110126889543, 'max_depth': 7, 'colsample_bytree': 0.5876861679668803, 'min_child_weight': 3, 'colsample_bynode': 0.41515028187504854, 'colsample_bylevel': 0.7677992300120016, 'reg_alpha': 0.4586841249030368, 'reg_lambda': 0.5501493394412652}. Best is trial 19 with value: 349.36517333984375.[0m


rmse_score:  519.7348022460938
val_scores:  [397.9716, 518.85754, 387.0785, 522.9336, 519.7348]
final mse:  469.315185546875
starting run
Client-77f49f50-8b1a-11ed-8ab7-1a6d039d6526
Starting training run 0
Make dtrain
Starting training run 0
Make dtrain
Make predictions
Score the model


2023-01-02 21:56:07,044 - distributed.client - ERROR - 
Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/utils.py", line 742, in wrapper
    return await func(*args, **kwargs)
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/client.py", line 1301, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/client.py", line 1331, in _ensure_connected
    comm = await connect(
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/core.py", line 291, in connect
    comm = await asyncio.wait_for(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 466, in wait_for
    await waiter
asyncio.exceptions.CancelledError

T

rmse_score:  465.1383361816406
val_scores:  [413.42438, 465.1849, 467.56918, 431.29062, 465.13834]
final mse:  448.521484375


2023-01-02 21:56:15,241 - distributed.client - ERROR - 
ConnectionRefusedError: [Errno 61] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/core.py", line 291, in connect
    comm = await asyncio.wait_for(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
    return fut.result()
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/tcp.py", line 511, in connect
    convert_stream_closed_error(self, e)
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/tcp.py", line 142, in convert_stream_closed_error
    raise CommClosedError(f"in {obj}: {exc.__class__.__name__}: {exc}") from exc
distributed.comm.core.CommClose

Collecting git+https://github.com/optuna/optuna.git@e8a010bb58aea943866e5f7addf0de953228de99
  Cloning https://github.com/optuna/optuna.git (to revision e8a010bb58aea943866e5f7addf0de953228de99) to /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-0voz_fes


  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-0voz_fes
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=1eab71bbde30ab05de02d1ce6737c42cd8810509a2d2dca06dac4f48d7e62c45
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-yllfnhbg/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna


In [10]:
study.best_params

{'n_estimators': 111,
 'learning_rate': 0.5973213766032188,
 'subsample': 0.7937042358727737,
 'max_depth': 6,
 'colsample_bytree': 0.6256120671859474,
 'min_child_weight': 3,
 'colsample_bynode': 0.6719821020977526,
 'colsample_bylevel': 0.9848329120293066,
 'reg_alpha': 0.49601346192828766,
 'reg_lambda': 0.4768295671695967}

In [11]:
study.best_value

349.36517333984375

In [12]:
study.best_trial

FrozenTrial(number=19, state=TrialState.COMPLETE, values=[349.36517333984375], datetime_start=datetime.datetime(2023, 1, 2, 21, 12, 24, 891411), datetime_complete=datetime.datetime(2023, 1, 2, 21, 22, 45, 496896), params={'n_estimators': 111, 'learning_rate': 0.5973213766032188, 'subsample': 0.7937042358727737, 'max_depth': 6, 'colsample_bytree': 0.6256120671859474, 'min_child_weight': 3, 'colsample_bynode': 0.6719821020977526, 'colsample_bylevel': 0.9848329120293066, 'reg_alpha': 0.49601346192828766, 'reg_lambda': 0.4768295671695967}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=125, log=False, low=75, step=1), 'learning_rate': FloatDistribution(high=0.6, log=False, low=0.1, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=9, log=False, low=3, step=1), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'min_child_weight': Int

In [13]:
len(study.trials)

42

In [14]:
import joblib

In [15]:
joblib.dump(study, "data/second_study.pkl")

['data/second_study.pkl']

Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/tcp.py", line 498, in connect
    stream = await self.client.connect(
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/greghayes/Documents/projects/dask-xgboost-nyctaxi/.venv/lib/python3.9/site-packages/distributed/comm/core.py", line 291, in connect
   

In [None]:
f = futures[0].result()

In [None]:
cluster.shutdown()