# Running hyperparameter optimization on Chemprop model using RayTune

## Import packages

In [1]:
from pathlib import Path

import pandas as pd
from lightning import pytorch as pl
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler

from chemprop import data, featurizers, models, nn

  from .autonotebook import tqdm as notebook_tqdm
2024-10-22 09:03:28,414	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-10-22 09:03:28,801	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-10-22 09:03:29,333	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
chemprop_dir = Path.cwd().parent
input_path = chemprop_dir / "tests" / "data" / "regression" / "mol" / "mol.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['lipo'] # list of names of the columns containing targets

hpopt_save_dir = Path.cwd() / "hpopt" # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True)

## Load data

In [3]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,smiles,lipo
0,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,3.54
1,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,-1.18
2,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,3.69
3,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,3.37
4,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,3.10
...,...,...
95,CC(C)N(CCCNC(=O)Nc1ccc(cc1)C(C)(C)C)C[C@H]2O[C...,2.20
96,CCN(CC)CCCCNc1ncc2CN(C(=O)N(Cc3cccc(NC(=O)C=C)...,2.04
97,CCSc1c(Cc2ccccc2C(F)(F)F)sc3N(CC(C)C)C(=O)N(C)...,4.49
98,COc1ccc(Cc2c(N)n[nH]c2N)cc1,0.20


In [4]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

## Make data points, splits, and datasets

In [5]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

In [6]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

In [7]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data[0], featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data[0], featurizer)


# Define helper function to train the model

In [8]:
def train_model(config, train_dset, val_dset, num_workers, scaler):

    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth)
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(output_transform=output_transform, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers)
    batch_norm = True
    metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=20, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)


## Define parameter search space

In [9]:
search_space = {
    "depth": tune.qrandint(lower=2, upper=6, q=1),
    "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
    "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
}

In [10]:
ray.init()

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=False, # change to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=hpopt_save_dir / "ray_results", # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(
        config, train_dset, val_dset, num_workers, scaler
    ),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=1, # number of random evaluations before tree parzen estimators
    random_state_seed=42,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch() 

tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    num_samples=2, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths
    
)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()


0,1
Current time:,2024-10-22 09:05:01
Running for:,00:01:23.70
Memory:,10.9/15.3 GiB

Trial name,status,loc,train_loop_config/de pth,train_loop_config/ff n_hidden_dim,train_loop_config/ff n_num_layers,train_loop_config/me ssage_hidden_dim,iter,total time (s),train_loss,train_loss_step,val/rmse
TorchTrainer_f1a6e41a,TERMINATED,172.31.231.162:24873,2,2000,2,500,20,49.8815,0.0990423,0.168217,0.861368
TorchTrainer_d775c15d,TERMINATED,172.31.231.162:24953,2,2200,2,400,20,56.6533,0.069695,0.119898,0.90258


[36m(RayTrainWorker pid=24952)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=24873)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=24873)[0m - (ip=172.31.231.162, pid=24952) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=24952)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=24952)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=24952)[0m HPU available: False, using: 0 HPUs


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             


[36m(RayTrainWorker pid=24952)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=24952)[0m /home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=24952)[0m /home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
[36m(RayTrainWorker pid=24952)[0m 
[36m(RayTrainWorker pid=24952)[0m   | Name            | Type               | Params | Mode 
[36m(RayTrainWorker pid=24952)[0m ------------------------------

Epoch 0:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s, v_num=0, train_loss_step=0.987]
Epoch 0: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s, v_num=0, train_loss_step=1.040]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=24952)[0m 
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.60it/s][A
Epoch 0: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s, v_num=0, train_loss_step=1.040, val_loss=0.848]


[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000000)


Epoch 0: 100%|██████████| 2/2 [00:01<00:00,  1.26it/s, v_num=0, train_loss_step=1.040, val_loss=0.848, train_loss_epoch=0.997]
Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=1.040, val_loss=0.848, train_loss_epoch=0.997]        
Epoch 1:  50%|█████     | 1/2 [00:00<00:00,  2.22it/s, v_num=0, train_loss_step=0.984, val_loss=0.848, train_loss_epoch=0.997]
Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  3.32it/s, v_num=0, train_loss_step=0.406, val_loss=0.848, train_loss_epoch=0.997]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.53it/s][A
Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  2.97it/s, v_num=0, train_loss_step=0.406, val_loss=0.904, train_loss_epoch=0.997]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000001)


Epoch 1: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s, v_num=0, train_loss_step=0.406, val_loss=0.904, train_loss_epoch=0.869]
Epoch 2:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.406, val_loss=0.904, train_loss_epoch=0.869]        
Epoch 2:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s, v_num=0, train_loss_step=1.190, val_loss=0.904, train_loss_epoch=0.869]
Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s, v_num=0, train_loss_step=1.290, val_loss=0.904, train_loss_epoch=0.869]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=24952)[0m 
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.01it/s][A
Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.66it/s, v_num=0, train_loss_step=1.290, val_loss=0.842, train_loss_epoch=0.869]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000002)


Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s, v_num=0, train_loss_step=1.290, val_loss=0.842, train_loss_epoch=1.210]
Epoch 3:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=1.290, val_loss=0.842, train_loss_epoch=1.210]        
Epoch 3:  50%|█████     | 1/2 [00:00<00:00,  1.80it/s, v_num=0, train_loss_step=0.890, val_loss=0.842, train_loss_epoch=1.210]


[36m(TorchTrainer pid=24953)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=24953)[0m - (ip=172.31.231.162, pid=25062) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=25062)[0m Setting up process group for: env:// [rank=0, world_size=1]


Epoch 3: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s, v_num=0, train_loss_step=0.749, val_loss=0.842, train_loss_epoch=1.210]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=24952)[0m 
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 10.81it/s][A
Epoch 3: 100%|██████████| 2/2 [00:00<00:00,  2.15it/s, v_num=0, train_loss_step=0.749, val_loss=0.912, train_loss_epoch=1.210]


[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000003)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 3: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s, v_num=0, train_loss_step=0.749, val_loss=0.912, train_loss_epoch=0.861]
Epoch 4:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.749, val_loss=0.912, train_loss_epoch=0.861]        
Epoch 4:  50%|█████     | 1/2 [00:00<00:00,  1.41it/s, v_num=0, train_loss_step=0.845, val_loss=0.912, train_loss_epoch=0.861]
Epoch 4: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s, v_num=0, train_loss_step=0.578, val_loss=0.912, train_loss_epoch=0.861]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=24952)[0m 
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 10.38it/s][A
Epoch 4: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s, v_num=0, train_loss_step=0.578, val_loss=0.912, train_loss_epoch=0.861]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000004)


Epoch 4: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s, v_num=0, train_loss_step=0.578, val_loss=0.912, train_loss_epoch=0.792]
Epoch 5:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.578, val_loss=0.912, train_loss_epoch=0.792]        
Epoch 5:  50%|█████     | 1/2 [00:00<00:00,  1.60it/s, v_num=0, train_loss_step=0.584, val_loss=0.912, train_loss_epoch=0.792]
Epoch 5: 100%|██████████| 2/2 [00:00<00:00,  2.58it/s, v_num=0, train_loss_step=0.751, val_loss=0.912, train_loss_epoch=0.792]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=24952)[0m 
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 12.17it/s][A
Epoch 5: 100%|██████████| 2/2 [00:00<00:00,  2.26it/s, v_num=0, train_loss_step=0.751, val_loss=0.887, train_loss_epoch=0.792]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000005)


Epoch 5: 100%|██████████| 2/2 [00:01<00:00,  1.59it/s, v_num=0, train_loss_step=0.751, val_loss=0.887, train_loss_epoch=0.618]
Epoch 6:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.751, val_loss=0.887, train_loss_epoch=0.618]        


[36m(RayTrainWorker pid=25062)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=25062)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=25062)[0m HPU available: False, using: 0 HPUs


Epoch 6:  50%|█████     | 1/2 [00:00<00:00,  1.64it/s, v_num=0, train_loss_step=0.421, val_loss=0.887, train_loss_epoch=0.618]
Epoch 6: 100%|██████████| 2/2 [00:00<00:00,  2.56it/s, v_num=0, train_loss_step=0.569, val_loss=0.887, train_loss_epoch=0.618]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 13.13it/s][A
Epoch 6: 100%|██████████| 2/2 [00:00<00:00,  2.28it/s, v_num=0, train_loss_step=0.569, val_loss=0.876, train_loss_epoch=0.618]
Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 12.06it/s]


[36m(RayTrainWorker pid=25062)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=25062)[0m /home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=25062)[0m /home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
[36m(RayTrainWorker pid=25062)[0m 
[36m(RayTrainWorker pid=25062)[0m   | Name            | Type               | Params | Mode 
[36m(RayTrainWorker pid=25062)[0m ------------------------------

Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             


[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000006)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 6: 100%|██████████| 2/2 [00:01<00:00,  1.53it/s, v_num=0, train_loss_step=0.569, val_loss=0.876, train_loss_epoch=0.450]
Epoch 7:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.569, val_loss=0.876, train_loss_epoch=0.450]        


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 7:  50%|█████     | 1/2 [00:00<00:00,  2.28it/s, v_num=0, train_loss_step=0.339, val_loss=0.876, train_loss_epoch=0.450][32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  3.75it/s, v_num=0, train_loss_step=0.335, val_loss=0.854, train_loss_epoch=1.010][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 3x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 16.17it/s][A[32m [repeated 3x across cluster][0m
Epoch 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s, v_num=0, train_loss_step=0.335, val_loss=0.893, train_loss_epoch=0.703][32m [repeated 3x across cluster][0m
Epoch 2:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.335, val_loss=0.893, train_loss_epoch=0.703][32m [repeated 3x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000009)[32m [repeated 6x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this er

Epoch 11:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s, v_num=0, train_loss_step=0.175, val_loss=0.897, train_loss_epoch=0.258][32m [repeated 8x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [00:01<00:00,  1.79it/s, v_num=0, train_loss_step=0.312, val_loss=0.897, train_loss_epoch=0.258][32m [repeated 7x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


[36m(RayTrainWorker pid=24952)[0m [32m [repeated 11x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  7.84it/s][A[32m [repeated 7x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s, v_num=0, train_loss_step=0.312, val_loss=0.869, train_loss_epoch=0.258][32m [repeated 7x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s, v_num=0, train_loss_step=0.312, val_loss=0.869, train_loss_epoch=0.203][32m [repeated 7x across cluster][0m
Epoch 12:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.312, val_loss=0.869, train_loss_epoch=0.203][32m [repeated 7x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, pa

Epoch 14:  50%|█████     | 1/2 [00:01<00:01,  0.88it/s, v_num=0, train_loss_step=0.131, val_loss=0.841, train_loss_epoch=0.141] [32m [repeated 6x across cluster][0m
Epoch 7: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s, v_num=0, train_loss_step=0.368, val_loss=0.836, train_loss_epoch=0.399][32m [repeated 5x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 5x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 5x across cluster][0m
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 4x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  7.76it/s][A[32m [repeated 5x across cluster][0m
Epoch 7: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s, v_num=0, train_loss_step=0.368, val_loss=0.843, train_loss_epoch=0.399][32m [repeated 5x across cluster][0m
Epoch 7: 100%|██████████| 2/2 [00:

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000015)[32m [repeated 4x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this er

Epoch 9:  50%|█████     | 1/2 [00:01<00:01,  0.72it/s, v_num=0, train_loss_step=0.216, val_loss=0.889, train_loss_epoch=0.254][32m [repeated 3x across cluster][0m
Epoch 9: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s, v_num=0, train_loss_step=0.322, val_loss=0.889, train_loss_epoch=0.254][32m [repeated 4x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


[36m(RayTrainWorker pid=25062)[0m [32m [repeated 9x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  4.73it/s][A[32m [repeated 4x across cluster][0m
Epoch 9: 100%|██████████| 2/2 [00:02<00:00,  0.90it/s, v_num=0, train_loss_step=0.322, val_loss=0.910, train_loss_epoch=0.254][32m [repeated 4x across cluster][0m
Epoch 9: 100%|██████████| 2/2 [00:02<00:00,  0.70it/s, v_num=0, train_loss_step=0.322, val_loss=0.910, train_loss_epoch=0.237][32m [repeated 4x across cluster][0m
Epoch 16:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.105, val_loss=0.809, train_loss_epoch=0.128][32m [repeated 3x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=25062)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, pa

Epoch 18:  50%|█████     | 1/2 [00:01<00:01,  0.98it/s, v_num=0, train_loss_step=0.0962, val_loss=0.781, train_loss_epoch=0.116][32m [repeated 5x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s, v_num=0, train_loss_step=0.263, val_loss=0.889, train_loss_epoch=0.219][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  9.49it/s][A[32m [repeated 4x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s, v_num=0, train_loss_step=0.263, val_loss=0.861, train_loss_epoch=0.219][32m [repeated 4x across cluster][0m
Epoch 11: 100%|██████████| 2/2 [

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=24952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000019)[32m [repeated 3x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this er

Epoch 13:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s, v_num=0, train_loss_step=0.118, val_loss=0.849, train_loss_epoch=0.122][32m [repeated 3x across cluster][0m
Epoch 13: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s, v_num=0, train_loss_step=0.0846, val_loss=0.849, train_loss_epoch=0.122][32m [repeated 4x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  7.32it/s][A[32m [repeated 4x across cluster][0m
Epoch 13: 100%|██████████| 2/2 [00:01<00:00,  1.42it/s, v_num=0, train_loss_step=0.0846, val_loss=0.842, train_loss_epoch=0.122][32m [repeated 4x across cluster][0m
Epoch 19: 100%|██████████| 2/2 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 15:  50%|█████     | 1/2 [00:01<00:01,  0.64it/s, v_num=0, train_loss_step=0.0923, val_loss=0.839, train_loss_epoch=0.0974][32m [repeated 2x across cluster][0m
Epoch 15: 100%|██████████| 2/2 [00:02<00:00,  0.94it/s, v_num=0, train_loss_step=0.0867, val_loss=0.839, train_loss_epoch=0.0974][32m [repeated 2x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 2x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 3x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 10.51it/s][A
Epoch 14: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s, v_num=0, train_loss_step=0.126, val_loss=0.839, train_loss_epoch=0.112]
Epoch 14: 100%|██████████| 2/2 [00:02<00:00,  0.87it/s, v_num=

[36m(RayTrainWorker pid=25062)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/d775c15d/checkpoint_000015)[32m [repeated 3x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 15: 100%|██████████| 2/2 [00:03<00:00,  0.54it/s, v_num=0, train_loss_step=0.0867, val_loss=0.837, train_loss_epoch=0.0912]
Epoch 16:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.0867, val_loss=0.837, train_loss_epoch=0.0912]        
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 16:  50%|█████     | 1/2 [00:02<00:02,  0.35it/s, v_num=0, train_loss_step=0.0792, val_loss=0.837, train_loss_epoch=0.0912]
Epoch 16: 100%|██████████| 2/2 [00:03<00:00,  0.61it/s, v_num=0, train_loss_step=0.0703, val_loss=0.837, train_loss_epoch=0.0912]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  4.23it/s][A
Epoch 16: 100%|██████████| 2/2 [00:03<00:00,  0.56it/s, v_num=0, train_loss_step=0.0703, val_loss=0.837, train_loss_epoch=0.0912]
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 2x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 16: 100%|██████████| 2/2 [00:04<00:00,  0.41it/s, v_num=0, train_loss_step=0.0703, val_loss=0.837, train_loss_epoch=0.0774]
Epoch 17:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.0703, val_loss=0.837, train_loss_epoch=0.0774]        
Epoch 17:  50%|█████     | 1/2 [00:01<00:01,  0.90it/s, v_num=0, train_loss_step=0.0711, val_loss=0.837, train_loss_epoch=0.0774]
Epoch 17: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s, v_num=0, train_loss_step=0.156, val_loss=0.837, train_loss_epoch=0.0774] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 11.47it/s][A
Epoch 17: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s, v_num=0, train_loss_step=0.156, val_loss=0.836, train_loss_epoch=0.0774]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 17: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s, v_num=0, train_loss_step=0.156, val_loss=0.836, train_loss_epoch=0.0882]





Epoch 18:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.156, val_loss=0.836, train_loss_epoch=0.0882]        
Epoch 18:  50%|█████     | 1/2 [00:00<00:00,  1.43it/s, v_num=0, train_loss_step=0.0684, val_loss=0.836, train_loss_epoch=0.0882]
[36m(RayTrainWorker pid=25062)[0m [32m [repeated 3x across cluster][0m
Epoch 18: 100%|██████████| 2/2 [00:00<00:00,  2.20it/s, v_num=0, train_loss_step=0.064, val_loss=0.836, train_loss_epoch=0.0882] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 12.20it/s][A
Epoch 18: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s, v_num=0, train_loss_step=0.064, val_loss=0.830, train_loss_epoch=0.0882]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 18: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s, v_num=0, train_loss_step=0.064, val_loss=0.830, train_loss_epoch=0.0675]
Epoch 19:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss_step=0.064, val_loss=0.830, train_loss_epoch=0.0675]        
Epoch 19:  50%|█████     | 1/2 [00:00<00:00,  1.64it/s, v_num=0, train_loss_step=0.0571, val_loss=0.830, train_loss_epoch=0.0675]
Epoch 19: 100%|██████████| 2/2 [00:00<00:00,  2.53it/s, v_num=0, train_loss_step=0.120, val_loss=0.830, train_loss_epoch=0.0675] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s][A
Epoch 19: 100%|██████████| 2/2 [00:00<00:00,  2.23it/s, v_num=0, train_loss_step=0.120, val_loss=0.815, train_loss_epoch=0.0675]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 19: 100%|██████████| 2/2 [00:01<00:00,  1.55it/s, v_num=0, train_loss_step=0.120, val_loss=0.815, train_loss_epoch=0.0697]
Epoch 19: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s, v_num=0, train_loss_step=0.120, val_loss=0.815, train_loss_epoch=0.0697]


[36m(RayTrainWorker pid=25062)[0m `Trainer.fit` stopped: `max_epochs=20` reached.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-10-22 09:05:01,823	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37' in 0.0305s.
2024-10-22 09:05:01,873	INFO tune.py:1048 -- Total run time: 83.87 seconds (83.66 seconds for the tuning loop).


## Hyperparameter optimization results

In [11]:
results

ResultGrid<[
  Result(
    metrics={'train_loss': 0.09904231131076813, 'train_loss_step': 0.16821686923503876, 'val/rmse': 0.8613682389259338, 'val/mae': 0.7006751298904419, 'val_loss': 0.7419552206993103, 'train_loss_epoch': 0.09904231131076813, 'epoch': 19, 'step': 40},
    path='/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000019)
  ),
  Result(
    metrics={'train_loss': 0.06969495117664337, 'train_loss_step': 0.11989812552928925, 'val/rmse': 0.902579665184021, 'val/mae': 0.7176367044448853, 'val_loss': 0.8146500587463379, 'train_loss_epoch': 0.06969495117664337, 'epoch': 19, 'step': 40},
    path='/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/d775c15d',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, pat

In [12]:
# results of all trials
result_df = results.get_dataframe()
result_df

Unnamed: 0,train_loss,train_loss_step,val/rmse,val/mae,val_loss,train_loss_epoch,epoch,step,timestamp,checkpoint_dir_name,...,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/depth,config/train_loop_config/ffn_hidden_dim,config/train_loop_config/ffn_num_layers,config/train_loop_config/message_hidden_dim,logdir
0,0.099042,0.168217,0.861368,0.700675,0.741955,0.099042,19,40,1729602279,checkpoint_000019,...,24873,Knathan-Laptop,172.31.231.162,49.881516,20,2,2000,2,500,f1a6e41a
1,0.069695,0.119898,0.90258,0.717637,0.81465,0.069695,19,40,1729602299,checkpoint_000019,...,24953,Knathan-Laptop,172.31.231.162,56.653336,20,2,2200,2,400,d775c15d


In [13]:
# best configuration
best_result = results.get_best_result()
best_config = best_result.config
best_config['train_loop_config']

{'depth': 2,
 'ffn_hidden_dim': 2000,
 'ffn_num_layers': 2,
 'message_hidden_dim': 500}

In [None]:
# best model checkpoint path
best_result = results.get_best_result()
best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
print(f"Best model checkpoint path: {best_checkpoint_path}")

Best model checkpoint path: /home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000019/checkpoint.ckpt


In [15]:
ray.shutdown()