# Running hyperparameter optimization on Chemprop model using RayTune and Hyperopt

## Import packages

In [1]:
from pathlib import Path

import pandas as pd
from lightning import pytorch as pl
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler

from chemprop import data, featurizers, models, nn

In [2]:
chemprop_dir = Path.cwd().parent
input_path = chemprop_dir / "tests" / "data" / "regression" / "mol" / "mol.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['lipo'] # list of names of the columns containing targets

hpopt_save_dir = Path.cwd() / "hpopt" # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True)

## Load data

In [3]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,smiles,lipo
0,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,3.54
1,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,-1.18
2,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,3.69
3,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,3.37
4,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,3.10
...,...,...
95,CC(C)N(CCCNC(=O)Nc1ccc(cc1)C(C)(C)C)C[C@H]2O[C...,2.20
96,CCN(CC)CCCCNc1ncc2CN(C(=O)N(Cc3cccc(NC(=O)C=C)...,2.04
97,CCSc1c(Cc2ccccc2C(F)(F)F)sc3N(CC(C)C)C(=O)N(C)...,4.49
98,COc1ccc(Cc2c(N)n[nH]c2N)cc1,0.20


In [4]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

## Make data points, splits, and datasets

In [5]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

In [6]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

In [7]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data, featurizer)


# Define helper function to train the model

In [8]:
def train_model(config, train_dset, val_dset, num_workers, scaler):

    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth)
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(output_transform=output_transform, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers)
    batch_norm = True
    metric_list = [nn.metrics.RMSEMetric(), nn.metrics.MAEMetric()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=20, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(find_unused_parameters=True),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)


## Define parameter search space

In [9]:
search_space = {
    "depth": tune.quniform(lower=2, upper=6, q=1),
    "ffn_hidden_dim": tune.quniform(lower=300, upper=2400, q=100),
    "ffn_num_layers": tune.quniform(lower=1, upper=3, q=1),
    "message_hidden_dim": tune.quniform(lower=300, upper=2400, q=100),
}

In [10]:
scheduler = ASHAScheduler(
    max_t=20,
    grace_period=10,
    reduction_factor=2,
)

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=True,
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=hpopt_save_dir / "ray_results", # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(
        config, train_dset, val_dset, num_workers, scaler
    ),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=1, # number of random evaluations before tree parzen estimators
    random_state_seed=42,
)

tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    num_samples=2, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    
)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()


0,1
Current time:,2024-05-23 13:00:44
Running for:,00:00:14.34
Memory:,56.1/503.5 GiB

Trial name,status,loc,train_loop_config/de pth,train_loop_config/ff n_hidden_dim,train_loop_config/ff n_num_layers,train_loop_config/me ssage_hidden_dim,iter,total time (s),train_loss,val/rmse,val/mae
TorchTrainer_61728432,TERMINATED,10.114.0.73:4139123,2,2000,3,500,20,11.314,0.114935,0.869372,0.686786
TorchTrainer_f7121652,TERMINATED,10.114.0.73:4139221,2,2200,3,400,10,8.54016,0.438999,0.946479,0.765502


[36m(RayTrainWorker pid=4139220)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=4139123)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=4139123)[0m - (ip=10.114.0.73, pid=4139220) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=4139220)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=4139220)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=4139220)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=4139220)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=4139220)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/pyt ...
[36m(RayTrainW

Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=4139220)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val/rmse', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=4139220)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val/mae', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=4139220)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate th

Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Epoch 0: 100%|██████████| 2/2 [00:00<00:00, 40.34it/s, v_num=0, train_loss=1.200]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 534.44it/s][A
Epoch 0: 100%|██████████| 2/2 [00:00<00:00, 34.37it/s, v_num=0, train_loss=1.200, val_loss=0.941]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000000)


Epoch 0: 100%|██████████| 2/2 [00:00<00:00,  8.94it/s, v_num=0, train_loss=1.200, val_loss=0.941]
Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss=1.200, val_loss=0.941]        


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 43.49it/s, v_num=0, train_loss=0.718, val_loss=0.941]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 517.75it/s][A
Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 36.46it/s, v_num=0, train_loss=0.718, val_loss=0.938]
Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  9.33it/s, v_num=0, train_loss=0.718, val_loss=0.938]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 2: 100%|██████████| 2/2 [00:00<00:00, 43.88it/s, v_num=0, train_loss=1.080, val_loss=0.938]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 549.06it/s][A
Epoch 2: 100%|██████████| 2/2 [00:00<00:00, 37.01it/s, v_num=0, train_loss=1.080, val_loss=0.933]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000002)


Epoch 2: 100%|██████████| 2/2 [00:00<00:00,  9.08it/s, v_num=0, train_loss=1.080, val_loss=0.933]
Epoch 3: 100%|██████████| 2/2 [00:00<00:00, 44.83it/s, v_num=0, train_loss=0.633, val_loss=0.933]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 577.73it/s][A
Epoch 3: 100%|██████████| 2/2 [00:00<00:00, 37.93it/s, v_num=0, train_loss=0.633, val_loss=0.935]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000003)


Epoch 3: 100%|██████████| 2/2 [00:00<00:00,  9.43it/s, v_num=0, train_loss=0.633, val_loss=0.935]
Epoch 4: 100%|██████████| 2/2 [00:00<00:00, 42.85it/s, v_num=0, train_loss=0.284, val_loss=0.935]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 570.73it/s][A
Epoch 4: 100%|██████████| 2/2 [00:00<00:00, 36.41it/s, v_num=0, train_loss=0.284, val_loss=0.956]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000004)


Epoch 4: 100%|██████████| 2/2 [00:00<00:00,  9.20it/s, v_num=0, train_loss=0.284, val_loss=0.956]
Epoch 5:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss=0.284, val_loss=0.956]        
Epoch 5:  50%|█████     | 1/2 [00:00<00:00, 28.57it/s, v_num=0, train_loss=0.381, val_loss=0.956]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 5: 100%|██████████| 2/2 [00:00<00:00, 41.61it/s, v_num=0, train_loss=0.462, val_loss=0.956]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 432.89it/s][A
Epoch 5: 100%|██████████| 2/2 [00:00<00:00, 34.69it/s, v_num=0, train_loss=0.462, val_loss=0.953]
Epoch 5: 100%|██████████| 2/2 [00:00<00:00,  9.03it/s, v_num=0, train_loss=0.462, val_loss=0.953]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000005)


Epoch 6:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss=0.462, val_loss=0.953]        
Epoch 6: 100%|██████████| 2/2 [00:00<00:00, 44.26it/s, v_num=0, train_loss=0.554, val_loss=0.953]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 456.20it/s][A
Epoch 6: 100%|██████████| 2/2 [00:00<00:00, 36.99it/s, v_num=0, train_loss=0.554, val_loss=0.951]


[33m(raylet)[0m [2024-05-23 13:00:39,839 E 4135615 4135644] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-05-23_13-00-28_275966_4135308 is over 95% full, available space: 95394922496; capacity: 2002947665920. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=4139457)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000006)
[36m(RayTrainWorker pid=4139457)[0m 2 | bn              | BatchNorm1d        | 800   
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environ

Epoch 6: 100%|██████████| 2/2 [00:00<00:00,  9.33it/s, v_num=0, train_loss=0.554, val_loss=0.951]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 7: 100%|██████████| 2/2 [00:00<00:00, 43.07it/s, v_num=0, train_loss=0.175, val_loss=0.951]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 467.02it/s][A
Epoch 7: 100%|██████████| 2/2 [00:00<00:00, 36.06it/s, v_num=0, train_loss=0.175, val_loss=0.935]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000007)
[36m(TorchTrainer pid=4139221)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=4139221)[0m - (ip=10.114.0.73, pid=4139457) world_rank=0, local_rank=0, node_rank=0


Epoch 7: 100%|██████████| 2/2 [00:00<00:00,  8.82it/s, v_num=0, train_loss=0.175, val_loss=0.935]
Epoch 8: 100%|██████████| 2/2 [00:00<00:00, 44.96it/s, v_num=0, train_loss=0.278, val_loss=0.935]
Validation: |          | 0/? [00:00<?, ?it/s][A


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


[36m(RayTrainWorker pid=4139220)[0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 587.60it/s][A
Epoch 8: 100%|██████████| 2/2 [00:00<00:00, 38.00it/s, v_num=0, train_loss=0.278, val_loss=0.927]
Epoch 8: 100%|██████████| 2/2 [00:00<00:00,  9.39it/s, v_num=0, train_loss=0.278, val_loss=0.927]
Epoch 0: 100%|██████████| 2/2 [00:00<00:00, 42.14it/s, v_num=0, train_loss=1.200]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000008)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 9:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss=0.278, val_loss=0.927]        
Epoch 9: 100%|██████████| 2/2 [00:00<00:00, 44.92it/s, v_num=0, train_loss=0.363, val_loss=0.927]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 10: 100%|██████████| 2/2 [00:00<00:00, 45.61it/s, v_num=0, train_loss=0.228, val_loss=0.917]


[36m(RayTrainWorker pid=4139457)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=4139457)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=4139457)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=4139457)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/pyt ...
[36m(RayTrainWorker pid=4139457)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pyt

Epoch 11:  50%|█████     | 1/2 [00:00<00:00, 31.66it/s, v_num=0, train_loss=0.116, val_loss=0.904]
Epoch 11: 100%|██████████| 2/2 [00:00<00:00, 47.12it/s, v_num=0, train_loss=0.132, val_loss=0.904]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=4139457)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
[36m(RayTrainWorker pid=4139457)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many wor

Epoch 13: 100%|██████████| 2/2 [00:00<00:00, 45.75it/s, v_num=0, train_loss=0.0633, val_loss=0.894]
Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]
[36m(RayTrainWorker pid=4139220)[0m 


[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 14: 100%|██████████| 2/2 [00:00<00:00, 47.22it/s, v_num=0, train_loss=0.0554, val_loss=0.892]


[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val/rmse', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val/mae', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=4139457)[0m /home/hwpang/miniforge3/envs/chemprop_v2_dev/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate th

Epoch 15: 100%|██████████| 2/2 [00:00<00:00, 45.45it/s, v_num=0, train_loss=0.073, val_loss=0.890] 
Epoch 5: 100%|██████████| 2/2 [00:00<00:00, 45.49it/s, v_num=0, train_loss=0.681, val_loss=0.939][32m [repeated 5x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 13x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 13x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 13x across cluster][0m
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 580.37it/s][A[32m [repeated 13x across cluster][0m
Epoch 15: 100%|██████████| 2/2 [00:00<00:00, 38.45it/s, v_num=0, train_loss=0.073, val_loss=0.887][32m [repeated 13x across cluster][0m
Epoch 5: 100%|██████████| 2/2 [00:00<00:00,  7.65it/s, v_num=0, train_loss=0.681, val_loss=0.957][32m [repeated 12x across cluster][0m


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000015)[32m [repeated 13x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 5:   0%|          | 0/2 [00:00<?, ?it/s, v_num=0, train_loss=0.374, val_loss=0.939][32m [repeated 5x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


                                                                       [A
Epoch 7: 100%|██████████| 2/2 [00:00<00:00, 44.90it/s, v_num=0, train_loss=0.197, val_loss=0.967]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 17: 100%|██████████| 2/2 [00:00<00:00, 45.82it/s, v_num=0, train_loss=0.0735, val_loss=0.884]
[36m(RayTrainWorker pid=4139220)[0m 
[36m(RayTrainWorker pid=4139220)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of 

Epoch 19: 100%|██████████| 2/2 [00:00<00:00,  8.26it/s, v_num=0, train_loss=0.115, val_loss=0.869]


[36m(RayTrainWorker pid=4139220)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000019)


## Hyperparameter optimization results

In [11]:
results

ResultGrid<[
  Result(
    metrics={'train_loss': 0.11493527144193649, 'val/rmse': 0.8693722486495972, 'val/mae': 0.6867863535881042, 'val_loss': 0.8693722486495972, 'epoch': 19, 'step': 40},
    path='/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000019)
  ),
  Result(
    metrics={'train_loss': 0.4389992952346802, 'val/rmse': 0.9464786648750305, 'val/mae': 0.7655020952224731, 'val_loss': 0.9464786648750305, 'epoch': 9, 'step': 20},
    path='/home/hwpang/Projects/chemprop_v2_

In [12]:
# results of all trials
result_df = results.get_dataframe()
result_df

Unnamed: 0,train_loss,val/rmse,val/mae,val_loss,epoch,step,timestamp,checkpoint_dir_name,should_checkpoint,done,...,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/depth,config/train_loop_config/ffn_hidden_dim,config/train_loop_config/ffn_num_layers,config/train_loop_config/message_hidden_dim,logdir
0,0.114935,0.869372,0.686786,0.869372,19,40,1716483644,checkpoint_000019,True,True,...,4139123,estes,10.114.0.73,11.314006,20,2.0,2000.0,3.0,500.0,61728432
1,0.438999,0.946479,0.765502,0.946479,9,20,1716483644,checkpoint_000009,True,True,...,4139221,estes,10.114.0.73,8.540158,10,2.0,2200.0,3.0,400.0,f7121652


In [13]:
# best configuration
best_result = results.get_best_result()
best_config = best_result.config
best_config['train_loop_config']

{'depth': 2.0,
 'ffn_hidden_dim': 2000.0,
 'ffn_num_layers': 3.0,
 'message_hidden_dim': 500.0}

In [14]:
# best model checkpoint path
best_result = results.get_best_result()
best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
print(f"Best model checkpoint path: {best_checkpoint_path}")

Best model checkpoint path: /home/hwpang/Projects/chemprop_v2_dev/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-05-23_13-00-28/TorchTrainer_61728432_1_depth=2.0000,ffn_hidden_dim=2000.0000,ffn_num_layers=3.0000,message_hidden_dim=500.0000_2024-05-23_13-00-30/checkpoint_000019/checkpoint.ckpt


In [15]:
ray.shutdown()