In [1]:
from ase.db import connect
import pyro
from bnn import BNN
import torch
import lightning as L
import torch.utils.data as data
import numpy as np
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from optuna import Study
import optuna
from optuna.trial import Trial

from datamodel import AenetDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datamodele = AenetDataModule('/home/riccardo/bin/repos/aenet-bayesian/examples/liquid_water/train.in')
net = datamodele.get_model()
model_kwargs = {'net': net,
        'lr': 0.00055,
        'pretrain_epochs': 0,
        'mc_samples_train': 1,
        'mc_samples_eval': 20, 
        'dataset_size': datamodele.dataset_size, 
        'fit_context': 'lrt', 
        'prior_loc': 1, 
        'prior_scale': 1, 
        'guide': 'normal', 
        'q_scale': 1.,
        'obs_scale' : 10.}

In [3]:
datamodele.dataset_size

8270

In [4]:
early_stopping = EarlyStopping(monitor='elbo/val', min_delta = 0., # minimum change in the monitored quantity to qualify as an improvement
  patience= 3, # number of checks with no improvement after which training will be stopped
  verbose= False, # verbosity mode
  mode= "min", # "max" means higher metric value is better, can be also "min"
  strict= True, # whether to crash the training if monitor is not found in the validation metrics
  check_finite= True,) # when set True, stops training when the monitor becomes NaN or infinite
#   'stopping_threshold'= null, # stop training immediately once the monitored quantity reaches this threshold
#   'divergence_threshold'= null, # stop training as soon as the monitored quantity becomes worse than this threshold
#   'check_on_train_epoch_end'= null,)

early_stopping = EarlyStopping(
    monitor='elbo/val',
    patience=100,
    mode='min' )

In [5]:
default_root_dir = '../examples/liquid_water/'

def objective(trial: Trial, model_kwargs : dict, output_dir: str):
    # model_kwargs['pretrain_epochs'] = trial.suggest_categorical(
    #         "pretrain_epochs", [0, 5]
    #     )
    model_kwargs['lr'] = trial.suggest_float("lr", 1e-5, 1e-3)
    #model_kwargs['prior_loc'] = trial.suggest_float("prior_loc", 0., 1., log=True)
    model_kwargs['prior_scale'] = trial.suggest_float("prior_scale", 0.1, 5, log=True)
    model_kwargs['q_scale'] = trial.suggest_float("q_scale", 1e-4, 5, log=True)
    model_kwargs['obs_scale'] = trial.suggest_float("obs_scale", 0.1, 5, log=True)

    model = BNN(**model_kwargs)
    trainer = L.Trainer(min_epochs = 1, max_epochs = 20, default_root_dir=default_root_dir, callbacks=early_stopping)
    trainer.fit(model=model, datamodule=datamodele)

    return trainer.callback_metrics['elbo/val'].item()

study = optuna.create_study(study_name='bnn_hyperparams', storage=f"sqlite:///{default_root_dir}/bnn_hparams_big.sqlite3", directions=["minimize"])
study.optimize(
        lambda trial: objective(trial, model_kwargs=model_kwargs, output_dir='optuna_log'),
        n_trials=60,
        timeout=None,
        catch=(RuntimeError,),
    )

[I 2024-03-04 14:24:46,926] A new study created in RDB with name: bnn_hyperparams
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: ../examples/liquid_water/lightning_logs
/home/riccardo/anaconda3/envs/bayesian/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:181: `LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer

  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

/home/riccardo/anaconda3/envs/bayesian/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/home/riccardo/anaconda3/envs/bayesian/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/riccardo/anaconda3/envs/bayesian/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (8) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:  12%|█▎        | 1/8 [00:01<00:10,  0.67it/s, v_num=0]



Epoch 19: 100%|██████████| 8/8 [00:12<00:00,  0.66it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:12<00:00,  0.65it/s, v_num=0]

[I 2024-03-04 14:30:14,964] Trial 0 finished with value: 12873508864.0 and parameters: {'lr': 0.0007199468657569009, 'prior_scale': 0.10225820217663267, 'q_scale': 0.07802701124751607, 'obs_scale': 0.25334807359648653}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.68it/s, v_num=1]           

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.68it/s, v_num=1]

[I 2024-03-04 14:35:01,978] Trial 1 finished with value: 172226225635328.0 and parameters: {'lr': 0.00019436469764910332, 'prior_scale': 1.2626126492131449, 'q_scale': 0.002430902419695284, 'obs_scale': 1.156716997950242}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.71it/s, v_num=2]           

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.71it/s, v_num=2]

[I 2024-03-04 14:39:46,743] Trial 2 finished with value: 2554800898048000.0 and parameters: {'lr': 0.0002601952086633604, 'prior_scale': 0.13765307977701544, 'q_scale': 0.006144291477574801, 'obs_scale': 3.9322959145371232}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.69it/s, v_num=3]           

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.69it/s, v_num=3]

[I 2024-03-04 14:44:22,915] Trial 3 finished with value: 1.1609295226698465e+18 and parameters: {'lr': 0.0007509964530488403, 'prior_scale': 0.5896561998318629, 'q_scale': 0.00011262290761080629, 'obs_scale': 2.2481811038893236}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.70it/s, v_num=4]           

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.70it/s, v_num=4]

[I 2024-03-04 14:49:07,377] Trial 4 finished with value: 5.0922983838957175e+19 and parameters: {'lr': 0.0004679098012769008, 'prior_scale': 0.22525628910529075, 'q_scale': 0.00024929633383857547, 'obs_scale': 4.109720020842615}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.69it/s, v_num=5]           

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 8/8 [00:11<00:00,  0.69it/s, v_num=5]

[I 2024-03-04 14:53:39,702] Trial 5 finished with value: 5.313508249382356e+23 and parameters: {'lr': 0.0005476261859273093, 'prior_scale': 0.6864277036138321, 'q_scale': 0.24231894856796715, 'obs_scale': 0.4881046582064587}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 8/8 [00:11<00:00,  0.67it/s, v_num=6]            

[I 2024-03-04 14:54:36,456] Trial 6 finished with value: 1.9695009880904376e+25 and parameters: {'lr': 0.000715226773082453, 'prior_scale': 0.852314908426756, 'q_scale': 0.003350174301426805, 'obs_scale': 0.9665357378833582}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 8/8 [00:11<00:00,  0.70it/s, v_num=7]            

[I 2024-03-04 14:55:28,608] Trial 7 finished with value: 6.757822134965293e+27 and parameters: {'lr': 0.0003025083904412125, 'prior_scale': 0.3697289978173995, 'q_scale': 1.0571798261831868, 'obs_scale': 0.6334874561840249}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name | Type    | Params
---------------------------------
0 | net  | NetAtom | 2.1 K 
---------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 8/8 [00:12<00:00,  0.65it/s, v_num=8]            

[I 2024-03-04 14:56:33,814] Trial 8 finished with value: 2.7829328807587684e+29 and parameters: {'lr': 2.8930669898145903e-05, 'prior_scale': 0.11247446937748473, 'q_scale': 0.010763766109824777, 'obs_scale': 1.1889885105778863}. Best is trial 0 with value: 12873508864.0.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs





: 