In [None]:
#| default_exp hpopt
#| default_cls_lvl 3

In [None]:
#| export
from seqdata.core import *
from seqdata.models.core import *
from seqdata.learner import *
from fastai.basics import *
from fastai.callback.schedule import *
from fastai.callback.rnn import *
from fastai.callback.tracker import *

import ray
from ray import tune
from ray.tune import Trainable
from ray.tune.schedulers import *
from ray.tune.experiment.trial import ExportFormat
from ray import train
from ray.train import Checkpoint

Library "haste_pytorch" not found


In [None]:
f_paths = Path.cwd() / 'test_data/WienerHammerstein/'
hdf_files = L([f for f in get_hdf_files(f_paths) if '_test.hdf5' not in str(f)])
tfm_src = CreateDict([DfHDFCreateWindows(win_sz=400,stp_sz=100,clm='u')])
dls = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                        SequenceBlock.from_hdf(['y'],TensorSequencesOutput)),
                get_items=tfm_src,
                splitter=ApplyToDict(FuncSplitter(lambda o: 'valid' in str(o)))).dataloaders(hdf_files)

## optimizer core

First we need a log uniform distibution for variables with vast value ranges

In [None]:
#| export
def log_uniform(min_bound, max_bound, base=10):
    '''uniform sampling in an exponential range'''
    logmin = np.log(min_bound) / np.log(base)
    logmax = np.log(max_bound) / np.log(base)
    def _sample():
        return base**(np.random.uniform(logmin, logmax))
    return _sample

In [None]:
[log_uniform(1e-8, 1e-2)() for _ in range(5)]

[8.707423528633258e-06,
 1.1678226754983236e-07,
 1.8065566484337061e-06,
 7.65402711625238e-05,
 9.501829452482763e-07]

In [None]:
#| export
class LearnerTrainable(tune.Trainable):

    def setup(self, config):
        self.create_lrn = ray.get(config['create_lrn'])
        self.dls = ray.get(config['dls'])

        self.lrn = self.create_lrn(self.dls,config)

    def step(self):
        with self.lrn.no_bar(): self.lrn.fit(1)
        train_loss,valid_loss,rmse = self.lrn.recorder.values[-1]
        result = {'train_loss': train_loss,
                'valid_loss': valid_loss,
                'mean_loss': rmse}
        return result

    def save_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
        torch.save(self.lrn.model.state_dict(), checkpoint_path)
        return tmp_checkpoint_dir

    def load_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
        self.lrn.model.load_state_dict(torch.load(checkpoint_path))

    def _export_model(self, export_formats, export_dir):
        if export_formats == [ExportFormat.MODEL]:
            path = os.path.join(export_dir, "exported_model")
            torch.save(self.lrn.model.state_dict(), path)
            return {ExportFormat.MODEL: path}
        else:
            raise ValueError("unexpected formats: " + str(export_formats))

    # the learner class will be recreated with every perturbation, saving the model
    # that way the new hyperparameter will be applied
    def reset_config(self, new_config):
        self.lrn = self.create_lrn(self.dls,new_config)
        self.config = new_config
        return True

In [None]:
#| export
from fastai.callback.tracker import SaveModelCallback 
class CBRaySaveModel(SaveModelCallback):
    "A `TrackerCallback` that saves the model's best during training in a tune checkpoint directory"
    
    def _save(self, name):
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            file = os.path.join(temp_checkpoint_dir,name+'.pth')
            save_model(file, self.learn.model,opt=None)
            self.last_saved_path = file
            
    #final checkpoint
    def after_fit(self, **kwargs):
        self._save(f'{self.fname}')

In [None]:
#| export
def learner_optimize(config):
        create_lrn = ray.get(config['create_lrn'])
        dls = ray.get(config['dls'])
        
        #Scheduling Parameters for training the Model
        lrn_kwargs = {'n_epoch':100,'pct_start':0.5}
        for attr in ['n_epoch','pct_start']:
            if attr in config: lrn_kwargs[attr] = config[attr]

        lrn = create_lrn(dls,config)
        
        # load checkpoint data if provided
        checkpoint: train.Checkpoint = train.get_checkpoint()
        if checkpoint:
            with checkpoint.as_directory() as checkpoint_dir:
                lrn.model.load_state_dict(torch.load(checkpoint_dir + 'model.pth'))
        
        lrn.lr = config['lr'] if 'lr' in config else 3e-3
        lrn.add_cb(CBRayReporter() if 'reporter' not in config else ray.get(config['reporter'])())
        # lrn.add_cb(CBRaySaveModel()) #the model saving now has to be done by the reporter callback
        with lrn.no_bar(): 
            ray.get(config['fit_method'])(lrn,**lrn_kwargs)

The mutation config dictionary consists of functions that sample from a distribution. In order to retrieve a dictionary with one realisation we need the function sample_config

In [None]:
#| export
def sample_config(config):
    ret_conf = config.copy()
    for k in ret_conf:
        ret_conf[k]=ret_conf[k]()
    return ret_conf

In [None]:
#| export
class CBRayReporter(Callback):
    "`Callback` reports progress after every epoch to the ray tune logger"
    
    order=70 #order has to be >50, to be executed after the recorder callback

    def after_epoch(self):
        # train_loss,valid_loss,rmse = self.learn.recorder.values[-1]
        # metrics = {
        #     'train_loss': train_loss,
        #     'valid_loss': valid_loss,
        #     'mean_loss': rmse,
        # }
        scores = self.learn.recorder.values[-1]
        metrics = {
            'train_loss': scores[0],
            'valid_loss': scores[1]
        }
        for metric,value in zip(self.learn.metrics,scores[2:]):
            m_name = metric.name if hasattr(metric,'name') else str(metric)
            metrics[m_name] = value

        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            file = os.path.join(temp_checkpoint_dir,'model.pth')
            #the model has to be saved to the checkpoint directory on creation
            #that is why a seperate callback for model saving is not trivial
            save_model(file, self.learn.model,opt=None) 
            ray.train.report(metrics, checkpoint=Checkpoint.from_directory(temp_checkpoint_dir))

In [None]:
#| export
class HPOptimizer():
    def __init__(self,create_lrn,dls):
        self.create_lrn = create_lrn
        self.dls = dls
        self.analysis = None
    
    @delegates(ray.init)
    def start_ray(self,**kwargs):
        ray.shutdown()
        ray.init(**kwargs)
        
    def stop_ray(self):
        ray.shutdown()
        
    
        

    @delegates(tune.run, keep=True)
    def optimize(self,config,optimize_func=learner_optimize,resources_per_trial={"gpu": 1.0},verbose=1,**kwargs):
        config['create_lrn'] = ray.put(self.create_lrn)
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        if 'fit_method' not in config: config['fit_method'] = ray.put(Learner.fit_flat_cos)

        kwargs.setdefault('keep_checkpoints_num', 1)#keep only the last checkpoint

        self.analysis = tune.run(
            optimize_func,
            config=config,
            resources_per_trial=resources_per_trial,
            verbose=verbose,
            **kwargs)
        return self.analysis
        
    @delegates(tune.run, keep=True)
    def optimize_pbt(self,opt_name,num_samples,config,mut_conf,perturbation_interval=2,
                 stop={"training_iteration": 40 },
                 resources_per_trial={"gpu": 1 },
                 resample_probability=0.25,
                 quantile_fraction=0.25,
                 **kwargs):
        self.mut_conf = mut_conf
        
        config['create_lrn'] = ray.put(self.create_lrn)
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        
        kwargs.setdefault('keep_checkpoints_num', 2)
        

        
        scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="mean_loss",
        mode="min",
        perturbation_interval=perturbation_interval,
        resample_probability=resample_probability,
        quantile_fraction=quantile_fraction,
        hyperparam_mutations=mut_conf)
        
        self.analysis = tune.run(
            LearnerTrainable,
            name=opt_name,
            scheduler=scheduler,
            reuse_actors=True,
            verbose=1,
            stop=stop,
            checkpoint_score_attr="mean_loss",
            num_samples=num_samples,
            resources_per_trial=resources_per_trial,
            config=config,
            **kwargs)
        return self.analysis
    
    def best_model(self):
        if self.analysis is None: raise Exception
        model = self.create_lrn(self.dls,sample_config(self.mut_conf)).model
        f_path = ray.get(self.analysis.get_best_trial('mean_loss',mode='min').checkpoint.value)
        model.load_state_dict(torch.load(f_path))
        return model

### Test Population Based Training

In [None]:
def create_lrn(dls,config):
    lr = config['lr']
    alpha = config['alpha']
    beta = config['beta']
    weight_p = config['weight_p']
    
    lrn = RNNLearner(dls)
    lrn.lr = lr
    return lrn

In [None]:
config={
            "lr": tune.loguniform(1e-2, 1e-4),
            "alpha": tune.loguniform(1e-5, 10),
            "beta": tune.loguniform(1e-5, 10),
            "weight_p": tune.uniform(0, 0.5)}
mut_conf = {# distribution for resampling
            "lr": log_uniform(1e-8, 1e-2),
            "alpha": log_uniform(1e-5, 10),
            "beta": log_uniform(1e-5, 10),
            "weight_p": lambda: np.random.uniform(0, 0.5)}

hp_opt = HPOptimizer(create_lrn,dls)
hp_opt.start_ray()
hp_opt.optimize_pbt('pbt_test',4,config,mut_conf,perturbation_interval=1,
                 stop={"training_iteration": 3 },
                 resources_per_trial={"gpu": 0.5},
                 storage_path=str(Path.home() / 'ray_results'))#no cpu count is necessary

0,1
Current time:,2024-02-27 09:08:51
Running for:,00:00:09.91
Memory:,51.0/251.7 GiB

Trial name,status,loc,alpha,beta,lr,weight_p,loss,iter,total time (s),train_loss,valid_loss
LearnerTrainable_6ba23_00000,TERMINATED,141.23.125.123:2274922,0.000754759,0.147041,0.00206852,0.427199,0.243669,3,4.09341,0.054592,0.0594217
LearnerTrainable_6ba23_00001,TERMINATED,141.23.125.123:2274980,6.6786e-05,1.14297e-05,0.00229839,0.28322,0.24408,3,3.57251,0.0548091,0.0596214
LearnerTrainable_6ba23_00002,TERMINATED,141.23.125.123:2274951,0.000943449,0.122534,0.00258565,0.389235,0.242528,3,4.3396,0.0545417,0.0588672
LearnerTrainable_6ba23_00003,TERMINATED,141.23.125.123:2274980,8.31592e-05,0.000128321,0.00130677,0.15309,0.245304,3,3.69751,0.0551621,0.0602236


[36m(LearnerTrainable pid=2274951)[0m Library "haste_pytorch" not found


2024-02-27 09:08:47,999	INFO pbt.py:716 -- [pbt]: no checkpoint for trial LearnerTrainable_6ba23_00001. Skip exploit for Trial LearnerTrainable_6ba23_00000


[36m(LearnerTrainable pid=2274922)[0m [0, 0.056544214487075806, 0.0614364854991436, 0.24775400757789612, '00:01']


[36m(LearnerTrainable pid=2274980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/pbt_test/LearnerTrainable_6ba23_00003_3_alpha=0.0001,beta=0.0001,lr=0.0013,weight_p=0.1531_2024-02-27_09-08-42/checkpoint_000000)
2024-02-27 09:08:49,183	INFO pbt.py:878 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 6ba23_00002 (score = -0.245733) into trial 6ba23_00000 (score = -0.260895)

2024-02-27 09:08:49,184	INFO pbt.py:905 -- 

[PopulationBasedTraining] [Explore] Perturbed the hyperparameter config of trial6ba23_00000:
lr : 0.002585650659524256 --- (* 0.8) --> 0.002068520527619405
alpha : 0.0009434489459286575 --- (* 0.8) --> 0.0007547591567429261
beta : 0.12253412616726472 --- (* 1.2) --> 0.14704095140071766
weight_p : 0.38923479562285357 --- (resample) --> 0.42719880741350896

[36m(LearnerTrainable pid=2274922)[0m Restored on 141.23.125.123 from checkpoint: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/pbt_test/Learner

[36m(LearnerTrainable pid=2274921)[0m Library "haste_pytorch" not found[32m [repeated 3x across cluster][0m
[36m(LearnerTrainable pid=2274980)[0m [0, 0.05516214296221733, 0.06022358313202858, 0.24530412256717682, '00:00'][32m [repeated 12x across cluster][0m


2024-02-27 09:08:54,335	INFO tune.py:1042 -- Total run time: 12.32 seconds (9.90 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis>

In [None]:
#hp_opt.best_model()

### Test Grid Search

In [None]:
# dls.cpu()

<fastai.data.core.DataLoaders>

In [None]:
def create_lrn(dls,config):
    lrn = RNNLearner(dls,hidden_size=config['hidden_size'],metrics=[fun_rmse,mse])
    return lrn

In [None]:
# class CustomReporter(Callback):
#     "`Callback` reports progress after every epoch to the ray tune logger"
    
#     order=70 #order has to be >50, to be executed after the recorder callback

#     def after_epoch(self):
#         train_loss,valid_loss,rmse,mse = self.learn.recorder.values[-1]
#         print(self.learn.recorder.values[-1])
#         metrics = {
#             'train_loss': train_loss,
#             'valid_loss': valid_loss,
#             'mean_loss': rmse,
#             'mse': mse,
#         }
#         with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
#             ray.train.report(metrics, checkpoint=Checkpoint.from_directory(temp_checkpoint_dir))

In [None]:
hp_opt = HPOptimizer(create_lrn,dls)

In [None]:
hp_opt.start_ray()

2024-02-27 09:08:57,880	INFO worker.py:1724 -- Started a local Ray instance.


In [None]:
search_space = {
    "hidden_size": tune.grid_search([10,20,50,100]),
    'n_epoch':10,
    # 'reporter':ray.put(CustomReporter)
}

In [None]:
hp_opt.optimize(resources_per_trial={"gpu": 0.5},
                config=search_space)

0,1
Current time:,2024-02-27 09:09:21
Running for:,00:00:19.05
Memory:,49.6/251.7 GiB

Trial name,status,loc,hidden_size,iter,total time (s),train_loss,valid_loss,fun_rmse
learner_optimize_77a19_00000,TERMINATED,141.23.125.123:2313691,10,10,15.3151,0.0959706,0.0715517,0.267347
learner_optimize_77a19_00001,TERMINATED,141.23.125.123:2313693,20,10,15.1866,0.0574443,0.0602697,0.245406
learner_optimize_77a19_00002,TERMINATED,141.23.125.123:2313694,50,10,15.1335,0.0306168,0.017711,0.13303
learner_optimize_77a19_00003,TERMINATED,141.23.125.123:2313695,100,10,15.388,0.0178903,0.00585899,0.0765191


[36m(learner_optimize pid=2313693)[0m Library "haste_pytorch" not found
[36m(learner_optimize pid=2313693)[0m [0, 0.08589156717061996, 0.08840160816907883, 0.2971020042896271, 0.08840160816907883, '00:02']


[36m(learner_optimize pid=2313694)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/learner_optimize_2024-02-27_09-09-02/learner_optimize_77a19_00002_2_hidden_size=50_2024-02-27_09-09-02/checkpoint_000000)


[36m(learner_optimize pid=2313694)[0m Library "haste_pytorch" not found[32m [repeated 3x across cluster][0m
[36m(learner_optimize pid=2313693)[0m [4, 0.0641716793179512, 0.061240144073963165, 0.2473810464143753, 0.061240144073963165, '00:01'][32m [repeated 16x across cluster][0m


[36m(learner_optimize pid=2313693)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/learner_optimize_2024-02-27_09-09-02/learner_optimize_77a19_00001_1_hidden_size=20_2024-02-27_09-09-02/checkpoint_000004)[32m [repeated 16x across cluster][0m


[36m(learner_optimize pid=2313694)[0m [8, 0.0348917655646801, 0.017931213602423668, 0.13385559618473053, 0.017931213602423668, '00:01'][32m [repeated 16x across cluster][0m


[36m(learner_optimize pid=2313694)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/learner_optimize_2024-02-27_09-09-02/learner_optimize_77a19_00002_2_hidden_size=50_2024-02-27_09-09-02/checkpoint_000008)[32m [repeated 16x across cluster][0m
2024-02-27 09:09:21,214	INFO tune.py:1042 -- Total run time: 19.07 seconds (19.05 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis>

In [None]:
hp_opt.analysis.get_best_config('mean_loss',mode='min')



### Test Random Search

In [None]:
#| include: false
import nbdev; nbdev.nbdev_export()

[36m(learner_optimize pid=2313695)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/pheenix/ray_results/learner_optimize_2024-02-27_09-09-02/learner_optimize_77a19_00003_3_hidden_size=100_2024-02-27_09-09-02/checkpoint_000009)


[36m(learner_optimize pid=2313695)[0m [9, 0.017890259623527527, 0.005858990829437971, 0.07651908695697784, 0.005858990829437971, '00:01']
