In [None]:
#| default_exp hpopt
#| default_cls_lvl 3

In [None]:
#| export
from seqdata.core import *
from seqdata.models.core import *
from seqdata.learner import *
from fastai.basics import *
from fastai.callback.schedule import *
from fastai.callback.rnn import *
from fastai.callback.tracker import *

import ray
from ray import tune
from ray.tune import Trainable
from ray.tune.schedulers import *
from ray.tune.experiment.trial import ExportFormat
from ray import train
from ray.train import Checkpoint

In [None]:
f_paths = Path.cwd() / 'test_data/WienerHammerstein/'
hdf_files = L([f for f in get_hdf_files(f_paths) if '_test.hdf5' not in str(f)])
tfm_src = CreateDict([DfHDFCreateWindows(win_sz=400,stp_sz=100,clm='u')])
dls = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                        SequenceBlock.from_hdf(['y'],TensorSequencesOutput)),
                get_items=tfm_src,
                splitter=ApplyToDict(FuncSplitter(lambda o: 'valid' in str(o)))).dataloaders(hdf_files)

## optimizer core

First we need a log uniform distibution for variables with vast value ranges

In [None]:
#| export
def log_uniform(min_bound, max_bound, base=10):
    '''uniform sampling in an exponential range'''
    logmin = np.log(min_bound) / np.log(base)
    logmax = np.log(max_bound) / np.log(base)
    def _sample():
        return base**(np.random.uniform(logmin, logmax))
    return _sample

In [None]:
[log_uniform(1e-8, 1e-2)() for _ in range(5)]

[9.69419005016273e-07,
 4.1156426625834497e-07,
 6.122196421320673e-08,
 7.288171684370689e-07,
 3.974506352285226e-06]

In [None]:
#| export
class LearnerTrainable(tune.Trainable):

    def setup(self, config):
        self.create_lrn = ray.get(config['create_lrn'])
        self.dls = ray.get(config['dls'])

        self.lrn = self.create_lrn(self.dls,config)

    def step(self):
        with self.lrn.no_bar(): self.lrn.fit(1)
        train_loss,valid_loss,rmse = self.lrn.recorder.values[-1]
        result = {'train_loss': train_loss,
                'valid_loss': valid_loss,
                'mean_loss': rmse}
        return result

    def save_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
        torch.save(self.lrn.model.state_dict(), checkpoint_path)
        return tmp_checkpoint_dir

    def load_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
        self.lrn.model.load_state_dict(torch.load(checkpoint_path))

    def _export_model(self, export_formats, export_dir):
        if export_formats == [ExportFormat.MODEL]:
            path = os.path.join(export_dir, "exported_model")
            torch.save(self.lrn.model.state_dict(), path)
            return {ExportFormat.MODEL: path}
        else:
            raise ValueError("unexpected formats: " + str(export_formats))

    # the learner class will be recreated with every perturbation, saving the model
    # that way the new hyperparameter will be applied
    def reset_config(self, new_config):
        self.lrn = self.create_lrn(self.dls,new_config)
        self.config = new_config
        return True

In [None]:
#| export
from fastai.callback.tracker import SaveModelCallback 
class CBRaySaveModel(SaveModelCallback):
    "A `TrackerCallback` that saves the model's best during training in a tune checkpoint directory"
    
    def _save(self, name):
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            file = os.path.join(temp_checkpoint_dir,name+'.pth')
            save_model(file, self.learn.model,opt=None)
            self.last_saved_path = file
            
    #final checkpoint
    def after_fit(self, **kwargs):
        self._save(f'{self.fname}')

In [None]:
#| export
from multiprocessing.managers import SharedMemoryManager
def stop_shared_memory_managers(obj):
    """
    Iteratively finds and stops all SharedMemoryManager instances contained within the provided object.
    """
    visited = set()  # Track visited objects to avoid infinite loops
    stack = [obj]  # Use a stack to manage objects to inspect

    while stack:
        current_obj = stack.pop()
        obj_id = id(current_obj)

        if obj_id in visited:
            continue  # Skip already visited objects
        visited.add(obj_id)

        # Check if the current object is a SharedMemoryManager and stop it
        if isinstance(current_obj, SharedMemoryManager):
            current_obj.shutdown()
            continue

        # If it's a collection, add its items to the stack. Otherwise, add its attributes.
        if isinstance(current_obj, dict):
            stack.extend(current_obj.keys())
            stack.extend(current_obj.values())
        elif isinstance(current_obj, (list, set, tuple)):
            stack.extend(current_obj)
        elif hasattr(current_obj, '__dict__'):  # Check for custom objects with attributes
            stack.extend(vars(current_obj).values())

In [None]:
#| export
import gc
def learner_optimize(config):
    try:
        create_lrn = ray.get(config['create_lrn'])
        dls = ray.get(config['dls'])
        
        #Scheduling Parameters for training the Model
        lrn_kwargs = {'n_epoch':100,'pct_start':0.5}
        for attr in ['n_epoch','pct_start']:
            if attr in config: lrn_kwargs[attr] = config[attr]
    
        lrn = create_lrn(dls,config)
        
        # load checkpoint data if provided
        checkpoint: train.Checkpoint = train.get_checkpoint()
        if checkpoint:
            with checkpoint.as_directory() as checkpoint_dir:
                lrn.model.load_state_dict(torch.load(checkpoint_dir + 'model.pth'))
        
        lrn.lr = config['lr'] if 'lr' in config else 3e-3
        lrn.add_cb(CBRayReporter() if 'reporter' not in config else ray.get(config['reporter'])())
        # lrn.add_cb(CBRaySaveModel()) #the model saving now has to be done by the reporter callback
        with lrn.no_bar(): 
            ray.get(config['fit_method'])(lrn,**lrn_kwargs)
    finally:
        #cleanup shared memory even when earlystopping occurs
        stop_shared_memory_managers(lrn)
        del lrn
        gc.collect()

In [None]:
from fastai.callback.core import Callback
class TrainSpecificEpoch(Callback):
    "Skip training up to `epoch`"
    order = 70
    
    def __init__(self, epoch:int):
        self._skip_to = epoch

    def before_epoch(self):
        print(self.epoch)
        # if self.epoch < self._skip_to:
        #     raise CancelEpochException
        # if self.epoch > self._skip_to:
        # raise CancelFitException

In [None]:
CancelEpochException??

[0;31mInit signature:[0m [0mCancelEpochException[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      Skip the rest of this epoch and go to `after_epoch`
[0;31mFile:[0m           ~/miniconda3/envs/env_fastai/lib/python3.10/site-packages/fastcore/basics.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     

In [None]:
SkipToEpoch??

[0;31mInit signature:[0m [0mSkipToEpoch[0m[0;34m([0m[0mepoch[0m[0;34m:[0m [0;34m'int'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mSkipToEpoch[0m[0;34m([0m[0mCallback[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Skip training up to `epoch`"[0m[0;34m[0m
[0;34m[0m    [0morder[0m [0;34m=[0m [0;36m70[0m[0;34m[0m
[0;34m[0m    [0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mepoch[0m[0;34m:[0m[0mint[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mself[0m[0;34m.[0m[0m_skip_to[0m [0;34m=[0m [0mepoch[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0mbefore_epoch[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mepoch[0m [0;34m<[0m [0mself[0m[0;34m.[0m[0m_skip_to[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mraise[0m

In [None]:
class TrainableModel(tune.Trainable):
    def setup(self, config):
        # Assuming create_lrn and dls are accessible here or passed in config
        self.create_lrn = ray.get(config['create_lrn'])
        self.dls = ray.get(config['dls'])
        self.config = config

        self.lrn = self.create_lrn(self.dls, config)

        self.lrn.lr = config['lr'] if 'lr' in config else 3e-3
        if 'wd' in config: self.lrn.wd = config['wd']
        self._setup_callbacks()

        if 'reporter' not in self.config:
            self.lrn.add_cb(CBRayReporter())
        else:
            self.lrn.add_cb(ray.get(self.config['reporter'])())

        if self.lrn.opt is None: self.lrn.create_opt()
        self.lrn.opt.set_hyper('lr', self.lrn.lr)
        lr = np.array([h['lr'] for h in self.lrn.opt.hypers])
        pct_start = config['pct_start'] if 'pct_start' in config else 0.3
        self.n_epoch = config['n_epoch'] if 'n_epoch' in config else 10
        lr_scheds = {'lr': combined_cos(pct_start, lr, lr, lr/div_final)}
        self.steps=0

    def step(self):

        self.fit(self.n_epoch, cbs=TrainSpecificEpoch(self.steps)+ParamScheduler(scheds)+L(cbs), wd=wd)
        self.steps += 1

        
        scores = self.lrn.recorder.values[-1]
        metrics = {
            'train_loss': scores[0],
            'valid_loss': scores[1]
        }        
        for metric,value in zip(self.learn.metrics,scores[2:]):
            m_name = metric.name if hasattr(metric,'name') else str(metric)
            metrics[m_name] = value
        return metrics

    def save_checkpoint(self, checkpoint_dir):
        file = os.path.join(temp_checkpoint_dir,'model.pth')
        save_model(file, self.learn.model,opt=None) 

    def load_checkpoint(self, checkpoint_path):
        self.lrn.model.load_state_dict(torch.load(checkpoint_dir + 'model.pth'))


In [None]:
class TrainableModel(tune.Trainable):
    def setup(self, config):
        # Assuming create_lrn and dls are accessible here or passed in config
        self.create_lrn = ray.get(config['create_lrn'])
        self.dls = ray.get(config['dls'])
        self.config = config
        self.lrn_kwargs = {'n_epoch': 100, 'pct_start': 0.5}

        for attr in ['n_epoch', 'pct_start']:
            if attr in config:
                self.lrn_kwargs[attr] = config[attr]

        self.lrn = self.create_lrn(self.dls, config)
        self.lrn.lr = config['lr'] if 'lr' in config else 3e-3


    def step(self):
        print(self.iteration)
        # fit_kwargs = {**self.lrn_kwargs,**{'cbs':TrainSpecificEpoch(self.iteration)}}
        # fit_kwargs = {**self.lrn_kwargs,**{'cbs':SkipToEpoch(self.iteration)}}
        # fit_kwargs = self.lrn_kwargs
        with self.lrn.no_bar(): 
            # ray.get(self.config['fit_method'])(self.lrn,**fit_kwargs)
            # self.lrn.fit_flat_cos(**fit_kwargs)
            self.lrn.fit_flat_cos(self.lrn_kwargs['n_epoch'],cbs=TrainSpecificEpoch(self.iteration))

        
        metrics = {
            'train_loss': 1,#scores[0],
            'valid_loss': 1,#scores[1],
             tune.result.DONE: self.iteration >= self.lrn_kwargs['n_epoch']-1
        }  
        
        # scores = self.lrn.recorder.values[-1]
        # metrics = {
        #     'train_loss': scores[0],
        #     'valid_loss': scores[1],
        #      tune.result.DONE: self.epoch_iter >= self.lrn_kwargs['n_epoch']
        # }        
        # for metric,value in zip(self.lrn.metrics,scores[2:]):
        #     m_name = metric.name if hasattr(metric,'name') else str(metric)
        #     metrics[m_name] = value
        return metrics

    def save_checkpoint(self, checkpoint_dir):
        file = os.path.join(temp_checkpoint_dir,'model.pth')
        save_model(file, self.learn.model,opt=None) 

    def load_checkpoint(self, checkpoint_path):
        self.lrn.model.load_state_dict(torch.load(checkpoint_dir + 'model.pth'))


In [None]:
dls = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                    SequenceBlock.from_hdf(['y'],TensorSequencesOutput)),
            get_items=tfm_src,
            splitter=ApplyToDict(FuncSplitter(lambda o: 'valid' in str(o)))).dataloaders(hdf_files)

In [None]:
n_epoch = 6
lrn = RNNLearner(dls,metrics=[fun_rmse,mse])
for i in range(n_epoch):
    
    fit_kwargs = {**{'n_epoch':n_epoch},**{'cbs':TrainSpecificEpoch(i)}}
    lrn.fit_flat_cos(**fit_kwargs)

epoch,train_loss,valid_loss,fun_rmse,mse,time
0,0.057185,0.06222,0.24931,0.06222,00:01


epoch,train_loss,valid_loss,fun_rmse,mse,time
0,00:00,,,,
1,0.056086,0.061215,0.247315,0.061215,00:00


epoch,train_loss,valid_loss,fun_rmse,mse,time
0,00:00,,,,
1,00:00,,,,
2,0.055413,0.060589,0.246052,0.060589,00:00


epoch,train_loss,valid_loss,fun_rmse,mse,time
0,00:00,,,,
1,00:00,,,,
2,00:00,,,,


Exception ignored in: <function _releaseLock>
Traceback (most recent call last):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
Exception ignored in: <function _releaseLock>
Traceback (most recent call last):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 2177850, 2177923, 2177991, 2178058, 2178125, 2178192) exited unexpectedly

Exception ignored in: <function Socket.__del__>
Traceback (most recent call last):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/site-packages/zmq/sugar/socket.py", line 178, in __del__
    def __del__(self):
KeyboardInterrupt: 
Exception ignored in: <function Socket.__del__>
Traceback (most recent call last):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/site-packages/zmq/sugar/socket.py", line 178, in __del__
    def __del__(self):
KeyboardInterrupt: 
Exception ignored in sys.unraisablehook: <built-in function unraisablehook>
Traceback (most recent call last):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/site-packages/ipykernel/iostream.py", line 526, in flush
    if not evt.wait(self.flush_timeout):
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/threading.py", line 607, in wait
    signaled = self._cond.wait(timeout)
  File "/home/pheenix/miniconda3/envs/env_fastai/lib/python3.10/threading.py", line 324, in wait

In [None]:
lrn.recorder.lrs

[0.0007500026081097779,
 0.0006710134397661091,
 0.0005868282615718807,
 0.000500005,
 0.0004131817384281193,
 0.0003289965602338909,
 0.0002500073918902221,
 0.00017861428158983696,
 0.00011698671565211225,
 6.699669065202683e-05,
 3.0163416534082117e-05,
 7.606054758419649e-06]

In [None]:
lrn = RNNLearner(dls,metrics=[fun_rmse,mse])
lrn.fit_flat_cos(n_epoch)

epoch,train_loss,valid_loss,fun_rmse,mse,time
0,0.067118,0.07046,0.265249,0.07046,00:00
1,0.063514,0.063554,0.251939,0.063554,00:00
2,0.060287,0.060276,0.245403,0.060276,00:00
3,0.058461,0.060354,0.245595,0.060354,00:00
4,0.057388,0.059777,0.24442,0.059777,00:01
5,0.056491,0.059315,0.243469,0.059315,00:00


In [None]:
lrn.recorder.values[-1]

(#4) [0.039791040122509,0.042779725044965744,0.20677869021892548,0.042779725044965744]

The mutation config dictionary consists of functions that sample from a distribution. In order to retrieve a dictionary with one realisation we need the function sample_config

In [None]:
#| export
def sample_config(config):
    ret_conf = config.copy()
    for k in ret_conf:
        ret_conf[k]=ret_conf[k]()
    return ret_conf

In [None]:
#| export
class CBRayReporter(Callback):
    "`Callback` reports progress after every epoch to the ray tune logger"
    
    order=70 #order has to be >50, to be executed after the recorder callback

    def after_epoch(self):
        # train_loss,valid_loss,rmse = self.learn.recorder.values[-1]
        # metrics = {
        #     'train_loss': train_loss,
        #     'valid_loss': valid_loss,
        #     'mean_loss': rmse,
        # }
        scores = self.learn.recorder.values[-1]
        metrics = {
            'train_loss': scores[0],
            'valid_loss': scores[1]
        }
        for metric,value in zip(self.learn.metrics,scores[2:]):
            m_name = metric.name if hasattr(metric,'name') else str(metric)
            metrics[m_name] = value

        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            file = os.path.join(temp_checkpoint_dir,'model.pth')
            #the model has to be saved to the checkpoint directory on creation
            #that is why a seperate callback for model saving is not trivial
            save_model(file, self.learn.model,opt=None) 
            ray.train.report(metrics, checkpoint=Checkpoint.from_directory(temp_checkpoint_dir))

In [None]:
#| export
class HPOptimizer():
    def __init__(self,create_lrn,dls):
        self.create_lrn = create_lrn
        self.dls = dls
        self.analysis = None
    
    @delegates(ray.init)
    def start_ray(self,**kwargs):
        ray.shutdown()
        ray.init(**kwargs)
        
    def stop_ray(self):
        ray.shutdown()
        
    
        

    @delegates(tune.run, keep=True)
    def optimize(self,config,optimize_func=learner_optimize,resources_per_trial={"gpu": 1.0},verbose=1,**kwargs):
        config['create_lrn'] = ray.put(self.create_lrn)
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        if 'fit_method' not in config: config['fit_method'] = ray.put(Learner.fit_flat_cos)

        kwargs.setdefault('keep_checkpoints_num', 1)#keep only the last checkpoint

        self.analysis = tune.run(
            optimize_func,
            config=config,
            resources_per_trial=resources_per_trial,
            verbose=verbose,
            **kwargs)
        return self.analysis
        
    @delegates(tune.run, keep=True)
    def optimize_pbt(self,opt_name,num_samples,config,mut_conf,perturbation_interval=2,
                 stop={"training_iteration": 40 },
                 resources_per_trial={"gpu": 1 },
                 resample_probability=0.25,
                 quantile_fraction=0.25,
                 **kwargs):
        self.mut_conf = mut_conf
        
        config['create_lrn'] = ray.put(self.create_lrn)
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        
        kwargs.setdefault('keep_checkpoints_num', 2)
        

        
        scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="mean_loss",
        mode="min",
        perturbation_interval=perturbation_interval,
        resample_probability=resample_probability,
        quantile_fraction=quantile_fraction,
        hyperparam_mutations=mut_conf)
        
        self.analysis = tune.run(
            LearnerTrainable,
            name=opt_name,
            scheduler=scheduler,
            reuse_actors=True,
            verbose=1,
            stop=stop,
            checkpoint_score_attr="mean_loss",
            num_samples=num_samples,
            resources_per_trial=resources_per_trial,
            config=config,
            **kwargs)
        return self.analysis
    
    def best_model(self):
        if self.analysis is None: raise Exception
        model = self.create_lrn(self.dls,sample_config(self.mut_conf)).model
        f_path = ray.get(self.analysis.get_best_trial('mean_loss',mode='min').checkpoint.value)
        model.load_state_dict(torch.load(f_path))
        return model

In [None]:
# #| export
# from ray.tune import Tuner
# class HPOptimizer():
#     def __init__(self, create_lrn, dls):
#         self.create_lrn = create_lrn
#         self.dls = dls
#         self.analysis = None
    
#     @delegates(ray.init)
#     def start_ray(self, **kwargs):
#         ray.shutdown()
#         ray.init(**kwargs)
        
#     def stop_ray(self):
#         ray.shutdown()
    
#     @delegates(Tuner.fit, keep=True)
#     def optimize(self, config, trainable=learner_optimize,, resources_per_trial={"gpu": 1.0}, **kwargs):
#         config['create_lrn'] = ray.put(self.create_lrn)
#         config['dls'] = ray.put(self.dls)
#         if 'fit_method' not in config:
#             config['fit_method'] = ray.put(Learner.fit_flat_cos)

#         tuner = Tuner(
#             tune.with_resources(trainable, resources_per_trial),
#             param_space=config,
#             run_config=train.RunConfig(
#                 checkpoint_config=train.CheckpointConfig(
#                     num_to_keep=1,
#                     checkpoint_at_end=True
#                 ),
#                 stop=
#             ),
#             **kwargs
            
#         )
#         self.analysis = tuner.fit()
#         return self.analysis
        
#    @delegates(tune.run, keep=True)
#     def optimize_pbt(self,opt_name,num_samples,config,mut_conf,perturbation_interval=2,
#                  stop={"training_iteration": 40 },
#                  resources_per_trial={"gpu": 1 },
#                  resample_probability=0.25,
#                  quantile_fraction=0.25,
#                  **kwargs):
#         self.mut_conf = mut_conf
        
#         config['create_lrn'] = ray.put(self.create_lrn)
#         #dls are large objects, letting ray handle the copying process makes it much faster
#         config['dls'] = ray.put(self.dls) 
        
#         kwargs.setdefault('keep_checkpoints_num', 2)
        

        
#         scheduler = PopulationBasedTraining(
#         time_attr="training_iteration",
#         metric="mean_loss",
#         mode="min",
#         perturbation_interval=perturbation_interval,
#         resample_probability=resample_probability,
#         quantile_fraction=quantile_fraction,
#         hyperparam_mutations=mut_conf)
        
#         self.analysis = tune.run(
#             LearnerTrainable,
#             name=opt_name,
#             scheduler=scheduler,
#             reuse_actors=True,
#             verbose=1,
#             stop=stop,
#             checkpoint_score_attr="mean_loss",
#             num_samples=num_samples,
#             resources_per_trial=resources_per_trial,
#             config=config,
#             **kwargs)
#         return self.analysis
    
#     def best_model(self):
#         if self.analysis is None:
#             raise Exception("No analysis data. Run optimize() first.")
#         best_checkpoint = self.analysis.get_best_checkpoint(
#             metric="mean_loss",
#             mode="min"
#         )
#         model = self.create_lrn(self.dls, sample_config(self.mut_conf)).model
#         model.load_state_dict(torch.load(best_checkpoint))
#         return model

### Test Population Based Training

In [None]:
def create_lrn(dls,config):
    lr = config['lr']
    alpha = config['alpha']
    beta = config['beta']
    weight_p = config['weight_p']
    
    lrn = RNNLearner(dls)
    lrn.lr = lr
    return lrn

In [None]:
config={
            "lr": tune.loguniform(1e-2, 1e-4),
            "alpha": tune.loguniform(1e-5, 10),
            "beta": tune.loguniform(1e-5, 10),
            "weight_p": tune.uniform(0, 0.5)}
mut_conf = {# distribution for resampling
            "lr": log_uniform(1e-8, 1e-2),
            "alpha": log_uniform(1e-5, 10),
            "beta": log_uniform(1e-5, 10),
            "weight_p": lambda: np.random.uniform(0, 0.5)}

hp_opt = HPOptimizer(create_lrn,dls)
hp_opt.start_ray()
hp_opt.optimize_pbt('pbt_test',4,config,mut_conf,perturbation_interval=1,
                 stop={"training_iteration": 3 },
                 resources_per_trial={"gpu": 0.5},
                 storage_path=str(Path.home() / 'ray_results'))#no cpu count is necessary


KeyboardInterrupt



In [None]:
#hp_opt.best_model()

### Test Grid Search

In [None]:
# dls.cpu()

<fastai.data.core.DataLoaders>

In [None]:
def create_lrn(dls,config):
    dls = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                        SequenceBlock.from_hdf(['y'],TensorSequencesOutput)),
                get_items=tfm_src,
                splitter=ApplyToDict(FuncSplitter(lambda o: 'valid' in str(o)))).dataloaders(hdf_files)
    lrn = RNNLearner(dls,hidden_size=config['hidden_size'],metrics=[fun_rmse,mse])
    return lrn

In [None]:
hp_opt = HPOptimizer(create_lrn,None)

In [None]:
search_space = {
    "hidden_size": tune.grid_search([10,20,50,100]),
    'n_epoch':10,
    # 'reporter':ray.put(CustomReporter)
}

In [None]:
hp_opt.optimize(optimize_func=TrainableModel,
                resources_per_trial={"gpu": 4},
                config=search_space)

0,1
Current time:,2024-04-07 14:09:11
Running for:,00:00:18.22
Memory:,21.8/251.7 GiB

Trial name,# failures,error file
TrainableModel_9a0c8_00000,1,/home/pheenix/ray_results/TrainableModel_2024-04-07_14-08-53/TrainableModel_9a0c8_00000_0_hidden_size=10_2024-04-07_14-08-53/error.txt
TrainableModel_9a0c8_00001,1,/home/pheenix/ray_results/TrainableModel_2024-04-07_14-08-53/TrainableModel_9a0c8_00001_1_hidden_size=20_2024-04-07_14-08-53/error.txt
TrainableModel_9a0c8_00002,1,/home/pheenix/ray_results/TrainableModel_2024-04-07_14-08-53/TrainableModel_9a0c8_00002_2_hidden_size=50_2024-04-07_14-08-53/error.txt
TrainableModel_9a0c8_00003,1,/home/pheenix/ray_results/TrainableModel_2024-04-07_14-08-53/TrainableModel_9a0c8_00003_3_hidden_size=100_2024-04-07_14-08-53/error.txt

Trial name,status,loc,hidden_size
TrainableModel_9a0c8_00000,ERROR,141.23.125.123:2671349,10
TrainableModel_9a0c8_00001,ERROR,141.23.125.123:2671649,20
TrainableModel_9a0c8_00002,ERROR,141.23.125.123:2672100,50
TrainableModel_9a0c8_00003,ERROR,141.23.125.123:2672425,100


[36m(TrainableModel pid=2671349)[0m Library "haste_pytorch" not found
[36m(TrainableModel pid=2671349)[0m 0


2024-04-07 14:08:57,673	ERROR tune_controller.py:1374 -- Trial task failed for trial TrainableModel_9a0c8_00000
Traceback (most recent call last):
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(IndexError): [36mray::TrainableModel.train()[39m (pid=2671349, ip=141.23.125.123, actor_id=32207edcd7f579fd9a05bb0501000000, repr=<__main__.TrainableModel object>)
  File "/home/pheenix/.local/lib/python3.10/site-packages/r

[36m(TrainableModel pid=2671349)[0m 0


2024-04-07 14:09:01,821	ERROR tune_controller.py:1374 -- Trial task failed for trial TrainableModel_9a0c8_00001
Traceback (most recent call last):
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(IndexError): [36mray::TrainableModel.train()[39m (pid=2671649, ip=141.23.125.123, actor_id=9e48da88ea13ca77f833bf9201000000, repr=<__main__.TrainableModel object>)
  File "/home/pheenix/.local/lib/python3.10/site-packages/r

[36m(TrainableModel pid=2672100)[0m Library "haste_pytorch" not found[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(TrainableModel pid=2671649)[0m 0[32m [repeated 2x across cluster][0m


2024-04-07 14:09:06,935	ERROR tune_controller.py:1374 -- Trial task failed for trial TrainableModel_9a0c8_00002
Traceback (most recent call last):
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/pheenix/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(IndexError): [36mray::TrainableModel.train()[39m (pid=2672100, ip=141.23.125.123, actor_id=9fe97a1615aec6c4d1dc838b01000000, repr=<__main__.TrainableModel object>)
  File "/home/pheenix/.local/lib/python3.10/site-packages/r

[36m(TrainableModel pid=2672425)[0m Library "haste_pytorch" not found
[36m(TrainableModel pid=2672425)[0m 0[32m [repeated 4x across cluster][0m


TuneError: ('Trials did not complete', [TrainableModel_9a0c8_00000, TrainableModel_9a0c8_00001, TrainableModel_9a0c8_00002, TrainableModel_9a0c8_00003])

In [None]:
hp_opt.analysis.get_best_config('mean_loss',mode='min')



### Test Random Search

In [None]:
#| include: false
import nbdev; nbdev.nbdev_export()