In [None]:
# default_exp hpopt
# default_cls_lvl 3

In [None]:
#hide
%load_ext line_profiler
%matplotlib notebook

# Hyperparameter Optimization Module
> Pytorch Models for Sequential Data

In [None]:
#export
from seqdata.core import *
from seqdata.model import *
from seqdata.learner import *
from fastai2.basics import *
from fastai2.callback.schedule import *
from fastai2.callback.rnn import *
from fastai2.callback.tracker import *

import ray
from ray import tune
from ray.tune import Trainable
from ray.tune.schedulers import *
from ray.tune.trial import ExportFormat

In [None]:
f_paths = '/mnt/data/Systemidentification/WienerHammerstein/'
hdf_files = L([f for f in get_hdf_files(f_paths) if 'test' not in str(f)])
tfm_src = CreateDict([DfHDFCreateWindows(win_sz=400,stp_sz=100,clm='u')])
dls = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                        SequenceBlock.from_hdf(['y'],TensorSequencesOutput)),
                get_items=tfm_src,
                splitter=ApplyToDict(FuncSplitter(lambda o: 'valid' in str(o)))).dataloaders(hdf_files)

## optimizer core

First we need a log uniform distibution for variables with vast value ranges

In [None]:
#export
def log_uniform(min_bound, max_bound, base=10):
    '''uniform sampling in an exponential range'''
    logmin = np.log(min_bound) / np.log(base)
    logmax = np.log(max_bound) / np.log(base)
    def _sample():
        return base**(np.random.uniform(logmin, logmax))
    return _sample

In [None]:
[log_uniform(1e-8, 1e-2)() for _ in range(5)]

[0.0005906777065587701,
 1.3568088162713915e-07,
 0.0063611200947972365,
 6.182820530108811e-07,
 7.353573100037116e-06]

In [None]:
#export
class LearnerTrainable(tune.Trainable):

    def _setup(self, config):
        self.create_lrn = config['create_lrn']
        self.dls = ray.get(config['dls'])

        self.lrn = self.create_lrn(self.dls,config)

    def _train(self):
        with self.lrn.no_bar(): self.lrn.fit(1)
        train_loss,valid_loss,rmse = self.lrn.recorder.values[-1]
        return {'train_loss': train_loss,
                'valid_loss': valid_loss,
                'mean_loss': rmse}

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        torch.save(self.lrn.model.state_dict(), checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.lrn.model.load_state_dict(torch.load(checkpoint_path))

    def _export_model(self, export_formats, export_dir):
        if export_formats == [ExportFormat.MODEL]:
            path = os.path.join(export_dir, "exported_model")
            torch.save(self.lrn.model.state_dict(), path)
            return {ExportFormat.MODEL: path}
        else:
            raise ValueError("unexpected formats: " + str(export_formats))

    # the learner class will be recreated with every perturbation, saving the model
    # that way the new hyperparameter will be applied
    def reset_config(self, new_config):
        model_state = self.lrn.model.state_dict()
#         import pdb; pdb.set_trace()
        self.lrn = self.create_lrn(self.dls,new_config)
        
        #restore trainable parameters, keeping the new hyperparameters in the model like dropout
        self.lrn.model.load_state_dict(model_state)
        
        self.config = new_config
        return True

In [None]:
#export
def learner_optimize(config):
        create_lrn = config['create_lrn']
        dls = ray.get(config['dls'])
        
        #Scheduling Parameters for training the Model
        lrn_kwargs = {'n_epoch':100,'pct_start':0.5}
        for attr in ['n_epoch','pct_start']:
            if attr in config: lrn_kwargs[attr] = config[attr]

        lrn = create_lrn(dls,config)
        lrn.lr = config['lr'] if 'lr' in config else 3e-3
        lrn.add_cb(CBRayReporter())
        with lrn.no_bar(): 
            config['fit_method'](lrn,**lrn_kwargs)

The mutation config dictionary consists of functions that sample from a distribution. In order to retrieve a dictionary with one realisation we need the function sample_config

In [None]:
#export
def sample_config(config):
    ret_conf = config.copy()
    for k in ret_conf:
        ret_conf[k]=ret_conf[k]()
    return ret_conf

In [None]:
#export
class CBRayReporter(Callback):
    "`Callback` reports progress after every epoch to the ray tune logger"

    def after_epoch(self):
        train_loss,valid_loss,rmse = self.learn.recorder.values[-1]
        tune.track.log(train_loss=train_loss,
                        valid_loss=valid_loss,
                        mean_loss=rmse)


In [None]:
#export
class HPOptimizer():
    def __init__(self,create_lrn,dls):
        self.create_lrn = create_lrn
        self.dls = dls
        self.analysis = None
    
    @delegates(ray.init)
    def start_ray(self,**kwargs):
        ray.shutdown()
        ray.init(**kwargs)
        
    def stop_ray(self):
        ray.shutdown()
        
    
        

    @delegates(tune.run, keep=True)
    def optimize(self,config,resources_per_trial={"gpu": 1.0},verbose=1,**kwargs):
        config['create_lrn'] = self.create_lrn
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        
        if 'fit_method' not in config: config['fit_method'] = Learner.fit_flat_cos

        self.analysis = tune.run(
            learner_optimize,
            config=config,
            resources_per_trial=resources_per_trial,
            verbose=verbose,
            **kwargs)
        return self.analysis
        
    @delegates(tune.run, keep=True)
    def optimize_pbt(self,opt_name,num_samples,config,mut_conf,freq=2,
                 stop={"training_iteration": 40 },
                 resources_per_trial={"gpu": 1 },
                 resample_probability=0.25,
                 quantile_fraction=0.25,
                 **kwargs):
        self.mut_conf = mut_conf
        
        config['create_lrn'] = self.create_lrn
        #dls are large objects, letting ray handle the copying process makes it much faster
        config['dls'] = ray.put(self.dls) 
        

        
        scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="mean_loss",
        mode="min",
        perturbation_interval=freq,
        resample_probability=resample_probability,
        quantile_fraction=quantile_fraction,
        hyperparam_mutations=mut_conf)
        
        self.analysis = tune.run(
            LearnerTrainable,
            name=opt_name,
            scheduler=scheduler,
            reuse_actors=True,
            verbose=1,
            stop=stop,
            checkpoint_score_attr="mean_loss",
            checkpoint_freq=freq,
            keep_checkpoints_num=4,
            num_samples=num_samples,
            resources_per_trial=resources_per_trial,
            config=config,
            **kwargs)
        return self.analysis
    
    def best_model(self):
        if self.analysis is None: raise Exception
        model = self.create_lrn(self.dls,sample_config(self.mut_conf)).model
        f_path = self.analysis.get_best_trial('mean_loss').checkpoint.value
        model.load_state_dict(torch.load(f_path))
        return model

### Test Population Based Training

In [None]:
def create_lrn(dls,config):
    lr = config['lr']
    alpha = config['alpha']
    beta = config['beta']
    weight_p = config['weight_p']
    
    lrn = RNNLearner(dls,cbs=[TimeSeriesRegularizer(alpha,beta)],weight_p=weight_p)
    lrn.lr = lr
    return lrn

In [None]:
config={
            "lr": tune.loguniform(1e-2, 1e-4),
            "alpha": tune.loguniform(1e-5, 10),
            "beta": tune.loguniform(1e-5, 10),
            "weight_p": tune.uniform(0, 0.5)}
mut_conf = {# distribution for resampling
            "lr": log_uniform(1e-8, 1e-2),
            "alpha": log_uniform(1e-5, 10),
            "beta": log_uniform(1e-5, 10),
            "weight_p": lambda: np.random.uniform(0, 0.5)}

hp_opt = HPOptimizer(create_lrn,dls)
hp_opt.start_ray()
hp_opt.optimize_pbt('pbt_test',4,config,mut_conf,freq=1,
                 stop={"training_iteration": 3 },
                 resources_per_trial={"gpu": 0.5},
                 local_dir='/mnt/data/ray_results')#no cpu count is necessary

Trial name,status,loc,beta,weight_p,alpha,lr,loss,total time (s),iter
LearnerTrainable_cee4ebbc,TERMINATED,,0.0415577,0.0070431,4.89958e-05,0.00651406,0.241739,5.57046,3
LearnerTrainable_cee4ebbd,TERMINATED,,1.40531,0.465232,3.95704e-05,0.00119041,0.245669,5.65941,3
LearnerTrainable_cee4ebbe,TERMINATED,,0.228819,0.325653,0.00022368,0.0028742,0.24258,5.79186,3
LearnerTrainable_cee4ebbf,TERMINATED,,0.0346314,0.00880387,4.08298e-05,0.00814257,0.232083,5.72665,3


2020-03-24 19:04:00,980	INFO tune.py:352 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f0f630a75d0>

In [None]:
hp_opt.best_model()

SimpleRNN(
  (rnn): RNN(
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): GRU(1, 100, batch_first=True)
      )
    )
    (res_gate0): Conv1d(1, 100, kernel_size=(1,), stride=(1,))
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
    )
    (norm_layers): ModuleList(
      (0): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    )
  )
  (final): SeqLinear(
    (lin): Sequential(
      (0): Sequential(
        (0): Conv1d(100, 100, kernel_size=(1,), stride=(1,))
        (1): Mish()
      )
      (1): Conv1d(100, 1, kernel_size=(1,), stride=(1,))
    )
  )
)

### Test Grid Search

In [None]:
def create_lrn(dls,config):
    lrn = RNNLearner(dls,hidden_size=config['hidden_size'])
    return lrn

In [None]:
hp_opt = HPOptimizer(create_lrn,dls)

In [None]:
hp_opt.start_ray(local_mode=False)

2020-03-24 19:04:02,666	INFO resource_spec.py:212 -- Starting Ray with 83.89 GiB memory available for workers and up to 18.63 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-24 19:04:03,287	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m


In [None]:
search_space = {
    "hidden_size": tune.grid_search([10,20,50,100]),
    "fit_method": Learner.fit_flat_cos,
    'n_epoch':2
}

In [None]:
hp_opt.optimize(resources_per_trial={"gpu": 0.5},
                config=search_space)

Trial name,status,loc,hidden_size,loss,total time (s),iter
learner_optimize_d8c6f738,TERMINATED,,10,0.383846,7.90006,1
learner_optimize_d8c6f739,TERMINATED,,20,0.282948,8.37529,1
learner_optimize_d8c6f73a,TERMINATED,,50,0.246655,7.4327,1
learner_optimize_d8c6f73b,TERMINATED,,100,0.247453,8.33767,1


2020-03-24 19:04:15,085	INFO tune.py:352 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


[2m[36m(pid=3350)[0m (#5) [1,0.061710622161626816,0.06129293888807297,0.24745303392410278,'00:01']


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f0f552f0450>

In [None]:
hp_opt.analysis.get_best_config('mean_loss',mode='min')

{'hidden_size': 50,
 'fit_method': <function fastai2.callback.schedule.Learner.fit_flat_cos(self: fastai2.learner.Learner, n_epoch, lr=None, div_final=100000.0, pct_start=0.75, wd=None, cbs=None, reset_opt=False)>,
 'n_epoch': 2,
 'create_lrn': <function __main__.create_lrn(dls, config)>,
 'dls': ObjectID(ffffffffffffffffffffffff0100008002000000)}

### Test Random Search

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_model.ipynb.
Converted 02_learner.ipynb.
Converted 03_tbptt_dl.ipynb.
Converted 11_dualrnn.ipynb.
Converted 12_TensorQuaternions.ipynb.
Converted 13_HPOpt.ipynb.
Converted index.ipynb.
