# Hyperparameter optimization for Deep RL using Optuma
Diego Minguzzi ([profile](https://www.linkedin.com/in/diego-minguzzi-2775b78/))

Online course [Deep Reinforcement Learning](https://huggingface.co/learn/deep-rl-course), by Hugging Face: exercise of the [Bonus Unit 2](https://huggingface.co/learn/deep-rl-course/unitbonus2/introduction).
<br>
Adapted from Antonin’s Raffin ICRA 2022 presentations:

*  [The tutorial on YouTube](https://youtu.be/ihP7E76KGOI)
*  [The related Colab Notebook](https://colab.research.google.com/github/araffin/tools-for-robotic-rl-icra2022/blob/main/notebooks/optuna_lab.ipynb)


See [Optuna](https://optuna.org/) and the [documentation](https://optuna.readthedocs.io/en/stable/index.html)

The Gym Lunar lander environment is used.<br>

## Lunar Lander environment

## Imports and package installation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!apt install swig cmake
!pip install gymnasium[box2d]
!pip install huggingface_sb3
!pip install stable-baselines3
!pip install sb3-contrib
!pip install optuna
!pip install optuna-dashboard
!pip install jupyterlab jupyterlab-optuna

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
swig is already the newest version (4.0.2-1ubuntu1).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
import gymnasium as gym
import collections
import json
import math
import numpy as np
import os.path
import pickle
import torch
from typing import Any, Dict
import torch.nn as nn

In [None]:
from stable_baselines3 import PPO, A2C, SAC, TD3, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback, CallbackList

In [None]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_intermediate_values

  and should_run_async(code)


## Set all parameters

In [None]:
import logging as log
LOG_FORMAT_STRING = "%(asctime)s [%(levelname)-7s] %(message)s"
LOG_LEVEL= log.DEBUG
LOG_ROOT_LOGGER = ''

N_TRIALS =          15 #50 # Maximum number of trials
N_JOBS =            1 # Number of jobs to run in parallel
N_STARTUP_TRIALS=   3 # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS=      6
N_TIMESTEPS =       int(300000)  # Training budget
EVAL_FREQ =         int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS =       4
N_EVAL_EPISODES =   8
TIMEOUT =           int(60 * 60 * 5) # Expressed into seconds.
NUM_STEPS_LR_UPDATE=5000 # After this many steps the learning rate is updated
LR_UPDATE_FACTOR =  0.95 # Learning rate update factor
POLICY=             'MlpPolicy'
IS_DETERMINISTIC=   True
IS_VERBOSE=         0
N_WARMUP_STEPS=     N_EVALUATIONS // 3  # Do not prune before the warmup steps.
SAVED_PARAMS_FILE=  'lunar_lander_params.json'
SAVED_SNAPSHOT_FILE='opt_lunar_lander_study.pkl'
SEED =              95
ENV_ID =            'LunarLander-v2'
BEST_MODEL_FILE=    f'{ENV_ID}.model'
RESULT_OBJECTIVE=   375.0

HyperParams= collections.namedtuple('HyperParams', ['gamma',
                                                    'learning_rate',
                                                    'max_grad_norm',
                                                    'n_epochs',
                                                    'n_steps',
                                                    'ent_coef',
                                                    'batch_size'])
# Guess Hyperparameters, used in the trials after GUESS_HYPERPARAMS_AFTER_N_TRIAL
guess_hyperparams=[ HyperParams(gamma = 1.0 - 0.0016617315216418454,
                                learning_rate = math.pow(10., -2.682161433845602),
                                max_grad_norm = 0.3733377073391161,
                                n_epochs= 5,
                                n_steps = 2 ** 12,
                                ent_coef= 0.01,
                                batch_size=64),
                    HyperParams(gamma = 1.0 - 0.00010534109595607163,
                                learning_rate = math.pow(10., -2.7388359550396095),
                                max_grad_norm = 0.30386280137092286,
                                n_epochs= 5,
                                n_steps = 2 ** 12,
                                ent_coef= 0.01,
                                batch_size=64)
]

GUESS_HYPERPARAMS_AFTER_N_TRIAL=10

In [None]:
log.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT_STRING)
log.getLogger().setLevel(LOG_LEVEL)
roothandler= log.getLogger().handlers[0]
roothandler.setFormatter( log.Formatter(LOG_FORMAT_STRING) )

In [None]:
DEFAULT_HYPERPARAMS = {
    "policy": POLICY,
    "env": ENV_ID,
}

In [None]:
np.random.seed(SEED)

In [None]:
class DecreaseLearningRateCallback(BaseCallback):
    """ Custom callback to decrease the learning rate periodically during training. """

    def __init__(self, decay_interval, decay_factor):
        super(DecreaseLearningRateCallback, self).__init__()
        self.decay_interval = decay_interval  # Interval at which to decrease the learning rate
        self.decay_factor = decay_factor      # Factor by which to decrease the learning rate

    def _on_step(self) -> bool:
        if 0==(self.num_timesteps % self.decay_interval):
            # Retrieve the current learning rate
            current_lr = self.model.learning_rate

            # Decrease the learning rate by the decay factor
            new_lr = current_lr * self.decay_factor

            # Set the new learning rate
            self.model.learning_rate = new_lr
            if 0==(self.num_timesteps % (10*self.decay_interval)):
              log.debug(f'Num steps:{self.num_timesteps} current_lr:{current_lr} new_lr:{new_lr}')

        return True

### Define the sampling function



In [None]:
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """ Samples the model hyperparameters.
    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    indx_trial= trial.number
    if ( (indx_trial>=GUESS_HYPERPARAMS_AFTER_N_TRIAL)
      and (indx_trial - GUESS_HYPERPARAMS_AFTER_N_TRIAL) < len(guess_hyperparams) ):
      indx_guess= indx_trial - GUESS_HYPERPARAMS_AFTER_N_TRIAL
      log.info(f'indx_trial:{indx_guess} indx_guess:{indx_guess}: using guessed params.')
      gamma = guess_hyperparams[indx_guess].gamma
      learning_rate = guess_hyperparams[indx_guess].learning_rate
      max_grad_norm = guess_hyperparams[indx_guess].max_grad_norm
      n_epochs = guess_hyperparams[indx_guess].n_epochs
      n_steps = guess_hyperparams[indx_guess].n_steps
      ent_coef= guess_hyperparams[indx_guess].ent_coef
      batch_size= guess_hyperparams[indx_guess].batch_size
    else:
      gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.2, log=True)
      learning_rate = math.pow(10., trial.suggest_float("learning_rate_exp", -4.0, -1.0))
      max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 10.0, log=True)
      n_epochs= trial.suggest_int("n_epochs", 2, 16)
      n_steps = 2 ** trial.suggest_int("exponent_n_steps", 6, 13)

    ent_coef= guess_hyperparams[-1].ent_coef
    batch_size= guess_hyperparams[-1].batch_size

    result= {
        "ent_coef": ent_coef,
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        "n_epochs":n_epochs,
        "batch_size":batch_size
    }
    log.info(f'sample_ppo_params() {trial.number}\nParams:{result}')
    return result

### Integrate the performance evaluation in stable baselines.

In [None]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.

    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """
    def __init__(self,
                  eval_env: gym.Env,
                  trial: optuna.Trial,
                  n_eval_episodes: int = 5,
                  eval_freq: int = 10000,
                  deterministic: bool = IS_DETERMINISTIC,
                  verbose: int = IS_VERBOSE):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Define the objective function: it evaluates a set of hyperparameters.

In [None]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """
    print(f'objective started trial:{trial.number}.')
    kwargs = DEFAULT_HYPERPARAMS.copy()

    # 1. Sample hyperparameters and update the keyword arguments
    sampled_params= sample_ppo_params( trial)
    kwargs.update( sampled_params)

    # Create the RL model
    model = PPO(**kwargs)

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    eval_env= make_vec_env(ENV_ID, N_EVAL_ENVS)

    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    eval_callback = TrialEvalCallback( eval_env, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=IS_DETERMINISTIC)
    lr_callback = DecreaseLearningRateCallback(decay_interval=NUM_STEPS_LR_UPDATE, decay_factor=LR_UPDATE_FACTOR)
    list_callbacks = CallbackList([eval_callback, lr_callback])

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=list_callbacks)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        log.error('Exception in objective:{e}')
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    mean_reward, std_reward = evaluate_policy(model, eval_env, N_EVAL_EPISODES, deterministic=IS_DETERMINISTIC)
    result = mean_reward-std_reward
    log.info(f'objective: trial:{trial.number}\nMean reward:{mean_reward} std_reward:{std_reward} result:{result}')

    if (0==trial.number) or ( (trial.number>0) and (result > trial.study.best_value) ):
      model.save(BEST_MODEL_FILE)
      log.info(f'Saved best model to file:{BEST_MODEL_FILE}')

    eval_callback.mean_minus_std= result
    return result

### Run the Optimization loop

The optimization loop runs for a given number of trials, or until the TIMEOUT expires.<br>
Then, a snapshot file is saved.  If the snapshot exists, it is loaded before starting the optimization.

In [None]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)

# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

pruner = MedianPruner( n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_WARMUP_STEPS )

# Create the study and start the hyperparameter optimization
if not os.path.exists(SAVED_SNAPSHOT_FILE):
  log.info('Started the study from scratch.')
  study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
else:
  log.info(f'Resuming the study from the file: {SAVED_SNAPSHOT_FILE}')
  snapshot_file = open(SAVED_SNAPSHOT_FILE, 'rb')
  with open(SAVED_SNAPSHOT_FILE, 'rb') as snapshot_file:
    study = pickle.load(snapshot_file)

try:
  study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except Exception as e:
  log.error(f'Exception during study: {e}')
except KeyboardInterrupt:
    pass

snapshot_file = open(SAVED_SNAPSHOT_FILE, 'wb')
pickle.dump(study, snapshot_file)
snapshot_file.close()

log.info(f'Saved the study to the file: {os.path.abspath(SAVED_SNAPSHOT_FILE)}')

2023-12-31 13:48:24,630 [INFO   ] Started the study from scratch.
[I 2023-12-31 13:48:24,632] A new study created in memory with name: no-name-74537ef2-8629-4266-94d6-a838dc9030b1
2023-12-31 13:48:24,637 [INFO   ] sample_ppo_params() 0
Params:{'ent_coef': 0.01, 'n_steps': 8192, 'gamma': 0.999896735134487, 'learning_rate': 0.00017432784789502684, 'max_grad_norm': 1.309833970119273, 'n_epochs': 6, 'batch_size': 64}


objective started trial:0.


2023-12-31 13:50:26,827 [DEBUG  ] Num steps:50000 current_lr:0.00010987002323440212 new_lr:0.000104376522072682
2023-12-31 13:52:39,101 [DEBUG  ] Num steps:100000 current_lr:6.578324142541546e-05 new_lr:6.249407935414468e-05
2023-12-31 13:55:22,901 [DEBUG  ] Num steps:150000 current_lr:3.9386856624232566e-05 new_lr:3.741751379302094e-05
2023-12-31 13:58:07,630 [DEBUG  ] Num steps:200000 current_lr:2.358236598141386e-05 new_lr:2.2403247682343167e-05
2023-12-31 14:00:56,411 [DEBUG  ] Num steps:250000 current_lr:1.4119633627711e-05 new_lr:1.341365194632545e-05
2023-12-31 14:04:00,847 [DEBUG  ] Num steps:300000 current_lr:8.45394622142297e-06 new_lr:8.031248910351821e-06
2023-12-31 14:04:28,063 [INFO   ] objective: trial:0
Mean reward:-19.94559 std_reward:70.77520029192992 result:-90.72079029192992
2023-12-31 14:04:28,075 [INFO   ] Saved best model to file:LunarLander-v2.model
[I 2023-12-31 14:04:28,077] Trial 0 finished with value: -90.72079029192992 and parameters: {'gamma': 0.0001032648

objective started trial:1.


2023-12-31 14:07:17,882 [DEBUG  ] Num steps:50000 current_lr:0.0004823142468631785 new_lr:0.00045819853452001955
2023-12-31 14:10:03,417 [DEBUG  ] Num steps:100000 current_lr:0.00028877935591792325 new_lr:0.00027434038812202706
2023-12-31 14:12:47,225 [DEBUG  ] Num steps:150000 current_lr:0.00017290286767752773 new_lr:0.00016425772429365135
2023-12-31 14:15:33,253 [DEBUG  ] Num steps:200000 current_lr:0.00010352333377878134 new_lr:9.834716708984227e-05
2023-12-31 14:18:13,930 [DEBUG  ] Num steps:250000 current_lr:6.198324400646059e-05 new_lr:5.888408180613756e-05
2023-12-31 14:20:53,250 [DEBUG  ] Num steps:300000 current_lr:3.7111657800493794e-05 new_lr:3.52560749104691e-05
2023-12-31 14:20:54,743 [INFO   ] objective: trial:1
Mean reward:192.18145425 std_reward:113.87917190943759 result:78.30228234056241
2023-12-31 14:20:54,756 [INFO   ] Saved best model to file:LunarLander-v2.model
[I 2023-12-31 14:20:54,758] Trial 1 finished with value: 78.30228234056241 and parameters: {'gamma': 0.0

objective started trial:2.


2023-12-31 14:22:40,536 [DEBUG  ] Num steps:50000 current_lr:0.03131907156136073 new_lr:0.029753117983292693
2023-12-31 14:24:26,683 [DEBUG  ] Num steps:100000 current_lr:0.01875188504643687 new_lr:0.017814290794115027
2023-12-31 14:26:02,072 [DEBUG  ] Num steps:150000 current_lr:0.011227446257653532 new_lr:0.010666073944770855
2023-12-31 14:27:38,067 [DEBUG  ] Num steps:200000 current_lr:0.006722286807770864 new_lr:0.006386172467382321
2023-12-31 14:29:14,421 [DEBUG  ] Num steps:250000 current_lr:0.004024881427967259 new_lr:0.003823637356568896
2023-12-31 14:30:49,724 [DEBUG  ] Num steps:300000 current_lr:0.002409845186978512 new_lr:0.002289352927629586
2023-12-31 14:30:50,150 [INFO   ] objective: trial:2
Mean reward:-459.76505875 std_reward:105.2366261300919 result:-565.001684880092
[I 2023-12-31 14:30:50,153] Trial 2 finished with value: -565.001684880092 and parameters: {'gamma': 0.0038714619250760415, 'learning_rate_exp': -1.303703568471525, 'max_grad_norm': 0.4851526529835961, 'n

objective started trial:3.


2023-12-31 14:34:15,472 [DEBUG  ] Num steps:50000 current_lr:0.0005730575002967387 new_lr:0.0005444046252819017
2023-12-31 14:37:47,907 [DEBUG  ] Num steps:100000 current_lr:0.00034311069373526556 new_lr:0.0003259551590485023
2023-12-31 14:41:28,698 [DEBUG  ] Num steps:150000 current_lr:0.00020543304658700965 new_lr:0.00019516139425765916
2023-12-31 14:44:50,216 [DEBUG  ] Num steps:200000 current_lr:0.0001230003535319214 new_lr:0.00011685033585532533
[I 2023-12-31 14:44:50,218] Trial 3 pruned. 
2023-12-31 14:44:50,245 [INFO   ] sample_ppo_params() 4
Params:{'ent_coef': 0.01, 'n_steps': 2048, 'gamma': 0.9971271318704582, 'learning_rate': 0.007136115040912182, 'max_grad_norm': 9.054632736910168, 'n_epochs': 16, 'batch_size': 64}


objective started trial:4.


2023-12-31 14:47:42,344 [DEBUG  ] Num steps:50000 current_lr:0.004497532292261807 new_lr:0.004272655677648717
2023-12-31 14:50:33,684 [DEBUG  ] Num steps:100000 current_lr:0.0026928387187946036 new_lr:0.0025581967828548734
2023-12-31 14:53:39,449 [DEBUG  ] Num steps:150000 current_lr:0.001612302012353678 new_lr:0.001531686911735994
2023-12-31 14:56:32,147 [DEBUG  ] Num steps:200000 current_lr:0.0009653447720045196 new_lr:0.0009170775334042936
2023-12-31 14:59:17,539 [DEBUG  ] Num steps:250000 current_lr:0.0005779875740997564 new_lr:0.0005490881953947686
2023-12-31 15:02:05,874 [DEBUG  ] Num steps:300000 current_lr:0.0003460625110343038 new_lr:0.0003287593854825886
2023-12-31 15:02:15,194 [INFO   ] objective: trial:4
Mean reward:149.336494375 std_reward:135.04616511878442 result:14.290329256215585
[I 2023-12-31 15:02:15,196] Trial 4 finished with value: 14.290329256215585 and parameters: {'gamma': 0.0028728681295418263, 'learning_rate_exp': -2.1465381573384024, 'max_grad_norm': 9.054632

objective started trial:5.


2023-12-31 15:05:14,061 [DEBUG  ] Num steps:50000 current_lr:0.0007480570025193146 new_lr:0.0007106541523933488
2023-12-31 15:08:34,730 [DEBUG  ] Num steps:100000 current_lr:0.0004478893600642505 new_lr:0.00042549489206103794
2023-12-31 15:11:52,286 [DEBUG  ] Num steps:150000 current_lr:0.00026816790456230547 new_lr:0.0002547595093341902
2023-12-31 15:14:56,607 [DEBUG  ] Num steps:200000 current_lr:0.00016056203037960443 new_lr:0.0001525339288606242
[I 2023-12-31 15:14:56,608] Trial 5 pruned. 
2023-12-31 15:14:56,637 [INFO   ] sample_ppo_params() 6
Params:{'ent_coef': 0.01, 'n_steps': 2048, 'gamma': 0.9994712125885031, 'learning_rate': 0.00014254522460254367, 'max_grad_norm': 3.0692943872787026, 'n_epochs': 11, 'batch_size': 64}


objective started trial:6.


2023-12-31 15:17:27,618 [DEBUG  ] Num steps:50000 current_lr:8.983904366481497e-05 new_lr:8.534709148157422e-05
2023-12-31 15:20:27,544 [DEBUG  ] Num steps:100000 current_lr:5.3789954027974374e-05 new_lr:5.110045632657565e-05
2023-12-31 15:23:14,924 [DEBUG  ] Num steps:150000 current_lr:3.2206032436482465e-05 new_lr:3.059573081465834e-05
2023-12-31 15:26:02,026 [DEBUG  ] Num steps:200000 current_lr:1.9282941286031456e-05 new_lr:1.8318794221729883e-05
[I 2023-12-31 15:26:02,027] Trial 6 pruned. 
2023-12-31 15:26:02,053 [INFO   ] sample_ppo_params() 7
Params:{'ent_coef': 0.01, 'n_steps': 128, 'gamma': 0.9819243187492308, 'learning_rate': 0.007767729195691527, 'max_grad_norm': 4.930927469590014, 'n_epochs': 8, 'batch_size': 64}


objective started trial:7.


2023-12-31 15:28:10,695 [DEBUG  ] Num steps:50000 current_lr:0.004895606740485198 new_lr:0.004650826403460938
2023-12-31 15:30:17,671 [DEBUG  ] Num steps:100000 current_lr:0.0029311805955128827 new_lr:0.0027846215657372385
[I 2023-12-31 15:30:17,677] Trial 7 pruned. 
2023-12-31 15:30:17,718 [INFO   ] sample_ppo_params() 8
Params:{'ent_coef': 0.01, 'n_steps': 1024, 'gamma': 0.8356771989851821, 'learning_rate': 0.0009029530376629181, 'max_grad_norm': 1.0179702645814035, 'n_epochs': 13, 'batch_size': 64}


objective started trial:8.


2023-12-31 15:33:21,086 [DEBUG  ] Num steps:50000 current_lr:0.0005690856189960969 new_lr:0.000540631338046292
2023-12-31 15:36:53,060 [DEBUG  ] Num steps:100000 current_lr:0.0003407325816823011 new_lr:0.0003236959525981861
2023-12-31 15:40:31,693 [DEBUG  ] Num steps:150000 current_lr:0.0002040091830552518 new_lr:0.00019380872390248918
2023-12-31 15:44:10,699 [DEBUG  ] Num steps:200000 current_lr:0.00012214783383902357 new_lr:0.00011604044214707238
[I 2023-12-31 15:44:10,701] Trial 8 pruned. 
2023-12-31 15:44:10,732 [INFO   ] sample_ppo_params() 9
Params:{'ent_coef': 0.01, 'n_steps': 8192, 'gamma': 0.9992458419798158, 'learning_rate': 0.07732046696445953, 'max_grad_norm': 0.31908888045902456, 'n_epochs': 4, 'batch_size': 64}


objective started trial:9.


2023-12-31 15:45:52,625 [DEBUG  ] Num steps:50000 current_lr:0.04873117866398175 new_lr:0.04629461973078266
2023-12-31 15:47:34,579 [DEBUG  ] Num steps:100000 current_lr:0.029177156758751016 new_lr:0.027718298920813463
[I 2023-12-31 15:47:34,581] Trial 9 pruned. 
2023-12-31 15:47:34,584 [INFO   ] indx_trial:0 indx_guess:0: using guessed params.
2023-12-31 15:47:34,588 [INFO   ] sample_ppo_params() 10
Params:{'ent_coef': 0.01, 'n_steps': 4096, 'gamma': 0.9983382684783582, 'learning_rate': 0.002078923775963909, 'max_grad_norm': 0.3733377073391161, 'n_epochs': 5, 'batch_size': 64}


objective started trial:10.


2023-12-31 15:49:38,497 [DEBUG  ] Num steps:50000 current_lr:0.0013102404826637094 new_lr:0.001244728458530524
2023-12-31 15:52:16,507 [DEBUG  ] Num steps:100000 current_lr:0.0007844893762562853 new_lr:0.000745264907443471
2023-12-31 15:54:41,378 [DEBUG  ] Num steps:150000 current_lr:0.000469702768004713 new_lr:0.00044621762960447733
2023-12-31 15:57:11,165 [DEBUG  ] Num steps:200000 current_lr:0.0002812283976669362 new_lr:0.00026716697778358933
2023-12-31 15:59:27,221 [DEBUG  ] Num steps:250000 current_lr:0.00016838183004601494 new_lr:0.00015996273854371418
2023-12-31 16:01:32,573 [DEBUG  ] Num steps:300000 current_lr:0.00010081642154510783 new_lr:9.577560046785243e-05
2023-12-31 16:01:45,039 [INFO   ] objective: trial:10
Mean reward:137.912733125 std_reward:263.61088234449807 result:-125.69814921949808
[I 2023-12-31 16:01:45,041] Trial 10 finished with value: -125.69814921949808 and parameters: {}. Best is trial 1 with value: 78.30228234056241.
2023-12-31 16:01:45,044 [INFO   ] indx_

objective started trial:11.


2023-12-31 16:03:39,213 [DEBUG  ] Num steps:50000 current_lr:0.0011499434730113207 new_lr:0.0010924462993607547
2023-12-31 16:06:02,653 [DEBUG  ] Num steps:100000 current_lr:0.0006885136353279492 new_lr:0.0006540879535615517
2023-12-31 16:08:21,905 [DEBUG  ] Num steps:150000 current_lr:0.0004122385466401455 new_lr:0.00039162661930813825
2023-12-31 16:10:30,232 [DEBUG  ] Num steps:200000 current_lr:0.0002468224456513983 new_lr:0.00023448132336882836
2023-12-31 16:12:39,769 [DEBUG  ] Num steps:250000 current_lr:0.00014778171564464927 new_lr:0.0001403926298624168
2023-12-31 16:15:09,267 [DEBUG  ] Num steps:300000 current_lr:8.848237210047371e-05 new_lr:8.405825349545001e-05
2023-12-31 16:15:21,938 [INFO   ] objective: trial:11
Mean reward:246.986910125 std_reward:19.57058966273891 result:227.4163204622611
2023-12-31 16:15:21,951 [INFO   ] Saved best model to file:LunarLander-v2.model
[I 2023-12-31 16:15:21,953] Trial 11 finished with value: 227.4163204622611 and parameters: {}. Best is tr

objective started trial:12.


2023-12-31 16:17:48,918 [DEBUG  ] Num steps:50000 current_lr:0.002135448000046126 new_lr:0.00202867560004382
2023-12-31 16:20:45,132 [DEBUG  ] Num steps:100000 current_lr:0.0012785715994503345 new_lr:0.0012146430194778176
[I 2023-12-31 16:20:45,134] Trial 12 pruned. 
2023-12-31 16:20:45,162 [INFO   ] sample_ppo_params() 13
Params:{'ent_coef': 0.01, 'n_steps': 64, 'gamma': 0.9991888679252675, 'learning_rate': 0.00037150030997262063, 'max_grad_norm': 0.8055580951601621, 'n_epochs': 14, 'batch_size': 64}


objective started trial:13.


2023-12-31 16:23:40,290 [DEBUG  ] Num steps:50000 current_lr:0.00023413785107275346 new_lr:0.00022243095851911577
2023-12-31 16:26:36,507 [DEBUG  ] Num steps:100000 current_lr:0.00014018698031115172 new_lr:0.00013317763129559412
[I 2023-12-31 16:26:36,511] Trial 13 pruned. 
2023-12-31 16:26:36,550 [INFO   ] sample_ppo_params() 14
Params:{'ent_coef': 0.01, 'n_steps': 2048, 'gamma': 0.9908686275040927, 'learning_rate': 0.018124075726541428, 'max_grad_norm': 2.103731430237438, 'n_epochs': 10, 'batch_size': 64}


objective started trial:14.


2023-12-31 16:28:51,061 [DEBUG  ] Num steps:50000 current_lr:0.011422688028456852 new_lr:0.010851553627034008
2023-12-31 16:30:56,265 [DEBUG  ] Num steps:100000 current_lr:0.0068391852680331254 new_lr:0.006497226004631469
[I 2023-12-31 16:30:56,266] Trial 14 pruned. 
2023-12-31 16:30:56,272 [INFO   ] Saved the study to the file: /content/opt_lunar_lander_study.pkl


In [None]:
# Write report
study.trials_dataframe().to_csv("study_results_ppo_lunar_lander.csv")

In [None]:
fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)
fig3 = plot_intermediate_values(study)

fig1.show()
fig2.show()
fig3.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
with open(SAVED_PARAMS_FILE, 'w') as fp:
    json.dump(study.best_params, fp)

In [None]:
study.best_params

{}

In [None]:
if len(study.trials)>0:
    trial = study.best_trial
    print(f'Best trial: {trial.number} Value: {trial.value}')

    print("  Params: ")
    for key, value in study.best_params.items():
        print(f"    {key}: {value}")

Best trial: 11 Value: 227.4163204622611
  Params: 
