### Installs and Imports

In [1]:
!pip install -q stable-baselines3[extra]
!pip install -q sb3-contrib
!pip install -q optuna

[K     |████████████████████████████████| 177 kB 31.9 MB/s 
[K     |████████████████████████████████| 1.5 MB 81.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 63.8 MB/s 
[K     |████████████████████████████████| 1.6 MB 66.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Building wheel for AutoROM.accept-rom-license (PEP 517) ... [?25l[?25hdone
[K     |████████████████████████████████| 78 kB 6.7 MB/s 
[K     |████████████████████████████████| 308 kB 40.3 MB/s 
[K     |████████████████████████████████| 81 kB 10.7 MB/s 
[K     |████████████████████████████████| 209 kB 75.9 MB/s 
[K     |████████████████████████████████| 78 kB 8.7 MB/s 
[K     |████████████████████████████████| 49 kB 5.7 MB/s 
[K     |████████████████████████████████| 147 kB 50.5 MB/s 
[K     |███████████████████████

In [2]:
import torch
import torch.nn as nn

import gym
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

### Config

In [33]:
N_TRIALS = 20 # maximum number of trials
N_JOBS = 1 # number of jobs to run in parallel
N_STARTUP_TRIALS = 5 # do N_STARTUP_TRIALS random sampling
N_EVALUATIONS = 2 # number of evaluations to run during training
N_TIMESTEPS = int(2e4) # training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)

N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60*15)  # 15 minutes

ENV_ID = "CartPole-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID
}

### Search Space

In [34]:
from typing import Any, Dict

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
  """
  Sampler for A2C parameters

  :param trial: Optuna trial object
  :return: The sampled hyperparemters dictionary for a given trial
  """
  gamma = 1 - trial.suggest_float("gamma_", 1e-4, 0.1, log=True)
  lr = trial.suggest_float("lr", 1e-5, 1, log=True)
  n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
  max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5, log=True)

  net_arch = trial.suggest_categorical("arch", ["small", "tiny"])
  activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

  # Display true values
  trial.set_user_attr("gamma", gamma)
  trial.set_user_attr("n_steps", n_steps)

  # Set network architecture and activation function from categorical suggestion
  net_arch = [
      {"pi": [64], "vf": [64]}
      if net_arch == "tiny"
      else {"pi": [64, 64], "vf": [64, 64]}]
  activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]


  return {"gamma": gamma,
          "learning_rate": lr,
          "n_steps": n_steps,
          "max_grad_norm": max_grad_norm,
          "policy_kwargs": {"net_arch": net_arch,
                            "activation_fn": activation_fn}
          }

### Evaluation Callbak Class

In [35]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
  """
  Callback used for evaluating and reporting a trial

  :param eval_env: Evaluation environment
  :param trial: Optuna trial object
  :param: n_eval_episodes: Number of episodes used to evalaute policy
  :param: eval_freq: Number of successive steps after which policy is evaluated 
      during training
  :param: deterministic: whether the evaluation should use a deterministic or 
      stochastic policy
  :param: verbose: verbosity
  """
  def __init__(
      self,
      eval_env: gym.Env,
      trial: optuna.Trial,
      n_eval_episodes: int = 5,
      eval_freq: int = 10_000,
      deterministic: bool = True,
      verbose: int = 0):
    
    super().__init__(
        eval_env=eval_env,
        n_eval_episodes=n_eval_episodes,
        eval_freq=eval_freq,
        deterministic=deterministic,
        verbose=verbose)
    self.trial = trial
    self.is_pruned = False
    self.eval_idx = 0

  def _on_step(self) -> bool:
    if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
      # Evalaute the policy. Done in the parent class
      super()._on_step()
      self.eval_idx += 1

      # Send report to Optuna
      self.trial.report(self.last_mean_reward, self.eval_idx)

      # Prune trial if needed
      if self.trial.should_prune():
        self.is_pruned = True
        return False
      return True       

### Objective Function

In [36]:
def objective(trial: optuna.Trial) -> float:
  """
  Objective function used by Optuna to evaluate one configuration (trial, i.e. 
  a set of parameters)

  Given a trial object, it will sample one set of hyperparameters, evaluate it, 
  and report the result (mean episodic reward)

  :param trial: Optuna trial object
  :return: Mean episodic reward after training 
  """

  # Initialize with default hyperparameters
  kwargs = DEFAULT_HYPERPARAMS.copy()

  # Update hyperparameters
  kwargs.update(sample_a2c_params(trial))

  # Create the model
  model = A2C(**kwargs)

  # Create the evaluation env
  eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS)

  # Create EvalCallback object
  # TrailEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
  eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, True, 1)

  # Train the model
  nan_encountered = False

  try:
    model.learn(N_TIMESTEPS, callback=eval_callback)
  except AssertionError as e:
    # Sometimes, random parameters can generate NaN
    print(e)
    nan_encountered = True
  finally:
    model.env.close()
    eval_envs.close()

  # Tell the optimizer that the trial failed
  if nan_encountered: return float("nan")

  if eval_callback.is_pruned: raise optuna.exceptions.TrialPruned()
  return eval_callback.last_mean_reward

### The Optimization Loop

In [37]:
# Set PyTorch number of threads to 1 for a faster training
torch.set_num_threads(1)

# Select the sampler. It can be RandomSampler, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Select the pruner. Do not prune before 1/3 of the maximum budget is used
# Do not prune before (N_EVALUATIONS // 3) evaluations
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps = N_EVALUATIONS // 3)

# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
  study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
  pass

[32m[I 2022-08-26 15:15:50,998][0m A new study created in memory with name: no-name-e9239167-b171-40bc-b007-d2a8353d2311[0m


Eval num_timesteps=10000, episode_reward=9.20 +/- 0.60
Episode length: 9.20 +/- 0.60
New best mean reward!


[32m[I 2022-08-26 15:16:04,354][0m Trial 0 finished with value: 9.2 and parameters: {'gamma_': 0.08713821351614517, 'lr': 0.18721093064885544, 'exponent_n_steps': 3, 'max_grad_norm': 1.5370488337131525, 'arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 9.2.[0m


Eval num_timesteps=20000, episode_reward=9.20 +/- 0.98
Episode length: 9.20 +/- 0.98
Eval num_timesteps=10000, episode_reward=9.40 +/- 0.66
Episode length: 9.40 +/- 0.66
New best mean reward!


[32m[I 2022-08-26 15:16:16,392][0m Trial 1 finished with value: 9.6 and parameters: {'gamma_': 0.00854047343750428, 'lr': 0.594188308348942, 'exponent_n_steps': 3, 'max_grad_norm': 0.3123929494111596, 'arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 1 with value: 9.6.[0m


Eval num_timesteps=20000, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
New best mean reward!
Eval num_timesteps=10000, episode_reward=9.20 +/- 0.60
Episode length: 9.20 +/- 0.60
New best mean reward!


[32m[I 2022-08-26 15:16:25,212][0m Trial 2 finished with value: 9.2 and parameters: {'gamma_': 0.004583391226094223, 'lr': 0.6009911555478358, 'exponent_n_steps': 6, 'max_grad_norm': 3.388513650297223, 'arch': 'small', 'activation_fn': 'tanh'}. Best is trial 1 with value: 9.6.[0m


Eval num_timesteps=20000, episode_reward=9.20 +/- 0.75
Episode length: 9.20 +/- 0.75
Eval num_timesteps=10000, episode_reward=86.00 +/- 25.12
Episode length: 86.00 +/- 25.12
New best mean reward!


[32m[I 2022-08-26 15:16:33,028][0m Trial 3 finished with value: 97.1 and parameters: {'gamma_': 0.03555632247004915, 'lr': 6.374647547851127e-05, 'exponent_n_steps': 10, 'max_grad_norm': 0.7350516040289287, 'arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 3 with value: 97.1.[0m


Eval num_timesteps=20000, episode_reward=97.10 +/- 29.65
Episode length: 97.10 +/- 29.65
New best mean reward!
Eval num_timesteps=10000, episode_reward=74.60 +/- 20.27
Episode length: 74.60 +/- 20.27
New best mean reward!


[32m[I 2022-08-26 15:16:42,074][0m Trial 4 finished with value: 139.2 and parameters: {'gamma_': 0.0008137634297241209, 'lr': 1.2326374422718447e-05, 'exponent_n_steps': 6, 'max_grad_norm': 3.876091518654808, 'arch': 'small', 'activation_fn': 'relu'}. Best is trial 4 with value: 139.2.[0m


Eval num_timesteps=20000, episode_reward=139.20 +/- 117.92
Episode length: 139.20 +/- 117.92
New best mean reward!
Eval num_timesteps=10000, episode_reward=70.10 +/- 15.56
Episode length: 70.10 +/- 15.56
New best mean reward!


[32m[I 2022-08-26 15:16:51,051][0m Trial 5 finished with value: 99.2 and parameters: {'gamma_': 0.00012170306986051758, 'lr': 1.0763500665848529e-05, 'exponent_n_steps': 7, 'max_grad_norm': 4.8328220980692285, 'arch': 'small', 'activation_fn': 'relu'}. Best is trial 4 with value: 139.2.[0m


Eval num_timesteps=20000, episode_reward=99.20 +/- 23.25
Episode length: 99.20 +/- 23.25
New best mean reward!
Eval num_timesteps=10000, episode_reward=496.30 +/- 11.10
Episode length: 496.30 +/- 11.10
New best mean reward!


[32m[I 2022-08-26 15:17:00,620][0m Trial 6 finished with value: 424.5 and parameters: {'gamma_': 0.00035425095834031094, 'lr': 0.0011777414046982924, 'exponent_n_steps': 6, 'max_grad_norm': 2.2918187605436025, 'arch': 'small', 'activation_fn': 'relu'}. Best is trial 6 with value: 424.5.[0m


Eval num_timesteps=20000, episode_reward=424.50 +/- 70.76
Episode length: 424.50 +/- 70.76
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:17:09,732][0m Trial 7 finished with value: 472.4 and parameters: {'gamma_': 0.00030245802824650683, 'lr': 0.0021686230389775427, 'exponent_n_steps': 8, 'max_grad_norm': 1.5706425984216532, 'arch': 'small', 'activation_fn': 'relu'}. Best is trial 7 with value: 472.4.[0m


Eval num_timesteps=20000, episode_reward=472.40 +/- 48.33
Episode length: 472.40 +/- 48.33
Eval num_timesteps=10000, episode_reward=438.90 +/- 66.54
Episode length: 438.90 +/- 66.54
New best mean reward!


[32m[I 2022-08-26 15:17:18,189][0m Trial 8 finished with value: 500.0 and parameters: {'gamma_': 0.0009943486624725889, 'lr': 0.010135413604588724, 'exponent_n_steps': 9, 'max_grad_norm': 0.8375117706725724, 'arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=87.30 +/- 68.00
Episode length: 87.30 +/- 68.00
New best mean reward!


[32m[I 2022-08-26 15:17:25,830][0m Trial 9 pruned. [0m


Eval num_timesteps=20000, episode_reward=93.30 +/- 16.35
Episode length: 93.30 +/- 16.35
New best mean reward!
Eval num_timesteps=10000, episode_reward=139.10 +/- 7.23
Episode length: 139.10 +/- 7.23
New best mean reward!


[32m[I 2022-08-26 15:17:33,999][0m Trial 10 finished with value: 500.0 and parameters: {'gamma_': 0.013062618031968294, 'lr': 0.02561953380020253, 'exponent_n_steps': 8, 'max_grad_norm': 0.7523320337249939, 'arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:17:37,880][0m Trial 11 pruned. [0m


Eval num_timesteps=10000, episode_reward=46.10 +/- 17.63
Episode length: 46.10 +/- 17.63
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:17:46,313][0m Trial 12 finished with value: 500.0 and parameters: {'gamma_': 0.002049290751687854, 'lr': 0.023549869624928523, 'exponent_n_steps': 9, 'max_grad_norm': 0.4107005725622915, 'arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
Eval num_timesteps=10000, episode_reward=93.00 +/- 37.99
Episode length: 93.00 +/- 37.99
New best mean reward!


[32m[I 2022-08-26 15:17:54,147][0m Trial 13 pruned. [0m


Eval num_timesteps=20000, episode_reward=82.80 +/- 28.06
Episode length: 82.80 +/- 28.06
Eval num_timesteps=10000, episode_reward=421.00 +/- 103.62
Episode length: 421.00 +/- 103.62
New best mean reward!


[32m[I 2022-08-26 15:18:02,660][0m Trial 14 finished with value: 500.0 and parameters: {'gamma_': 0.003924000990219366, 'lr': 0.008198698471866274, 'exponent_n_steps': 9, 'max_grad_norm': 0.45438775723043523, 'arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:18:06,608][0m Trial 15 pruned. [0m


Eval num_timesteps=10000, episode_reward=9.00 +/- 0.63
Episode length: 9.00 +/- 0.63
New best mean reward!
Eval num_timesteps=10000, episode_reward=468.00 +/- 62.41
Episode length: 468.00 +/- 62.41
New best mean reward!


[32m[I 2022-08-26 15:18:15,047][0m Trial 16 finished with value: 500.0 and parameters: {'gamma_': 0.056186779287576996, 'lr': 0.004904482414114254, 'exponent_n_steps': 9, 'max_grad_norm': 0.626257852634348, 'arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:18:19,493][0m Trial 17 pruned. [0m


Eval num_timesteps=10000, episode_reward=100.60 +/- 37.64
Episode length: 100.60 +/- 37.64
New best mean reward!
Eval num_timesteps=10000, episode_reward=328.10 +/- 82.27
Episode length: 328.10 +/- 82.27
New best mean reward!


[32m[I 2022-08-26 15:18:27,877][0m Trial 18 finished with value: 500.0 and parameters: {'gamma_': 0.0009394398266475002, 'lr': 0.007549043894468425, 'exponent_n_steps': 9, 'max_grad_norm': 0.46607452438223707, 'arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 8 with value: 500.0.[0m


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 15:18:31,688][0m Trial 19 pruned. [0m


Eval num_timesteps=10000, episode_reward=66.50 +/- 21.86
Episode length: 66.50 +/- 21.86
New best mean reward!


In [38]:
# Print Results
print("Number of finished trials", len(study.trials))

trial = study.best_trial
print("Best trial:")
print(f"  Mean Reward: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
  print(f"    {key}: {value}")
print("  User Attributes:")
for key, value in trial.user_attrs.items():
  print(f"    {key}: {value}")


# Write Report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

Number of finished trials 20
Best trial:
  Mean Reward: 500.0
  Params: 
    gamma_: 0.0009943486624725889
    lr: 0.010135413604588724
    exponent_n_steps: 9
    max_grad_norm: 0.8375117706725724
    arch: tiny
    activation_fn: relu
  User Attributes:
    gamma: 0.9990056513375274
    n_steps: 512
