
### Installs and Imports

In [17]:
!pip install -q gym[box2d]
!pip install -q stable-baselines3[extra]
!pip install -q sb3-contrib
!pip install -q optuna

In [18]:
import torch
import torch.nn as nn

import gym
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

In [19]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

### Config

In [20]:
N_TRIALS = 20 # maximum number of trials
N_JOBS = 1 # number of jobs to run in parallel
N_STARTUP_TRIALS = 5 # do N_STARTUP_TRIALS random sampling
N_EVALUATIONS = 2 # number of evaluations to run during training
N_TIMESTEPS = int(2e4) # training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)

N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60*15)  # 15 minutes

ENV_ID = "Pendulum-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID
}

### Search Space

In [25]:
from typing import Any, Dict

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
  """
  Sampler for PPO parameters

  :param trial: Optuna trial object
  :return: The sampled hyperparemters dictionary for a given trial
  """
  gamma = 1 - trial.suggest_float("gamma_", 1e-4, 0.1, log=True)
  lr = trial.suggest_float("lr", 1e-5, 1, log=True)
  n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 12)
  gae_lambda = 1 - trial.suggest_float("gae_lambda_", 1e-4, 0.01, log=True)
  max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5, log=True)

  net_arch = trial.suggest_categorical("arch", ["small", "tiny"])
  activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
  normalize_advantage = trial.suggest_categorical("normalize_advantage", ["True", "False"])

  # Display true values
  trial.set_user_attr("gamma", gamma)
  trial.set_user_attr("n_steps", n_steps)
  trial.set_user_attr("gae_lambda", gae_lambda)

  # Set network architecture and activation function from categorical suggestion
  net_arch = [
      {"pi": [64], "vf": [64]}
      if net_arch == "tiny"
      else {"pi": [64, 64], "vf": [64, 64]}]
  activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]
  normalize_advantage = {"True": True, "False": False}[normalize_advantage]


  return {"gamma": gamma,
          "learning_rate": lr,
          "n_steps": n_steps,
          "gae_lambda": gae_lambda,
          "normalize_advantage": normalize_advantage,
          "max_grad_norm": max_grad_norm,
          "policy_kwargs": {"net_arch": net_arch,
                            "activation_fn": activation_fn}
          }

### Evaluation Callbak Class

In [26]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
  """
  Callback used for evaluating and reporting a trial

  :param eval_env: Evaluation environment
  :param trial: Optuna trial object
  :param: n_eval_episodes: Number of episodes used to evalaute policy
  :param: eval_freq: Number of successive steps after which policy is evaluated 
      during training
  :param: deterministic: whether the evaluation should use a deterministic or 
      stochastic policy
  :param: verbose: verbosity
  """
  def __init__(
      self,
      eval_env: gym.Env,
      trial: optuna.Trial,
      n_eval_episodes: int = 5,
      eval_freq: int = 10_000,
      deterministic: bool = True,
      verbose: int = 0):
    
    super().__init__(
        eval_env=eval_env,
        n_eval_episodes=n_eval_episodes,
        eval_freq=eval_freq,
        deterministic=deterministic,
        verbose=verbose)
    self.trial = trial
    self.is_pruned = False
    self.eval_idx = 0

  def _on_step(self) -> bool:
    if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
      # Evalaute the policy. Done in the parent class
      super()._on_step()
      self.eval_idx += 1

      # Send report to Optuna
      self.trial.report(self.last_mean_reward, self.eval_idx)

      # Prune trial if needed
      if self.trial.should_prune():
        self.is_pruned = True
        return False
      return True       

### Objective Function

In [27]:
def objective(trial: optuna.Trial) -> float:
  """
  Objective function used by Optuna to evaluate one configuration (trial, i.e. 
  a set of parameters)

  Given a trial object, it will sample one set of hyperparameters, evaluate it, 
  and report the result (mean episodic reward)

  :param trial: Optuna trial object
  :return: Mean episodic reward after training 
  """

  # Initialize with default hyperparameters
  kwargs = DEFAULT_HYPERPARAMS.copy()

  # Update hyperparameters
  kwargs.update(sample_ppo_params(trial))

  # Create the model
  model = PPO(**kwargs)

  # Create the evaluation env
  eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS)

  # Create EvalCallback object
  # TrailEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
  eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, True, 1)

  # Train the model
  nan_encountered = False

  try:
    model.learn(N_TIMESTEPS, callback=eval_callback)
  except AssertionError as e:
    # Sometimes, random parameters can generate NaN
    print(e)
    nan_encountered = True
  finally:
    model.env.close()
    eval_envs.close()

  # Tell the optimizer that the trial failed
  if nan_encountered: return float("nan")

  if eval_callback.is_pruned: raise optuna.exceptions.TrialPruned()
  return eval_callback.last_mean_reward

### The Optimization Loop

In [None]:
# Set PyTorch number of threads to 1 for a faster training
torch.set_num_threads(1)

# Select the sampler. It can be RandomSampler, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Select the pruner. Do not prune before 1/3 of the maximum budget is used
# Do not prune before (N_EVALUATIONS // 3) evaluations
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps = N_EVALUATIONS // 3)

# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
  study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
  pass

[32m[I 2022-08-26 16:33:33,980][0m A new study created in memory with name: no-name-6c26cc06-ff5f-4911-aa20-7ed9ef79ac1a[0m


Eval num_timesteps=10000, episode_reward=-1472.42 +/- 47.67
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1482.44 +/- 62.53
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:34:04,322][0m Trial 0 finished with value: -1482.4371759 and parameters: {'gamma_': 0.026007345873336345, 'lr': 0.6375169849173647, 'exponent_n_steps': 11, 'gae_lambda_': 0.0030884824359835045, 'max_grad_norm': 0.35024907628223534, 'arch': 'small', 'activation_fn': 'relu', 'normalize_advantage': 'False'}. Best is trial 0 with value: -1482.4371759.[0m


Eval num_timesteps=10000, episode_reward=-1240.66 +/- 216.91
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1393.92 +/- 167.77
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:34:34,315][0m Trial 1 finished with value: -1393.9244159 and parameters: {'gamma_': 0.00021066629126039035, 'lr': 0.032030162656981206, 'exponent_n_steps': 9, 'gae_lambda_': 0.006180113265782246, 'max_grad_norm': 0.4509718097693385, 'arch': 'small', 'activation_fn': 'tanh', 'normalize_advantage': 'False'}. Best is trial 1 with value: -1393.9244159.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"


Eval num_timesteps=10000, episode_reward=-1565.03 +/- 230.10
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:36:22,440][0m Trial 2 finished with value: -1826.0837007999999 and parameters: {'gamma_': 0.01975947390077405, 'lr': 0.00012784545854165776, 'exponent_n_steps': 3, 'gae_lambda_': 0.00816279726526012, 'max_grad_norm': 1.3922991445418573, 'arch': 'small', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 1 with value: -1393.9244159.[0m


Eval num_timesteps=20000, episode_reward=-1826.08 +/- 83.36
Episode length: 200.00 +/- 0.00
Eval num_timesteps=10000, episode_reward=-1399.56 +/- 280.43
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1194.49 +/- 312.98
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:36:49,143][0m Trial 3 finished with value: -1194.4876064999999 and parameters: {'gamma_': 0.00015226974522460345, 'lr': 0.0007220584872409857, 'exponent_n_steps': 12, 'gae_lambda_': 0.0035188424608009207, 'max_grad_norm': 0.5136550074077985, 'arch': 'tiny', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 3 with value: -1194.4876064999999.[0m


Eval num_timesteps=10000, episode_reward=-1322.85 +/- 199.42
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:37:15,980][0m Trial 4 finished with value: -1185.0895925999998 and parameters: {'gamma_': 0.016122971918859936, 'lr': 0.0021961632439701515, 'exponent_n_steps': 7, 'gae_lambda_': 0.0023638868953019436, 'max_grad_norm': 2.2954013500921446, 'arch': 'tiny', 'activation_fn': 'relu', 'normalize_advantage': 'True'}. Best is trial 4 with value: -1185.0895925999998.[0m


Eval num_timesteps=20000, episode_reward=-1185.09 +/- 215.86
Episode length: 200.00 +/- 0.00
New best mean reward!


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"


Eval num_timesteps=10000, episode_reward=-1336.92 +/- 153.39
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:37:51,742][0m Trial 5 finished with value: -1442.7867286 and parameters: {'gamma_': 0.00450247368878552, 'lr': 0.007232685038656514, 'exponent_n_steps': 5, 'gae_lambda_': 0.0002964864266286789, 'max_grad_norm': 4.785209983163338, 'arch': 'tiny', 'activation_fn': 'relu', 'normalize_advantage': 'True'}. Best is trial 4 with value: -1185.0895925999998.[0m


Eval num_timesteps=20000, episode_reward=-1442.79 +/- 77.27
Episode length: 200.00 +/- 0.00
Eval num_timesteps=10000, episode_reward=-1296.36 +/- 337.26
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:38:18,093][0m Trial 6 finished with value: -1244.2103943000002 and parameters: {'gamma_': 0.09748535054077735, 'lr': 1.1127691511840384e-05, 'exponent_n_steps': 7, 'gae_lambda_': 0.0006944763479084139, 'max_grad_norm': 2.654287914590901, 'arch': 'tiny', 'activation_fn': 'relu', 'normalize_advantage': 'True'}. Best is trial 4 with value: -1185.0895925999998.[0m


Eval num_timesteps=20000, episode_reward=-1244.21 +/- 280.26
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:38:31,125][0m Trial 7 pruned. [0m


Eval num_timesteps=10000, episode_reward=-1459.78 +/- 69.90
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1322.80 +/- 208.35
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1564.15 +/- 240.51
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:38:57,135][0m Trial 8 finished with value: -1564.1465950000002 and parameters: {'gamma_': 0.0016181032722194701, 'lr': 0.0007487584388668523, 'exponent_n_steps': 9, 'gae_lambda_': 0.00011543897053967674, 'max_grad_norm': 0.8398350505146229, 'arch': 'tiny', 'activation_fn': 'relu', 'normalize_advantage': 'False'}. Best is trial 4 with value: -1185.0895925999998.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-08-26 16:39:15,043][0m Trial 9 pruned. [0m


Eval num_timesteps=10000, episode_reward=-1359.14 +/- 179.05
Episode length: 200.00 +/- 0.00
New best mean reward!


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-08-26 16:39:36,062][0m Trial 10 pruned. [0m


Eval num_timesteps=10000, episode_reward=-1397.38 +/- 141.89
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1152.15 +/- 224.49
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1167.25 +/- 332.09
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:40:02,407][0m Trial 11 finished with value: -1167.2527505000003 and parameters: {'gamma_': 0.00013251576375382216, 'lr': 0.00048335560344894916, 'exponent_n_steps': 12, 'gae_lambda_': 0.0028907100040943983, 'max_grad_norm': 0.7418640502748501, 'arch': 'tiny', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 11 with value: -1167.2527505000003.[0m


Eval num_timesteps=10000, episode_reward=-1280.63 +/- 162.75
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1332.05 +/- 65.07
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:40:28,904][0m Trial 12 finished with value: -1332.0499849 and parameters: {'gamma_': 0.0004950849377566995, 'lr': 0.0006210959529831925, 'exponent_n_steps': 9, 'gae_lambda_': 0.0020502410332880676, 'max_grad_norm': 0.8647754318890651, 'arch': 'tiny', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 11 with value: -1167.2527505000003.[0m


Eval num_timesteps=10000, episode_reward=-1008.95 +/- 90.39
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1272.30 +/- 242.86
Episode length: 200.00 +/- 0.00


[32m[I 2022-08-26 16:40:55,315][0m Trial 13 finished with value: -1272.2974849 and parameters: {'gamma_': 0.005483854743828143, 'lr': 0.0001645777555208797, 'exponent_n_steps': 10, 'gae_lambda_': 0.0005665372753554061, 'max_grad_norm': 1.0013622997785006, 'arch': 'tiny', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 11 with value: -1167.2527505000003.[0m


Eval num_timesteps=10000, episode_reward=-1185.77 +/- 276.30
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1161.87 +/- 198.58
Episode length: 200.00 +/- 0.00
New best mean reward!


[32m[I 2022-08-26 16:41:21,650][0m Trial 14 finished with value: -1161.8690614 and parameters: {'gamma_': 0.0007298739365178624, 'lr': 0.01838256851153824, 'exponent_n_steps': 12, 'gae_lambda_': 0.005664490677963549, 'max_grad_norm': 0.749765913625302, 'arch': 'tiny', 'activation_fn': 'tanh', 'normalize_advantage': 'True'}. Best is trial 14 with value: -1161.8690614.[0m


In [None]:
# Print Results
print("Number of finished trials", len(study.trials))

trial = study.best_trial
print("Best trial:")
print(f"  Mean Reward: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
  print(f"    {key}: {value}")
print("  User Attributes:")
for key, value in trial.user_attrs.items():
  print(f"    {key}: {value}")


# Write Report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()