## Hyperparameter tuning with Callbacks

In [3]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C, SAC
from stable_baselines3.common.evaluation import evaluate_policy

In [7]:
eval_env = gym.make('Pendulum-v1')
default_model = SAC(
    'MlpPolicy',
    eval_env,
    verbose=0,
    seed=0,
    batch_size=64,
    policy_kwargs=dict(net_arch=[64,64]),
).learn(8_000)

In [8]:
mean_reward, std_reward = evaluate_policy(default_model, eval_env, n_eval_episodes=100)
print(f'Default Mean reward: {mean_reward} +/- {std_reward}')

Default Mean reward: -609.9966125883581 +/- 77.17879235387625


In [10]:
tuned_model = SAC(
    'MlpPolicy',
    eval_env,
    verbose=0,
    seed=0,
    batch_size=128,
    learning_rate=3e-4,
    policy_kwargs=dict(net_arch=[256,256]),
).learn(8_000)
mean_reward, std_reward = evaluate_policy(tuned_model, eval_env, n_eval_episodes=100)
print(f'Tuned Mean reward: {mean_reward} +/- {std_reward}')

Tuned Mean reward: -162.31068963069788 +/- 99.2422378410119


## Callbacks

To build a custom callback, create a class derived from `BaseCallback` and override the methods for the events you want to handle (_on_training_start, _on_step and other useful variables like self.model for the RL model)

The access to variables like `self.model` allows you to change hyperparameters on the fly, for example we can implement a learning rate scheduler that decreases the learning rate linearly over time.

In [12]:
from stable_baselines3.common.callbacks import BaseCallback

class CustomCallback(BaseCallback):
    """
    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
    def _on_training_start(self) -> None: pass # Method called before rollout starts
    def _on_rollout_start(self) -> None: pass # A rollout is collection of environment interaction using the current policy
    def _on_step(self) -> bool: return True # This method will be called by the model after each call to `env.step()`. It returns a boolean indicating whether training should continue.
    def _on_rollout_end(self) -> None: pass # This event is triggered before updating the policy
    def _on_training_end(self) -> None: pass # Method called at the end of training

> A simple callback implementation

In [15]:
class SimpleCallback(BaseCallback):
    """
    A simple callback that can only be called twice before stopping training
    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self._called = False
    def _on_step(self) -> bool:
        if not self._called:
            print(f'callback - first call')
            self._called = True
            return True # Returning True will continue training
        print(f'callback - second call')
        return False # Returning False will stop training

In [14]:
model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1).learn(10_000, callback=SimpleCallback())

Using cpu device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
callback - first call
callback - second call


## First Example: Auto Saving best model

In [16]:
import os

import numpy as np

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.results_plotter import load_results, ts2xy

class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq, log_dir, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {}".format(self.num_timesteps))
                    print(
                        "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(
                            self.best_mean_reward, mean_reward
                        )
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model at {} timesteps".format(x[-1]))
                        print("Saving new best model to {}.zip".format(self.save_path))
                    self.model.save(self.save_path)

        return True

In [17]:
# Create log dir
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = make_vec_env("CartPole-v1", n_envs=1, monitor_dir=log_dir)
# it is equivalent to:
# env = gym.make('CartPole-v1')
# env = Monitor(env, log_dir)
# env = DummyVecEnv([lambda: env])

# Create Callback
callback = SaveOnBestTrainingRewardCallback(check_freq=20, log_dir=log_dir, verbose=1)

model = A2C("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=5000, callback=callback)

Num timesteps: 20
Best mean reward: -inf - Last mean reward per episode: 10.00
Saving new best model at 10 timesteps
Saving new best model to /tmp/gym/best_model.zip
Num timesteps: 40
Best mean reward: 10.00 - Last mean reward per episode: 11.00
Saving new best model at 22 timesteps
Saving new best model to /tmp/gym/best_model.zip
Num timesteps: 60
Best mean reward: 11.00 - Last mean reward per episode: 11.00
Num timesteps: 80
Best mean reward: 11.00 - Last mean reward per episode: 23.33
Saving new best model at 70 timesteps
Saving new best model to /tmp/gym/best_model.zip
Num timesteps: 100
Best mean reward: 23.33 - Last mean reward per episode: 23.33
Num timesteps: 120
Best mean reward: 23.33 - Last mean reward per episode: 23.33
Num timesteps: 140
Best mean reward: 23.33 - Last mean reward per episode: 30.75
Saving new best model at 123 timesteps
Saving new best model to /tmp/gym/best_model.zip
Num timesteps: 160
Best mean reward: 30.75 - Last mean reward per episode: 29.00
Num time

<stable_baselines3.a2c.a2c.A2C at 0x124ef2ba0>

## Second Example: Plotting the training reward in real-time

In [18]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook


class PlottingCallback(BaseCallback):
    """
    Callback for plotting the performance in realtime.

    :param verbose: (int)
    """
    def __init__(self, verbose=1):
        super().__init__(verbose)
        self._plot = None

    def _on_step(self) -> bool:
        # get the monitor's data
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if self._plot is None: # make the plot
            plt.ion()
            fig = plt.figure(figsize=(6,3))
            ax = fig.add_subplot(111)
            line, = ax.plot(x, y)
            self._plot = (line, ax, fig)
            plt.show()
        else: # update and rescale the plot
            self._plot[0].set_data(x, y)
            self._plot[-2].relim()
            self._plot[-2].set_xlim([self.locals["total_timesteps"] * -0.02,
                                    self.locals["total_timesteps"] * 1.02])
            self._plot[-2].autoscale_view(True,True,True)
            self._plot[-1].canvas.draw()

# Create log dir
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = make_vec_env('MountainCarContinuous-v0', n_envs=1, monitor_dir=log_dir)

plotting_callback = PlottingCallback()

model = PPO('MlpPolicy', env, verbose=0)
model.learn(10000, callback=plotting_callback)

<IPython.core.display.Javascript object>

<stable_baselines3.ppo.ppo.PPO at 0x1273caf90>