In [None]:
import gym

%load_ext tensorboard

import os

import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2
from stable_baselines import results_plotter
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.common.vec_env import DummyVecEnv

In [None]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {}".format(self.num_timesteps))
                    print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(self.save_path))
                    self.model.save(self.save_path)

        return True

In [None]:
# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

In [None]:
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir)

#model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="./ppo_cartpole_tensorboard/")

In [None]:
env = DummyVecEnv([lambda: env])

# Load the trained agent
model = PPO2.load("ppo2_CartPole", env=env, tensorboard_log="./ppo_cartpole_tensorboard/")

In [None]:
# Create the callback: check every 1000 steps
#callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

# Train the agent
time_steps = 10000000
#model.learn(total_timesteps=time_steps, callback=callback)
model.learn(total_timesteps=time_steps)

In [None]:
# Save the agent
model.save("ppo2_CartPole")

In [None]:
del model  # delete trained model to demonstrate loading

# Load the trained agent
model = PPO2.load("ppo2_CartPole")

In [None]:
results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO2 CartPole")
plt.show()

In [None]:
%tensorboard --logdir ./ppo_cartpole_tensorboard/

In [None]:
# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()
env.close()