In [46]:
import gym
import numpy as np

"A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center." [CartPole-v1](https://gym.openai.com/envs/CartPole-v1/), 

[Max episode length is 500](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py), therefore the max reward is also 500.

In [47]:
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy

In [48]:
env = gym.make('CartPole-v1')

model = PPO(MlpPolicy, env, verbose=0)

In [49]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [50]:
for i in range(1,26):

    # Train the agent for 1'000 steps
    model.learn(total_timesteps=1000) 

    if i % 5 == 0:

        # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
        mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
        print(f"mean_reward (agent trained over {i}'000 timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")

        # Save model
        save_dir = "./models/"
        model.save(save_dir + f"/cartpole-v1/cartpole_v1_ppo_{i}K")

mean_reward (agent trained over 5'000 timesteps): 290.61 +/- 101.96
mean_reward (agent trained over 10'000 timesteps): 490.51 +/- 30.90
mean_reward (agent trained over 15'000 timesteps): 500.00 +/- 0.00
mean_reward (agent trained over 20'000 timesteps): 500.00 +/- 0.00
mean_reward (agent trained over 25'000 timesteps): 500.00 +/- 0.00
