In [19]:
import gym
import numpy as np

"A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center." [CartPole-v1](https://gym.openai.com/envs/CartPole-v1/), 

<s>[Max episode length is 500](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py), therefore the max reward is also 500.</s>

This is a custom version of CartPole that has a maximum for episode steps of 1000.

[Registering custom environments](https://stackoverflow.com/questions/42787924/why-is-episode-done-after-200-time-steps-gym-environment-mountaincar)

In [20]:
# No need to re-run unless output is cleared; registration is saved in PATH\TO\ENV\lib\site-packages\gym\envs\registration.py
# Re-running will throw UserWarning: PATH\TO\ENV\lib\site-packages\gym\envs\registration.py:595: 
# UserWarning: WARN: Overriding environment CartPole-1k
gym.envs.register(
    id="CartPole-v1k",
    entry_point="gym.envs.classic_control:CartPoleEnv",
    max_episode_steps=1000, # CartPole-v1 uses 500
    reward_threshold=975.0, # CartPole-v1 uses 475.0
)

In [21]:
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy

In [22]:
env = gym.make('CartPole-v1k')

model = PPO(MlpPolicy, env, verbose=0)

In [23]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [24]:
for i in range(1,21):

    # Train the agent for 1'000 steps
    model.learn(total_timesteps=1000) 

    # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
    mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
    print(f"mean_reward (agent trained over {i}'000 timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")

    # Save model
    save_dir = "./models/"
    model.save(save_dir + f"/cartpole-v1k/cartpole_v1k_ppo_{i}K")

mean_reward (agent trained over 1'000 timesteps): 330.41 +/- 270.76
mean_reward (agent trained over 2'000 timesteps): 399.02 +/- 264.97
mean_reward (agent trained over 3'000 timesteps): 349.32 +/- 229.78
mean_reward (agent trained over 4'000 timesteps): 413.11 +/- 246.12
mean_reward (agent trained over 5'000 timesteps): 421.51 +/- 228.71
mean_reward (agent trained over 6'000 timesteps): 454.90 +/- 217.08
mean_reward (agent trained over 7'000 timesteps): 475.73 +/- 256.68
mean_reward (agent trained over 8'000 timesteps): 601.47 +/- 253.89
mean_reward (agent trained over 9'000 timesteps): 764.44 +/- 250.46
mean_reward (agent trained over 10'000 timesteps): 599.67 +/- 219.13
mean_reward (agent trained over 11'000 timesteps): 823.81 +/- 212.27
mean_reward (agent trained over 12'000 timesteps): 896.98 +/- 150.79
mean_reward (agent trained over 13'000 timesteps): 902.53 +/- 154.34
mean_reward (agent trained over 14'000 timesteps): 951.84 +/- 96.14
mean_reward (agent trained over 15'000 times