In [23]:
import gym
import numpy as np

"A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center." [CartPole-v1](https://gym.openai.com/envs/CartPole-v1/), 

<s>[Max episode length is 500](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py), therefore the max reward is also 500.</s>

This is a custom version of CartPole that has a maximum for episode steps of 1000.

[Registering custom environments](https://stackoverflow.com/questions/42787924/why-is-episode-done-after-200-time-steps-gym-environment-mountaincar)

In [24]:
# No need to re-run unless output is cleared; registration is saved in PATH\TO\ENV\lib\site-packages\gym\envs\registration.py
# Re-running will throw UserWarning: PATH\TO\ENV\lib\site-packages\gym\envs\registration.py:595: 
# UserWarning: WARN: Overriding environment CartPole-1k
gym.envs.register(
    id="CartPole-v1k",
    entry_point="gym.envs.classic_control:CartPoleEnv",
    max_episode_steps=1000, # CartPole-v1 uses 500
    reward_threshold=975.0, # CartPole-v1 uses 475.0
)

In [25]:
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy

In [26]:
env = gym.make('CartPole-v1k')

model = PPO(MlpPolicy, env, verbose=0)

In [27]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [28]:
for i in range(1,31):

    iterations = i*500

    # Train the agent for 500 steps
    model.learn(total_timesteps=500) 

    # Save model
    save_dir = "./models/"
    model.save(save_dir + f"/cartpole-v1k/cartpole_v1k_ppo_{iterations}")

In [38]:
data ={'timesteps': [], 'mean_reward': [], 'std_reward': []}

for i in range(1,31):

    timesteps = i*500

    # Load model
    del model
    model = PPO.load(save_dir + f"/cartpole-v1k/cartpole_v1k_ppo_{timesteps}")

    # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
    mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
    print(f"mean_reward (agent trained over {timesteps} timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")
    data['timesteps'].append(timesteps)
    data['mean_reward'].append(mean_reward)
    data['std_reward'].append(std_reward)

mean_reward (agent trained over 500 timesteps): 113.46 +/- 84.18
mean_reward (agent trained over 1000 timesteps): 280.72 +/- 190.36
mean_reward (agent trained over 1500 timesteps): 282.94 +/- 222.04
mean_reward (agent trained over 2000 timesteps): 245.23 +/- 168.19
mean_reward (agent trained over 2500 timesteps): 377.14 +/- 197.35
mean_reward (agent trained over 3000 timesteps): 472.94 +/- 232.75
mean_reward (agent trained over 3500 timesteps): 509.95 +/- 231.40
mean_reward (agent trained over 4000 timesteps): 700.82 +/- 254.36
mean_reward (agent trained over 4500 timesteps): 492.70 +/- 151.78
mean_reward (agent trained over 5000 timesteps): 793.59 +/- 246.56
mean_reward (agent trained over 5500 timesteps): 869.34 +/- 194.30
mean_reward (agent trained over 6000 timesteps): 881.09 +/- 185.32
mean_reward (agent trained over 6500 timesteps): 922.75 +/- 131.46
mean_reward (agent trained over 7000 timesteps): 915.06 +/- 155.20
mean_reward (agent trained over 7500 timesteps): 870.82 +/- 168.

In [39]:
import pandas as pd

In [40]:
# Turn data into pandas.DataFrame and serialize as csv file
df = pd.DataFrame(data, columns=data.keys())
df.set_index('timesteps', inplace=True)

df.to_csv(f"./out/cartpole-v1k/cartpole_v1k_ppo_{df.index.to_list()[-1]}.csv")