In [23]:
import gym
import numpy as np
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

"A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center." [CartPole-v1](https://gym.openai.com/envs/CartPole-v1/), 

<s>[Max episode length is 500](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py), therefore the max reward is also 500.</s>

This is a custom version of CartPole that has a maximum for episode steps of 1000.

[Registering custom environments](https://stackoverflow.com/questions/42787924/why-is-episode-done-after-200-time-steps-gym-environment-mountaincar)

In [24]:
# No need to re-run unless output is cleared; registration is saved in PATH\TO\ENV\lib\site-packages\gym\envs\registration.py
# Re-running will throw UserWarning: PATH\TO\ENV\lib\site-packages\gym\envs\registration.py:595: 
# UserWarning: WARN: Overriding environment CartPole-1k
gym.envs.register(
    id="CartPole-v1k",
    entry_point="gym.envs.classic_control:CartPoleEnv",
    max_episode_steps=1000, # CartPole-v1 uses 500
    reward_threshold=975.0, # CartPole-v1 uses 475.0
)

In [26]:
env = gym.make('CartPole-v1k')

model = PPO(MlpPolicy, env, verbose=0)

In [28]:
# Train agent for 15000 timesteps
for i in range(1,31):

    iterations = i*500

    # Train the agent for 500 steps at a time
    model.learn(total_timesteps=500) 

    # Save model
    save_dir = "./models/"
    model.save(save_dir + f"/cartpole-v1k/cartpole_v1k_ppo_{iterations}")

In [None]:
data = {'timesteps': [], 'mean_reward': [], 'std_reward': []}

for i in range(1,31):

    timesteps = i*500

    # Load model
    del model
    model = PPO.load(save_dir + f"/cartpole-v1k/cartpole_v1k_ppo_{timesteps}")

    # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
    mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)

    # print(f"mean_reward (agent trained over {timesteps} timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")

    data['timesteps'].append(timesteps)
    data['mean_reward'].append(mean_reward)
    data['std_reward'].append(std_reward)

In [40]:
# Turn data into pandas.DataFrame and serialize as csv file
df = pd.DataFrame(data, columns=data.keys())
df.set_index('timesteps', inplace=True)

df.to_csv(f"./out/training-log/cartpole_v1k_ppo_{df.index.to_list()[-1]}.csv")