In [5]:
import gym
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

"Easiest continuous control task to learn from pixels, a top-down racing environment. Discreet control is reasonable in this environment as well, on/off discretisation is fine. State consists of 96x96 pixels. Reward is -0.1 every frame and +1000/N for every track tile visited, where N is the total number of tiles in track. For example, if you have finished in 732 frames, your reward is 1000 - 0.1*732 = 926.8 points. Episode finishes when all tiles are visited. Some indicators shown at the bottom of the window and the state RGB buffer. From left to right: true speed, four ABS sensors, steering wheel position, gyroscope." - [CarRacing-v0](https://gym.openai.com/envs/CarRacing-v0/) ([Source](https://github.com/openai/gym/blob/master/gym/envs/box2d/car_racing.py))

In [6]:
env = gym.make('CarRacing-v1')

model = PPO(MlpPolicy, env, verbose=0)

In [None]:
# Train agent for 500000 timesteps
for i in range(1,501):

    iterations = i*1000

    # Train the agent for 1000 steps at a time
    model.learn(total_timesteps=1000)

    print(f"\n-----\nProgress:{iterations}/500000\n-----\n")

    if iterations % 5000 == 0:
        # Save model
        save_dir = "./models/"
        model.save(save_dir + f"/carracing-v1/carracing_v1_ppo_{iterations}")

In [11]:
# Load model
save_dir = "./models/"
del model
model = PPO.load(save_dir + f"/carracing-v1/carracing_v1_ppo_500000")

# Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=50)

print(f"mean_reward (agent trained over 500000 timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")

Track generation: 1284..1609 -> 325-tiles track
Track generation: 1100..1373 -> 273-tiles track
Track generation: 1127..1413 -> 286-tiles track
Track generation: 1188..1489 -> 301-tiles track
Track generation: 1120..1404 -> 284-tiles track
Track generation: 1029..1292 -> 263-tiles track
Track generation: 1278..1602 -> 324-tiles track
Track generation: 1131..1418 -> 287-tiles track
Track generation: 1090..1371 -> 281-tiles track
Track generation: 1187..1488 -> 301-tiles track
Track generation: 1107..1388 -> 281-tiles track
Track generation: 1172..1469 -> 297-tiles track
Track generation: 1064..1334 -> 270-tiles track
Track generation: 1110..1392 -> 282-tiles track
Track generation: 1030..1294 -> 264-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1026..1295 -> 269-tiles track
Track generation: 1192..1494 -> 302-tiles track
Track generation: 1084..1359 -> 275-tiles track
Track generation: 1032..1294 -> 262-tiles track
Track ge

In [None]:
data = {'timesteps': [], 'mean_reward': [], 'std_reward': []}

for i in range(1,501):

    timesteps = i*1000

    if timesteps % 5000 == 0:
        # Load model
        del model
        model = PPO.load(save_dir + f"/carracing-v1/carracing_v1_ppo_{timesteps}")    
        
        # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
        mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
        
        print(f"mean_reward (agent trained over {timesteps} timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")
        
        data['timesteps'].append(timesteps)
        data['mean_reward'].append(mean_reward)
        data['std_reward'].append(std_reward)

In [None]:
# Turn data into pandas.DataFrame and serialize as csv file
df = pd.DataFrame(data, columns=data.keys())
df.set_index('timesteps', inplace=True)

df.to_csv(f"./out/training-log/carracing_v1_ppo_{df.index.to_list()[-1]}.csv")