In [2]:
import gym
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor




"Easiest continuous control task to learn from pixels, a top-down racing environment. Discreet control is reasonable in this environment as well, on/off discretisation is fine. State consists of 96x96 pixels. Reward is -0.1 every frame and +1000/N for every track tile visited, where N is the total number of tiles in track. For example, if you have finished in 732 frames, your reward is 1000 - 0.1*732 = 926.8 points. Episode finishes when all tiles are visited. Some indicators shown at the bottom of the window and the state RGB buffer. From left to right: true speed, four ABS sensors, steering wheel position, gyroscope." - [CarRacing-v0](https://gym.openai.com/envs/CarRacing-v0/) ([Source](https://github.com/openai/gym/blob/master/gym/envs/box2d/car_racing.py))

In [8]:
env = gym.make('CarRacing-v1')

model = PPO(MlpPolicy, env, verbose=0)

In [None]:
# Train agent for 1500 timesteps
for i in range(1,31):

    iterations = i*500

    # Train the agent for 500 steps at a time
    model.learn(total_timesteps=500)

    # Save model
    save_dir = "./models/"
    model.save(save_dir + f"/carracing-v1/carracing_v1_ppo_{iterations}")

In [9]:
# Load model
save_dir = "./models/"
del model
model = PPO.load(save_dir + f"/carracing-v1/carracing_v1_ppo_15000")

# Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=10)

print(f"mean_reward (agent trained over {7500} timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")

Track generation: 1224..1534 -> 310-tiles track
Track generation: 1144..1434 -> 290-tiles track
Track generation: 1136..1433 -> 297-tiles track
Track generation: 1271..1593 -> 322-tiles track
Track generation: 1101..1388 -> 287-tiles track
Track generation: 1182..1481 -> 299-tiles track
Track generation: 1064..1337 -> 273-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1159..1453 -> 294-tiles track
Track generation: 1165..1461 -> 296-tiles track
Track generation: 1098..1376 -> 278-tiles track
Track generation: 1053..1323 -> 270-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1128..1414 -> 286-tiles track
Track generation: 1128..1421 -> 293-tiles track
mean_reward (agent trained over 7500 timesteps): -69.43 +/- 1.21


In [None]:
data = {'timesteps': [], 'mean_reward': [], 'std_reward': []}

for i in range(1,31):

    timesteps = i*500

    # Load model
    del model
    model = PPO.load(save_dir + f"/carracing-v1/carracing_v1_ppo_{timesteps}")    
    
    # Evaluate the trained agent - for info on why the env is wrapped with Monitor check the evaluate_policy function
    mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
    
    print(f"mean_reward (agent trained over {timesteps} timesteps): {mean_reward:.2f} +/- {std_reward:.2f}")
    
    data['timesteps'].append(timesteps)
    data['mean_reward'].append(mean_reward)
    data['std_reward'].append(std_reward)

In [None]:
# Turn data into pandas.DataFrame and serialize as csv file
df = pd.DataFrame(data, columns=data.keys())
df.set_index('timesteps', inplace=True)

df.to_csv(f"./out/training-log/carracing_v1_ppo_{df.index.to_list()[-1]}.csv")