# Imports

In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

# Load Environment

In [2]:
env_name = 'CarRacing-v0'
env = gym.make(env_name)

We check the action and observation spaces to determine what algorithms to use.

In [3]:
env.action_space

Box([-1.  0.  0.], [1. 1. 1.], (3,), float32)

In [4]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

We have our environment render with our agent taking random moves to get a sense of it all.

In [5]:
episodes = 5
for episode in range(1, episodes + 1):
    # resets state to its initial values
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        # renders the actual environment
        env.render()
        # generates random action
        action = env.action_space.sample()
        # env.step outputs 4 values that corresponds to our variables below
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Track generation: 1296..1624 -> 328-tiles track




Episode:1 Score:-41.89602446483242
Track generation: 1221..1531 -> 310-tiles track
Episode:2 Score:-35.275080906149434
Track generation: 1183..1483 -> 300-tiles track
Episode:3 Score:-33.11036789297714
Track generation: 1291..1618 -> 327-tiles track
Episode:4 Score:-38.65030674846684
Track generation: 1085..1360 -> 275-tiles track
Episode:5 Score:-27.007299270073318


# Modeling

## Model 1: PPO, 100k timesteps

In [6]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])

In [7]:
log_path = os.path.join('Training', 'Logs')
model = PPO('CnnPolicy', env, verbose = 0, tensorboard_log=log_path)

In [8]:
%%time
model.learn(total_timesteps=100_000)

Track generation: 1188..1489 -> 301-tiles track
Track generation: 1076..1349 -> 273-tiles track
Track generation: 1116..1399 -> 283-tiles track
Track generation: 1084..1363 -> 279-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1295..1624 -> 329-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 981..1230 -> 249-tiles track
Track generation: 1123..1408 -> 285-tiles track
Track generation: 978..1230 -> 252-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1186..1487 -> 301-tiles track
Track generation: 1043..1308 -> 265-tiles track
Track generation: 1164..1459 -> 295-tiles track
Track generation: 1164..1463 -> 299-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1096..1374 -> 278-tiles track
Track generation: 1143..1440 -> 297-tiles track
Track gene

<stable_baselines3.ppo.ppo.PPO at 0x232399ac5e0>

### Evaluation

With the first training done, we aren't sure how well it will do so we'll train it again to compare.

In [9]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)



Track generation: 927..1168 -> 241-tiles track
Track generation: 999..1258 -> 259-tiles track
Track generation: 1168..1469 -> 301-tiles track
Track generation: 1121..1413 -> 292-tiles track
Track generation: 1148..1439 -> 291-tiles track
Track generation: 1155..1448 -> 293-tiles track
Track generation: 1001..1261 -> 260-tiles track
Track generation: 1080..1354 -> 274-tiles track
Track generation: 1259..1578 -> 319-tiles track
Track generation: 1243..1558 -> 315-tiles track
Track generation: 1077..1357 -> 280-tiles track


(-44.17115752696991, 21.888285325715312)

In [10]:
env.close()

## Model 2: 300k timesteps

In [11]:
%%time
model.learn(total_timesteps=200_000)

Track generation: 1314..1647 -> 333-tiles track
Track generation: 1200..1504 -> 304-tiles track
Track generation: 1119..1403 -> 284-tiles track
Track generation: 1056..1334 -> 278-tiles track
Track generation: 1152..1444 -> 292-tiles track
Track generation: 1355..1698 -> 343-tiles track
Track generation: 1104..1391 -> 287-tiles track
Track generation: 1026..1295 -> 269-tiles track
Track generation: 1237..1557 -> 320-tiles track
Track generation: 1172..1469 -> 297-tiles track
Track generation: 1040..1304 -> 264-tiles track
Track generation: 1232..1552 -> 320-tiles track
Track generation: 1061..1330 -> 269-tiles track
Track generation: 1137..1425 -> 288-tiles track
Track generation: 1151..1443 -> 292-tiles track
Track generation: 1056..1324 -> 268-tiles track
Track generation: 907..1140 -> 233-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1096..1374 -> 278-tiles track
Track generation: 970..1224 -> 254-tiles track
Track gene

Track generation: 1074..1349 -> 275-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1197..1500 -> 303-tiles track
Track generation: 1175..1478 -> 303-tiles track
Track generation: 1013..1272 -> 259-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1076..1349 -> 273-tiles track
Track generation: 1196..1499 -> 303-tiles track
Track generation: 1084..1359 -> 275-tiles track
Track generation: 1324..1659 -> 335-tiles track
Track generation: 1108..1389 -> 281-tiles track
Track generation: 1335..1679 -> 344-tiles track
Track generation: 1152..1444 -> 292-tiles track
Track generation: 1056..1324 -> 268-tiles track
Track generation: 1091..1376 -> 285-tiles track
Track generation: 1240..1554 -> 314-tiles track
Track generation: 1153..1446 -> 293-tiles track
Track generation: 1033..1304 -> 271-tiles track
Track generation: 1113..1395 -> 282-tiles track
Track generation: 1071..

<stable_baselines3.ppo.ppo.PPO at 0x232399ac5e0>

### Evaluation

We might be on the wrong track here. The mean score increased but the standard deviation decreased. The car seems to be doing the same wrong movements. Perhaps more training could fix it.

In [12]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)

Track generation: 1180..1479 -> 299-tiles track
Track generation: 1281..1605 -> 324-tiles track
Track generation: 1128..1414 -> 286-tiles track
Track generation: 1045..1313 -> 268-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1136..1428 -> 292-tiles track
Track generation: 1058..1327 -> 269-tiles track
Track generation: 1363..1708 -> 345-tiles track
Track generation: 1135..1433 -> 298-tiles track
Track generation: 1273..1595 -> 322-tiles track
Track generation: 1047..1313 -> 266-tiles track
Track generation: 1074..1350 -> 276-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1127..1413 -> 286-tiles track
Track generation: 1130..1415 -> 285-tiles track


(-63.48859500139952, 9.98924971748378)

In [13]:
# path = os.path.join('Training', 'Saved Models', 'ppo_car_300k')
# model.save(path)

## Model 3: 500k

In [14]:
%%time
model.learn(total_timesteps=200_000)

Track generation: 1224..1534 -> 310-tiles track
Track generation: 1057..1325 -> 268-tiles track
Track generation: 999..1253 -> 254-tiles track
Track generation: 1007..1269 -> 262-tiles track
Track generation: 1197..1510 -> 313-tiles track
Track generation: 1265..1585 -> 320-tiles track
Track generation: 971..1218 -> 247-tiles track
Track generation: 1183..1483 -> 300-tiles track
Track generation: 1093..1379 -> 286-tiles track
Track generation: 1066..1337 -> 271-tiles track
Track generation: 1123..1446 -> 323-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1079..1353 -> 274-tiles track
Track generation: 1169..1465 -> 296-tiles track
Track generation: 1111..1399 -> 288-tiles track
Track generation: 1259..1578 -> 319-tiles track
Track generation: 1148..1439 -> 291-tiles track
Track generation: 1188..1489 -> 301-tiles track
Track generation: 1122..1407 -> 285-tiles track
Track generation: 1068..1339 -> 271-tiles track
Track gene

Track generation: 1025..1292 -> 267-tiles track
Track generation: 1157..1450 -> 293-tiles track
Track generation: 1213..1520 -> 307-tiles track
Track generation: 1107..1388 -> 281-tiles track
Track generation: 1190..1501 -> 311-tiles track
Track generation: 1315..1648 -> 333-tiles track
Track generation: 1199..1503 -> 304-tiles track
Track generation: 1195..1498 -> 303-tiles track
Track generation: 1228..1540 -> 312-tiles track
Track generation: 937..1178 -> 241-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1095..1373 -> 278-tiles track
Track generation: 1063..1337 -> 274-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1247..1563 -> 316-tiles track
Track generation: 1162..1457 -> 295-tiles track
Track generation: 1077..1350 -> 273-tiles track
Track generation: 1215..1523 -> 308-tiles track
Track generation: 940..1186 -> 246-tiles track
Track generation: 1103..13

<stable_baselines3.ppo.ppo.PPO at 0x232399ac5e0>

### Evaluation

By far the worst performing model. It seems to have solidified its bad behavior since the standard deviation has decresed again. More time will be needed to tune and as well as come up with alternatives to training, such as creating custom classes.

In [15]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)

Track generation: 1075..1349 -> 274-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1072..1351 -> 279-tiles track
Track generation: 1219..1528 -> 309-tiles track
Track generation: 1188..1489 -> 301-tiles track
Track generation: 1099..1378 -> 279-tiles track
Track generation: 1311..1643 -> 332-tiles track
Track generation: 1127..1413 -> 286-tiles track
Track generation: 1149..1441 -> 292-tiles track
Track generation: 1204..1509 -> 305-tiles track
Track generation: 1098..1382 -> 284-tiles track
Track generation: 1116..1399 -> 283-tiles track
Track generation: 1120..1404 -> 284-tiles track


(-77.73669624701142, 6.151285143364425)

In [16]:
env.close()

In [17]:
# path = os.path.join('Training', 'Saved Models', 'ppo_car_500k')
# model.save(path)