In [1]:
import gym
import random
import csv
import time
from itertools import product
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


# these are the hyperparameters we can edit
learning_rates = [0.1, 0.01, 0.001]
n_steps = [512]
batch_sizes = [128]
gammas = [0.9, 0.95, 0.99, 0.999]
gae_lambdas = [0.95, 0.90, 0.85]
clip_ranges = [0.1, 0.2, 0.3]
ent_coefs = [0.0, 0.1, 0.01]
epsilons = [1e-5, 1e-6]

#648 combinations x 45.5 seconds should be 29484

best_hyperparams = None
best_score = -float('inf')

with open('hyperparam_results_grid_search_bipedal_hardcore_100,000.csv', 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['learning_rate', 'n_steps', 'batch_size', 'gamma', 'gae_lambda', 'clip_range', 'ent_coef', 'epsilon', 'mean_reward', 'training_time', 'standard deviation'])

    for hyperparams in product(learning_rates, n_steps, batch_sizes, gammas, gae_lambdas, clip_ranges, ent_coefs, epsilons):
        learning_rate, n_steps, batch_size, gamma, gae_lambda, clip_range, ent_coef, epsilon = hyperparams

        from gym.envs.box2d import BipedalWalker

        class BipedalWalkerHardcore(BipedalWalker):
            hardcore = True

            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)


        environment_name = 'BipedalWalkerHardcore-v3'
        gym.register(
            id=environment_name,
            entry_point='__main__:BipedalWalkerHardcore',
            max_episode_steps=1600,
            reward_threshold=300,
        )

        env = gym.make(environment_name)


        model = PPO('MlpPolicy', env,
                    learning_rate=learning_rate,
                    n_steps=n_steps,
                    batch_size=batch_size,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    clip_range=clip_range,
                    ent_coef=ent_coef,
                    clip_range_vf=epsilon,
                    verbose=1)

        print("Training with hyperparameters:", hyperparams)

        start_time = time.time()
        model.learn(total_timesteps=100000) #100 thousand steps
        end_time = time.time()

        training_time = end_time - start_time

        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
        print("Mean reward: {:.2f} +/- {:.2f}".format(mean_reward, std_reward))

        if mean_reward > best_score:
            best_hyperparams = hyperparams
            best_score = mean_reward
            print("New best hyperparameters found:", best_hyperparams)
            print("New best score found:", best_score)

        with open('hyperparam_results_grid_search_bipedal_hardcore_100,000.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow((learning_rate, n_steps, batch_size, gamma, gae_lambda, clip_range, ent_coef, epsilon, mean_reward,training_time, std_reward))

    print("Best hyperparameters:", best_hyperparams)
    print("Best score:", best_score)


  logger.warn("Overriding environment {}".format(id))


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training with hyperparameters: (0.1, 512, 128, 0.9, 0.95, 0.1, 0.0, 1e-05)
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 74.8     |
|    ep_rew_mean     | -112     |
| time/              |          |
|    fps             | 1524     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 512      |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 74.8      |
|    ep_rew_mean          | -112      |
| time/                   |           |
|    fps                  | 1388      |
|    iterations           | 2         |
|    time_elapsed         | 0         |
|    total_timesteps      | 1024      |
| train/                  |           |
|    approx_kl            | 255.04086 |
|    clip_fraction        | 0.974     |
|    clip_r

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 125       |
|    ep_rew_mean          | -120      |
| time/                   |           |
|    fps                  | 1282      |
|    iterations           | 11        |
|    time_elapsed         | 4         |
|    total_timesteps      | 5632      |
| train/                  |           |
|    approx_kl            | 126.10528 |
|    clip_fraction        | 0.954     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -11.3     |
|    explained_variance   | -8.26e-05 |
|    learning_rate        | 0.1       |
|    loss                 | 328       |
|    n_updates            | 100       |
|    policy_gradient_loss | 0.193     |
|    std                  | 11.6      |
|    value_loss           | 642       |
---------------------------------------
---------------------------------------
| rollout/                |           |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 233       |
|    ep_rew_mean          | -125      |
| time/                   |           |
|    fps                  | 1254      |
|    iterations           | 20        |
|    time_elapsed         | 8         |
|    total_timesteps      | 10240     |
| train/                  |           |
|    approx_kl            | 7727.2114 |
|    clip_fraction        | 0.967     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -7.24     |
|    explained_variance   | -2.38e-07 |
|    learning_rate        | 0.1       |
|    loss                 | 6.47      |
|    n_updates            | 190       |
|    policy_gradient_loss | 0.172     |
|    std                  | 1.71      |
|    value_loss           | 12.7      |
---------------------------------------
---------------------------------------
| rollout/                |           |


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 315      |
|    ep_rew_mean          | -128     |
| time/                   |          |
|    fps                  | 1255     |
|    iterations           | 29       |
|    time_elapsed         | 11       |
|    total_timesteps      | 14848    |
| train/                  |          |
|    approx_kl            | 3541.012 |
|    clip_fraction        | 0.954    |
|    clip_range           | 0.1      |
|    clip_range_vf        | 1e-05    |
|    entropy_loss         | -6.49    |
|    explained_variance   | 0        |
|    learning_rate        | 0.1      |
|    loss                 | 281      |
|    n_updates            | 280      |
|    policy_gradient_loss | 0.145    |
|    std                  | 1.29     |
|    value_loss           | 560      |
--------------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean         

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 355       |
|    ep_rew_mean          | -128      |
| time/                   |           |
|    fps                  | 1243      |
|    iterations           | 39        |
|    time_elapsed         | 16        |
|    total_timesteps      | 19968     |
| train/                  |           |
|    approx_kl            | 4016.751  |
|    clip_fraction        | 0.966     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -7.15     |
|    explained_variance   | -0.000247 |
|    learning_rate        | 0.1       |
|    loss                 | 261       |
|    n_updates            | 380       |
|    policy_gradient_loss | 0.176     |
|    std                  | 2.42      |
|    value_loss           | 490       |
---------------------------------------
---------------------------------------
| rollout/                |           |


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 327      |
|    ep_rew_mean          | -125     |
| time/                   |          |
|    fps                  | 1231     |
|    iterations           | 48       |
|    time_elapsed         | 19       |
|    total_timesteps      | 24576    |
| train/                  |          |
|    approx_kl            | 8816.78  |
|    clip_fraction        | 0.974    |
|    clip_range           | 0.1      |
|    clip_range_vf        | 1e-05    |
|    entropy_loss         | -7.45    |
|    explained_variance   | 0        |
|    learning_rate        | 0.1      |
|    loss                 | 13.2     |
|    n_updates            | 470      |
|    policy_gradient_loss | 0.138    |
|    std                  | 8.36     |
|    value_loss           | 26.1     |
--------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean       

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 322       |
|    ep_rew_mean          | -125      |
| time/                   |           |
|    fps                  | 1222      |
|    iterations           | 57        |
|    time_elapsed         | 23        |
|    total_timesteps      | 29184     |
| train/                  |           |
|    approx_kl            | 3079.8113 |
|    clip_fraction        | 0.972     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -8.93     |
|    explained_variance   | 1.85e-06  |
|    learning_rate        | 0.1       |
|    loss                 | 244       |
|    n_updates            | 560       |
|    policy_gradient_loss | 0.251     |
|    std                  | 10        |
|    value_loss           | 521       |
---------------------------------------
--------------------------------------
| rollout/                |          |
| 

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 141       |
|    ep_rew_mean          | -112      |
| time/                   |           |
|    fps                  | 1216      |
|    iterations           | 66        |
|    time_elapsed         | 27        |
|    total_timesteps      | 33792     |
| train/                  |           |
|    approx_kl            | 3.9338503 |
|    clip_fraction        | 0.947     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -17.5     |
|    explained_variance   | -1.19e-07 |
|    learning_rate        | 0.1       |
|    loss                 | 391       |
|    n_updates            | 650       |
|    policy_gradient_loss | 0.279     |
|    std                  | 23.3      |
|    value_loss           | 845       |
---------------------------------------
---------------------------------------
| rollout/                |           |


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60.3        |
|    ep_rew_mean          | -107        |
| time/                   |             |
|    fps                  | 1216        |
|    iterations           | 75          |
|    time_elapsed         | 31          |
|    total_timesteps      | 38400       |
| train/                  |             |
|    approx_kl            | 0.015221149 |
|    clip_fraction        | 0.531       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -18.2       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 255         |
|    n_updates            | 740         |
|    policy_gradient_loss | 0.0183      |
|    std                  | 28.2        |
|    value_loss           | 514         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 56.6        |
|    ep_rew_mean          | -106        |
| time/                   |             |
|    fps                  | 1222        |
|    iterations           | 84          |
|    time_elapsed         | 35          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010906144 |
|    clip_fraction        | 0.47        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -18.3       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 429         |
|    n_updates            | 830         |
|    policy_gradient_loss | 0.0202      |
|    std                  | 28.9        |
|    value_loss           | 864         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 57.2        |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 1229        |
|    iterations           | 93          |
|    time_elapsed         | 38          |
|    total_timesteps      | 47616       |
| train/                  |             |
|    approx_kl            | 0.010994297 |
|    clip_fraction        | 0.465       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -18.8       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 275         |
|    n_updates            | 920         |
|    policy_gradient_loss | 0.0244      |
|    std                  | 33.4        |
|    value_loss           | 513         |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 55.4       |
|    ep_rew_mean          | -107       |
| time/                   |            |
|    fps                  | 1234       |
|    iterations           | 102        |
|    time_elapsed         | 42         |
|    total_timesteps      | 52224      |
| train/                  |            |
|    approx_kl            | 0.01725234 |
|    clip_fraction        | 0.475      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-05      |
|    entropy_loss         | -18.8      |
|    explained_variance   | -1.19e-07  |
|    learning_rate        | 0.1        |
|    loss                 | 597        |
|    n_updates            | 1010       |
|    policy_gradient_loss | 0.0181     |
|    std                  | 34.4       |
|    value_loss           | 886        |
----------------------------------------
-----------------------------------------
| rollout/     

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 73.7       |
|    ep_rew_mean          | -109       |
| time/                   |            |
|    fps                  | 1240       |
|    iterations           | 111        |
|    time_elapsed         | 45         |
|    total_timesteps      | 56832      |
| train/                  |            |
|    approx_kl            | 0.01483937 |
|    clip_fraction        | 0.359      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-05      |
|    entropy_loss         | -18.8      |
|    explained_variance   | 5.96e-08   |
|    learning_rate        | 0.1        |
|    loss                 | 244        |
|    n_updates            | 1100       |
|    policy_gradient_loss | 0.00674    |
|    std                  | 37.6       |
|    value_loss           | 528        |
----------------------------------------
-----------------------------------------
| rollout/     

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 119        |
|    ep_rew_mean          | -113       |
| time/                   |            |
|    fps                  | 1247       |
|    iterations           | 120        |
|    time_elapsed         | 49         |
|    total_timesteps      | 61440      |
| train/                  |            |
|    approx_kl            | 0.01908756 |
|    clip_fraction        | 0.546      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-05      |
|    entropy_loss         | -18.7      |
|    explained_variance   | 0          |
|    learning_rate        | 0.1        |
|    loss                 | 9.51       |
|    n_updates            | 1190       |
|    policy_gradient_loss | 0.00869    |
|    std                  | 34         |
|    value_loss           | 18.8       |
----------------------------------------
-----------------------------------------
| rollout/     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 149         |
|    ep_rew_mean          | -115        |
| time/                   |             |
|    fps                  | 1254        |
|    iterations           | 129         |
|    time_elapsed         | 52          |
|    total_timesteps      | 66048       |
| train/                  |             |
|    approx_kl            | 0.067198426 |
|    clip_fraction        | 0.559       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -18.9       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 263         |
|    n_updates            | 1280        |
|    policy_gradient_loss | 0.0115      |
|    std                  | 33.4        |
|    value_loss           | 529         |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 196        |
|    ep_rew_mean          | -117       |
| time/                   |            |
|    fps                  | 1260       |
|    iterations           | 138        |
|    time_elapsed         | 56         |
|    total_timesteps      | 70656      |
| train/                  |            |
|    approx_kl            | 0.14784452 |
|    clip_fraction        | 0.666      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-05      |
|    entropy_loss         | -19.1      |
|    explained_variance   | -1.19e-07  |
|    learning_rate        | 0.1        |
|    loss                 | 9.67       |
|    n_updates            | 1370       |
|    policy_gradient_loss | 0.0266     |
|    std                  | 39.5       |
|    value_loss           | 18.9       |
----------------------------------------
----------------------------------------
| rollout/      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 242         |
|    ep_rew_mean          | -119        |
| time/                   |             |
|    fps                  | 1266        |
|    iterations           | 147         |
|    time_elapsed         | 59          |
|    total_timesteps      | 75264       |
| train/                  |             |
|    approx_kl            | 0.015184538 |
|    clip_fraction        | 0.585       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -19.7       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.1         |
|    loss                 | 258         |
|    n_updates            | 1460        |
|    policy_gradient_loss | 0.000943    |
|    std                  | 41.1        |
|    value_loss           | 519         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 289         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 1271        |
|    iterations           | 156         |
|    time_elapsed         | 62          |
|    total_timesteps      | 79872       |
| train/                  |             |
|    approx_kl            | 0.035133753 |
|    clip_fraction        | 0.524       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -20.1       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 9.31        |
|    n_updates            | 1550        |
|    policy_gradient_loss | 0.00747     |
|    std                  | 47.3        |
|    value_loss           | 19.2        |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 335         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 1276        |
|    iterations           | 165         |
|    time_elapsed         | 66          |
|    total_timesteps      | 84480       |
| train/                  |             |
|    approx_kl            | 0.010250118 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -20.5       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 253         |
|    n_updates            | 1640        |
|    policy_gradient_loss | 0.00845     |
|    std                  | 50.9        |
|    value_loss           | 502         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 381         |
|    ep_rew_mean          | -127        |
| time/                   |             |
|    fps                  | 1280        |
|    iterations           | 174         |
|    time_elapsed         | 69          |
|    total_timesteps      | 89088       |
| train/                  |             |
|    approx_kl            | 0.033470962 |
|    clip_fraction        | 0.54        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -20.2       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 9.57        |
|    n_updates            | 1730        |
|    policy_gradient_loss | 0.00875     |
|    std                  | 49          |
|    value_loss           | 18.9        |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 413         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 1284        |
|    iterations           | 183         |
|    time_elapsed         | 72          |
|    total_timesteps      | 93696       |
| train/                  |             |
|    approx_kl            | 0.009885037 |
|    clip_fraction        | 0.495       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -21.1       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 256         |
|    n_updates            | 1820        |
|    policy_gradient_loss | 0.000102    |
|    std                  | 61.3        |
|    value_loss           | 517         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 460         |
|    ep_rew_mean          | -132        |
| time/                   |             |
|    fps                  | 1288        |
|    iterations           | 192         |
|    time_elapsed         | 76          |
|    total_timesteps      | 98304       |
| train/                  |             |
|    approx_kl            | 0.009455705 |
|    clip_fraction        | 0.453       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -21.8       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 9.61        |
|    n_updates            | 1910        |
|    policy_gradient_loss | 0.00501     |
|    std                  | 69.1        |
|    value_loss           | 19.3        |
-----------------------------------------
----------------------------------



Mean reward: -154.92 +/- 32.63
New best hyperparameters found: (0.1, 512, 128, 0.9, 0.95, 0.1, 0.0, 1e-05)
New best score found: -154.9237129758112
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training with hyperparameters: (0.1, 512, 128, 0.9, 0.95, 0.1, 0.0, 1e-06)
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 77.7     |
|    ep_rew_mean     | -115     |
| time/              |          |
|    fps             | 1509     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 512      |
---------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 77.7     |
|    ep_rew_mean          | -115     |
| time/                   |          |
|    fps                  | 1338     |
|    iterations           | 2        |
|    time_elapsed         | 0        |
|    total_timesteps      | 102

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 108         |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 1317        |
|    iterations           | 11          |
|    time_elapsed         | 4           |
|    total_timesteps      | 5632        |
| train/                  |             |
|    approx_kl            | 0.064821474 |
|    clip_fraction        | 0.71        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -16.3       |
|    explained_variance   | -1.33       |
|    learning_rate        | 0.1         |
|    loss                 | 17.5        |
|    n_updates            | 100         |
|    policy_gradient_loss | 0.067       |
|    std                  | 18.7        |
|    value_loss           | 34.7        |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 205        |
|    ep_rew_mean          | -113       |
| time/                   |            |
|    fps                  | 1327       |
|    iterations           | 20         |
|    time_elapsed         | 7          |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.14122988 |
|    clip_fraction        | 0.767      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-06      |
|    entropy_loss         | -16.6      |
|    explained_variance   | -1.37      |
|    learning_rate        | 0.1        |
|    loss                 | 205        |
|    n_updates            | 190        |
|    policy_gradient_loss | 0.0357     |
|    std                  | 19.2       |
|    value_loss           | 413        |
----------------------------------------
----------------------------------------
| rollout/      

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 291       |
|    ep_rew_mean          | -117      |
| time/                   |           |
|    fps                  | 1333      |
|    iterations           | 29        |
|    time_elapsed         | 11        |
|    total_timesteps      | 14848     |
| train/                  |           |
|    approx_kl            | 0.0432226 |
|    clip_fraction        | 0.572     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-06     |
|    entropy_loss         | -16.8     |
|    explained_variance   | -0.499    |
|    learning_rate        | 0.1       |
|    loss                 | 201       |
|    n_updates            | 280       |
|    policy_gradient_loss | 0.0194    |
|    std                  | 20.4      |
|    value_loss           | 398       |
---------------------------------------
----------------------------------------
| rollout/                |            

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 366         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 1317        |
|    iterations           | 38          |
|    time_elapsed         | 14          |
|    total_timesteps      | 19456       |
| train/                  |             |
|    approx_kl            | 0.025856528 |
|    clip_fraction        | 0.587       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -17.1       |
|    explained_variance   | -0.0969     |
|    learning_rate        | 0.1         |
|    loss                 | 18.2        |
|    n_updates            | 370         |
|    policy_gradient_loss | 0.0162      |
|    std                  | 20.1        |
|    value_loss           | 36.1        |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 433         |
|    ep_rew_mean          | -125        |
| time/                   |             |
|    fps                  | 1315        |
|    iterations           | 47          |
|    time_elapsed         | 18          |
|    total_timesteps      | 24064       |
| train/                  |             |
|    approx_kl            | 0.034354657 |
|    clip_fraction        | 0.72        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -17.3       |
|    explained_variance   | -4.77e-07   |
|    learning_rate        | 0.1         |
|    loss                 | 221         |
|    n_updates            | 460         |
|    policy_gradient_loss | 0.0253      |
|    std                  | 21          |
|    value_loss           | 440         |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 494        |
|    ep_rew_mean          | -128       |
| time/                   |            |
|    fps                  | 1314       |
|    iterations           | 56         |
|    time_elapsed         | 21         |
|    total_timesteps      | 28672      |
| train/                  |            |
|    approx_kl            | 0.05573512 |
|    clip_fraction        | 0.647      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-06      |
|    entropy_loss         | -17.5      |
|    explained_variance   | -1.43e-06  |
|    learning_rate        | 0.1        |
|    loss                 | 20.5       |
|    n_updates            | 550        |
|    policy_gradient_loss | 0.0213     |
|    std                  | 21.2       |
|    value_loss           | 40.7       |
----------------------------------------
-----------------------------------------
| rollout/     

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 531        |
|    ep_rew_mean          | -129       |
| time/                   |            |
|    fps                  | 1308       |
|    iterations           | 65         |
|    time_elapsed         | 25         |
|    total_timesteps      | 33280      |
| train/                  |            |
|    approx_kl            | 0.03476309 |
|    clip_fraction        | 0.686      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-06      |
|    entropy_loss         | -17.7      |
|    explained_variance   | 7.08e-05   |
|    learning_rate        | 0.1        |
|    loss                 | 134        |
|    n_updates            | 640        |
|    policy_gradient_loss | 0.00708    |
|    std                  | 21.7       |
|    value_loss           | 270        |
----------------------------------------
-----------------------------------------
| rollout/     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 582         |
|    ep_rew_mean          | -132        |
| time/                   |             |
|    fps                  | 1304        |
|    iterations           | 74          |
|    time_elapsed         | 29          |
|    total_timesteps      | 37888       |
| train/                  |             |
|    approx_kl            | 0.024188058 |
|    clip_fraction        | 0.631       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -17.2       |
|    explained_variance   | -0.00907    |
|    learning_rate        | 0.1         |
|    loss                 | 52.5        |
|    n_updates            | 730         |
|    policy_gradient_loss | 0.0121      |
|    std                  | 18.9        |
|    value_loss           | 105         |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 628        |
|    ep_rew_mean          | -134       |
| time/                   |            |
|    fps                  | 1294       |
|    iterations           | 83         |
|    time_elapsed         | 32         |
|    total_timesteps      | 42496      |
| train/                  |            |
|    approx_kl            | 0.07172676 |
|    clip_fraction        | 0.523      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-06      |
|    entropy_loss         | -18        |
|    explained_variance   | -0.000102  |
|    learning_rate        | 0.1        |
|    loss                 | 151        |
|    n_updates            | 820        |
|    policy_gradient_loss | 0.0145     |
|    std                  | 23.9       |
|    value_loss           | 301        |
----------------------------------------
-----------------------------------------
| rollout/     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 670         |
|    ep_rew_mean          | -135        |
| time/                   |             |
|    fps                  | 1277        |
|    iterations           | 92          |
|    time_elapsed         | 36          |
|    total_timesteps      | 47104       |
| train/                  |             |
|    approx_kl            | 0.006116181 |
|    clip_fraction        | 0.451       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -18.2       |
|    explained_variance   | 0.109       |
|    learning_rate        | 0.1         |
|    loss                 | 52.8        |
|    n_updates            | 910         |
|    policy_gradient_loss | 0.00531     |
|    std                  | 25.1        |
|    value_loss           | 104         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 697         |
|    ep_rew_mean          | -136        |
| time/                   |             |
|    fps                  | 1254        |
|    iterations           | 101         |
|    time_elapsed         | 41          |
|    total_timesteps      | 51712       |
| train/                  |             |
|    approx_kl            | 0.014102756 |
|    clip_fraction        | 0.686       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -18.8       |
|    explained_variance   | 5.25e-06    |
|    learning_rate        | 0.1         |
|    loss                 | 169         |
|    n_updates            | 1000        |
|    policy_gradient_loss | 0.011       |
|    std                  | 30.4        |
|    value_loss           | 299         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 684         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 1237        |
|    iterations           | 110         |
|    time_elapsed         | 45          |
|    total_timesteps      | 56320       |
| train/                  |             |
|    approx_kl            | 0.058721047 |
|    clip_fraction        | 0.563       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -18.5       |
|    explained_variance   | -3.22e-06   |
|    learning_rate        | 0.1         |
|    loss                 | 151         |
|    n_updates            | 1090        |
|    policy_gradient_loss | 0.00675     |
|    std                  | 26.6        |
|    value_loss           | 229         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 669         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 1223        |
|    iterations           | 119         |
|    time_elapsed         | 49          |
|    total_timesteps      | 60928       |
| train/                  |             |
|    approx_kl            | 0.006140849 |
|    clip_fraction        | 0.538       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.3       |
|    explained_variance   | 2.38e-07    |
|    learning_rate        | 0.1         |
|    loss                 | 136         |
|    n_updates            | 1180        |
|    policy_gradient_loss | 0.00698     |
|    std                  | 32.1        |
|    value_loss           | 269         |
-----------------------------------------
----------------------------------

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 686          |
|    ep_rew_mean          | -137         |
| time/                   |              |
|    fps                  | 1211         |
|    iterations           | 128          |
|    time_elapsed         | 54           |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0060452595 |
|    clip_fraction        | 0.501        |
|    clip_range           | 0.1          |
|    clip_range_vf        | 1e-06        |
|    entropy_loss         | -19.6        |
|    explained_variance   | -3.58e-07    |
|    learning_rate        | 0.1          |
|    loss                 | 53.7         |
|    n_updates            | 1270         |
|    policy_gradient_loss | 0.0101       |
|    std                  | 34.9         |
|    value_loss           | 107          |
------------------------------------------
-----------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 676         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 1205        |
|    iterations           | 137         |
|    time_elapsed         | 58          |
|    total_timesteps      | 70144       |
| train/                  |             |
|    approx_kl            | 0.032477103 |
|    clip_fraction        | 0.583       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.3       |
|    explained_variance   | 1.79e-07    |
|    learning_rate        | 0.1         |
|    loss                 | 141         |
|    n_updates            | 1360        |
|    policy_gradient_loss | 0.0101      |
|    std                  | 33.6        |
|    value_loss           | 282         |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 720         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 1200        |
|    iterations           | 146         |
|    time_elapsed         | 62          |
|    total_timesteps      | 74752       |
| train/                  |             |
|    approx_kl            | 0.028884295 |
|    clip_fraction        | 0.607       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.2       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 232         |
|    n_updates            | 1450        |
|    policy_gradient_loss | 0.0233      |
|    std                  | 33.1        |
|    value_loss           | 444         |
-----------------------------------------
----------------------------------

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 378       |
|    ep_rew_mean          | -128      |
| time/                   |           |
|    fps                  | 1192      |
|    iterations           | 155       |
|    time_elapsed         | 66        |
|    total_timesteps      | 79360     |
| train/                  |           |
|    approx_kl            | 0.0947573 |
|    clip_fraction        | 0.602     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-06     |
|    entropy_loss         | -19.4     |
|    explained_variance   | -0.00103  |
|    learning_rate        | 0.1       |
|    loss                 | 152       |
|    n_updates            | 1540      |
|    policy_gradient_loss | 0.0102    |
|    std                  | 36.4      |
|    value_loss           | 303       |
---------------------------------------
-----------------------------------------
| rollout/                |           

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 185         |
|    ep_rew_mean          | -120        |
| time/                   |             |
|    fps                  | 1185        |
|    iterations           | 164         |
|    time_elapsed         | 70          |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.023724763 |
|    clip_fraction        | 0.653       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.3       |
|    explained_variance   | 9.2e-05     |
|    learning_rate        | 0.1         |
|    loss                 | 362         |
|    n_updates            | 1630        |
|    policy_gradient_loss | 0.024       |
|    std                  | 35.4        |
|    value_loss           | 717         |
-----------------------------------------
----------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 203        |
|    ep_rew_mean          | -122       |
| time/                   |            |
|    fps                  | 1178       |
|    iterations           | 173        |
|    time_elapsed         | 75         |
|    total_timesteps      | 88576      |
| train/                  |            |
|    approx_kl            | 0.02246647 |
|    clip_fraction        | 0.512      |
|    clip_range           | 0.1        |
|    clip_range_vf        | 1e-06      |
|    entropy_loss         | -19.4      |
|    explained_variance   | -1.07      |
|    learning_rate        | 0.1        |
|    loss                 | 2.51       |
|    n_updates            | 1720       |
|    policy_gradient_loss | 0.0105     |
|    std                  | 39.3       |
|    value_loss           | 4.94       |
----------------------------------------
----------------------------------------
| rollout/      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 209         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 1171        |
|    iterations           | 182         |
|    time_elapsed         | 79          |
|    total_timesteps      | 93184       |
| train/                  |             |
|    approx_kl            | 0.012857912 |
|    clip_fraction        | 0.46        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.5       |
|    explained_variance   | 0           |
|    learning_rate        | 0.1         |
|    loss                 | 1.55e+03    |
|    n_updates            | 1810        |
|    policy_gradient_loss | 0.00732     |
|    std                  | 38.6        |
|    value_loss           | 3.08e+03    |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 221         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 1164        |
|    iterations           | 191         |
|    time_elapsed         | 83          |
|    total_timesteps      | 97792       |
| train/                  |             |
|    approx_kl            | 0.008881306 |
|    clip_fraction        | 0.51        |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-06       |
|    entropy_loss         | -19.4       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.1         |
|    loss                 | 1.59e+03    |
|    n_updates            | 1900        |
|    policy_gradient_loss | 0.0184      |
|    std                  | 36.6        |
|    value_loss           | 3.21e+03    |
-----------------------------------------
----------------------------------

--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 152      |
|    ep_rew_mean          | -118     |
| time/                   |          |
|    fps                  | 1108     |
|    iterations           | 5        |
|    time_elapsed         | 2        |
|    total_timesteps      | 2560     |
| train/                  |          |
|    approx_kl            | 9.955282 |
|    clip_fraction        | 0.952    |
|    clip_range           | 0.1      |
|    clip_range_vf        | 1e-05    |
|    entropy_loss         | -48.9    |
|    explained_variance   | -0.0562  |
|    learning_rate        | 0.1      |
|    loss                 | 54.7     |
|    n_updates            | 40       |
|    policy_gradient_loss | 0.116    |
|    std                  | 5.01e+05 |
|    value_loss           | 189      |
--------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 232         |
|    ep_rew_mean          | -120        |
| time/                   |             |
|    fps                  | 1078        |
|    iterations           | 14          |
|    time_elapsed         | 6           |
|    total_timesteps      | 7168        |
| train/                  |             |
|    approx_kl            | 0.013386563 |
|    clip_fraction        | 0.564       |
|    clip_range           | 0.1         |
|    clip_range_vf        | 1e-05       |
|    entropy_loss         | -119        |
|    explained_variance   | -8.11e-06   |
|    learning_rate        | 0.1         |
|    loss                 | -5.86       |
|    n_updates            | 130         |
|    policy_gradient_loss | 0.046       |
|    std                  | 3.56e+12    |
|    value_loss           | 12.1        |
-----------------------------------------
----------------------------------

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -121      |
| time/                   |           |
|    fps                  | 1068      |
|    iterations           | 23        |
|    time_elapsed         | 11        |
|    total_timesteps      | 11776     |
| train/                  |           |
|    approx_kl            | 1.0782106 |
|    clip_fraction        | 0.785     |
|    clip_range           | 0.1       |
|    clip_range_vf        | 1e-05     |
|    entropy_loss         | -145      |
|    explained_variance   | -0.0147   |
|    learning_rate        | 0.1       |
|    loss                 | 280       |
|    n_updates            | 220       |
|    policy_gradient_loss | 0.0791    |
|    std                  | 6.73e+15  |
|    value_loss           | 585       |
---------------------------------------
-----------------------------------------
| rollout/                |           

ValueError: Expected parameter loc (Tensor of shape (128, 4)) of distribution Normal(loc: torch.Size([128, 4]), scale: torch.Size([128, 4])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], grad_fn=<AddmmBackward0>)