In [1]:
from ppo_agent import PPOAgent
import gymnasium as gym
import torch
from agent_configs import PPOConfig, PPOActorConfig, PPOCriticConfig
from game_configs import CartPoleConfig

env = gym.make('CartPole-v1', render_mode='rgb_array')
config_dict = {
        'activation': 'tanh',
        'clip_param': 0.2,
        'kernel_initializer': 'orthogonal',
        # NORMALIZATION?
        'discount_factor': 0.99,
        'gae_lambda': 0.95,
        'critic_dense_layers': [64],
        'actor_dense_layers': [64],
        # REWARD CLIPPING
        'steps_per_epoch': 128,
        'train_policy_iterations': 4,
        'train_value_iterations': 4,
        'target_kl': 0.02,
        'entropy_coefficient': 0.01,
        'num_minibatches': 4,
        'loss_function': None,
    }

actor_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

critic_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

print("Actor Config")
actor_config = PPOActorConfig(actor_config_dict)
print("Critic Config")
critic_config = PPOCriticConfig(critic_config_dict)

print("PPO Config")
config = PPOConfig(config_dict, CartPoleConfig(), actor_config=actor_config, critic_config=critic_config)

agent = PPOAgent(env, config=config, name='PPOAgent')


Actor Config
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.0005
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Critic Config
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.0005
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
PPO Config
Using default save_intermediate_weights     : True
Using default training_steps                : 10000
Using default adam_epsilon                  : 1e-06
Using default learning_rate                 : 0.001
Using default clipnorm                      : 0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using         loss_function                 : None
Using         activation                    : relu
Using         kernel_initializer            : orthogonal
Usin

  logger.warn(


In [2]:
for param in agent.model.actor.parameters():
    print(param)
    print(param.grad)
    print(param.grad_fn)
    print(param.requires_grad)

for param in agent.model.critic.parameters():
    print(param)
    print(param.grad)
    print(param.grad_fn)
    print(param.requires_grad)

Parameter containing:
tensor([[ 0.3837, -0.2932, -0.3200, -0.0562],
        [ 0.4308, -0.2357, -0.3921,  0.1258],
        [-0.0069,  0.4934, -0.1091, -0.3311],
        [ 0.3150, -0.1726,  0.2962,  0.4472],
        [ 0.3474,  0.1326, -0.1978,  0.2100],
        [-0.4466, -0.1796, -0.0187, -0.0730],
        [ 0.1506, -0.1177, -0.1844,  0.3857],
        [ 0.3006, -0.2397, -0.0209, -0.4200],
        [-0.4919,  0.0220,  0.3795,  0.1700],
        [ 0.1176,  0.4525,  0.4255, -0.4418],
        [-0.1845, -0.0752, -0.1191, -0.1040],
        [-0.1939, -0.3216,  0.2319,  0.3415],
        [-0.4685,  0.1073, -0.2803,  0.1422],
        [-0.0897, -0.4890,  0.3843, -0.2724],
        [ 0.3946,  0.0750, -0.0297, -0.4390],
        [-0.2882, -0.0400, -0.1191,  0.1809],
        [-0.3006, -0.1596,  0.0492, -0.0638],
        [-0.0954, -0.1281, -0.1574,  0.3633],
        [ 0.1390, -0.2213, -0.0681,  0.0497],
        [-0.2553,  0.3330, -0.1969, -0.0131],
        [ 0.3167,  0.3976,  0.4102, -0.0854],
        [ 0.

In [3]:
# state, _ = env.reset()
# print(state)
# print(agent.select_action(state)[0])
agent.checkpoint_interval = 1
agent.train()

Training Step:  0


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(6.3526e-07)
37 Implimentation Details KL Divergence tensor(1.4529e-08)
Open AI Spinning Up KL Divergence tensor(0.0001)
37 Implimentation Details KL Divergence tensor(2.9172e-05)
Open AI Spinning Up KL Divergence tensor(0.0007)
37 Implimentation Details KL Divergence tensor(0.0001)
Open AI Spinning Up KL Divergence tensor(0.0005)
37 Implimentation Details KL Divergence tensor(0.0003)
Open AI Spinning Up KL Divergence tensor(0.0007)
37 Implimentation Details KL Divergence tensor(0.0005)
Open AI Spinning Up KL Divergence tensor(0.0012)
37 Implimentation Details KL Divergence tensor(0.0007)
Open AI Spinning Up KL Divergence tensor(0.0027)
37 Implimentation Details KL Divergence tensor(0.0010)
Open AI Spinning Up KL Divergence tensor(0.0020)
37 Implimentation Details KL Divergence tensor(0.0014)
Open AI Spinning Up KL Divergence tensor(0.0022)
37 Implimentation Details KL Divergence tensor(0.0018)
Open AI Spinning Up KL Divergence tensor(0.0030)
37 

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_1/videos/PPOAgent/1/PPOAgent-episode-4.mp4
score:  56.0


  axs[row][col].set_xlim(1, len(values))


Training Step:  2


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(4.0523e-06)
37 Implimentation Details KL Divergence tensor(1.3709e-08)
Open AI Spinning Up KL Divergence tensor(5.1983e-05)
37 Implimentation Details KL Divergence tensor(1.3407e-06)
Open AI Spinning Up KL Divergence tensor(-8.6946e-06)
37 Implimentation Details KL Divergence tensor(5.0712e-06)
Open AI Spinning Up KL Divergence tensor(6.2029e-05)
37 Implimentation Details KL Divergence tensor(9.9475e-06)
Open AI Spinning Up KL Divergence tensor(-5.1316e-05)
37 Implimentation Details KL Divergence tensor(1.5140e-05)
Open AI Spinning Up KL Divergence tensor(0.0002)
37 Implimentation Details KL Divergence tensor(2.0561e-05)
Open AI Spinning Up KL Divergence tensor(-5.2967e-06)
37 Implimentation Details KL Divergence tensor(2.6547e-05)
Open AI Spinning Up KL Divergence tensor(0.0001)
37 Implimentation Details KL Divergence tensor(3.0216e-05)
Open AI Spinning Up KL Divergence tensor(-7.6836e-05)
37 Implimentation Details KL Divergence tensor(3.2186e-

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_2/videos/PPOAgent/2/PPOAgent-episode-9.mp4
score:  20.0




Training Step:  3


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.8671e-06)
37 Implimentation Details KL Divergence tensor(1.5497e-08)
Open AI Spinning Up KL Divergence tensor(-4.5053e-05)
37 Implimentation Details KL Divergence tensor(3.0254e-07)
Open AI Spinning Up KL Divergence tensor(2.5810e-06)
37 Implimentation Details KL Divergence tensor(9.9614e-07)
Open AI Spinning Up KL Divergence tensor(3.1624e-05)
37 Implimentation Details KL Divergence tensor(2.2581e-06)
Open AI Spinning Up KL Divergence tensor(-7.9575e-05)
37 Implimentation Details KL Divergence tensor(3.5463e-06)
Open AI Spinning Up KL Divergence tensor(-0.0002)
37 Implimentation Details KL Divergence tensor(5.2784e-06)
Open AI Spinning Up KL Divergence tensor(2.9682e-06)
37 Implimentation Details KL Divergence tensor(6.0733e-06)
Open AI Spinning Up KL Divergence tensor(7.0484e-05)
37 Implimentation Details KL Divergence tensor(7.9358e-06)
Open AI Spinning Up KL Divergence tensor(-0.0001)
37 Implimentation Details KL Divergence tensor(8.7152e

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_3/videos/PPOAgent/3/PPOAgent-episode-14.mp4
score:  10.0




Training Step:  4


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-3.2134e-06)
37 Implimentation Details KL Divergence tensor(1.6292e-08)
Open AI Spinning Up KL Divergence tensor(4.8947e-06)
37 Implimentation Details KL Divergence tensor(5.0813e-08)
Open AI Spinning Up KL Divergence tensor(1.9455e-05)
37 Implimentation Details KL Divergence tensor(1.5731e-07)
Open AI Spinning Up KL Divergence tensor(1.4661e-05)
37 Implimentation Details KL Divergence tensor(2.8988e-07)
Open AI Spinning Up KL Divergence tensor(-3.6205e-05)
37 Implimentation Details KL Divergence tensor(4.5791e-07)
Open AI Spinning Up KL Divergence tensor(1.2076e-05)
37 Implimentation Details KL Divergence tensor(6.2888e-07)
Open AI Spinning Up KL Divergence tensor(4.1898e-05)
37 Implimentation Details KL Divergence tensor(8.4783e-07)
Open AI Spinning Up KL Divergence tensor(2.5808e-05)
37 Implimentation Details KL Divergence tensor(9.1523e-07)
Open AI Spinning Up KL Divergence tensor(-4.9501e-05)
37 Implimentation Details KL Divergence tensor(1

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_4/videos/PPOAgent/4/PPOAgent-episode-19.mp4
score:  21.0
Training Step:  5


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-5.6495e-07)
37 Implimentation Details KL Divergence tensor(1.5547e-08)
Open AI Spinning Up KL Divergence tensor(1.8938e-06)
37 Implimentation Details KL Divergence tensor(2.1756e-08)
Open AI Spinning Up KL Divergence tensor(-5.7473e-06)
37 Implimentation Details KL Divergence tensor(3.2137e-08)
Open AI Spinning Up KL Divergence tensor(1.7749e-06)
37 Implimentation Details KL Divergence tensor(4.5300e-08)
Open AI Spinning Up KL Divergence tensor(-1.9127e-05)
37 Implimentation Details KL Divergence tensor(6.6012e-08)
Open AI Spinning Up KL Divergence tensor(4.5816e-06)
37 Implimentation Details KL Divergence tensor(8.4142e-08)
Open AI Spinning Up KL Divergence tensor(-7.6929e-06)
37 Implimentation Details KL Divergence tensor(9.4473e-08)
Open AI Spinning Up KL Divergence tensor(-5.9555e-08)
37 Implimentation Details KL Divergence tensor(1.0063e-07)
Open AI Spinning Up KL Divergence tensor(-2.6686e-05)
37 Implimentation Details KL Divergence tenso

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_5/videos/PPOAgent/5/PPOAgent-episode-24.mp4
score:  33.0
Training Step:  6


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.2595e-06)
37 Implimentation Details KL Divergence tensor(1.5696e-08)
Open AI Spinning Up KL Divergence tensor(4.6892e-06)
37 Implimentation Details KL Divergence tensor(1.7633e-08)
Open AI Spinning Up KL Divergence tensor(3.6136e-06)
37 Implimentation Details KL Divergence tensor(1.7236e-08)
Open AI Spinning Up KL Divergence tensor(5.5443e-06)
37 Implimentation Details KL Divergence tensor(1.9918e-08)
Open AI Spinning Up KL Divergence tensor(-5.1142e-06)
37 Implimentation Details KL Divergence tensor(2.4438e-08)
Open AI Spinning Up KL Divergence tensor(8.4641e-06)
37 Implimentation Details KL Divergence tensor(2.8660e-08)
Open AI Spinning Up KL Divergence tensor(4.3740e-06)
37 Implimentation Details KL Divergence tensor(2.7567e-08)
Open AI Spinning Up KL Divergence tensor(5.5081e-06)
37 Implimentation Details KL Divergence tensor(3.0349e-08)
Open AI Spinning Up KL Divergence tensor(-8.8132e-06)
37 Implimentation Details KL Divergence tensor(3

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_6/videos/PPOAgent/6/PPOAgent-episode-29.mp4
score:  20.0
Training Step:  7


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(3.1859e-07)
37 Implimentation Details KL Divergence tensor(1.5597e-08)
Open AI Spinning Up KL Divergence tensor(1.8341e-06)
37 Implimentation Details KL Divergence tensor(1.5547e-08)
Open AI Spinning Up KL Divergence tensor(1.4148e-06)
37 Implimentation Details KL Divergence tensor(1.8328e-08)
Open AI Spinning Up KL Divergence tensor(2.2163e-07)
37 Implimentation Details KL Divergence tensor(1.7037e-08)
Open AI Spinning Up KL Divergence tensor(3.1748e-06)
37 Implimentation Details KL Divergence tensor(1.8328e-08)
Open AI Spinning Up KL Divergence tensor(2.0470e-06)
37 Implimentation Details KL Divergence tensor(1.9372e-08)
Open AI Spinning Up KL Divergence tensor(2.0482e-06)
37 Implimentation Details KL Divergence tensor(2.1110e-08)
Open AI Spinning Up KL Divergence tensor(-1.6261e-06)
37 Implimentation Details KL Divergence tensor(2.0713e-08)
Open AI Spinning Up KL Divergence tensor(5.3653e-06)
37 Implimentation Details KL Divergence tensor(2.3

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_7/videos/PPOAgent/7/PPOAgent-episode-34.mp4
score:  23.0




Training Step:  8


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.3554e-06)
37 Implimentation Details KL Divergence tensor(1.5547e-08)
Open AI Spinning Up KL Divergence tensor(7.3037e-06)
37 Implimentation Details KL Divergence tensor(1.5497e-08)
Open AI Spinning Up KL Divergence tensor(5.2452e-07)
37 Implimentation Details KL Divergence tensor(1.6242e-08)
Open AI Spinning Up KL Divergence tensor(-2.1184e-07)
37 Implimentation Details KL Divergence tensor(1.6590e-08)
Open AI Spinning Up KL Divergence tensor(-3.2820e-06)
37 Implimentation Details KL Divergence tensor(1.7633e-08)
Open AI Spinning Up KL Divergence tensor(8.1930e-06)
37 Implimentation Details KL Divergence tensor(1.9372e-08)
Open AI Spinning Up KL Divergence tensor(2.3430e-06)
37 Implimentation Details KL Divergence tensor(2.0017e-08)
Open AI Spinning Up KL Divergence tensor(5.8065e-07)
37 Implimentation Details KL Divergence tensor(2.1656e-08)
Open AI Spinning Up KL Divergence tensor(-6.3513e-06)
37 Implimentation Details KL Divergence tensor(

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_8/videos/PPOAgent/8/PPOAgent-episode-39.mp4
score:  25.0
Training Step:  9


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(5.2209e-07)
37 Implimentation Details KL Divergence tensor(1.4653e-08)
Open AI Spinning Up KL Divergence tensor(-3.3873e-06)
37 Implimentation Details KL Divergence tensor(1.5696e-08)
Open AI Spinning Up KL Divergence tensor(3.3431e-06)
37 Implimentation Details KL Divergence tensor(1.6888e-08)
Open AI Spinning Up KL Divergence tensor(1.7918e-06)
37 Implimentation Details KL Divergence tensor(1.7832e-08)
Open AI Spinning Up KL Divergence tensor(-5.7255e-07)
37 Implimentation Details KL Divergence tensor(1.8428e-08)
Open AI Spinning Up KL Divergence tensor(-2.3058e-06)
37 Implimentation Details KL Divergence tensor(2.1358e-08)
Open AI Spinning Up KL Divergence tensor(5.0224e-06)
37 Implimentation Details KL Divergence tensor(2.2749e-08)
Open AI Spinning Up KL Divergence tensor(7.8849e-06)
37 Implimentation Details KL Divergence tensor(2.5531e-08)
Open AI Spinning Up KL Divergence tensor(-3.5497e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_9/videos/PPOAgent/9/PPOAgent-episode-44.mp4
score:  16.0
Training Step:  10


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(8.4409e-06)
37 Implimentation Details KL Divergence tensor(1.5100e-08)
Open AI Spinning Up KL Divergence tensor(-8.0943e-07)
37 Implimentation Details KL Divergence tensor(1.6441e-08)
Open AI Spinning Up KL Divergence tensor(-2.5820e-06)
37 Implimentation Details KL Divergence tensor(1.6640e-08)
Open AI Spinning Up KL Divergence tensor(1.0157e-05)
37 Implimentation Details KL Divergence tensor(1.8179e-08)
Open AI Spinning Up KL Divergence tensor(8.5025e-06)
37 Implimentation Details KL Divergence tensor(1.8775e-08)
Open AI Spinning Up KL Divergence tensor(-3.4965e-06)
37 Implimentation Details KL Divergence tensor(2.1805e-08)
Open AI Spinning Up KL Divergence tensor(-3.7822e-06)
37 Implimentation Details KL Divergence tensor(2.2153e-08)
Open AI Spinning Up KL Divergence tensor(9.7933e-06)
37 Implimentation Details KL Divergence tensor(2.2103e-08)
Open AI Spinning Up KL Divergence tensor(8.6461e-06)
37 Implimentation Details KL Divergence tensor(

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_10/videos/PPOAgent/10/PPOAgent-episode-49.mp4
score:  14.0




Training Step:  11


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-7.2079e-06)
37 Implimentation Details KL Divergence tensor(1.5398e-08)
Open AI Spinning Up KL Divergence tensor(-4.0139e-06)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(4.3519e-06)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(1.7722e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(-2.3449e-06)
37 Implimentation Details KL Divergence tensor(2.3345e-08)
Open AI Spinning Up KL Divergence tensor(2.0203e-06)
37 Implimentation Details KL Divergence tensor(2.8064e-08)
Open AI Spinning Up KL Divergence tensor(2.6812e-05)
37 Implimentation Details KL Divergence tensor(3.8346e-08)
Open AI Spinning Up KL Divergence tensor(-5.2411e-06)
37 Implimentation Details KL Divergence tensor(4.6243e-08)
Open AI Spinning Up KL Divergence tensor(6.0603e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_11/videos/PPOAgent/11/PPOAgent-episode-54.mp4
score:  20.0
Training Step:  12


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(4.3144e-06)
37 Implimentation Details KL Divergence tensor(1.3759e-08)
Open AI Spinning Up KL Divergence tensor(-3.7969e-06)
37 Implimentation Details KL Divergence tensor(1.8328e-08)
Open AI Spinning Up KL Divergence tensor(-4.5752e-06)
37 Implimentation Details KL Divergence tensor(2.0365e-08)
Open AI Spinning Up KL Divergence tensor(2.3005e-06)
37 Implimentation Details KL Divergence tensor(2.2004e-08)
Open AI Spinning Up KL Divergence tensor(6.7652e-06)
37 Implimentation Details KL Divergence tensor(2.8610e-08)
Open AI Spinning Up KL Divergence tensor(-2.9783e-06)
37 Implimentation Details KL Divergence tensor(3.9935e-08)
Open AI Spinning Up KL Divergence tensor(-2.6444e-06)
37 Implimentation Details KL Divergence tensor(5.1806e-08)
Open AI Spinning Up KL Divergence tensor(2.5590e-07)
37 Implimentation Details KL Divergence tensor(6.1939e-08)
Open AI Spinning Up KL Divergence tensor(1.0836e-05)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_12/videos/PPOAgent/12/PPOAgent-episode-59.mp4
score:  12.0
Training Step:  13


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.1305e-06)
37 Implimentation Details KL Divergence tensor(1.5150e-08)
Open AI Spinning Up KL Divergence tensor(2.3274e-06)
37 Implimentation Details KL Divergence tensor(1.7335e-08)
Open AI Spinning Up KL Divergence tensor(-5.3848e-06)
37 Implimentation Details KL Divergence tensor(2.5034e-08)
Open AI Spinning Up KL Divergence tensor(2.1854e-05)
37 Implimentation Details KL Divergence tensor(4.9869e-08)
Open AI Spinning Up KL Divergence tensor(7.2806e-06)
37 Implimentation Details KL Divergence tensor(9.2387e-08)
Open AI Spinning Up KL Divergence tensor(4.4172e-06)
37 Implimentation Details KL Divergence tensor(1.4340e-07)
Open AI Spinning Up KL Divergence tensor(-1.2284e-05)
37 Implimentation Details KL Divergence tensor(2.2069e-07)
Open AI Spinning Up KL Divergence tensor(5.2759e-05)
37 Implimentation Details KL Divergence tensor(3.2023e-07)
Open AI Spinning Up KL Divergence tensor(1.8564e-05)
37 Implimentation Details KL Divergence tensor(4

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_13/videos/PPOAgent/13/PPOAgent-episode-64.mp4
score:  15.0




Training Step:  14


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(4.8493e-06)
37 Implimentation Details KL Divergence tensor(1.4106e-08)
Open AI Spinning Up KL Divergence tensor(6.2621e-06)
37 Implimentation Details KL Divergence tensor(1.9222e-08)
Open AI Spinning Up KL Divergence tensor(1.6054e-07)
37 Implimentation Details KL Divergence tensor(2.4835e-08)
Open AI Spinning Up KL Divergence tensor(3.3203e-06)
37 Implimentation Details KL Divergence tensor(3.8693e-08)
Open AI Spinning Up KL Divergence tensor(7.5647e-06)
37 Implimentation Details KL Divergence tensor(5.4538e-08)
Open AI Spinning Up KL Divergence tensor(2.8549e-05)
37 Implimentation Details KL Divergence tensor(7.7387e-08)
Open AI Spinning Up KL Divergence tensor(6.7629e-06)
37 Implimentation Details KL Divergence tensor(1.0267e-07)
Open AI Spinning Up KL Divergence tensor(1.3156e-05)
37 Implimentation Details KL Divergence tensor(1.3202e-07)
Open AI Spinning Up KL Divergence tensor(8.1513e-06)
37 Implimentation Details KL Divergence tensor(1.56

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_14/videos/PPOAgent/14/PPOAgent-episode-69.mp4
score:  11.0




Training Step:  15


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-3.0456e-06)
37 Implimentation Details KL Divergence tensor(1.5299e-08)
Open AI Spinning Up KL Divergence tensor(-6.1890e-06)
37 Implimentation Details KL Divergence tensor(1.6640e-08)
Open AI Spinning Up KL Divergence tensor(3.5127e-07)
37 Implimentation Details KL Divergence tensor(1.9222e-08)
Open AI Spinning Up KL Divergence tensor(-3.0811e-06)
37 Implimentation Details KL Divergence tensor(2.1756e-08)
Open AI Spinning Up KL Divergence tensor(-1.2274e-07)
37 Implimentation Details KL Divergence tensor(2.2948e-08)
Open AI Spinning Up KL Divergence tensor(-1.1905e-05)
37 Implimentation Details KL Divergence tensor(2.4438e-08)
Open AI Spinning Up KL Divergence tensor(6.4140e-07)
37 Implimentation Details KL Divergence tensor(2.6127e-08)
Open AI Spinning Up KL Divergence tensor(-6.7643e-06)
37 Implimentation Details KL Divergence tensor(2.7567e-08)
Open AI Spinning Up KL Divergence tensor(8.3655e-07)
37 Implimentation Details KL Divergence tenso

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_15/videos/PPOAgent/15/PPOAgent-episode-74.mp4
score:  11.0




Training Step:  16


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-8.0566e-08)
37 Implimentation Details KL Divergence tensor(1.5497e-08)
Open AI Spinning Up KL Divergence tensor(-6.1572e-07)
37 Implimentation Details KL Divergence tensor(1.7285e-08)
Open AI Spinning Up KL Divergence tensor(-2.7626e-06)
37 Implimentation Details KL Divergence tensor(1.6938e-08)
Open AI Spinning Up KL Divergence tensor(-2.9807e-06)
37 Implimentation Details KL Divergence tensor(1.8229e-08)
Open AI Spinning Up KL Divergence tensor(9.1920e-07)
37 Implimentation Details KL Divergence tensor(1.9670e-08)
Open AI Spinning Up KL Divergence tensor(1.7524e-07)
37 Implimentation Details KL Divergence tensor(1.9123e-08)
Open AI Spinning Up KL Divergence tensor(-4.0679e-06)
37 Implimentation Details KL Divergence tensor(1.8974e-08)
Open AI Spinning Up KL Divergence tensor(-1.9020e-06)
37 Implimentation Details KL Divergence tensor(2.0961e-08)
Open AI Spinning Up KL Divergence tensor(1.7918e-06)
37 Implimentation Details KL Divergence tenso

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_16/videos/PPOAgent/16/PPOAgent-episode-79.mp4
score:  20.0
Training Step:  17


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.2396e-06)
37 Implimentation Details KL Divergence tensor(1.5249e-08)
Open AI Spinning Up KL Divergence tensor(-4.0279e-06)
37 Implimentation Details KL Divergence tensor(1.4702e-08)
Open AI Spinning Up KL Divergence tensor(-5.0242e-06)
37 Implimentation Details KL Divergence tensor(1.8924e-08)
Open AI Spinning Up KL Divergence tensor(4.2731e-06)
37 Implimentation Details KL Divergence tensor(2.0166e-08)
Open AI Spinning Up KL Divergence tensor(-6.1348e-06)
37 Implimentation Details KL Divergence tensor(2.4239e-08)
Open AI Spinning Up KL Divergence tensor(-3.8315e-06)
37 Implimentation Details KL Divergence tensor(2.9802e-08)
Open AI Spinning Up KL Divergence tensor(-6.1424e-06)
37 Implimentation Details KL Divergence tensor(3.6259e-08)
Open AI Spinning Up KL Divergence tensor(4.8634e-06)
37 Implimentation Details KL Divergence tensor(4.4207e-08)
Open AI Spinning Up KL Divergence tensor(-1.1787e-05)
37 Implimentation Details KL Divergence tens

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_17/videos/PPOAgent/17/PPOAgent-episode-84.mp4
score:  27.0
Training Step:  18


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-3.4271e-06)
37 Implimentation Details KL Divergence tensor(1.4404e-08)
Open AI Spinning Up KL Divergence tensor(3.5939e-06)
37 Implimentation Details KL Divergence tensor(1.7683e-08)
Open AI Spinning Up KL Divergence tensor(-2.6143e-06)
37 Implimentation Details KL Divergence tensor(1.7534e-08)
Open AI Spinning Up KL Divergence tensor(-1.4901e-10)
37 Implimentation Details KL Divergence tensor(2.2550e-08)
Open AI Spinning Up KL Divergence tensor(1.6935e-06)
37 Implimentation Details KL Divergence tensor(2.5928e-08)
Open AI Spinning Up KL Divergence tensor(2.9227e-06)
37 Implimentation Details KL Divergence tensor(3.1839e-08)
Open AI Spinning Up KL Divergence tensor(2.3823e-06)
37 Implimentation Details KL Divergence tensor(3.7551e-08)
Open AI Spinning Up KL Divergence tensor(-4.3186e-06)
37 Implimentation Details KL Divergence tensor(4.2369e-08)
Open AI Spinning Up KL Divergence tensor(5.4147e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_18/videos/PPOAgent/18/PPOAgent-episode-89.mp4
score:  22.0
Training Step:  19


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-6.4514e-06)
37 Implimentation Details KL Divergence tensor(1.5497e-08)
Open AI Spinning Up KL Divergence tensor(-2.1800e-07)
37 Implimentation Details KL Divergence tensor(1.6540e-08)
Open AI Spinning Up KL Divergence tensor(-9.0862e-07)
37 Implimentation Details KL Divergence tensor(1.7037e-08)
Open AI Spinning Up KL Divergence tensor(7.2072e-06)
37 Implimentation Details KL Divergence tensor(1.5348e-08)
Open AI Spinning Up KL Divergence tensor(-5.7822e-06)
37 Implimentation Details KL Divergence tensor(1.8080e-08)
Open AI Spinning Up KL Divergence tensor(-9.0475e-07)
37 Implimentation Details KL Divergence tensor(1.6640e-08)
Open AI Spinning Up KL Divergence tensor(-4.6258e-07)
37 Implimentation Details KL Divergence tensor(1.7534e-08)
Open AI Spinning Up KL Divergence tensor(7.2343e-06)
37 Implimentation Details KL Divergence tensor(1.7434e-08)
Open AI Spinning Up KL Divergence tensor(-5.9097e-06)
37 Implimentation Details KL Divergence tens

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_19/videos/PPOAgent/19/PPOAgent-episode-94.mp4
score:  21.0
Training Step:  20


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(2.6217e-06)
37 Implimentation Details KL Divergence tensor(1.5348e-08)
Open AI Spinning Up KL Divergence tensor(-7.5648e-08)
37 Implimentation Details KL Divergence tensor(1.6491e-08)
Open AI Spinning Up KL Divergence tensor(7.0077e-06)
37 Implimentation Details KL Divergence tensor(1.5001e-08)
Open AI Spinning Up KL Divergence tensor(1.0664e-07)
37 Implimentation Details KL Divergence tensor(1.6789e-08)
Open AI Spinning Up KL Divergence tensor(1.2059e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(4.3678e-06)
37 Implimentation Details KL Divergence tensor(1.9123e-08)
Open AI Spinning Up KL Divergence tensor(7.6900e-06)
37 Implimentation Details KL Divergence tensor(2.5332e-08)
Open AI Spinning Up KL Divergence tensor(4.0788e-06)
37 Implimentation Details KL Divergence tensor(2.9057e-08)
Open AI Spinning Up KL Divergence tensor(-1.6379e-06)
37 Implimentation Details KL Divergence tensor(3.

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_20/videos/PPOAgent/20/PPOAgent-episode-99.mp4
score:  13.0




Training Step:  21


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(4.5567e-06)
37 Implimentation Details KL Divergence tensor(1.5100e-08)
Open AI Spinning Up KL Divergence tensor(2.3784e-06)
37 Implimentation Details KL Divergence tensor(1.5199e-08)
Open AI Spinning Up KL Divergence tensor(8.6948e-07)
37 Implimentation Details KL Divergence tensor(1.7583e-08)
Open AI Spinning Up KL Divergence tensor(4.3512e-06)
37 Implimentation Details KL Divergence tensor(1.9719e-08)
Open AI Spinning Up KL Divergence tensor(2.7766e-06)
37 Implimentation Details KL Divergence tensor(2.2401e-08)
Open AI Spinning Up KL Divergence tensor(-3.0619e-06)
37 Implimentation Details KL Divergence tensor(2.9057e-08)
Open AI Spinning Up KL Divergence tensor(-3.7526e-06)
37 Implimentation Details KL Divergence tensor(3.3230e-08)
Open AI Spinning Up KL Divergence tensor(7.0884e-06)
37 Implimentation Details KL Divergence tensor(4.2071e-08)
Open AI Spinning Up KL Divergence tensor(1.1525e-06)
37 Implimentation Details KL Divergence tensor(5.

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_21/videos/PPOAgent/21/PPOAgent-episode-104.mp4
score:  13.0




Training Step:  22


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-8.4613e-06)
37 Implimentation Details KL Divergence tensor(1.5100e-08)
Open AI Spinning Up KL Divergence tensor(-3.6339e-07)
37 Implimentation Details KL Divergence tensor(1.4901e-08)
Open AI Spinning Up KL Divergence tensor(-3.1367e-06)
37 Implimentation Details KL Divergence tensor(1.5696e-08)
Open AI Spinning Up KL Divergence tensor(-1.9922e-06)
37 Implimentation Details KL Divergence tensor(1.9024e-08)
Open AI Spinning Up KL Divergence tensor(-6.3461e-06)
37 Implimentation Details KL Divergence tensor(1.8974e-08)
Open AI Spinning Up KL Divergence tensor(1.5214e-06)
37 Implimentation Details KL Divergence tensor(2.2302e-08)
Open AI Spinning Up KL Divergence tensor(-5.7240e-06)
37 Implimentation Details KL Divergence tensor(2.4090e-08)
Open AI Spinning Up KL Divergence tensor(-4.7694e-07)
37 Implimentation Details KL Divergence tensor(2.5332e-08)
Open AI Spinning Up KL Divergence tensor(-2.8367e-06)
37 Implimentation Details KL Divergence ten

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_22/videos/PPOAgent/22/PPOAgent-episode-109.mp4
score:  31.0
Training Step:  23


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(7.3110e-07)
37 Implimentation Details KL Divergence tensor(1.4603e-08)
Open AI Spinning Up KL Divergence tensor(-2.9228e-06)
37 Implimentation Details KL Divergence tensor(1.5795e-08)
Open AI Spinning Up KL Divergence tensor(4.3049e-06)
37 Implimentation Details KL Divergence tensor(1.7236e-08)
Open AI Spinning Up KL Divergence tensor(-1.4043e-05)
37 Implimentation Details KL Divergence tensor(1.8130e-08)
Open AI Spinning Up KL Divergence tensor(1.6703e-06)
37 Implimentation Details KL Divergence tensor(1.6838e-08)
Open AI Spinning Up KL Divergence tensor(-3.8469e-06)
37 Implimentation Details KL Divergence tensor(1.9620e-08)
Open AI Spinning Up KL Divergence tensor(3.7030e-06)
37 Implimentation Details KL Divergence tensor(1.8030e-08)
Open AI Spinning Up KL Divergence tensor(-1.3946e-05)
37 Implimentation Details KL Divergence tensor(1.9570e-08)
Open AI Spinning Up KL Divergence tensor(3.1880e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_23/videos/PPOAgent/23/PPOAgent-episode-114.mp4
score:  15.0
Training Step:  24


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(5.2397e-07)
37 Implimentation Details KL Divergence tensor(1.5100e-08)
Open AI Spinning Up KL Divergence tensor(-3.4926e-06)
37 Implimentation Details KL Divergence tensor(1.6143e-08)
Open AI Spinning Up KL Divergence tensor(5.9123e-06)
37 Implimentation Details KL Divergence tensor(1.5497e-08)
Open AI Spinning Up KL Divergence tensor(9.4752e-07)
37 Implimentation Details KL Divergence tensor(1.7385e-08)
Open AI Spinning Up KL Divergence tensor(6.0663e-07)
37 Implimentation Details KL Divergence tensor(1.5646e-08)
Open AI Spinning Up KL Divergence tensor(-2.6067e-06)
37 Implimentation Details KL Divergence tensor(1.6888e-08)
Open AI Spinning Up KL Divergence tensor(5.3368e-06)
37 Implimentation Details KL Divergence tensor(1.8130e-08)
Open AI Spinning Up KL Divergence tensor(6.0722e-07)
37 Implimentation Details KL Divergence tensor(1.7484e-08)
Open AI Spinning Up KL Divergence tensor(-5.0664e-09)
37 Implimentation Details KL Divergence tensor(1

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_24/videos/PPOAgent/24/PPOAgent-episode-119.mp4
score:  44.0
Training Step:  25


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(3.2131e-06)
37 Implimentation Details KL Divergence tensor(1.5646e-08)
Open AI Spinning Up KL Divergence tensor(-4.2713e-06)
37 Implimentation Details KL Divergence tensor(1.6093e-08)
Open AI Spinning Up KL Divergence tensor(1.6712e-06)
37 Implimentation Details KL Divergence tensor(1.8676e-08)
Open AI Spinning Up KL Divergence tensor(3.3093e-06)
37 Implimentation Details KL Divergence tensor(1.8477e-08)
Open AI Spinning Up KL Divergence tensor(9.4975e-07)
37 Implimentation Details KL Divergence tensor(2.7517e-08)
Open AI Spinning Up KL Divergence tensor(-7.5097e-06)
37 Implimentation Details KL Divergence tensor(3.2783e-08)
Open AI Spinning Up KL Divergence tensor(2.9619e-06)
37 Implimentation Details KL Divergence tensor(4.3909e-08)
Open AI Spinning Up KL Divergence tensor(1.2236e-05)
37 Implimentation Details KL Divergence tensor(5.7171e-08)
Open AI Spinning Up KL Divergence tensor(-2.2317e-06)
37 Implimentation Details KL Divergence tensor(7

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_25/videos/PPOAgent/25/PPOAgent-episode-124.mp4
score:  17.0
Training Step:  26


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(9.2551e-07)
37 Implimentation Details KL Divergence tensor(1.4404e-08)
Open AI Spinning Up KL Divergence tensor(-5.7299e-06)
37 Implimentation Details KL Divergence tensor(1.6491e-08)
Open AI Spinning Up KL Divergence tensor(6.1125e-07)
37 Implimentation Details KL Divergence tensor(1.7186e-08)
Open AI Spinning Up KL Divergence tensor(-2.4959e-06)
37 Implimentation Details KL Divergence tensor(1.7484e-08)
Open AI Spinning Up KL Divergence tensor(1.8069e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(-3.3847e-06)
37 Implimentation Details KL Divergence tensor(2.2004e-08)
Open AI Spinning Up KL Divergence tensor(3.6210e-08)
37 Implimentation Details KL Divergence tensor(2.1110e-08)
Open AI Spinning Up KL Divergence tensor(-3.2694e-06)
37 Implimentation Details KL Divergence tensor(2.0862e-08)
Open AI Spinning Up KL Divergence tensor(2.3374e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_26/videos/PPOAgent/26/PPOAgent-episode-129.mp4
score:  18.0
Training Step:  27


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.8487e-06)
37 Implimentation Details KL Divergence tensor(1.5994e-08)
Open AI Spinning Up KL Divergence tensor(7.3073e-06)
37 Implimentation Details KL Divergence tensor(1.5845e-08)
Open AI Spinning Up KL Divergence tensor(1.7360e-06)
37 Implimentation Details KL Divergence tensor(1.8080e-08)
Open AI Spinning Up KL Divergence tensor(-1.8350e-06)
37 Implimentation Details KL Divergence tensor(1.8428e-08)
Open AI Spinning Up KL Divergence tensor(-2.5608e-06)
37 Implimentation Details KL Divergence tensor(2.1110e-08)
Open AI Spinning Up KL Divergence tensor(9.6216e-06)
37 Implimentation Details KL Divergence tensor(2.3345e-08)
Open AI Spinning Up KL Divergence tensor(1.4798e-06)
37 Implimentation Details KL Divergence tensor(2.9902e-08)
Open AI Spinning Up KL Divergence tensor(-1.7709e-06)
37 Implimentation Details KL Divergence tensor(3.4372e-08)
Open AI Spinning Up KL Divergence tensor(-3.2858e-06)
37 Implimentation Details KL Divergence tensor

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_27/videos/PPOAgent/27/PPOAgent-episode-134.mp4
score:  23.0
Training Step:  28


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.7592e-06)
37 Implimentation Details KL Divergence tensor(1.5547e-08)
Open AI Spinning Up KL Divergence tensor(-2.4823e-06)
37 Implimentation Details KL Divergence tensor(1.5746e-08)
Open AI Spinning Up KL Divergence tensor(4.2935e-06)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(2.1503e-06)
37 Implimentation Details KL Divergence tensor(1.6292e-08)
Open AI Spinning Up KL Divergence tensor(-1.9250e-06)
37 Implimentation Details KL Divergence tensor(1.6342e-08)
Open AI Spinning Up KL Divergence tensor(-2.4952e-06)
37 Implimentation Details KL Divergence tensor(1.6292e-08)
Open AI Spinning Up KL Divergence tensor(4.3216e-06)
37 Implimentation Details KL Divergence tensor(1.6888e-08)
Open AI Spinning Up KL Divergence tensor(1.7176e-06)
37 Implimentation Details KL Divergence tensor(1.5895e-08)
Open AI Spinning Up KL Divergence tensor(-3.0611e-06)
37 Implimentation Details KL Divergence tensor

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_28/videos/PPOAgent/28/PPOAgent-episode-139.mp4
score:  29.0
Training Step:  29


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(7.0726e-07)
37 Implimentation Details KL Divergence tensor(1.5001e-08)
Open AI Spinning Up KL Divergence tensor(2.1005e-06)
37 Implimentation Details KL Divergence tensor(1.4901e-08)
Open AI Spinning Up KL Divergence tensor(-3.5173e-06)
37 Implimentation Details KL Divergence tensor(1.7931e-08)
Open AI Spinning Up KL Divergence tensor(-7.5837e-06)
37 Implimentation Details KL Divergence tensor(1.9123e-08)
Open AI Spinning Up KL Divergence tensor(6.7314e-06)
37 Implimentation Details KL Divergence tensor(2.1060e-08)
Open AI Spinning Up KL Divergence tensor(2.6103e-06)
37 Implimentation Details KL Divergence tensor(2.4488e-08)
Open AI Spinning Up KL Divergence tensor(-3.1104e-06)
37 Implimentation Details KL Divergence tensor(2.6772e-08)
Open AI Spinning Up KL Divergence tensor(-7.0947e-06)
37 Implimentation Details KL Divergence tensor(2.9256e-08)
Open AI Spinning Up KL Divergence tensor(1.2880e-05)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_29/videos/PPOAgent/29/PPOAgent-episode-144.mp4
score:  16.0
Training Step:  30


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(6.4432e-06)
37 Implimentation Details KL Divergence tensor(1.3957e-08)
Open AI Spinning Up KL Divergence tensor(-4.4229e-06)
37 Implimentation Details KL Divergence tensor(1.7881e-08)
Open AI Spinning Up KL Divergence tensor(-4.8671e-06)
37 Implimentation Details KL Divergence tensor(1.9421e-08)
Open AI Spinning Up KL Divergence tensor(-2.0820e-06)
37 Implimentation Details KL Divergence tensor(2.4686e-08)
Open AI Spinning Up KL Divergence tensor(1.1566e-05)
37 Implimentation Details KL Divergence tensor(3.1590e-08)
Open AI Spinning Up KL Divergence tensor(-2.6608e-06)
37 Implimentation Details KL Divergence tensor(3.9041e-08)
Open AI Spinning Up KL Divergence tensor(-3.4779e-06)
37 Implimentation Details KL Divergence tensor(5.0068e-08)
Open AI Spinning Up KL Divergence tensor(-2.4417e-06)
37 Implimentation Details KL Divergence tensor(6.7204e-08)
Open AI Spinning Up KL Divergence tensor(1.6401e-05)
37 Implimentation Details KL Divergence tenso

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_30/videos/PPOAgent/30/PPOAgent-episode-149.mp4
score:  12.0




Training Step:  31


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(2.2349e-06)
37 Implimentation Details KL Divergence tensor(1.6292e-08)
Open AI Spinning Up KL Divergence tensor(-2.0914e-06)
37 Implimentation Details KL Divergence tensor(1.6739e-08)
Open AI Spinning Up KL Divergence tensor(5.6735e-06)
37 Implimentation Details KL Divergence tensor(1.9421e-08)
Open AI Spinning Up KL Divergence tensor(-3.5295e-06)
37 Implimentation Details KL Divergence tensor(2.6574e-08)
Open AI Spinning Up KL Divergence tensor(4.0025e-06)
37 Implimentation Details KL Divergence tensor(3.1839e-08)
Open AI Spinning Up KL Divergence tensor(4.1779e-06)
37 Implimentation Details KL Divergence tensor(3.8644e-08)
Open AI Spinning Up KL Divergence tensor(5.8311e-06)
37 Implimentation Details KL Divergence tensor(4.2667e-08)
Open AI Spinning Up KL Divergence tensor(-6.3528e-06)
37 Implimentation Details KL Divergence tensor(5.4638e-08)
Open AI Spinning Up KL Divergence tensor(4.4387e-06)
37 Implimentation Details KL Divergence tensor(6

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_31/videos/PPOAgent/31/PPOAgent-episode-154.mp4
score:  13.0
Training Step:  32


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.9224e-06)
37 Implimentation Details KL Divergence tensor(1.4802e-08)
Open AI Spinning Up KL Divergence tensor(1.6955e-06)
37 Implimentation Details KL Divergence tensor(1.5646e-08)
Open AI Spinning Up KL Divergence tensor(-1.2267e-06)
37 Implimentation Details KL Divergence tensor(1.5696e-08)
Open AI Spinning Up KL Divergence tensor(1.2544e-06)
37 Implimentation Details KL Divergence tensor(2.1259e-08)
Open AI Spinning Up KL Divergence tensor(-6.3384e-06)
37 Implimentation Details KL Divergence tensor(2.3047e-08)
Open AI Spinning Up KL Divergence tensor(-2.5723e-06)
37 Implimentation Details KL Divergence tensor(2.5133e-08)
Open AI Spinning Up KL Divergence tensor(-6.3522e-06)
37 Implimentation Details KL Divergence tensor(2.9951e-08)
Open AI Spinning Up KL Divergence tensor(6.4705e-06)
37 Implimentation Details KL Divergence tensor(3.8594e-08)
Open AI Spinning Up KL Divergence tensor(-1.3049e-05)
37 Implimentation Details KL Divergence tenso

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_32/videos/PPOAgent/32/PPOAgent-episode-159.mp4
score:  42.0
Training Step:  33


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.0540e-06)
37 Implimentation Details KL Divergence tensor(1.5150e-08)
Open AI Spinning Up KL Divergence tensor(1.0351e-07)
37 Implimentation Details KL Divergence tensor(1.6044e-08)
Open AI Spinning Up KL Divergence tensor(-2.8577e-06)
37 Implimentation Details KL Divergence tensor(1.8477e-08)
Open AI Spinning Up KL Divergence tensor(-4.2471e-06)
37 Implimentation Details KL Divergence tensor(2.0564e-08)
Open AI Spinning Up KL Divergence tensor(-9.5344e-06)
37 Implimentation Details KL Divergence tensor(2.2103e-08)
Open AI Spinning Up KL Divergence tensor(2.8533e-06)
37 Implimentation Details KL Divergence tensor(2.6276e-08)
Open AI Spinning Up KL Divergence tensor(-6.9610e-06)
37 Implimentation Details KL Divergence tensor(2.7617e-08)
Open AI Spinning Up KL Divergence tensor(-6.3568e-06)
37 Implimentation Details KL Divergence tensor(2.9653e-08)
Open AI Spinning Up KL Divergence tensor(-1.3571e-05)
37 Implimentation Details KL Divergence tens

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_33/videos/PPOAgent/33/PPOAgent-episode-164.mp4
score:  46.0
Training Step:  34


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(9.5318e-08)
37 Implimentation Details KL Divergence tensor(1.5944e-08)
Open AI Spinning Up KL Divergence tensor(-3.1243e-06)
37 Implimentation Details KL Divergence tensor(1.5895e-08)
Open AI Spinning Up KL Divergence tensor(-8.4067e-07)
37 Implimentation Details KL Divergence tensor(1.4702e-08)
Open AI Spinning Up KL Divergence tensor(3.5999e-06)
37 Implimentation Details KL Divergence tensor(1.7285e-08)
Open AI Spinning Up KL Divergence tensor(9.3132e-08)
37 Implimentation Details KL Divergence tensor(1.4752e-08)
Open AI Spinning Up KL Divergence tensor(-1.7964e-06)
37 Implimentation Details KL Divergence tensor(1.6491e-08)
Open AI Spinning Up KL Divergence tensor(2.5113e-07)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(4.5323e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(-3.5038e-07)
37 Implimentation Details KL Divergence tensor(

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_34/videos/PPOAgent/34/PPOAgent-episode-169.mp4
score:  11.0




Training Step:  35


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.4453e-06)
37 Implimentation Details KL Divergence tensor(1.5001e-08)
Open AI Spinning Up KL Divergence tensor(4.6112e-06)
37 Implimentation Details KL Divergence tensor(1.4951e-08)
Open AI Spinning Up KL Divergence tensor(1.1181e-05)
37 Implimentation Details KL Divergence tensor(1.7981e-08)
Open AI Spinning Up KL Divergence tensor(7.9402e-06)
37 Implimentation Details KL Divergence tensor(1.9024e-08)
Open AI Spinning Up KL Divergence tensor(8.4142e-07)
37 Implimentation Details KL Divergence tensor(2.1408e-08)
Open AI Spinning Up KL Divergence tensor(2.1288e-06)
37 Implimentation Details KL Divergence tensor(2.6723e-08)
Open AI Spinning Up KL Divergence tensor(1.8431e-05)
37 Implimentation Details KL Divergence tensor(3.5067e-08)
Open AI Spinning Up KL Divergence tensor(1.2699e-05)
37 Implimentation Details KL Divergence tensor(3.5912e-08)
Open AI Spinning Up KL Divergence tensor(3.4081e-06)
37 Implimentation Details KL Divergence tensor(3.8

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_35/videos/PPOAgent/35/PPOAgent-episode-174.mp4
score:  11.0




Training Step:  36


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.8516e-06)
37 Implimentation Details KL Divergence tensor(1.6093e-08)
Open AI Spinning Up KL Divergence tensor(-6.4724e-06)
37 Implimentation Details KL Divergence tensor(1.6689e-08)
Open AI Spinning Up KL Divergence tensor(1.2358e-06)
37 Implimentation Details KL Divergence tensor(1.6590e-08)
Open AI Spinning Up KL Divergence tensor(4.0159e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(-7.1575e-08)
37 Implimentation Details KL Divergence tensor(1.8825e-08)
Open AI Spinning Up KL Divergence tensor(-7.9539e-06)
37 Implimentation Details KL Divergence tensor(1.8626e-08)
Open AI Spinning Up KL Divergence tensor(1.1898e-06)
37 Implimentation Details KL Divergence tensor(1.9272e-08)
Open AI Spinning Up KL Divergence tensor(4.0313e-06)
37 Implimentation Details KL Divergence tensor(2.1259e-08)
Open AI Spinning Up KL Divergence tensor(2.8176e-06)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_36/videos/PPOAgent/36/PPOAgent-episode-179.mp4
score:  14.0
Training Step:  37


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-1.1375e-07)
37 Implimentation Details KL Divergence tensor(1.5050e-08)
Open AI Spinning Up KL Divergence tensor(6.8272e-07)
37 Implimentation Details KL Divergence tensor(1.4851e-08)
Open AI Spinning Up KL Divergence tensor(2.6047e-06)
37 Implimentation Details KL Divergence tensor(1.6044e-08)
Open AI Spinning Up KL Divergence tensor(-1.5157e-06)
37 Implimentation Details KL Divergence tensor(1.7534e-08)
Open AI Spinning Up KL Divergence tensor(-4.3064e-08)
37 Implimentation Details KL Divergence tensor(1.8825e-08)
Open AI Spinning Up KL Divergence tensor(-2.3175e-06)
37 Implimentation Details KL Divergence tensor(1.8080e-08)
Open AI Spinning Up KL Divergence tensor(1.9404e-06)
37 Implimentation Details KL Divergence tensor(2.0564e-08)
Open AI Spinning Up KL Divergence tensor(1.5861e-06)
37 Implimentation Details KL Divergence tensor(2.4587e-08)
Open AI Spinning Up KL Divergence tensor(8.7490e-07)
37 Implimentation Details KL Divergence tensor(

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_37/videos/PPOAgent/37/PPOAgent-episode-184.mp4
score:  14.0
Training Step:  38


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-2.6636e-06)
37 Implimentation Details KL Divergence tensor(1.5547e-08)
Open AI Spinning Up KL Divergence tensor(3.7595e-06)
37 Implimentation Details KL Divergence tensor(1.6590e-08)
Open AI Spinning Up KL Divergence tensor(-4.7453e-06)
37 Implimentation Details KL Divergence tensor(1.5994e-08)
Open AI Spinning Up KL Divergence tensor(-1.6040e-06)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(-2.0964e-06)
37 Implimentation Details KL Divergence tensor(1.6590e-08)
Open AI Spinning Up KL Divergence tensor(6.6456e-06)
37 Implimentation Details KL Divergence tensor(1.8428e-08)
Open AI Spinning Up KL Divergence tensor(-4.0762e-06)
37 Implimentation Details KL Divergence tensor(1.9024e-08)
Open AI Spinning Up KL Divergence tensor(-3.4158e-07)
37 Implimentation Details KL Divergence tensor(2.1110e-08)
Open AI Spinning Up KL Divergence tensor(-2.0094e-06)
37 Implimentation Details KL Divergence tens

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_38/videos/PPOAgent/38/PPOAgent-episode-189.mp4
score:  14.0




Training Step:  39


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(1.6855e-06)
37 Implimentation Details KL Divergence tensor(1.5348e-08)
Open AI Spinning Up KL Divergence tensor(3.2336e-06)
37 Implimentation Details KL Divergence tensor(1.6342e-08)
Open AI Spinning Up KL Divergence tensor(-7.6617e-07)
37 Implimentation Details KL Divergence tensor(1.6540e-08)
Open AI Spinning Up KL Divergence tensor(-3.5188e-06)
37 Implimentation Details KL Divergence tensor(1.7782e-08)
Open AI Spinning Up KL Divergence tensor(2.0916e-06)
37 Implimentation Details KL Divergence tensor(1.6689e-08)
Open AI Spinning Up KL Divergence tensor(2.9692e-06)
37 Implimentation Details KL Divergence tensor(1.7732e-08)
Open AI Spinning Up KL Divergence tensor(-2.3051e-06)
37 Implimentation Details KL Divergence tensor(1.7683e-08)
Open AI Spinning Up KL Divergence tensor(-2.3724e-06)
37 Implimentation Details KL Divergence tensor(1.8527e-08)
Open AI Spinning Up KL Divergence tensor(9.1270e-07)
37 Implimentation Details KL Divergence tensor(

                                                             

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_39/videos/PPOAgent/39/PPOAgent-episode-194.mp4
score:  18.0




Training Step:  40


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(7.7300e-06)
37 Implimentation Details KL Divergence tensor(1.5895e-08)
Open AI Spinning Up KL Divergence tensor(-3.2289e-06)
37 Implimentation Details KL Divergence tensor(1.7285e-08)
Open AI Spinning Up KL Divergence tensor(3.3527e-06)
37 Implimentation Details KL Divergence tensor(1.6391e-08)
Open AI Spinning Up KL Divergence tensor(1.0505e-06)
37 Implimentation Details KL Divergence tensor(1.9272e-08)
Open AI Spinning Up KL Divergence tensor(9.0565e-06)
37 Implimentation Details KL Divergence tensor(1.9123e-08)
Open AI Spinning Up KL Divergence tensor(-5.9501e-06)
37 Implimentation Details KL Divergence tensor(1.9521e-08)
Open AI Spinning Up KL Divergence tensor(2.9279e-06)
37 Implimentation Details KL Divergence tensor(2.4140e-08)
Open AI Spinning Up KL Divergence tensor(3.4183e-07)
37 Implimentation Details KL Divergence tensor(2.5034e-08)
Open AI Spinning Up KL Divergence tensor(1.0214e-05)
37 Implimentation Details KL Divergence tensor(2.

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_40/videos/PPOAgent/40/PPOAgent-episode-199.mp4
score:  17.0
Training Step:  41


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Open AI Spinning Up KL Divergence tensor(-9.9343e-06)
37 Implimentation Details KL Divergence tensor(1.6640e-08)
Open AI Spinning Up KL Divergence tensor(-9.2010e-07)
37 Implimentation Details KL Divergence tensor(1.4752e-08)
Open AI Spinning Up KL Divergence tensor(-7.9853e-06)
37 Implimentation Details KL Divergence tensor(1.7732e-08)
Open AI Spinning Up KL Divergence tensor(-1.2964e-06)
37 Implimentation Details KL Divergence tensor(1.8080e-08)
Open AI Spinning Up KL Divergence tensor(-1.1012e-05)
37 Implimentation Details KL Divergence tensor(1.8626e-08)
Open AI Spinning Up KL Divergence tensor(-2.5973e-07)
37 Implimentation Details KL Divergence tensor(1.9868e-08)
Open AI Spinning Up KL Divergence tensor(-1.1286e-05)
37 Implimentation Details KL Divergence tensor(2.0713e-08)
Open AI Spinning Up KL Divergence tensor(-1.2457e-06)
37 Implimentation Details KL Divergence tensor(2.2699e-08)
Open AI Spinning Up KL Divergence tensor(-1.3064e-05)
37 Implimentation Details KL Divergence te

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/PPOAgent/step_41/videos/PPOAgent/41/PPOAgent-episode-204.mp4
score:  15.0


KeyboardInterrupt: 

In [None]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

class Agent(nn.Module):
    def get_value(self, x):
        return self.critic(self.network(x))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)


if __name__ == "__main__":
    agent = Agent(envs).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=self.learning_rate, eps=1e-5)

    # ALGO Logic: Storage setup
    obs = torch.zeros((self.num_steps, self.num_envs) + envs.single_observation_space.shape).to(device)
    actions = torch.zeros((self.num_steps, self.num_envs) + envs.single_action_space.shape).to(device)
    logprobs = torch.zeros((self.num_steps, self.num_envs)).to(device)
    rewards = torch.zeros((self.num_steps, self.num_envs)).to(device)
    dones = torch.zeros((self.num_steps, self.num_envs)).to(device)
    values = torch.zeros((self.num_steps, self.num_envs)).to(device)

    # TRY NOT TO MODIFY: start the game
    global_step = 0
    start_time = time.time()
    next_obs = torch.Tensor(envs.reset()).to(device)
    next_done = torch.zeros(self.num_envs).to(device)
    num_updates = self.total_timesteps // self.batch_size

    for update in range(1, num_updates + 1):
        # Annealing the rate if instructed to do so.
        if self.anneal_lr:
            frac = 1.0 - (update - 1.0) / num_updates
            lrnow = frac * self.learning_rate
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, self.num_steps):
            global_step += 1 * self.num_envs
            obs[step] = next_obs
            dones[step] = next_done

            # ALGO LOGIC: action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, done, info = envs.step(action.cpu().numpy())
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            if self.gae:
                advantages = torch.zeros_like(rewards).to(device)
                lastgaelam = 0
                for t in reversed(range(self.num_steps)):
                    if t == self.num_steps - 1:
                        nextnonterminal = 1.0 - next_done
                        nextvalues = next_value
                    else:
                        nextnonterminal = 1.0 - dones[t + 1]
                        nextvalues = values[t + 1]
                    delta = rewards[t] + self.gamma * nextvalues * nextnonterminal - values[t]
                    advantages[t] = lastgaelam = delta + self.gamma * self.gae_lambda * nextnonterminal * lastgaelam
                returns = advantages + values
            else:
                returns = torch.zeros_like(rewards).to(device)
                for t in reversed(range(self.num_steps)):
                    if t == self.num_steps - 1:
                        nextnonterminal = 1.0 - next_done
                        next_return = next_value
                    else:
                        nextnonterminal = 1.0 - dones[t + 1]
                        next_return = returns[t + 1]
                    returns[t] = rewards[t] + self.gamma * nextnonterminal * next_return
                advantages = returns - values

        # flatten the batch
        b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # Optimizing the policy and value network
        b_inds = np.arange(self.batch_size)
        clipfracs = []
        for epoch in range(self.update_epochs):
            np.random.shuffle(b_inds)
            for start in range(0, self.batch_size, self.minibatch_size):
                end = start + self.minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > self.clip_coef).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if self.norm_adv:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                # Policy loss
                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - self.clip_coef, 1 + self.clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # Value loss
                newvalue = newvalue.view(-1)
                if self.clip_vloss:
                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -self.clip_coef,
                        self.clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - self.ent_coef * entropy_loss + v_loss * self.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), self.max_grad_norm)
                optimizer.step()

            if self.target_kl is not None:
                if approx_kl > self.target_kl:
                    break

    envs.close()


In [None]:
"""
	This file is the executable for running PPO. It is based on this medium article: 
	https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
"""

import gymnasium as gym
import sys
import torch

from test_ppo_agent import FeedForwardNN, PPO

def train(env, hyperparameters, actor_model, critic_model):
	"""
		Trains the model.

		Parameters:
			env - the environment to train on
			hyperparameters - a dict of hyperparameters to use, defined in main
			actor_model - the actor model to load in if we want to continue training
			critic_model - the critic model to load in if we want to continue training

		Return:
			None
	"""	
	print(f"Training", flush=True)

	# Create a model for PPO.
	model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)

	# Tries to load in an existing actor/critic model to continue training on
	if actor_model != '' and critic_model != '':
		print(f"Loading in {actor_model} and {critic_model}...", flush=True)
		model.actor.load_state_dict(torch.load(actor_model))
		model.critic.load_state_dict(torch.load(critic_model))
		print(f"Successfully loaded.", flush=True)
	elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
		print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
		sys.exit(0)
	else:
		print(f"Training from scratch.", flush=True)

	# Train the PPO model with a specified total timesteps
	# NOTE: You can change the total timesteps here, I put a big number just because
	# you can kill the process whenever you feel like PPO is converging
	model.learn(total_timesteps=200_000_000)

def test(env, actor_model):
	"""
		Tests the model.

		Parameters:
			env - the environment to test the policy on
			actor_model - the actor model to load in

		Return:
			None
	"""
	print(f"Testing {actor_model}", flush=True)

	# If the actor model is not specified, then exit
	if actor_model == '':
		print(f"Didn't specify model file. Exiting.", flush=True)
		sys.exit(0)

	# Extract out dimensions of observation and action spaces
	obs_dim = env.observation_space.shape[0]
	act_dim = env.action_space.shape[0]

	# Build our policy the same way we build our actor model in PPO
	policy = FeedForwardNN(obs_dim, act_dim)

	# Load in the actor model saved by the PPO algorithm
	policy.load_state_dict(torch.load(actor_model))

def main():
	"""
		The main function to run.

		Parameters:
			args - the arguments parsed from command line

		Return:
			None
	"""
	# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
	# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
	# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
	hyperparameters = {
				'timesteps_per_batch': 2048, 
				'max_timesteps_per_episode': 200, 
				'gamma': 0.99, 
				'n_updates_per_iteration': 10,
				'lr': 3e-4, 
				'clip': 0.2,
				'render': True,
				'render_every_i': 10
			  }

	# Creates the environment we'll be running. If you want to replace with your own
	# custom environment, note that it must inherit Gym and have both continuous
	# observation and action spaces.
	env = gym.make('CartPole-v1')

	# Train or test, depending on the mode specified
	if 'train' == 'train':
		train(env=env, hyperparameters=hyperparameters, actor_model='', critic_model='')
	else:
		test(env=env, actor_model='')

main()



Training
Training from scratch.
Learning... Running 200 timesteps per episode, 2048 timesteps per batch for a total of 200000000 timesteps


  gym.logger.warn(



-------------------- Iteration #1 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1239.03
Average Loss: -0.00148
Timesteps So Far: 2200
Iteration took: 0.65 secs
------------------------------------------------------



  batch_obs = torch.tensor(batch_obs, dtype=torch.float)



-------------------- Iteration #2 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1301.04
Average Loss: -0.00106
Timesteps So Far: 4400
Iteration took: 1.04 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1196.74
Average Loss: -0.00104
Timesteps So Far: 6600
Iteration took: 0.82 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1222.07
Average Loss: -0.00059
Timesteps So Far: 8800
Iteration took: 0.75 secs
------------------------------------------------------


-------------------- Iteration #5 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1265.79
Average Loss: -0.00092
Timesteps So Far: 11000
Iteration took: 1.31 secs
------------------------------------------------------


--

KeyboardInterrupt: 