In [1]:
import gym
import tensorflow as tf

import os

import gym
import numpy as np

from stable_baselines import PPO2
from stable_baselines.bench import Monitor

from stable_baselines.common.policies import ActorCriticPolicy, register_policy, mlp_extractor, MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.tf_layers import linear

In [2]:
# Custom MLP policy of three layers of size 128 each for the actor and 2 layers of 32 for the critic,
# with a MLP feature extractor
class CustomPolicyNew(ActorCriticPolicy):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs):
        super(CustomPolicyNew, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, 
                                           reuse=reuse, scale=False)

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.nn.relu

            pi_h, vf_h = mlp_extractor(tf.layers.flatten(self.processed_obs), 
                                               net_arch=[dict(vf=[64, 64], pi=[64, 64])],
                                               act_fun=tf.tanh, **kwargs)
            #extracted_features = tf.layers.flatten(extracted_features)

            #pi_h = extracted_features
            for i, layer_size in enumerate([128, 128, 128]):
                pi_h = activ(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i)))
            pi_latent = pi_h

            #vf_h = extracted_features
            for i, layer_size in enumerate([32, 32]):
                vf_h = activ(tf.layers.dense(vf_h, layer_size, name='vf_fc' + str(i)))
            value_fn = tf.layers.dense(vf_h, 1, name='vf')
            vf_latent = vf_h

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._value_fn = value_fn
        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})


In [None]:
class CustomPolicy(ActorCriticPolicy):

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, 
                 net_arch=None, act_fun=tf.tanh, feature_extraction="mlp", **kwargs):
        super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                                scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead "
                          "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                              DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})

In [3]:
# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

In [4]:
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir)

model = PPO2(CustomPolicyNew, env, verbose=1)

Wrapping the env in a DummyVecEnv.




Instructions for updating:
Use keras.layers.flatten instead.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






In [5]:
# Train the agent
time_steps = 50000
model.learn(total_timesteps=time_steps)

--------------------------------------
| approxkl           | 9.619993e-06  |
| clipfrac           | 0.0           |
| ep_len_mean        | 21.8          |
| ep_reward_mean     | 21.8          |
| explained_variance | 0.0345        |
| fps                | 202           |
| n_updates          | 1             |
| policy_entropy     | 0.6931372     |
| policy_loss        | -0.0006388412 |
| serial_timesteps   | 128           |
| time_elapsed       | 4.43e-05      |
| total_timesteps    | 128           |
| value_loss         | 46.837788     |
--------------------------------------
--------------------------------------
| approxkl           | 1.3510906e-05 |
| clipfrac           | 0.0           |
| ep_len_mean        | 23            |
| ep_reward_mean     | 23            |
| explained_variance | 0.00579       |
| fps                | 818           |
| n_updates          | 2             |
| policy_entropy     | 0.6930676     |
| policy_loss        | -0.0009091749 |
| serial_timesteps   | 25

---------------------------------------
| approxkl           | 0.00017307905  |
| clipfrac           | 0.0            |
| ep_len_mean        | 24.8           |
| ep_reward_mean     | 24.8           |
| explained_variance | 0.0105         |
| fps                | 869            |
| n_updates          | 16             |
| policy_entropy     | 0.6405998      |
| policy_loss        | -0.00078717695 |
| serial_timesteps   | 2048           |
| time_elapsed       | 2.81           |
| total_timesteps    | 2048           |
| value_loss         | 63.797295      |
---------------------------------------
-------------------------------------
| approxkl           | 0.0006060024 |
| clipfrac           | 0.0          |
| ep_len_mean        | 26           |
| ep_reward_mean     | 26           |
| explained_variance | -0.163       |
| fps                | 865          |
| n_updates          | 17           |
| policy_entropy     | 0.6620505    |
| policy_loss        | 0.0034306026 |
| serial_timesteps  

--------------------------------------
| approxkl           | 0.00094983005 |
| clipfrac           | 0.00390625    |
| ep_len_mean        | 38.7          |
| ep_reward_mean     | 38.7          |
| explained_variance | -0.0291       |
| fps                | 848           |
| n_updates          | 31            |
| policy_entropy     | 0.62005264    |
| policy_loss        | 0.007616415   |
| serial_timesteps   | 3968          |
| time_elapsed       | 5.08          |
| total_timesteps    | 3968          |
| value_loss         | 64.04257      |
--------------------------------------
--------------------------------------
| approxkl           | 0.00012561076 |
| clipfrac           | 0.0           |
| ep_len_mean        | 39.8          |
| ep_reward_mean     | 39.8          |
| explained_variance | -0.0316       |
| fps                | 817           |
| n_updates          | 32            |
| policy_entropy     | 0.61019146    |
| policy_loss        | 0.00086507597 |
| serial_timesteps   | 40

-------------------------------------
| approxkl           | 0.0024790657 |
| clipfrac           | 0.029296875  |
| ep_len_mean        | 52           |
| ep_reward_mean     | 52           |
| explained_variance | 0.29         |
| fps                | 811          |
| n_updates          | 46           |
| policy_entropy     | 0.6263214    |
| policy_loss        | -0.002653458 |
| serial_timesteps   | 5888         |
| time_elapsed       | 7.38         |
| total_timesteps    | 5888         |
| value_loss         | 71.89594     |
-------------------------------------
--------------------------------------
| approxkl           | 0.0016910178  |
| clipfrac           | 0.009765625   |
| ep_len_mean        | 54.5          |
| ep_reward_mean     | 54.5          |
| explained_variance | 0.242         |
| fps                | 850           |
| n_updates          | 47            |
| policy_entropy     | 0.55426717    |
| policy_loss        | -0.0029462145 |
| serial_timesteps   | 6016          |
|

-------------------------------------
| approxkl           | 0.0005953453 |
| clipfrac           | 0.0          |
| ep_len_mean        | 69.7         |
| ep_reward_mean     | 69.7         |
| explained_variance | 0.088        |
| fps                | 866          |
| n_updates          | 61           |
| policy_entropy     | 0.5571782    |
| policy_loss        | -0.004538995 |
| serial_timesteps   | 7808         |
| time_elapsed       | 9.7          |
| total_timesteps    | 7808         |
| value_loss         | 47.046402    |
-------------------------------------
--------------------------------------
| approxkl           | 0.0026700404  |
| clipfrac           | 0.015625      |
| ep_len_mean        | 70.9          |
| ep_reward_mean     | 70.9          |
| explained_variance | 0.115         |
| fps                | 829           |
| n_updates          | 62            |
| policy_entropy     | 0.5797389     |
| policy_loss        | -0.0030489974 |
| serial_timesteps   | 7936          |
|

-------------------------------------
| approxkl           | 0.008885331  |
| clipfrac           | 0.14453125   |
| ep_len_mean        | 83.8         |
| ep_reward_mean     | 83.8         |
| explained_variance | 0.131        |
| fps                | 812          |
| n_updates          | 76           |
| policy_entropy     | 0.52925885   |
| policy_loss        | -0.010991391 |
| serial_timesteps   | 9728         |
| time_elapsed       | 12.1         |
| total_timesteps    | 9728         |
| value_loss         | 37.3997      |
-------------------------------------
--------------------------------------
| approxkl           | 0.0015589036  |
| clipfrac           | 0.017578125   |
| ep_len_mean        | 86.3          |
| ep_reward_mean     | 86.3          |
| explained_variance | 0.412         |
| fps                | 831           |
| n_updates          | 77            |
| policy_entropy     | 0.5481117     |
| policy_loss        | -0.0029646053 |
| serial_timesteps   | 9856          |
|

------------------------------------
| approxkl           | 0.004180839 |
| clipfrac           | 0.044921875 |
| ep_len_mean        | 99.9        |
| ep_reward_mean     | 99.9        |
| explained_variance | -0.16       |
| fps                | 746         |
| n_updates          | 91          |
| policy_entropy     | 0.58400226  |
| policy_loss        | 0.012755275 |
| serial_timesteps   | 11648       |
| time_elapsed       | 14.4        |
| total_timesteps    | 11648       |
| value_loss         | 166.78271   |
------------------------------------
--------------------------------------
| approxkl           | 0.0029014463  |
| clipfrac           | 0.0234375     |
| ep_len_mean        | 99.9          |
| ep_reward_mean     | 99.9          |
| explained_variance | -3.18         |
| fps                | 845           |
| n_updates          | 92            |
| policy_entropy     | 0.5734235     |
| policy_loss        | -0.0046307268 |
| serial_timesteps   | 11776         |
| time_elapsed  

---------------------------------------
| approxkl           | 6.5669947e-06  |
| clipfrac           | 0.0            |
| ep_len_mean        | 108            |
| ep_reward_mean     | 108            |
| explained_variance | 0.178          |
| fps                | 817            |
| n_updates          | 106            |
| policy_entropy     | 0.5735937      |
| policy_loss        | -5.8643054e-05 |
| serial_timesteps   | 13568          |
| time_elapsed       | 16.9           |
| total_timesteps    | 13568          |
| value_loss         | 87.24703       |
---------------------------------------
--------------------------------------
| approxkl           | 2.0532176e-05 |
| clipfrac           | 0.0           |
| ep_len_mean        | 109           |
| ep_reward_mean     | 109           |
| explained_variance | 0.159         |
| fps                | 874           |
| n_updates          | 107           |
| policy_entropy     | 0.56682926    |
| policy_loss        | -0.0001269239 |
| serial_t

-------------------------------------
| approxkl           | 0.0001455602 |
| clipfrac           | 0.0          |
| ep_len_mean        | 103          |
| ep_reward_mean     | 103          |
| explained_variance | 0.562        |
| fps                | 852          |
| n_updates          | 120          |
| policy_entropy     | 0.4906356    |
| policy_loss        | 0.0010286702 |
| serial_timesteps   | 15360        |
| time_elapsed       | 19.1         |
| total_timesteps    | 15360        |
| value_loss         | 63.766457    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00021128434 |
| clipfrac           | 0.0           |
| ep_len_mean        | 104           |
| ep_reward_mean     | 104           |
| explained_variance | 0.504         |
| fps                | 838           |
| n_updates          | 121           |
| policy_entropy     | 0.54365975    |
| policy_loss        | -0.00233857   |
| serial_timesteps   | 15488         |
|

--------------------------------------
| approxkl           | 0.001209464   |
| clipfrac           | 0.0078125     |
| ep_len_mean        | 105           |
| ep_reward_mean     | 105           |
| explained_variance | 0.965         |
| fps                | 816           |
| n_updates          | 135           |
| policy_entropy     | 0.45364755    |
| policy_loss        | -0.0015607467 |
| serial_timesteps   | 17280         |
| time_elapsed       | 21.4          |
| total_timesteps    | 17280         |
| value_loss         | 10.224551     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00014691638 |
| clipfrac           | 0.0           |
| ep_len_mean        | 105           |
| ep_reward_mean     | 105           |
| explained_variance | 0.024         |
| fps                | 836           |
| n_updates          | 136           |
| policy_entropy     | 0.42437997    |
| policy_loss        | 0.0007296158  |
| serial_timesteps   | 17

--------------------------------------
| approxkl           | 0.00014913561 |
| clipfrac           | 0.0           |
| ep_len_mean        | 105           |
| ep_reward_mean     | 105           |
| explained_variance | 0.947         |
| fps                | 798           |
| n_updates          | 150           |
| policy_entropy     | 0.48185068    |
| policy_loss        | -0.0008946876 |
| serial_timesteps   | 19200         |
| time_elapsed       | 23.8          |
| total_timesteps    | 19200         |
| value_loss         | 8.17325       |
--------------------------------------
--------------------------------------
| approxkl           | 0.0020714018  |
| clipfrac           | 0.021484375   |
| ep_len_mean        | 105           |
| ep_reward_mean     | 105           |
| explained_variance | 0.998         |
| fps                | 830           |
| n_updates          | 151           |
| policy_entropy     | 0.5051412     |
| policy_loss        | -0.0028426615 |
| serial_timesteps   | 19

--------------------------------------
| approxkl           | 8.826792e-05  |
| clipfrac           | 0.0           |
| ep_len_mean        | 106           |
| ep_reward_mean     | 106           |
| explained_variance | 0.964         |
| fps                | 807           |
| n_updates          | 165           |
| policy_entropy     | 0.5328552     |
| policy_loss        | -0.0012314947 |
| serial_timesteps   | 21120         |
| time_elapsed       | 26.3          |
| total_timesteps    | 21120         |
| value_loss         | 2.673346      |
--------------------------------------
--------------------------------------
| approxkl           | 0.00013516459 |
| clipfrac           | 0.0           |
| ep_len_mean        | 107           |
| ep_reward_mean     | 107           |
| explained_variance | 0.893         |
| fps                | 859           |
| n_updates          | 166           |
| policy_entropy     | 0.45295972    |
| policy_loss        | -0.0014843106 |
| serial_timesteps   | 21

--------------------------------------
| approxkl           | 0.00079407473 |
| clipfrac           | 0.001953125   |
| ep_len_mean        | 115           |
| ep_reward_mean     | 115           |
| explained_variance | 0.993         |
| fps                | 576           |
| n_updates          | 180           |
| policy_entropy     | 0.42308223    |
| policy_loss        | 0.000290073   |
| serial_timesteps   | 23040         |
| time_elapsed       | 29.1          |
| total_timesteps    | 23040         |
| value_loss         | 0.8382883     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0013336829  |
| clipfrac           | 0.015625      |
| ep_len_mean        | 115           |
| ep_reward_mean     | 115           |
| explained_variance | 0.997         |
| fps                | 749           |
| n_updates          | 181           |
| policy_entropy     | 0.42109343    |
| policy_loss        | -0.0010494282 |
| serial_timesteps   | 23

-------------------------------------
| approxkl           | 0.003741383  |
| clipfrac           | 0.048828125  |
| ep_len_mean        | 122          |
| ep_reward_mean     | 122          |
| explained_variance | 0.803        |
| fps                | 850          |
| n_updates          | 194          |
| policy_entropy     | 0.44608033   |
| policy_loss        | 0.0013583034 |
| serial_timesteps   | 24832        |
| time_elapsed       | 31.2         |
| total_timesteps    | 24832        |
| value_loss         | 56.229378    |
-------------------------------------
--------------------------------------
| approxkl           | 0.0060638804  |
| clipfrac           | 0.076171875   |
| ep_len_mean        | 122           |
| ep_reward_mean     | 122           |
| explained_variance | 0.969         |
| fps                | 881           |
| n_updates          | 195           |
| policy_entropy     | 0.38614386    |
| policy_loss        | -0.0043708556 |
| serial_timesteps   | 24960         |
|

--------------------------------------
| approxkl           | 6.118655e-05  |
| clipfrac           | 0.0           |
| ep_len_mean        | 134           |
| ep_reward_mean     | 134           |
| explained_variance | 0.193         |
| fps                | 850           |
| n_updates          | 209           |
| policy_entropy     | 0.5191936     |
| policy_loss        | -0.0008543889 |
| serial_timesteps   | 26752         |
| time_elapsed       | 33.6          |
| total_timesteps    | 26752         |
| value_loss         | 2.465027      |
--------------------------------------
---------------------------------------
| approxkl           | 0.00023822258  |
| clipfrac           | 0.0            |
| ep_len_mean        | 134            |
| ep_reward_mean     | 134            |
| explained_variance | 0.49           |
| fps                | 855            |
| n_updates          | 210            |
| policy_entropy     | 0.44725758     |
| policy_loss        | -0.00056906627 |
| serial_timest

--------------------------------------
| approxkl           | 0.0023641628  |
| clipfrac           | 0.021484375   |
| ep_len_mean        | 148           |
| ep_reward_mean     | 148           |
| explained_variance | 0.762         |
| fps                | 878           |
| n_updates          | 224           |
| policy_entropy     | 0.53165174    |
| policy_loss        | -0.0055591334 |
| serial_timesteps   | 28672         |
| time_elapsed       | 36.2          |
| total_timesteps    | 28672         |
| value_loss         | 111.97292     |
--------------------------------------
-------------------------------------
| approxkl           | 0.0012456912 |
| clipfrac           | 0.017578125  |
| ep_len_mean        | 148          |
| ep_reward_mean     | 148          |
| explained_variance | 0.602        |
| fps                | 876          |
| n_updates          | 225          |
| policy_entropy     | 0.5397909    |
| policy_loss        | -0.006649863 |
| serial_timesteps   | 28800       

-------------------------------------
| approxkl           | 0.0012402006 |
| clipfrac           | 0.005859375  |
| ep_len_mean        | 162          |
| ep_reward_mean     | 162          |
| explained_variance | 0.295        |
| fps                | 874          |
| n_updates          | 239          |
| policy_entropy     | 0.39413396   |
| policy_loss        | 0.0019594417 |
| serial_timesteps   | 30592        |
| time_elapsed       | 38.5         |
| total_timesteps    | 30592        |
| value_loss         | 209.36684    |
-------------------------------------
-------------------------------------
| approxkl           | 0.016243083  |
| clipfrac           | 0.15820312   |
| ep_len_mean        | 162          |
| ep_reward_mean     | 162          |
| explained_variance | 0.325        |
| fps                | 883          |
| n_updates          | 240          |
| policy_entropy     | 0.40317315   |
| policy_loss        | -0.003766568 |
| serial_timesteps   | 30720        |
| time_elaps

-------------------------------------
| approxkl           | 0.0059659043 |
| clipfrac           | 0.0703125    |
| ep_len_mean        | 174          |
| ep_reward_mean     | 174          |
| explained_variance | -0.861       |
| fps                | 876          |
| n_updates          | 254          |
| policy_entropy     | 0.41197395   |
| policy_loss        | 0.004317212  |
| serial_timesteps   | 32512        |
| time_elapsed       | 40.7         |
| total_timesteps    | 32512        |
| value_loss         | 1.063321     |
-------------------------------------
----------------------------------------
| approxkl           | 3.236285e-06    |
| clipfrac           | 0.0             |
| ep_len_mean        | 178             |
| ep_reward_mean     | 178             |
| explained_variance | 0.0402          |
| fps                | 858             |
| n_updates          | 255             |
| policy_entropy     | 0.32468337      |
| policy_loss        | -0.000112704816 |
| serial_timesteps  

---------------------------------------
| approxkl           | 0.00038826274  |
| clipfrac           | 0.0            |
| ep_len_mean        | 193            |
| ep_reward_mean     | 193            |
| explained_variance | 0.0207         |
| fps                | 653            |
| n_updates          | 269            |
| policy_entropy     | 0.36324996     |
| policy_loss        | -0.00030030683 |
| serial_timesteps   | 34432          |
| time_elapsed       | 43             |
| total_timesteps    | 34432          |
| value_loss         | 299.56396      |
---------------------------------------
--------------------------------------
| approxkl           | 0.0011457613  |
| clipfrac           | 0.013671875   |
| ep_len_mean        | 193           |
| ep_reward_mean     | 193           |
| explained_variance | -0.18         |
| fps                | 515           |
| n_updates          | 270           |
| policy_entropy     | 0.39088738    |
| policy_loss        | -0.0030165024 |
| serial_t

-------------------------------------
| approxkl           | 0.0034836992 |
| clipfrac           | 0.0390625    |
| ep_len_mean        | 205          |
| ep_reward_mean     | 205          |
| explained_variance | -0.0287      |
| fps                | 837          |
| n_updates          | 284          |
| policy_entropy     | 0.24890558   |
| policy_loss        | -0.009823869 |
| serial_timesteps   | 36352        |
| time_elapsed       | 45.6         |
| total_timesteps    | 36352        |
| value_loss         | 0.15151115   |
-------------------------------------
--------------------------------------
| approxkl           | 0.0033214702  |
| clipfrac           | 0.04296875    |
| ep_len_mean        | 209           |
| ep_reward_mean     | 209           |
| explained_variance | -0.00456      |
| fps                | 861           |
| n_updates          | 285           |
| policy_entropy     | 0.27147305    |
| policy_loss        | -0.0013991087 |
| serial_timesteps   | 36480         |
|

--------------------------------------
| approxkl           | 0.0009292457  |
| clipfrac           | 0.00390625    |
| ep_len_mean        | 222           |
| ep_reward_mean     | 222           |
| explained_variance | -0.324        |
| fps                | 818           |
| n_updates          | 299           |
| policy_entropy     | 0.35636538    |
| policy_loss        | -0.0009906407 |
| serial_timesteps   | 38272         |
| time_elapsed       | 48            |
| total_timesteps    | 38272         |
| value_loss         | 0.50480384    |
--------------------------------------
--------------------------------------
| approxkl           | 0.00036653568 |
| clipfrac           | 0.0           |
| ep_len_mean        | 226           |
| ep_reward_mean     | 226           |
| explained_variance | 0.0632        |
| fps                | 842           |
| n_updates          | 300           |
| policy_entropy     | 0.33463877    |
| policy_loss        | -0.0012642136 |
| serial_timesteps   | 38

--------------------------------------
| approxkl           | 0.0009344937  |
| clipfrac           | 0.009765625   |
| ep_len_mean        | 240           |
| ep_reward_mean     | 240           |
| explained_variance | 0.152         |
| fps                | 798           |
| n_updates          | 314           |
| policy_entropy     | 0.34135526    |
| policy_loss        | -0.0021916188 |
| serial_timesteps   | 40192         |
| time_elapsed       | 50.4          |
| total_timesteps    | 40192         |
| value_loss         | 0.015796179   |
--------------------------------------
-------------------------------------
| approxkl           | 0.0042438707 |
| clipfrac           | 0.0625       |
| ep_len_mean        | 240          |
| ep_reward_mean     | 240          |
| explained_variance | -0.698       |
| fps                | 780          |
| n_updates          | 315          |
| policy_entropy     | 0.38972092   |
| policy_loss        | -0.004897107 |
| serial_timesteps   | 40320       

---------------------------------------
| approxkl           | 0.00096515566  |
| clipfrac           | 0.001953125    |
| ep_len_mean        | 255            |
| ep_reward_mean     | 255            |
| explained_variance | -1.09          |
| fps                | 808            |
| n_updates          | 329            |
| policy_entropy     | 0.3376761      |
| policy_loss        | -0.00033945846 |
| serial_timesteps   | 42112          |
| time_elapsed       | 52.8           |
| total_timesteps    | 42112          |
| value_loss         | 0.020389628    |
---------------------------------------
-------------------------------------
| approxkl           | 0.005079034  |
| clipfrac           | 0.06640625   |
| ep_len_mean        | 255          |
| ep_reward_mean     | 255          |
| explained_variance | 0.49         |
| fps                | 847          |
| n_updates          | 330          |
| policy_entropy     | 0.3565007    |
| policy_loss        | 0.0015751992 |
| serial_timesteps  

--------------------------------------
| approxkl           | 3.963812e-05  |
| clipfrac           | 0.0           |
| ep_len_mean        | 272           |
| ep_reward_mean     | 272           |
| explained_variance | -1.48         |
| fps                | 820           |
| n_updates          | 344           |
| policy_entropy     | 0.36055157    |
| policy_loss        | 0.00015786214 |
| serial_timesteps   | 44032         |
| time_elapsed       | 55.2          |
| total_timesteps    | 44032         |
| value_loss         | 0.021143485   |
--------------------------------------
-------------------------------------
| approxkl           | 0.011548545  |
| clipfrac           | 0.11328125   |
| ep_len_mean        | 272          |
| ep_reward_mean     | 272          |
| explained_variance | -11.3        |
| fps                | 782          |
| n_updates          | 345          |
| policy_entropy     | 0.32837707   |
| policy_loss        | -0.011552495 |
| serial_timesteps   | 44160       

--------------------------------------
| approxkl           | 0.010553232   |
| clipfrac           | 0.109375      |
| ep_len_mean        | 286           |
| ep_reward_mean     | 286           |
| explained_variance | 0.459         |
| fps                | 817           |
| n_updates          | 359           |
| policy_entropy     | 0.43330765    |
| policy_loss        | -0.0021989846 |
| serial_timesteps   | 45952         |
| time_elapsed       | 57.7          |
| total_timesteps    | 45952         |
| value_loss         | 208.14789     |
--------------------------------------
------------------------------------
| approxkl           | 0.015562485 |
| clipfrac           | 0.20898438  |
| ep_len_mean        | 286         |
| ep_reward_mean     | 286         |
| explained_variance | 0.572       |
| fps                | 828         |
| n_updates          | 360         |
| policy_entropy     | 0.39136028  |
| policy_loss        | -0.01523221 |
| serial_timesteps   | 46080       |
| time_e

--------------------------------------
| approxkl           | 0.0017997982  |
| clipfrac           | 0.017578125   |
| ep_len_mean        | 302           |
| ep_reward_mean     | 302           |
| explained_variance | -0.00325      |
| fps                | 812           |
| n_updates          | 374           |
| policy_entropy     | 0.36667144    |
| policy_loss        | -0.0032008376 |
| serial_timesteps   | 47872         |
| time_elapsed       | 60.1          |
| total_timesteps    | 47872         |
| value_loss         | 328.798       |
--------------------------------------
---------------------------------------
| approxkl           | 0.0009239847   |
| clipfrac           | 0.01171875     |
| ep_len_mean        | 302            |
| ep_reward_mean     | 302            |
| explained_variance | -11.3          |
| fps                | 817            |
| n_updates          | 375            |
| policy_entropy     | 0.38312992     |
| policy_loss        | -0.00053465995 |
| serial_timest

-------------------------------------
| approxkl           | 0.002168411  |
| clipfrac           | 0.021484375  |
| ep_len_mean        | 312          |
| ep_reward_mean     | 312          |
| explained_variance | -0.549       |
| fps                | 832          |
| n_updates          | 389          |
| policy_entropy     | 0.42412362   |
| policy_loss        | 0.0029362175 |
| serial_timesteps   | 49792        |
| time_elapsed       | 62.4         |
| total_timesteps    | 49792        |
| value_loss         | 0.042799614  |
-------------------------------------
-------------------------------------
| approxkl           | 0.015358709  |
| clipfrac           | 0.1875       |
| ep_len_mean        | 315          |
| ep_reward_mean     | 315          |
| explained_variance | 0.00172      |
| fps                | 789          |
| n_updates          | 390          |
| policy_entropy     | 0.39766082   |
| policy_loss        | 0.0023935658 |
| serial_timesteps   | 49920        |
| time_elaps

<stable_baselines.ppo2.ppo2.PPO2 at 0x7f9140cadc50>

In [6]:
# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()
env.close()