In [4]:
import gym

class GymRunner():

    env = None

    def __init__(env_name = "LunarLander-v2"):
        self.env = gym.make(env_name)
        #self.env._max_episode_steps = 1200

    def get_shapes():
        return {
            "action_space" : self.env.action_space.n,
            "observation_space" : self.env.observation_space.shape,
        }

    def test_agent(agent, n_iters):
        done = False
        observation = self.env.reset()
        reward = 0
        info = {}

        for _ in range(n_iters):
            while not done:
                self.env.render()

                action = agent.act(observation, reward, done)

                observation, reward, done, info = self.env.step(action)

                print(f"REWARD = {reward}")

In [7]:
import argparse
import sys

import gym
from gym import wrappers, logger

class RandomAgent(object):
    """The world's simplest agent!"""
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()

# New comment
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('env_id', nargs='?', default='LunarLander-v2', help='Select the environment to run')
    args = parser.parse_args()

    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.INFO)
    
    env = gym.make(args.env_id)

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Close the env and write monitor result info to disk
    env.close()

usage: ipykernel_launcher.py [-h] [env_id]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

In [3]:
import gym
env = gym.make('LunarLander-v2')
env.render()
env.reset()
print("hello")

hello


In [5]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'CartPole-v1'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

# Option 2: deep network
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.

#cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=5, visualize=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 10        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________
None
Training for 100000 steps ...
    13/100000: episode: 1, duration: 0.039s, episode steps: 13, steps per second: 334, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.107 [-1.319, 0.772], mean_best_reward: --
    24/100000: episode: 2, duration: 0.005s, episode steps: 11, steps per second: 2243, episode reward: 11.000, mean reward: 1.000 

   893/100000: episode: 44, duration: 0.004s, episode steps: 10, steps per second: 2493, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.145 [-1.523, 2.391], mean_best_reward: --
   924/100000: episode: 45, duration: 0.010s, episode steps: 31, steps per second: 3144, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.613 [0.000, 1.000], mean observation: -0.034 [-2.329, 1.418], mean_best_reward: --
   939/100000: episode: 46, duration: 0.009s, episode steps: 15, steps per second: 1640, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.066 [-2.251, 1.410], mean_best_reward: --
   955/100000: episode: 47, duration: 0.006s, episode steps: 16, steps per second: 2595, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.060 [-2.520, 1.606], mean_best_reward: --
   998/100000: ep

  1720/100000: episode: 79, duration: 0.011s, episode steps: 13, steps per second: 1168, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.094 [-1.725, 1.021], mean_best_reward: --
  1729/100000: episode: 80, duration: 0.007s, episode steps: 9, steps per second: 1353, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.136 [-2.446, 1.607], mean_best_reward: --
  1741/100000: episode: 81, duration: 0.006s, episode steps: 12, steps per second: 1886, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.117 [-1.355, 2.126], mean_best_reward: --
  1752/100000: episode: 82, duration: 0.008s, episode steps: 11, steps per second: 1456, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.125 [-2.329, 1.418], mean_best_reward: --
  1771/100000: epis

  2829/100000: episode: 121, duration: 0.027s, episode steps: 88, steps per second: 3234, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.072 [-0.497, 1.342], mean_best_reward: --
  2973/100000: episode: 122, duration: 0.053s, episode steps: 144, steps per second: 2728, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: 0.116 [-1.161, 1.565], mean_best_reward: --
  2990/100000: episode: 123, duration: 0.007s, episode steps: 17, steps per second: 2428, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.588 [0.000, 1.000], mean observation: -0.119 [-1.588, 0.759], mean_best_reward: --
  3019/100000: episode: 124, duration: 0.010s, episode steps: 29, steps per second: 2870, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.126 [-0.499, 1.035], mean_best_reward: --
  3046/100000

  4022/100000: episode: 157, duration: 0.008s, episode steps: 22, steps per second: 2638, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.100 [-0.415, 1.031], mean_best_reward: --
  4064/100000: episode: 158, duration: 0.015s, episode steps: 42, steps per second: 2812, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: 0.054 [-1.553, 1.533], mean_best_reward: --
  4126/100000: episode: 159, duration: 0.022s, episode steps: 62, steps per second: 2834, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: 0.021 [-1.123, 1.374], mean_best_reward: --
  4147/100000: episode: 160, duration: 0.007s, episode steps: 21, steps per second: 2847, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.052 [-1.119, 1.759], mean_best_reward: --
  4199/100000: e

  5368/100000: episode: 192, duration: 0.036s, episode steps: 105, steps per second: 2927, episode reward: 105.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.045 [-1.513, 1.524], mean_best_reward: --
  5404/100000: episode: 193, duration: 0.012s, episode steps: 36, steps per second: 2979, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.059 [-0.604, 0.908], mean_best_reward: --
  5471/100000: episode: 194, duration: 0.021s, episode steps: 67, steps per second: 3195, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.247 [-1.323, 0.777], mean_best_reward: --
  5482/100000: episode: 195, duration: 0.004s, episode steps: 11, steps per second: 2635, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.113 [-1.484, 0.840], mean_best_reward: --
  5515/1000

  7153/100000: episode: 239, duration: 0.023s, episode steps: 74, steps per second: 3184, episode reward: 74.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.104 [-1.298, 0.744], mean_best_reward: --
  7197/100000: episode: 240, duration: 0.015s, episode steps: 44, steps per second: 2891, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.087 [-0.950, 1.095], mean_best_reward: --
  7227/100000: episode: 241, duration: 0.012s, episode steps: 30, steps per second: 2477, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.103 [-1.505, 0.582], mean_best_reward: --
  7327/100000: episode: 242, duration: 0.029s, episode steps: 100, steps per second: 3414, episode reward: 100.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.209 [-1.149, 0.856], mean_best_reward: --
  7366/1000

  9030/100000: episode: 289, duration: 0.021s, episode steps: 70, steps per second: 3275, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.175 [-1.148, 1.171], mean_best_reward: --
  9068/100000: episode: 290, duration: 0.013s, episode steps: 38, steps per second: 2917, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: 0.092 [-1.018, 2.156], mean_best_reward: --
  9111/100000: episode: 291, duration: 0.018s, episode steps: 43, steps per second: 2346, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: 0.104 [-0.850, 1.727], mean_best_reward: --
  9139/100000: episode: 292, duration: 0.011s, episode steps: 28, steps per second: 2531, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.066 [-0.799, 1.235], mean_best_reward: --
  9161/100000: e

 10838/100000: episode: 342, duration: 0.032s, episode steps: 85, steps per second: 2666, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: 0.044 [-0.978, 1.538], mean_best_reward: --
 10861/100000: episode: 343, duration: 0.008s, episode steps: 23, steps per second: 2846, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.072 [-1.186, 0.572], mean_best_reward: --
 10884/100000: episode: 344, duration: 0.009s, episode steps: 23, steps per second: 2570, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.609 [0.000, 1.000], mean observation: -0.047 [-2.064, 1.329], mean_best_reward: --
 10923/100000: episode: 345, duration: 0.013s, episode steps: 39, steps per second: 3060, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: 0.083 [-0.853, 1.393], mean_best_reward: --
 10966/100000:

 12089/100000: episode: 376, duration: 0.025s, episode steps: 73, steps per second: 2890, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.095 [-1.077, 1.256], mean_best_reward: --
 12120/100000: episode: 377, duration: 0.013s, episode steps: 31, steps per second: 2400, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.056 [-1.011, 1.254], mean_best_reward: --
 12156/100000: episode: 378, duration: 0.012s, episode steps: 36, steps per second: 2985, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.197 [-0.599, 1.129], mean_best_reward: --
 12183/100000: episode: 379, duration: 0.009s, episode steps: 27, steps per second: 3012, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.104 [-1.215, 0.786], mean_best_reward: --
 12197/100000: 

 13845/100000: episode: 423, duration: 0.006s, episode steps: 18, steps per second: 2967, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.389 [0.000, 1.000], mean observation: 0.051 [-0.984, 1.544], mean_best_reward: --
 13889/100000: episode: 424, duration: 0.014s, episode steps: 44, steps per second: 3068, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.087 [-0.768, 1.363], mean_best_reward: --
 13927/100000: episode: 425, duration: 0.012s, episode steps: 38, steps per second: 3126, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.063 [-0.620, 1.597], mean_best_reward: --
 13973/100000: episode: 426, duration: 0.016s, episode steps: 46, steps per second: 2854, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.069 [-1.455, 0.594], mean_best_reward: --
 14000/100000: 

 15649/100000: episode: 468, duration: 0.010s, episode steps: 30, steps per second: 3063, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.054 [-0.776, 1.370], mean_best_reward: --
 15690/100000: episode: 469, duration: 0.019s, episode steps: 41, steps per second: 2215, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.072 [-1.033, 0.638], mean_best_reward: --
 15721/100000: episode: 470, duration: 0.012s, episode steps: 31, steps per second: 2640, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.134 [-0.924, 0.551], mean_best_reward: --
 15731/100000: episode: 471, duration: 0.004s, episode steps: 10, steps per second: 2533, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.127 [-0.788, 1.387], mean_best_reward: --
 15758/100000:

 16955/100000: episode: 502, duration: 0.027s, episode steps: 97, steps per second: 3548, episode reward: 97.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.546 [0.000, 1.000], mean observation: 0.272 [-2.001, 2.853], mean_best_reward: --
 17006/100000: episode: 503, duration: 0.019s, episode steps: 51, steps per second: 2676, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.036 [-1.489, 1.182], mean_best_reward: --
 17103/100000: episode: 504, duration: 0.029s, episode steps: 97, steps per second: 3359, episode reward: 97.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.062 [-1.186, 1.283], mean_best_reward: --
 17147/100000: episode: 505, duration: 0.014s, episode steps: 44, steps per second: 3238, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.109 [-0.754, 1.189], mean_best_reward: --
 17249/100000:

 18808/100000: episode: 553, duration: 0.007s, episode steps: 19, steps per second: 2851, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.078 [-0.821, 1.465], mean_best_reward: --
 18865/100000: episode: 554, duration: 0.019s, episode steps: 57, steps per second: 2964, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.061 [-1.096, 1.025], mean_best_reward: --
 18898/100000: episode: 555, duration: 0.012s, episode steps: 33, steps per second: 2845, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.076 [-1.647, 0.810], mean_best_reward: --
 18954/100000: episode: 556, duration: 0.017s, episode steps: 56, steps per second: 3219, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: -0.106 [-1.763, 0.582], mean_best_reward: --
 18999/100000

 20605/100000: episode: 595, duration: 0.010s, episode steps: 22, steps per second: 2211, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: -0.108 [-1.341, 0.744], mean_best_reward: --
 20634/100000: episode: 596, duration: 0.011s, episode steps: 29, steps per second: 2677, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.105 [-0.538, 0.929], mean_best_reward: --
 20676/100000: episode: 597, duration: 0.024s, episode steps: 42, steps per second: 1760, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.130 [-1.320, 0.603], mean_best_reward: --
 20759/100000: episode: 598, duration: 0.036s, episode steps: 83, steps per second: 2290, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.070 [-1.852, 1.182], mean_best_reward: --
 20773/100000:

 22358/100000: episode: 637, duration: 0.030s, episode steps: 81, steps per second: 2673, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.247 [-1.329, 0.943], mean_best_reward: --
 22421/100000: episode: 638, duration: 0.020s, episode steps: 63, steps per second: 3102, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.028 [-0.656, 0.912], mean_best_reward: --
 22468/100000: episode: 639, duration: 0.015s, episode steps: 47, steps per second: 3035, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: -0.054 [-1.525, 0.761], mean_best_reward: --
 22521/100000: episode: 640, duration: 0.017s, episode steps: 53, steps per second: 3151, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.052 [-1.494, 1.370], mean_best_reward: --
 22542/100000:

 24112/100000: episode: 685, duration: 0.018s, episode steps: 56, steps per second: 3160, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.554 [0.000, 1.000], mean observation: -0.017 [-1.832, 1.517], mean_best_reward: --
 24137/100000: episode: 686, duration: 0.014s, episode steps: 25, steps per second: 1776, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.081 [-1.362, 0.747], mean_best_reward: --
 24169/100000: episode: 687, duration: 0.009s, episode steps: 32, steps per second: 3375, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.034 [-1.236, 0.806], mean_best_reward: --
 24216/100000: episode: 688, duration: 0.015s, episode steps: 47, steps per second: 3147, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.045 [-0.746, 1.377], mean_best_reward: --
 24252/100000

 25681/100000: episode: 725, duration: 0.039s, episode steps: 84, steps per second: 2151, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.090 [-1.138, 1.481], mean_best_reward: --
 25722/100000: episode: 726, duration: 0.023s, episode steps: 41, steps per second: 1768, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.070 [-0.546, 0.983], mean_best_reward: --
 25799/100000: episode: 727, duration: 0.030s, episode steps: 77, steps per second: 2593, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.063 [-1.398, 0.834], mean_best_reward: --
 25926/100000: episode: 728, duration: 0.060s, episode steps: 127, steps per second: 2128, episode reward: 127.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.030 [-0.948, 1.286], mean_best_reward: --
 25958/100000

 27384/100000: episode: 767, duration: 0.020s, episode steps: 55, steps per second: 2819, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.172 [-1.385, 0.713], mean_best_reward: --
 27422/100000: episode: 768, duration: 0.015s, episode steps: 38, steps per second: 2598, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.089 [-0.815, 0.458], mean_best_reward: --
 27464/100000: episode: 769, duration: 0.019s, episode steps: 42, steps per second: 2189, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.039 [-0.958, 1.352], mean_best_reward: --
 27547/100000: episode: 770, duration: 0.041s, episode steps: 83, steps per second: 2037, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: -0.283 [-1.602, 0.802], mean_best_reward: --
 27589/100000

 29091/100000: episode: 805, duration: 0.038s, episode steps: 109, steps per second: 2889, episode reward: 109.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.217 [-1.536, 1.083], mean_best_reward: --
 29278/100000: episode: 806, duration: 0.068s, episode steps: 187, steps per second: 2734, episode reward: 187.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: 0.024 [-1.141, 1.668], mean_best_reward: --
 29298/100000: episode: 807, duration: 0.007s, episode steps: 20, steps per second: 2930, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.071 [-1.166, 2.038], mean_best_reward: --
 29328/100000: episode: 808, duration: 0.011s, episode steps: 30, steps per second: 2727, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.096 [-1.025, 0.603], mean_best_reward: --
 29361/100

 30998/100000: episode: 850, duration: 0.035s, episode steps: 99, steps per second: 2847, episode reward: 99.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.141 [-1.152, 1.537], mean_best_reward: --
 31029/100000: episode: 851, duration: 0.013s, episode steps: 31, steps per second: 2336, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.039 [-0.785, 1.432], mean_best_reward: 112.500000
 31044/100000: episode: 852, duration: 0.010s, episode steps: 15, steps per second: 1544, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.087 [-1.212, 0.813], mean_best_reward: --
 31067/100000: episode: 853, duration: 0.009s, episode steps: 23, steps per second: 2648, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.565 [0.000, 1.000], mean observation: -0.078 [-1.397, 0.614], mean_best_reward: --
 31079

 33108/100000: episode: 893, duration: 0.023s, episode steps: 52, steps per second: 2225, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.133 [-0.649, 1.115], mean_best_reward: --
 33180/100000: episode: 894, duration: 0.029s, episode steps: 72, steps per second: 2448, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: -0.294 [-2.068, 0.925], mean_best_reward: --
 33214/100000: episode: 895, duration: 0.011s, episode steps: 34, steps per second: 3043, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.082 [-0.387, 1.060], mean_best_reward: --
 33225/100000: episode: 896, duration: 0.004s, episode steps: 11, steps per second: 2536, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.121 [-0.774, 1.346], mean_best_reward: --
 33278/100000: 

 35071/100000: episode: 935, duration: 0.041s, episode steps: 83, steps per second: 2018, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: 0.044 [-1.431, 1.359], mean_best_reward: --
 35081/100000: episode: 936, duration: 0.006s, episode steps: 10, steps per second: 1707, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.116 [-1.823, 1.181], mean_best_reward: --
 35129/100000: episode: 937, duration: 0.025s, episode steps: 48, steps per second: 1942, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.164 [-0.539, 1.396], mean_best_reward: --
 35175/100000: episode: 938, duration: 0.022s, episode steps: 46, steps per second: 2104, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.069 [-1.543, 0.830], mean_best_reward: --
 35244/100000:

 36615/100000: episode: 973, duration: 0.025s, episode steps: 46, steps per second: 1823, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.103 [-0.507, 1.472], mean_best_reward: --
 36642/100000: episode: 974, duration: 0.012s, episode steps: 27, steps per second: 2273, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.063 [-0.609, 1.408], mean_best_reward: --
 36691/100000: episode: 975, duration: 0.031s, episode steps: 49, steps per second: 1579, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.018 [-1.414, 2.355], mean_best_reward: --
 36715/100000: episode: 976, duration: 0.012s, episode steps: 24, steps per second: 2079, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.043 [-0.946, 1.458], mean_best_reward: --
 36770/100000: e

 38286/100000: episode: 1010, duration: 0.041s, episode steps: 112, steps per second: 2756, episode reward: 112.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.084 [-1.109, 1.441], mean_best_reward: --
 38352/100000: episode: 1011, duration: 0.021s, episode steps: 66, steps per second: 3081, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.171 [-0.707, 1.159], mean_best_reward: --
 38416/100000: episode: 1012, duration: 0.019s, episode steps: 64, steps per second: 3403, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.547 [0.000, 1.000], mean observation: 0.173 [-0.590, 1.164], mean_best_reward: --
 38441/100000: episode: 1013, duration: 0.008s, episode steps: 25, steps per second: 3159, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.099 [-0.612, 1.066], mean_best_reward: --
 38507/10

 40001/100000: episode: 1046, duration: 0.029s, episode steps: 98, steps per second: 3377, episode reward: 98.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.271 [-1.520, 0.803], mean_best_reward: --
 40016/100000: episode: 1047, duration: 0.006s, episode steps: 15, steps per second: 2710, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.088 [-1.194, 1.932], mean_best_reward: --
 40027/100000: episode: 1048, duration: 0.006s, episode steps: 11, steps per second: 1942, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.131 [-1.665, 0.995], mean_best_reward: --
 40099/100000: episode: 1049, duration: 0.025s, episode steps: 72, steps per second: 2908, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.088 [-1.017, 1.096], mean_best_reward: --
 40125/100

 41278/100000: episode: 1081, duration: 0.016s, episode steps: 49, steps per second: 3027, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.112 [-0.773, 1.258], mean_best_reward: --
 41295/100000: episode: 1082, duration: 0.008s, episode steps: 17, steps per second: 2029, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.086 [-1.536, 1.021], mean_best_reward: --
 41315/100000: episode: 1083, duration: 0.009s, episode steps: 20, steps per second: 2284, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.064 [-0.961, 1.378], mean_best_reward: --
 41328/100000: episode: 1084, duration: 0.005s, episode steps: 13, steps per second: 2673, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.100 [-1.394, 0.819], mean_best_reward: --
 41343/100

 42506/100000: episode: 1120, duration: 0.009s, episode steps: 28, steps per second: 3046, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.064 [-0.442, 0.823], mean_best_reward: --
 42533/100000: episode: 1121, duration: 0.009s, episode steps: 27, steps per second: 3153, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.030 [-1.680, 1.013], mean_best_reward: --
 42565/100000: episode: 1122, duration: 0.012s, episode steps: 32, steps per second: 2627, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.055 [-0.438, 1.051], mean_best_reward: --
 42581/100000: episode: 1123, duration: 0.008s, episode steps: 16, steps per second: 1909, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.072 [-0.823, 1.436], mean_best_reward: --
 42662/1000

 43878/100000: episode: 1154, duration: 0.050s, episode steps: 141, steps per second: 2838, episode reward: 141.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.226 [-1.492, 1.341], mean_best_reward: --
 43904/100000: episode: 1155, duration: 0.009s, episode steps: 26, steps per second: 2922, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.615 [0.000, 1.000], mean observation: -0.046 [-2.015, 1.199], mean_best_reward: --
 43963/100000: episode: 1156, duration: 0.018s, episode steps: 59, steps per second: 3256, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.044 [-0.609, 1.072], mean_best_reward: --
 43990/100000: episode: 1157, duration: 0.010s, episode steps: 27, steps per second: 2737, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.080 [-0.555, 1.119], mean_best_reward: --
 44054/1

 45442/100000: episode: 1190, duration: 0.038s, episode steps: 79, steps per second: 2059, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.544 [0.000, 1.000], mean observation: 0.156 [-0.571, 1.415], mean_best_reward: --
 45475/100000: episode: 1191, duration: 0.018s, episode steps: 33, steps per second: 1843, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.081 [-1.480, 0.812], mean_best_reward: --
 45529/100000: episode: 1192, duration: 0.021s, episode steps: 54, steps per second: 2583, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.165 [-0.743, 1.418], mean_best_reward: --
 45569/100000: episode: 1193, duration: 0.014s, episode steps: 40, steps per second: 2838, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.009 [-1.507, 1.185], mean_best_reward: --
 45614/1000

 47086/100000: episode: 1228, duration: 0.022s, episode steps: 75, steps per second: 3334, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.083 [-0.970, 1.343], mean_best_reward: --
 47111/100000: episode: 1229, duration: 0.010s, episode steps: 25, steps per second: 2553, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.094 [-0.610, 1.196], mean_best_reward: --
 47149/100000: episode: 1230, duration: 0.016s, episode steps: 38, steps per second: 2407, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.053 [-0.619, 1.131], mean_best_reward: --
 47167/100000: episode: 1231, duration: 0.006s, episode steps: 18, steps per second: 2882, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.389 [0.000, 1.000], mean observation: 0.079 [-1.018, 1.821], mean_best_reward: --
 47179/10000

 48312/100000: episode: 1264, duration: 0.011s, episode steps: 33, steps per second: 2924, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.012 [-1.375, 1.153], mean_best_reward: --
 48381/100000: episode: 1265, duration: 0.024s, episode steps: 69, steps per second: 2866, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.301 [-1.501, 0.781], mean_best_reward: --
 48503/100000: episode: 1266, duration: 0.035s, episode steps: 122, steps per second: 3470, episode reward: 122.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.066 [-1.537, 1.162], mean_best_reward: --
 48545/100000: episode: 1267, duration: 0.013s, episode steps: 42, steps per second: 3182, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.066 [-0.643, 1.183], mean_best_reward: --
 48580/1

 50390/100000: episode: 1313, duration: 0.046s, episode steps: 133, steps per second: 2920, episode reward: 133.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.300 [-1.847, 0.846], mean_best_reward: --
 50414/100000: episode: 1314, duration: 0.009s, episode steps: 24, steps per second: 2811, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.046 [-1.297, 0.820], mean_best_reward: --
 50471/100000: episode: 1315, duration: 0.017s, episode steps: 57, steps per second: 3303, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.456 [0.000, 1.000], mean observation: 0.034 [-1.243, 1.931], mean_best_reward: --
 50532/100000: episode: 1316, duration: 0.018s, episode steps: 61, steps per second: 3404, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.099 [-1.516, 1.677], mean_best_reward: --
 50544/1

 52253/100000: episode: 1358, duration: 0.020s, episode steps: 64, steps per second: 3191, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.207 [-0.839, 1.264], mean_best_reward: --
 52345/100000: episode: 1359, duration: 0.036s, episode steps: 92, steps per second: 2591, episode reward: 92.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.179 [-0.877, 1.021], mean_best_reward: --
 52357/100000: episode: 1360, duration: 0.006s, episode steps: 12, steps per second: 1985, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.095 [-0.787, 1.196], mean_best_reward: --
 52396/100000: episode: 1361, duration: 0.015s, episode steps: 39, steps per second: 2570, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.070 [-1.101, 1.482], mean_best_reward: --
 52493/10000

 54095/100000: episode: 1399, duration: 0.013s, episode steps: 29, steps per second: 2312, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.081 [-0.536, 1.220], mean_best_reward: --
 54137/100000: episode: 1400, duration: 0.016s, episode steps: 42, steps per second: 2703, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.006 [-0.597, 1.007], mean_best_reward: --
 54174/100000: episode: 1401, duration: 0.016s, episode steps: 37, steps per second: 2303, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.118 [-0.620, 1.591], mean_best_reward: 131.000000
 54190/100000: episode: 1402, duration: 0.007s, episode steps: 16, steps per second: 2440, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.562 [0.000, 1.000], mean observation: -0.090 [-1.415, 0.936], mean_best_reward: --
 54

 56085/100000: episode: 1444, duration: 0.021s, episode steps: 66, steps per second: 3164, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.133 [-1.149, 2.261], mean_best_reward: --
 56101/100000: episode: 1445, duration: 0.009s, episode steps: 16, steps per second: 1851, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: -0.086 [-1.272, 0.779], mean_best_reward: --
 56127/100000: episode: 1446, duration: 0.009s, episode steps: 26, steps per second: 2985, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.039 [-0.954, 1.311], mean_best_reward: --
 56197/100000: episode: 1447, duration: 0.022s, episode steps: 70, steps per second: 3248, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.029 [-1.113, 1.344], mean_best_reward: --
 56240/1000

 57992/100000: episode: 1493, duration: 0.023s, episode steps: 71, steps per second: 3137, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: -0.188 [-2.015, 0.690], mean_best_reward: --
 58054/100000: episode: 1494, duration: 0.027s, episode steps: 62, steps per second: 2311, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-1.128, 0.723], mean_best_reward: --
 58094/100000: episode: 1495, duration: 0.015s, episode steps: 40, steps per second: 2631, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.080 [-0.447, 0.892], mean_best_reward: --
 58170/100000: episode: 1496, duration: 0.026s, episode steps: 76, steps per second: 2931, episode reward: 76.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.026 [-0.950, 1.201], mean_best_reward: --
 58215/100

 59699/100000: episode: 1529, duration: 0.022s, episode steps: 75, steps per second: 3360, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.042 [-2.126, 2.055], mean_best_reward: --
 59716/100000: episode: 1530, duration: 0.006s, episode steps: 17, steps per second: 2662, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.057 [-1.315, 1.877], mean_best_reward: --
 59743/100000: episode: 1531, duration: 0.014s, episode steps: 27, steps per second: 1984, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.079 [-1.280, 0.737], mean_best_reward: --
 59765/100000: episode: 1532, duration: 0.008s, episode steps: 22, steps per second: 2808, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.409 [0.000, 1.000], mean observation: 0.017 [-1.013, 1.550], mean_best_reward: --
 59820/1000

 61042/100000: episode: 1564, duration: 0.036s, episode steps: 106, steps per second: 2980, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.099 [-0.664, 1.172], mean_best_reward: --
 61097/100000: episode: 1565, duration: 0.017s, episode steps: 55, steps per second: 3177, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.083 [-0.870, 0.810], mean_best_reward: --
 61168/100000: episode: 1566, duration: 0.023s, episode steps: 71, steps per second: 3147, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: -0.131 [-1.798, 1.097], mean_best_reward: --
 61183/100000: episode: 1567, duration: 0.005s, episode steps: 15, steps per second: 2801, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.107 [-1.199, 0.641], mean_best_reward: --
 61212/1

 62686/100000: episode: 1600, duration: 0.026s, episode steps: 80, steps per second: 3022, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.053 [-0.984, 1.056], mean_best_reward: --
 62731/100000: episode: 1601, duration: 0.020s, episode steps: 45, steps per second: 2222, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.130 [-0.700, 1.375], mean_best_reward: 162.000000
 62743/100000: episode: 1602, duration: 0.006s, episode steps: 12, steps per second: 2080, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.101 [-1.250, 0.767], mean_best_reward: --
 62823/100000: episode: 1603, duration: 0.026s, episode steps: 80, steps per second: 3042, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.103 [-1.347, 1.432], mean_best_reward: --
 6

 64456/100000: episode: 1646, duration: 0.010s, episode steps: 28, steps per second: 2881, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.102 [-1.207, 0.388], mean_best_reward: --
 64488/100000: episode: 1647, duration: 0.011s, episode steps: 32, steps per second: 2874, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: 0.076 [-0.608, 1.538], mean_best_reward: --
 64602/100000: episode: 1648, duration: 0.046s, episode steps: 114, steps per second: 2464, episode reward: 114.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: -0.151 [-1.380, 1.118], mean_best_reward: --
 64617/100000: episode: 1649, duration: 0.007s, episode steps: 15, steps per second: 2253, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.068 [-1.223, 1.821], mean_best_reward: --
 64629/1

 66047/100000: episode: 1685, duration: 0.010s, episode steps: 14, steps per second: 1367, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.089 [-0.598, 1.134], mean_best_reward: --
 66073/100000: episode: 1686, duration: 0.020s, episode steps: 26, steps per second: 1325, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.065 [-1.241, 0.630], mean_best_reward: --
 66129/100000: episode: 1687, duration: 0.031s, episode steps: 56, steps per second: 1791, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: -0.004 [-1.432, 0.803], mean_best_reward: --
 66174/100000: episode: 1688, duration: 0.025s, episode steps: 45, steps per second: 1815, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.110 [-0.806, 1.055], mean_best_reward: --
 66214/100

 67647/100000: episode: 1724, duration: 0.009s, episode steps: 29, steps per second: 3096, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.045 [-0.607, 1.070], mean_best_reward: --
 67661/100000: episode: 1725, duration: 0.005s, episode steps: 14, steps per second: 2694, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.093 [-0.976, 1.705], mean_best_reward: --
 67687/100000: episode: 1726, duration: 0.011s, episode steps: 26, steps per second: 2319, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.577 [0.000, 1.000], mean observation: -0.105 [-1.938, 0.828], mean_best_reward: --
 67718/100000: episode: 1727, duration: 0.013s, episode steps: 31, steps per second: 2380, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: 0.053 [-0.815, 1.522], mean_best_reward: --
 67760/1000

 68872/100000: episode: 1760, duration: 0.006s, episode steps: 16, steps per second: 2642, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.114 [-1.360, 0.740], mean_best_reward: --
 68904/100000: episode: 1761, duration: 0.015s, episode steps: 32, steps per second: 2188, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: 0.087 [-0.396, 1.388], mean_best_reward: --
 68918/100000: episode: 1762, duration: 0.005s, episode steps: 14, steps per second: 2546, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.079 [-1.014, 1.599], mean_best_reward: --
 68929/100000: episode: 1763, duration: 0.004s, episode steps: 11, steps per second: 2582, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.114 [-0.823, 1.342], mean_best_reward: --
 68991/1000

 70743/100000: episode: 1805, duration: 0.021s, episode steps: 65, steps per second: 3084, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.554 [0.000, 1.000], mean observation: 0.178 [-0.717, 1.238], mean_best_reward: --
 70768/100000: episode: 1806, duration: 0.008s, episode steps: 25, steps per second: 3172, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.052 [-0.832, 1.400], mean_best_reward: --
 70800/100000: episode: 1807, duration: 0.011s, episode steps: 32, steps per second: 2952, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.052 [-1.069, 0.639], mean_best_reward: --
 70821/100000: episode: 1808, duration: 0.011s, episode steps: 21, steps per second: 1829, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.054 [-1.178, 1.867], mean_best_reward: --
 70880/1000

 72553/100000: episode: 1848, duration: 0.019s, episode steps: 62, steps per second: 3351, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.066 [-0.976, 1.251], mean_best_reward: --
 72565/100000: episode: 1849, duration: 0.004s, episode steps: 12, steps per second: 2860, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.101 [-1.175, 1.940], mean_best_reward: --
 72603/100000: episode: 1850, duration: 0.014s, episode steps: 38, steps per second: 2784, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.092 [-0.437, 1.149], mean_best_reward: --
 72629/100000: episode: 1851, duration: 0.012s, episode steps: 26, steps per second: 2118, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.088 [-0.609, 0.990], mean_best_reward: 106.500000
 72

 74126/100000: episode: 1885, duration: 0.008s, episode steps: 24, steps per second: 2924, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.098 [-0.607, 1.251], mean_best_reward: --
 74173/100000: episode: 1886, duration: 0.016s, episode steps: 47, steps per second: 2963, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.093 [-0.486, 1.348], mean_best_reward: --
 74205/100000: episode: 1887, duration: 0.013s, episode steps: 32, steps per second: 2373, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.095 [-0.415, 1.245], mean_best_reward: --
 74220/100000: episode: 1888, duration: 0.006s, episode steps: 15, steps per second: 2643, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.104 [-0.746, 1.303], mean_best_reward: --
 74238/10000

 76064/100000: episode: 1932, duration: 0.028s, episode steps: 91, steps per second: 3239, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.009 [-1.071, 1.073], mean_best_reward: --
 76128/100000: episode: 1933, duration: 0.025s, episode steps: 64, steps per second: 2581, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.071 [-1.631, 0.752], mean_best_reward: --
 76247/100000: episode: 1934, duration: 0.038s, episode steps: 119, steps per second: 3162, episode reward: 119.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.250 [-2.040, 0.813], mean_best_reward: --
 76383/100000: episode: 1935, duration: 0.039s, episode steps: 136, steps per second: 3523, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.196 [-1.683, 1.216], mean_best_reward: --
 7639

 77927/100000: episode: 1969, duration: 0.011s, episode steps: 36, steps per second: 3241, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.046 [-0.848, 1.774], mean_best_reward: --
 77948/100000: episode: 1970, duration: 0.008s, episode steps: 21, steps per second: 2710, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.082 [-0.779, 1.249], mean_best_reward: --
 78003/100000: episode: 1971, duration: 0.022s, episode steps: 55, steps per second: 2470, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.073 [-0.531, 1.443], mean_best_reward: --
 78029/100000: episode: 1972, duration: 0.009s, episode steps: 26, steps per second: 2843, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.066 [-0.599, 1.253], mean_best_reward: --
 78053/10000

 79227/100000: episode: 2005, duration: 0.023s, episode steps: 56, steps per second: 2420, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.112 [-0.617, 1.189], mean_best_reward: --
 79247/100000: episode: 2006, duration: 0.010s, episode steps: 20, steps per second: 2009, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.058 [-1.125, 1.754], mean_best_reward: --
 79274/100000: episode: 2007, duration: 0.009s, episode steps: 27, steps per second: 2979, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.060 [-0.790, 1.161], mean_best_reward: --
 79302/100000: episode: 2008, duration: 0.009s, episode steps: 28, steps per second: 3051, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.070 [-1.535, 0.802], mean_best_reward: --
 79323/1000

 81120/100000: episode: 2052, duration: 0.028s, episode steps: 88, steps per second: 3185, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.135 [-1.690, 1.406], mean_best_reward: --
 81165/100000: episode: 2053, duration: 0.016s, episode steps: 45, steps per second: 2800, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.008 [-1.411, 0.952], mean_best_reward: --
 81178/100000: episode: 2054, duration: 0.005s, episode steps: 13, steps per second: 2627, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.116 [-0.951, 1.675], mean_best_reward: --
 81218/100000: episode: 2055, duration: 0.012s, episode steps: 40, steps per second: 3293, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.059 [-1.097, 0.897], mean_best_reward: --
 81260/100

 82406/100000: episode: 2088, duration: 0.013s, episode steps: 36, steps per second: 2854, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: -0.114 [-1.365, 0.642], mean_best_reward: --
 82485/100000: episode: 2089, duration: 0.029s, episode steps: 79, steps per second: 2764, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.063 [-1.189, 1.361], mean_best_reward: --
 82554/100000: episode: 2090, duration: 0.022s, episode steps: 69, steps per second: 3196, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: -0.021 [-1.336, 0.755], mean_best_reward: --
 82567/100000: episode: 2091, duration: 0.005s, episode steps: 13, steps per second: 2854, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.099 [-1.162, 1.914], mean_best_reward: --
 82595/10

 83764/100000: episode: 2125, duration: 0.005s, episode steps: 15, steps per second: 2811, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.104 [-0.764, 1.333], mean_best_reward: --
 83816/100000: episode: 2126, duration: 0.018s, episode steps: 52, steps per second: 2891, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.018 [-0.913, 1.133], mean_best_reward: --
 83888/100000: episode: 2127, duration: 0.025s, episode steps: 72, steps per second: 2906, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.208 [-0.945, 1.249], mean_best_reward: --
 83907/100000: episode: 2128, duration: 0.006s, episode steps: 19, steps per second: 3016, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.088 [-0.966, 1.558], mean_best_reward: --
 83925/1000

 85015/100000: episode: 2161, duration: 0.009s, episode steps: 29, steps per second: 3266, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.097 [-0.540, 0.985], mean_best_reward: --
 85051/100000: episode: 2162, duration: 0.013s, episode steps: 36, steps per second: 2859, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.089 [-0.633, 1.192], mean_best_reward: --
 85152/100000: episode: 2163, duration: 0.033s, episode steps: 101, steps per second: 3092, episode reward: 101.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.383 [-1.886, 1.121], mean_best_reward: --
 85174/100000: episode: 2164, duration: 0.008s, episode steps: 22, steps per second: 2815, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.409 [0.000, 1.000], mean observation: 0.082 [-1.006, 1.875], mean_best_reward: --
 85245/10

 86940/100000: episode: 2205, duration: 0.015s, episode steps: 45, steps per second: 3049, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.076 [-0.609, 1.258], mean_best_reward: --
 86960/100000: episode: 2206, duration: 0.006s, episode steps: 20, steps per second: 3101, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.083 [-1.008, 1.712], mean_best_reward: --
 86974/100000: episode: 2207, duration: 0.007s, episode steps: 14, steps per second: 2074, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.102 [-1.647, 0.980], mean_best_reward: --
 86996/100000: episode: 2208, duration: 0.011s, episode steps: 22, steps per second: 2084, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.068 [-1.000, 1.484], mean_best_reward: --
 87008/1000

 88337/100000: episode: 2241, duration: 0.021s, episode steps: 64, steps per second: 3037, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.062 [-1.174, 1.197], mean_best_reward: --
 88432/100000: episode: 2242, duration: 0.030s, episode steps: 95, steps per second: 3155, episode reward: 95.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.077 [-1.109, 1.328], mean_best_reward: --
 88531/100000: episode: 2243, duration: 0.029s, episode steps: 99, steps per second: 3380, episode reward: 99.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.092 [-1.170, 1.707], mean_best_reward: --
 88588/100000: episode: 2244, duration: 0.017s, episode steps: 57, steps per second: 3324, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.050 [-1.123, 1.006], mean_best_reward: --
 88622/1000

 90315/100000: episode: 2279, duration: 0.040s, episode steps: 126, steps per second: 3188, episode reward: 126.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.463 [-2.129, 1.299], mean_best_reward: --
 90326/100000: episode: 2280, duration: 0.007s, episode steps: 11, steps per second: 1495, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.123 [-1.329, 2.163], mean_best_reward: --
 90399/100000: episode: 2281, duration: 0.022s, episode steps: 73, steps per second: 3273, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.155 [-1.336, 1.148], mean_best_reward: --
 90428/100000: episode: 2282, duration: 0.009s, episode steps: 29, steps per second: 3118, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.063 [-0.946, 1.430], mean_best_reward: --
 90460/1

 92209/100000: episode: 2324, duration: 0.016s, episode steps: 46, steps per second: 2839, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.100 [-0.587, 1.398], mean_best_reward: --
 92229/100000: episode: 2325, duration: 0.012s, episode steps: 20, steps per second: 1670, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.060 [-1.352, 0.830], mean_best_reward: --
 92282/100000: episode: 2326, duration: 0.023s, episode steps: 53, steps per second: 2330, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.174 [-0.976, 1.638], mean_best_reward: --
 92409/100000: episode: 2327, duration: 0.050s, episode steps: 127, steps per second: 2556, episode reward: 127.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.338 [-0.968, 1.851], mean_best_reward: --
 92429/10

 94000/100000: episode: 2362, duration: 0.014s, episode steps: 46, steps per second: 3195, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: 0.061 [-1.000, 1.732], mean_best_reward: --
 94075/100000: episode: 2363, duration: 0.029s, episode steps: 75, steps per second: 2583, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.167 [-1.280, 1.067], mean_best_reward: --
 94099/100000: episode: 2364, duration: 0.008s, episode steps: 24, steps per second: 3056, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.083 [-0.936, 1.674], mean_best_reward: --
 94137/100000: episode: 2365, duration: 0.012s, episode steps: 38, steps per second: 3276, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: 0.101 [-0.766, 1.814], mean_best_reward: --
 94151/1000

 95744/100000: episode: 2409, duration: 0.021s, episode steps: 70, steps per second: 3337, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.136 [-1.291, 0.943], mean_best_reward: --
 95846/100000: episode: 2410, duration: 0.040s, episode steps: 102, steps per second: 2521, episode reward: 102.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.250 [-1.535, 1.139], mean_best_reward: --
 95904/100000: episode: 2411, duration: 0.018s, episode steps: 58, steps per second: 3250, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.064 [-1.372, 1.337], mean_best_reward: --
 95948/100000: episode: 2412, duration: 0.013s, episode steps: 44, steps per second: 3267, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.003 [-1.025, 1.580], mean_best_reward: --
 95971/

 97695/100000: episode: 2452, duration: 0.010s, episode steps: 25, steps per second: 2492, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.078 [-2.069, 1.144], mean_best_reward: --
 97706/100000: episode: 2453, duration: 0.005s, episode steps: 11, steps per second: 2226, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.094 [-1.588, 1.012], mean_best_reward: --
 97721/100000: episode: 2454, duration: 0.007s, episode steps: 15, steps per second: 2262, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.048 [-1.696, 1.218], mean_best_reward: --
 97746/100000: episode: 2455, duration: 0.009s, episode steps: 25, steps per second: 2696, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.062 [-0.987, 0.606], mean_best_reward: --
 97774/1

 99483/100000: episode: 2497, duration: 0.019s, episode steps: 52, steps per second: 2711, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.106 [-1.339, 0.539], mean_best_reward: --
 99501/100000: episode: 2498, duration: 0.008s, episode steps: 18, steps per second: 2241, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.082 [-1.141, 1.790], mean_best_reward: --
 99585/100000: episode: 2499, duration: 0.026s, episode steps: 84, steps per second: 3206, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.076 [-1.255, 1.031], mean_best_reward: --
 99606/100000: episode: 2500, duration: 0.008s, episode steps: 21, steps per second: 2649, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.619 [0.000, 1.000], mean observation: -0.083 [-2.023, 1.017], mean_best_reward: --
 99703/100

<keras.callbacks.History at 0x7f2ea4029da0>

In [42]:
env.action_space.np_random

<mtrand.RandomState at 0x7f3f904fd900>

In [12]:
import rl

In [13]:
rl.agents

In [4]:
import gym
env = gym.make('CartPole-v1')
mn = []
for i_episode in range(10):
    observation = env.reset()
    for t in range(100):
        env.render()
        
        
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        
        
        if done:
            print("Episode finished after {} timesteps".format(t+1),', Reward:',reward)
            break

Episode finished after 22 timesteps , Reward: 1.0
Episode finished after 14 timesteps , Reward: 1.0
Episode finished after 23 timesteps , Reward: 1.0
Episode finished after 12 timesteps , Reward: 1.0
Episode finished after 51 timesteps , Reward: 1.0
Episode finished after 14 timesteps , Reward: 1.0
Episode finished after 13 timesteps , Reward: 1.0
Episode finished after 10 timesteps , Reward: 1.0
Episode finished after 49 timesteps , Reward: 1.0
Episode finished after 49 timesteps , Reward: 1.0
