In [10]:
import gym
import numpy as np

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Classic Control

## Cart-Pole

In [11]:
# Create the environment and reset it to the initial state
env = gym.make("CartPole-v1")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [12]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


    47/50000: episode: 1, duration: 0.234s, episode steps:  47, steps per second: 201, episode reward: 47.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.404 [0.000, 1.000],  loss: --, mae: --, mean_q: --
    63/50000: episode: 2, duration: 0.012s, episode steps:  16, steps per second: 1335, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.750 [0.000, 1.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


   103/50000: episode: 3, duration: 0.882s, episode steps:  40, steps per second:  45, episode reward: 40.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.475 [0.000, 1.000],  loss: 0.522412, mae: 0.534500, mean_q: -0.013017
   125/50000: episode: 4, duration: 0.093s, episode steps:  22, steps per second: 237, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.591 [0.000, 1.000],  loss: 0.407394, mae: 0.498958, mean_q: 0.108280
   161/50000: episode: 5, duration: 0.153s, episode steps:  36, steps per second: 235, episode reward: 36.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.528 [0.000, 1.000],  loss: 0.185644, mae: 0.529316, mean_q: 0.538482
   180/50000: episode: 6, duration: 0.080s, episode steps:  19, steps per second: 236, episode reward: 19.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 0.058134, mae: 0.624777, mean_q: 1.083646
   198/50000: episode: 7, duration: 0.077s, episode steps:  18, ste

  1044/50000: episode: 39, duration: 0.116s, episode steps:  22, steps per second: 189, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 0.538945, mae: 4.070412, mean_q: 7.919444
  1063/50000: episode: 40, duration: 0.099s, episode steps:  19, steps per second: 191, episode reward: 19.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.421 [0.000, 1.000],  loss: 0.385834, mae: 4.145517, mean_q: 8.117569
  1073/50000: episode: 41, duration: 0.054s, episode steps:  10, steps per second: 185, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.354219, mae: 4.213254, mean_q: 8.284635
  1106/50000: episode: 42, duration: 0.170s, episode steps:  33, steps per second: 194, episode reward: 33.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.576 [0.000, 1.000],  loss: 0.540375, mae: 4.288280, mean_q: 8.331957
  1147/50000: episode: 43, duration: 0.207s, episode steps:  41,

  5603/50000: episode: 74, duration: 0.817s, episode steps: 164, steps per second: 201, episode reward: 164.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.530 [0.000, 1.000],  loss: 2.352743, mae: 22.864025, mean_q: 46.518192
  5848/50000: episode: 75, duration: 1.216s, episode steps: 245, steps per second: 201, episode reward: 245.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.478 [0.000, 1.000],  loss: 2.471309, mae: 23.540297, mean_q: 47.829960
  6007/50000: episode: 76, duration: 0.863s, episode steps: 159, steps per second: 184, episode reward: 159.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.535 [0.000, 1.000],  loss: 2.842201, mae: 24.256903, mean_q: 49.294983
  6199/50000: episode: 77, duration: 1.115s, episode steps: 192, steps per second: 172, episode reward: 192.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.469 [0.000, 1.000],  loss: 2.525446, mae: 24.856138, mean_q: 50.548573
  6357/50000: episode: 78, duration: 1.116s, episode

 12419/50000: episode: 109, duration: 0.889s, episode steps: 168, steps per second: 189, episode reward: 168.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.536 [0.000, 1.000],  loss: 3.300701, mae: 38.254597, mean_q: 77.292595
 12631/50000: episode: 110, duration: 0.961s, episode steps: 212, steps per second: 221, episode reward: 212.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 2.399673, mae: 38.655155, mean_q: 78.234779
 12800/50000: episode: 111, duration: 0.826s, episode steps: 169, steps per second: 205, episode reward: 169.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.456 [0.000, 1.000],  loss: 3.505029, mae: 38.581188, mean_q: 77.977440
 12982/50000: episode: 112, duration: 1.013s, episode steps: 182, steps per second: 180, episode reward: 182.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.522 [0.000, 1.000],  loss: 2.804895, mae: 39.190376, mean_q: 79.293739
 13138/50000: episode: 113, duration: 0.880s, ep

 18997/50000: episode: 144, duration: 1.071s, episode steps: 222, steps per second: 207, episode reward: 222.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.527 [0.000, 1.000],  loss: 2.113759, mae: 40.261192, mean_q: 81.262848
 19192/50000: episode: 145, duration: 0.892s, episode steps: 195, steps per second: 219, episode reward: 195.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.523 [0.000, 1.000],  loss: 2.819534, mae: 40.560352, mean_q: 81.952698
 19356/50000: episode: 146, duration: 0.801s, episode steps: 164, steps per second: 205, episode reward: 164.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.433 [0.000, 1.000],  loss: 2.608602, mae: 40.545334, mean_q: 81.919838
 19531/50000: episode: 147, duration: 0.917s, episode steps: 175, steps per second: 191, episode reward: 175.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.451 [0.000, 1.000],  loss: 1.975867, mae: 40.884697, mean_q: 82.527863
 19713/50000: episode: 148, duration: 0.814s, ep

 27283/50000: episode: 179, duration: 4.427s, episode steps: 500, steps per second: 113, episode reward: 500.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 2.008116, mae: 44.793072, mean_q: 90.405579
 27783/50000: episode: 180, duration: 4.422s, episode steps: 500, steps per second: 113, episode reward: 500.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.502 [0.000, 1.000],  loss: 2.662230, mae: 45.768658, mean_q: 92.222610
 28283/50000: episode: 181, duration: 4.499s, episode steps: 500, steps per second: 111, episode reward: 500.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.502 [0.000, 1.000],  loss: 2.007236, mae: 47.151665, mean_q: 95.022461
 28783/50000: episode: 182, duration: 4.446s, episode steps: 500, steps per second: 112, episode reward: 500.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 4.857229, mae: 47.944393, mean_q: 96.545860
 29283/50000: episode: 183, duration: 4.440s, ep

 38750/50000: episode: 214, duration: 1.243s, episode steps: 137, steps per second: 110, episode reward: 137.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 9.790737, mae: 61.172958, mean_q: 122.864136
 38882/50000: episode: 215, duration: 1.207s, episode steps: 132, steps per second: 109, episode reward: 132.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.523 [0.000, 1.000],  loss: 6.772850, mae: 60.766201, mean_q: 122.355309
 39007/50000: episode: 216, duration: 1.137s, episode steps: 125, steps per second: 110, episode reward: 125.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 3.293504, mae: 60.666222, mean_q: 122.322533
 39148/50000: episode: 217, duration: 1.291s, episode steps: 141, steps per second: 109, episode reward: 141.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.525 [0.000, 1.000],  loss: 10.803409, mae: 60.965466, mean_q: 122.534370
 39283/50000: episode: 218, duration: 1.230

 43104/50000: episode: 249, duration: 1.163s, episode steps: 125, steps per second: 107, episode reward: 125.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.544 [0.000, 1.000],  loss: 4.278934, mae: 61.930954, mean_q: 124.768044
 43218/50000: episode: 250, duration: 1.073s, episode steps: 114, steps per second: 106, episode reward: 114.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.544 [0.000, 1.000],  loss: 12.499614, mae: 61.815792, mean_q: 124.446983
 43327/50000: episode: 251, duration: 1.013s, episode steps: 109, steps per second: 108, episode reward: 109.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.541 [0.000, 1.000],  loss: 8.637574, mae: 62.122372, mean_q: 124.853523
 43453/50000: episode: 252, duration: 1.171s, episode steps: 126, steps per second: 108, episode reward: 126.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.548 [0.000, 1.000],  loss: 1.916406, mae: 61.352242, mean_q: 123.727837
 43566/50000: episode: 253, duration: 1.043

<keras.callbacks.History at 0x7fe72ad2f550>

In [13]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500


<keras.callbacks.History at 0x7fe72ae72c70>

## Mountain Car (Discrete)

In [None]:
# Create the environment and reset it to the initial state
env = gym.make("MountainCar-v0")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [15]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


   200/50000: episode: 1, duration: 0.997s, episode steps: 200, steps per second: 201, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.060 [0.000, 2.000],  loss: 0.232180, mae: 0.502601, mean_q: -0.382186
   400/50000: episode: 2, duration: 0.975s, episode steps: 200, steps per second: 205, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 0.002554, mae: 1.240121, mean_q: -1.828955
   600/50000: episode: 3, duration: 0.943s, episode steps: 200, steps per second: 212, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.990 [0.000, 2.000],  loss: 0.013305, mae: 2.286235, mean_q: -3.385422
   800/50000: episode: 4, duration: 1.059s, episode steps: 200, steps per second: 189, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.015 [0.000, 2.000],  loss: 0.024968, mae: 3.398019, mean_q: -5.041681
  1000/50000: episode: 5, duration: 1.001s, episode step

  7200/50000: episode: 36, duration: 1.034s, episode steps: 200, steps per second: 193, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.115 [0.000, 2.000],  loss: 3.310073, mae: 25.745474, mean_q: -38.204140
  7400/50000: episode: 37, duration: 1.021s, episode steps: 200, steps per second: 196, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.080 [0.000, 2.000],  loss: 3.292670, mae: 26.148848, mean_q: -38.738998
  7600/50000: episode: 38, duration: 1.066s, episode steps: 200, steps per second: 188, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 4.061584, mae: 26.513691, mean_q: -39.252823
  7800/50000: episode: 39, duration: 0.935s, episode steps: 200, steps per second: 214, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.075 [0.000, 2.000],  loss: 4.419078, mae: 26.842304, mean_q: -39.741764
  8000/50000: episode: 40, duration: 1.140s,

 14200/50000: episode: 71, duration: 2.017s, episode steps: 200, steps per second:  99, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.070 [0.000, 2.000],  loss: 4.096649, mae: 33.881943, mean_q: -50.306210
 14400/50000: episode: 72, duration: 1.912s, episode steps: 200, steps per second: 105, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.150 [0.000, 2.000],  loss: 5.003302, mae: 34.160900, mean_q: -50.746593
 14600/50000: episode: 73, duration: 1.936s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.260 [0.000, 2.000],  loss: 6.219534, mae: 34.300228, mean_q: -50.822369
 14800/50000: episode: 74, duration: 1.983s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.100 [0.000, 2.000],  loss: 6.981527, mae: 34.264828, mean_q: -50.762550
 15000/50000: episode: 75, duration: 1.875s,

 21200/50000: episode: 106, duration: 1.882s, episode steps: 200, steps per second: 106, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.150 [0.000, 2.000],  loss: 9.674154, mae: 35.215267, mean_q: -52.163406
 21400/50000: episode: 107, duration: 1.869s, episode steps: 200, steps per second: 107, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.085 [0.000, 2.000],  loss: 8.470133, mae: 35.185429, mean_q: -52.191994
 21600/50000: episode: 108, duration: 1.863s, episode steps: 200, steps per second: 107, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.240 [0.000, 2.000],  loss: 5.426492, mae: 35.206001, mean_q: -52.215843
 21800/50000: episode: 109, duration: 1.916s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.150 [0.000, 2.000],  loss: 5.525233, mae: 35.367081, mean_q: -52.536747
 22000/50000: episode: 110, duration: 1.

 28200/50000: episode: 141, duration: 1.892s, episode steps: 200, steps per second: 106, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.040 [0.000, 2.000],  loss: 5.879042, mae: 37.806087, mean_q: -56.215790
 28400/50000: episode: 142, duration: 1.979s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.125 [0.000, 2.000],  loss: 9.094000, mae: 37.924915, mean_q: -56.188515
 28600/50000: episode: 143, duration: 1.914s, episode steps: 200, steps per second: 105, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.165 [0.000, 2.000],  loss: 7.319055, mae: 37.800850, mean_q: -56.164639
 28800/50000: episode: 144, duration: 1.885s, episode steps: 200, steps per second: 106, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.060 [0.000, 2.000],  loss: 7.656014, mae: 37.848076, mean_q: -56.157051
 29000/50000: episode: 145, duration: 1.

 35200/50000: episode: 176, duration: 1.923s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.130 [0.000, 2.000],  loss: 10.073669, mae: 38.692265, mean_q: -57.439507
 35400/50000: episode: 177, duration: 1.914s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 8.551554, mae: 38.802181, mean_q: -57.562332
 35600/50000: episode: 178, duration: 1.995s, episode steps: 200, steps per second: 100, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.065 [0.000, 2.000],  loss: 7.556763, mae: 38.851189, mean_q: -57.698624
 35800/50000: episode: 179, duration: 1.920s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.960 [0.000, 2.000],  loss: 8.063097, mae: 38.887787, mean_q: -57.771786
 36000/50000: episode: 180, duration: 1

 42200/50000: episode: 211, duration: 1.954s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.970 [0.000, 2.000],  loss: 8.441375, mae: 39.293556, mean_q: -58.286102
 42400/50000: episode: 212, duration: 1.946s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.965 [0.000, 2.000],  loss: 7.939999, mae: 39.237450, mean_q: -58.338303
 42600/50000: episode: 213, duration: 1.943s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.005 [0.000, 2.000],  loss: 9.844736, mae: 39.180714, mean_q: -58.172676
 42800/50000: episode: 214, duration: 1.948s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.050 [0.000, 2.000],  loss: 5.431265, mae: 39.334480, mean_q: -58.584892
 43000/50000: episode: 215, duration: 1.

 49200/50000: episode: 246, duration: 2.079s, episode steps: 200, steps per second:  96, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.925 [0.000, 2.000],  loss: 7.736361, mae: 38.624165, mean_q: -57.365799
 49400/50000: episode: 247, duration: 2.077s, episode steps: 200, steps per second:  96, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.945 [0.000, 2.000],  loss: 6.970424, mae: 38.616264, mean_q: -57.341972
 49600/50000: episode: 248, duration: 2.001s, episode steps: 200, steps per second: 100, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.875 [0.000, 2.000],  loss: 8.675717, mae: 38.467510, mean_q: -57.051014
 49800/50000: episode: 249, duration: 2.010s, episode steps: 200, steps per second: 100, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.830 [0.000, 2.000],  loss: 7.064775, mae: 38.244556, mean_q: -56.784630
 50000/50000: episode: 250, duration: 2.

<keras.callbacks.History at 0x7fe7d4e216d0>

In [16]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -200.000, steps: 200
Episode 6: reward: -200.000, steps: 200
Episode 7: reward: -200.000, steps: 200
Episode 8: reward: -200.000, steps: 200
Episode 9: reward: -200.000, steps: 200
Episode 10: reward: -200.000, steps: 200


<keras.callbacks.History at 0x7fe7d4e20eb0>

## Acrobot

In [17]:
# Create the environment and reset it to the initial state
env = gym.make("Acrobot-v1")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)



In [18]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


   500/50000: episode: 1, duration: 5.216s, episode steps: 500, steps per second:  96, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.988 [0.000, 2.000],  loss: 0.039595, mae: 1.345360, mean_q: -1.863840
  1000/50000: episode: 2, duration: 4.665s, episode steps: 500, steps per second: 107, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.018 [0.000, 2.000],  loss: 0.025499, mae: 3.668244, mean_q: -5.367797
  1447/50000: episode: 3, duration: 4.228s, episode steps: 447, steps per second: 106, episode reward: -446.000, mean reward: -0.998 [-1.000,  0.000], mean action: 1.043 [0.000, 2.000],  loss: 0.077571, mae: 6.075470, mean_q: -8.900464
  1947/50000: episode: 4, duration: 4.649s, episode steps: 500, steps per second: 108, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.048 [0.000, 2.000],  loss: 0.116497, mae: 8.383890, mean_q: -12.331873
  2279/50000: episode: 5, duration: 3.282s, episode ste

 12025/50000: episode: 36, duration: 3.488s, episode steps: 368, steps per second: 106, episode reward: -367.000, mean reward: -0.997 [-1.000,  0.000], mean action: 1.011 [0.000, 2.000],  loss: 1.766647, mae: 29.760349, mean_q: -43.690399
 12238/50000: episode: 37, duration: 2.015s, episode steps: 213, steps per second: 106, episode reward: -212.000, mean reward: -0.995 [-1.000,  0.000], mean action: 0.939 [0.000, 2.000],  loss: 1.520958, mae: 29.821804, mean_q: -43.768421
 12496/50000: episode: 38, duration: 2.446s, episode steps: 258, steps per second: 105, episode reward: -257.000, mean reward: -0.996 [-1.000,  0.000], mean action: 0.965 [0.000, 2.000],  loss: 1.534717, mae: 29.835117, mean_q: -43.746193
 12635/50000: episode: 39, duration: 1.329s, episode steps: 139, steps per second: 105, episode reward: -138.000, mean reward: -0.993 [-1.000,  0.000], mean action: 0.957 [0.000, 2.000],  loss: 1.465238, mae: 29.937311, mean_q: -43.941143
 12910/50000: episode: 40, duration: 2.606s,

 19169/50000: episode: 71, duration: 1.551s, episode steps: 161, steps per second: 104, episode reward: -160.000, mean reward: -0.994 [-1.000,  0.000], mean action: 1.019 [0.000, 2.000],  loss: 1.941131, mae: 29.144310, mean_q: -42.484146
 19368/50000: episode: 72, duration: 1.918s, episode steps: 199, steps per second: 104, episode reward: -198.000, mean reward: -0.995 [-1.000,  0.000], mean action: 1.025 [0.000, 2.000],  loss: 1.800464, mae: 29.227705, mean_q: -42.648849
 19571/50000: episode: 73, duration: 1.996s, episode steps: 203, steps per second: 102, episode reward: -202.000, mean reward: -0.995 [-1.000,  0.000], mean action: 1.148 [0.000, 2.000],  loss: 1.604700, mae: 29.011965, mean_q: -42.347225
 19718/50000: episode: 74, duration: 1.747s, episode steps: 147, steps per second:  84, episode reward: -146.000, mean reward: -0.993 [-1.000,  0.000], mean action: 1.034 [0.000, 2.000],  loss: 1.347512, mae: 28.857824, mean_q: -42.113514
 19895/50000: episode: 75, duration: 2.064s,

 25076/50000: episode: 106, duration: 1.268s, episode steps: 129, steps per second: 102, episode reward: -128.000, mean reward: -0.992 [-1.000,  0.000], mean action: 0.984 [0.000, 2.000],  loss: 1.389147, mae: 27.163595, mean_q: -39.547760
 25230/50000: episode: 107, duration: 1.510s, episode steps: 154, steps per second: 102, episode reward: -153.000, mean reward: -0.994 [-1.000,  0.000], mean action: 1.123 [0.000, 2.000],  loss: 1.258999, mae: 27.207317, mean_q: -39.618813
 25403/50000: episode: 108, duration: 1.689s, episode steps: 173, steps per second: 102, episode reward: -172.000, mean reward: -0.994 [-1.000,  0.000], mean action: 0.942 [0.000, 2.000],  loss: 1.510840, mae: 27.176559, mean_q: -39.543266
 25539/50000: episode: 109, duration: 1.328s, episode steps: 136, steps per second: 102, episode reward: -135.000, mean reward: -0.993 [-1.000,  0.000], mean action: 0.971 [0.000, 2.000],  loss: 1.097504, mae: 26.912144, mean_q: -39.147583
 25651/50000: episode: 110, duration: 1.

 31492/50000: episode: 141, duration: 1.398s, episode steps: 141, steps per second: 101, episode reward: -140.000, mean reward: -0.993 [-1.000,  0.000], mean action: 1.043 [0.000, 2.000],  loss: 1.196766, mae: 26.396847, mean_q: -38.456200
 31673/50000: episode: 142, duration: 1.792s, episode steps: 181, steps per second: 101, episode reward: -180.000, mean reward: -0.994 [-1.000,  0.000], mean action: 1.055 [0.000, 2.000],  loss: 1.181003, mae: 26.056076, mean_q: -37.966812
 31861/50000: episode: 143, duration: 1.856s, episode steps: 188, steps per second: 101, episode reward: -187.000, mean reward: -0.995 [-1.000,  0.000], mean action: 0.872 [0.000, 2.000],  loss: 1.404072, mae: 26.019905, mean_q: -37.872993
 32001/50000: episode: 144, duration: 1.384s, episode steps: 140, steps per second: 101, episode reward: -139.000, mean reward: -0.993 [-1.000,  0.000], mean action: 0.950 [0.000, 2.000],  loss: 1.158157, mae: 26.208202, mean_q: -38.198048
 32188/50000: episode: 145, duration: 1.

 38250/50000: episode: 176, duration: 2.090s, episode steps: 209, steps per second: 100, episode reward: -208.000, mean reward: -0.995 [-1.000,  0.000], mean action: 0.856 [0.000, 2.000],  loss: 1.226825, mae: 26.879259, mean_q: -39.215065
 38516/50000: episode: 177, duration: 2.658s, episode steps: 266, steps per second: 100, episode reward: -265.000, mean reward: -0.996 [-1.000,  0.000], mean action: 1.026 [0.000, 2.000],  loss: 1.296716, mae: 26.839060, mean_q: -39.165936
 38700/50000: episode: 178, duration: 1.847s, episode steps: 184, steps per second: 100, episode reward: -183.000, mean reward: -0.995 [-1.000,  0.000], mean action: 0.957 [0.000, 2.000],  loss: 1.173377, mae: 27.020279, mean_q: -39.467403
 38866/50000: episode: 179, duration: 1.667s, episode steps: 166, steps per second: 100, episode reward: -165.000, mean reward: -0.994 [-1.000,  0.000], mean action: 0.976 [0.000, 2.000],  loss: 1.089788, mae: 27.030245, mean_q: -39.470814
 38991/50000: episode: 180, duration: 1.

 44610/50000: episode: 211, duration: 1.641s, episode steps: 161, steps per second:  98, episode reward: -160.000, mean reward: -0.994 [-1.000,  0.000], mean action: 1.050 [0.000, 2.000],  loss: 1.197696, mae: 27.264977, mean_q: -39.765633
 44800/50000: episode: 212, duration: 1.952s, episode steps: 190, steps per second:  97, episode reward: -189.000, mean reward: -0.995 [-1.000,  0.000], mean action: 1.111 [0.000, 2.000],  loss: 1.181147, mae: 27.177534, mean_q: -39.642944
 45077/50000: episode: 213, duration: 2.808s, episode steps: 277, steps per second:  99, episode reward: -276.000, mean reward: -0.996 [-1.000,  0.000], mean action: 0.874 [0.000, 2.000],  loss: 1.125964, mae: 27.165583, mean_q: -39.627708
 45247/50000: episode: 214, duration: 1.767s, episode steps: 170, steps per second:  96, episode reward: -169.000, mean reward: -0.994 [-1.000,  0.000], mean action: 0.959 [0.000, 2.000],  loss: 1.206549, mae: 27.178841, mean_q: -39.638271
 45514/50000: episode: 215, duration: 2.

<keras.callbacks.History at 0x7fe759bad5b0>

In [19]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: -61.000, steps: 62
Episode 2: reward: -61.000, steps: 62
Episode 3: reward: -151.000, steps: 152
Episode 4: reward: -69.000, steps: 70
Episode 5: reward: -279.000, steps: 280
Episode 6: reward: -82.000, steps: 83
Episode 7: reward: -73.000, steps: 74
Episode 8: reward: -193.000, steps: 194
Episode 9: reward: -99.000, steps: 100
Episode 10: reward: -128.000, steps: 129


<keras.callbacks.History at 0x7fe759badaf0>

# Box2D

## Lunar Lander

In [38]:
# Create the environment and reset it to the initial state
env = gym.make("LunarLander-v2")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)



In [39]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=100000, visualize=False, verbose=2)

Training for 100000 steps ...


  updates=self.state_updates,


    97/100000: episode: 1, duration: 0.797s, episode steps:  97, steps per second: 122, episode reward: -312.989, mean reward: -3.227 [-100.000, 40.537], mean action: 1.629 [0.000, 3.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


   211/100000: episode: 2, duration: 3.646s, episode steps: 114, steps per second:  31, episode reward: -238.897, mean reward: -2.096 [-100.000,  8.177], mean action: 1.614 [0.000, 3.000],  loss: 53.887167, mae: 1.248642, mean_q: -0.260770
   276/100000: episode: 3, duration: 0.720s, episode steps:  65, steps per second:  90, episode reward: -96.340, mean reward: -1.482 [-100.000,  6.120], mean action: 1.708 [0.000, 3.000],  loss: 48.265110, mae: 1.852698, mean_q: -0.831954
   350/100000: episode: 4, duration: 0.823s, episode steps:  74, steps per second:  90, episode reward: -499.421, mean reward: -6.749 [-100.000,  0.699], mean action: 2.027 [0.000, 3.000],  loss: 34.889248, mae: 2.154367, mean_q: -0.889887
   475/100000: episode: 5, duration: 1.375s, episode steps: 125, steps per second:  91, episode reward: -604.896, mean reward: -4.839 [-100.000,  1.876], mean action: 1.280 [0.000, 3.000],  loss: 49.093128, mae: 3.595279, mean_q: -2.387645
   575/100000: episode: 6, duration: 1.09

  5841/100000: episode: 36, duration: 2.547s, episode steps: 226, steps per second:  89, episode reward: -79.463, mean reward: -0.352 [-100.000, 10.865], mean action: 1.664 [0.000, 3.000],  loss: 4.336106, mae: 21.461140, mean_q: -8.992249
  6001/100000: episode: 37, duration: 1.798s, episode steps: 160, steps per second:  89, episode reward: -123.971, mean reward: -0.775 [-100.000,  6.092], mean action: 1.675 [0.000, 3.000],  loss: 3.917167, mae: 22.587315, mean_q: -9.615352
  6298/100000: episode: 38, duration: 3.524s, episode steps: 297, steps per second:  84, episode reward: -41.794, mean reward: -0.141 [-100.000,  8.451], mean action: 1.892 [0.000, 3.000],  loss: 4.397918, mae: 22.369169, mean_q: -8.761622
  6583/100000: episode: 39, duration: 3.257s, episode steps: 285, steps per second:  88, episode reward: -47.846, mean reward: -0.168 [-100.000, 13.311], mean action: 1.775 [0.000, 3.000],  loss: 3.878886, mae: 22.564608, mean_q: -7.465215
  6741/100000: episode: 40, duration: 1

 25710/100000: episode: 71, duration: 13.168s, episode steps: 1000, steps per second:  76, episode reward: 38.315, mean reward:  0.038 [-20.032, 22.813], mean action: 1.435 [0.000, 3.000],  loss: 6.779273, mae: 28.831795, mean_q: 30.547026
 26710/100000: episode: 72, duration: 14.729s, episode steps: 1000, steps per second:  68, episode reward: 41.119, mean reward:  0.041 [-20.465, 13.407], mean action: 1.516 [0.000, 3.000],  loss: 6.363595, mae: 28.446726, mean_q: 30.544008
 27710/100000: episode: 73, duration: 14.957s, episode steps: 1000, steps per second:  67, episode reward: 33.093, mean reward:  0.033 [-21.606, 25.646], mean action: 1.359 [0.000, 3.000],  loss: 6.526580, mae: 28.223207, mean_q: 29.880644
 28710/100000: episode: 74, duration: 13.506s, episode steps: 1000, steps per second:  74, episode reward: -60.863, mean reward: -0.061 [-25.508, 21.668], mean action: 1.343 [0.000, 3.000],  loss: 6.734522, mae: 27.765703, mean_q: 29.347975
 29710/100000: episode: 75, duration: 1

 58200/100000: episode: 106, duration: 13.873s, episode steps: 1000, steps per second:  72, episode reward: 105.845, mean reward:  0.106 [-20.102, 22.789], mean action: 1.246 [0.000, 3.000],  loss: 2.842800, mae: 27.528204, mean_q: 36.940460
 59200/100000: episode: 107, duration: 13.474s, episode steps: 1000, steps per second:  74, episode reward: 136.782, mean reward:  0.137 [-24.686, 23.750], mean action: 1.335 [0.000, 3.000],  loss: 3.586583, mae: 27.406395, mean_q: 36.946720
 60200/100000: episode: 108, duration: 13.526s, episode steps: 1000, steps per second:  74, episode reward: 99.753, mean reward:  0.100 [-22.586, 23.237], mean action: 1.411 [0.000, 3.000],  loss: 2.927845, mae: 27.194309, mean_q: 36.708580
 61200/100000: episode: 109, duration: 14.009s, episode steps: 1000, steps per second:  71, episode reward: 99.755, mean reward:  0.100 [-21.245, 25.013], mean action: 1.374 [0.000, 3.000],  loss: 3.411216, mae: 27.083761, mean_q: 36.571571
 61398/100000: episode: 110, durat

 86724/100000: episode: 140, duration: 13.772s, episode steps: 1000, steps per second:  73, episode reward: 127.718, mean reward:  0.128 [-20.146, 22.439], mean action: 1.457 [0.000, 3.000],  loss: 3.435793, mae: 27.901012, mean_q: 37.620975
 87724/100000: episode: 141, duration: 13.329s, episode steps: 1000, steps per second:  75, episode reward: 84.727, mean reward:  0.085 [-24.078, 23.098], mean action: 1.513 [0.000, 3.000],  loss: 2.939423, mae: 27.819235, mean_q: 37.474495
 88005/100000: episode: 142, duration: 3.553s, episode steps: 281, steps per second:  79, episode reward: -230.141, mean reward: -0.819 [-100.000, 52.874], mean action: 1.794 [0.000, 3.000],  loss: 4.143942, mae: 27.931124, mean_q: 37.460964
 89005/100000: episode: 143, duration: 14.155s, episode steps: 1000, steps per second:  71, episode reward: 160.358, mean reward:  0.160 [-19.582, 13.047], mean action: 1.451 [0.000, 3.000],  loss: 3.731712, mae: 27.904253, mean_q: 37.435280
 90005/100000: episode: 144, dura

<keras.callbacks.History at 0x7fe753e98a90>

In [40]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: -175.041, steps: 342
Episode 2: reward: 23.638, steps: 220
Episode 3: reward: -73.747, steps: 107
Episode 4: reward: 123.541, steps: 1000
Episode 5: reward: -325.697, steps: 345
Episode 6: reward: 177.584, steps: 373
Episode 7: reward: 257.463, steps: 291
Episode 8: reward: 254.353, steps: 277
Episode 9: reward: 164.730, steps: 388
Episode 10: reward: 260.992, steps: 248


<keras.callbacks.History at 0x7fe74f91b700>

# Toy Text

## Frozen Lake

In [41]:
# Create the environment and reset it to the initial state
env = gym.make("FrozenLake-v0")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [42]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


    15/50000: episode: 1, duration: 0.769s, episode steps:  15, steps per second:  19, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: --, mae: --, mean_q: --
    34/50000: episode: 2, duration: 0.031s, episode steps:  19, steps per second: 618, episode reward:  1.000, mean reward:  0.053 [ 0.000,  1.000], mean action: 1.263 [0.000, 3.000],  loss: --, mae: --, mean_q: --
    38/50000: episode: 3, duration: 0.008s, episode steps:   4, steps per second: 501, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: --, mae: --, mean_q: --
    44/50000: episode: 4, duration: 0.011s, episode steps:   6, steps per second: 551, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: --, mae: --, mean_q: --
    50/50000: episode: 5, duration: 0.011s, episode steps:   6, steps per second: 552, episode reward:  0.000, mean reward:  0.000 [ 0.000, 

  updates=self.state_updates,


   102/50000: episode: 11, duration: 2.974s, episode steps:  11, steps per second:   4, episode reward:  1.000, mean reward:  0.091 [ 0.000,  1.000], mean action: 1.364 [0.000, 3.000],  loss: 0.197874, mae: 0.260762, mean_q: 0.594411
   108/50000: episode: 12, duration: 0.074s, episode steps:   6, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.186194, mae: 0.221708, mean_q: 0.426308
   127/50000: episode: 13, duration: 0.224s, episode steps:  19, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.579 [0.000, 3.000],  loss: 0.102199, mae: 0.217833, mean_q: 0.424311
   132/50000: episode: 14, duration: 0.065s, episode steps:   5, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.079698, mae: 0.280471, mean_q: 0.447362
   146/50000: episode: 15, duration: 0.166s, episode steps:  14,

   344/50000: episode: 47, duration: 0.077s, episode steps:   6, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.028374, mae: 0.261066, mean_q: 0.348317
   348/50000: episode: 48, duration: 0.056s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 3.000],  loss: 0.028657, mae: 0.252358, mean_q: 0.330545
   353/50000: episode: 49, duration: 0.065s, episode steps:   5, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.027451, mae: 0.256632, mean_q: 0.323180
   359/50000: episode: 50, duration: 0.074s, episode steps:   6, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.020540, mae: 0.262168, mean_q: 0.334407
   376/50000: episode: 51, duration: 0.202s, episode steps:  17,

   616/50000: episode: 84, duration: 0.033s, episode steps:   2, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.009555, mae: 0.219633, mean_q: 0.291733
   623/50000: episode: 85, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.010200, mae: 0.223712, mean_q: 0.298976
   628/50000: episode: 86, duration: 0.065s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 2.000],  loss: 0.009256, mae: 0.222349, mean_q: 0.297424
   640/50000: episode: 87, duration: 0.145s, episode steps:  12, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.583 [0.000, 3.000],  loss: 0.009927, mae: 0.235701, mean_q: 0.313784
   650/50000: episode: 88, duration: 0.120s, episode steps:  10,

   871/50000: episode: 121, duration: 0.157s, episode steps:  13, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.154 [0.000, 3.000],  loss: 0.006662, mae: 0.211783, mean_q: 0.280448
   878/50000: episode: 122, duration: 0.085s, episode steps:   7, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.143 [0.000, 2.000],  loss: 0.008727, mae: 0.197912, mean_q: 0.263681
   881/50000: episode: 123, duration: 0.043s, episode steps:   3, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: 0.006907, mae: 0.187045, mean_q: 0.254913
   890/50000: episode: 124, duration: 0.114s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.889 [0.000, 2.000],  loss: 0.008309, mae: 0.192280, mean_q: 0.260749
   892/50000: episode: 125, duration: 0.033s, episode steps:

  1161/50000: episode: 157, duration: 0.046s, episode steps:   3, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.667 [2.000, 3.000],  loss: 0.004453, mae: 0.162777, mean_q: 0.218730
  1176/50000: episode: 158, duration: 0.179s, episode steps:  15, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.003737, mae: 0.169656, mean_q: 0.228907
  1185/50000: episode: 159, duration: 0.114s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.004005, mae: 0.164458, mean_q: 0.221646
  1189/50000: episode: 160, duration: 0.053s, episode steps:   4, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.008487, mae: 0.166522, mean_q: 0.224180
  1198/50000: episode: 161, duration: 0.106s, episode steps:

  1443/50000: episode: 193, duration: 0.076s, episode steps:   6, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.003013, mae: 0.142934, mean_q: 0.193870
  1452/50000: episode: 194, duration: 0.112s, episode steps:   9, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.003884, mae: 0.138926, mean_q: 0.187770
  1461/50000: episode: 195, duration: 0.110s, episode steps:   9, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.778 [0.000, 3.000],  loss: 0.002623, mae: 0.139958, mean_q: 0.194250
  1466/50000: episode: 196, duration: 0.066s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002903, mae: 0.145743, mean_q: 0.202419
  1468/50000: episode: 197, duration: 0.032s, episode steps:

  1714/50000: episode: 229, duration: 0.125s, episode steps:  10, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002950, mae: 0.124098, mean_q: 0.178614
  1723/50000: episode: 230, duration: 0.108s, episode steps:   9, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.002328, mae: 0.125253, mean_q: 0.177463
  1726/50000: episode: 231, duration: 0.044s, episode steps:   3, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 2.000],  loss: 0.003911, mae: 0.128608, mean_q: 0.173670
  1732/50000: episode: 232, duration: 0.082s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 2.000],  loss: 0.002633, mae: 0.129555, mean_q: 0.173312
  1736/50000: episode: 233, duration: 0.054s, episode steps:

  1998/50000: episode: 265, duration: 0.097s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002993, mae: 0.113588, mean_q: 0.158683
  2006/50000: episode: 266, duration: 0.102s, episode steps:   8, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002024, mae: 0.109984, mean_q: 0.152421
  2019/50000: episode: 267, duration: 0.155s, episode steps:  13, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003162, mae: 0.107108, mean_q: 0.144177
  2025/50000: episode: 268, duration: 0.077s, episode steps:   6, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002290, mae: 0.112080, mean_q: 0.150279
  2031/50000: episode: 269, duration: 0.075s, episode steps:

  2256/50000: episode: 300, duration: 0.188s, episode steps:  10, steps per second:  53, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.002598, mae: 0.094719, mean_q: 0.125957
  2267/50000: episode: 301, duration: 0.137s, episode steps:  11, steps per second:  80, episode reward:  1.000, mean reward:  0.091 [ 0.000,  1.000], mean action: 1.636 [0.000, 3.000],  loss: 0.001530, mae: 0.094159, mean_q: 0.126412
  2274/50000: episode: 302, duration: 0.093s, episode steps:   7, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002917, mae: 0.100861, mean_q: 0.135238
  2281/50000: episode: 303, duration: 0.088s, episode steps:   7, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002148, mae: 0.103744, mean_q: 0.138372
  2283/50000: episode: 304, duration: 0.034s, episode steps:

  2524/50000: episode: 336, duration: 0.293s, episode steps:  24, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.417 [0.000, 3.000],  loss: 0.001797, mae: 0.091550, mean_q: 0.129075
  2529/50000: episode: 337, duration: 0.080s, episode steps:   5, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002219, mae: 0.095958, mean_q: 0.128618
  2536/50000: episode: 338, duration: 0.092s, episode steps:   7, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 3.000],  loss: 0.002904, mae: 0.097823, mean_q: 0.131287
  2546/50000: episode: 339, duration: 0.123s, episode steps:  10, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.001476, mae: 0.091137, mean_q: 0.124868
  2552/50000: episode: 340, duration: 0.077s, episode steps:

  2780/50000: episode: 372, duration: 0.263s, episode steps:  18, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.003234, mae: 0.099827, mean_q: 0.134060
  2786/50000: episode: 373, duration: 0.077s, episode steps:   6, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [1.000, 3.000],  loss: 0.002471, mae: 0.095056, mean_q: 0.127702
  2791/50000: episode: 374, duration: 0.065s, episode steps:   5, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.800 [0.000, 2.000],  loss: 0.002731, mae: 0.091771, mean_q: 0.124009
  2800/50000: episode: 375, duration: 0.155s, episode steps:   9, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.778 [0.000, 3.000],  loss: 0.002716, mae: 0.093199, mean_q: 0.130710
  2806/50000: episode: 376, duration: 0.088s, episode steps:

  3068/50000: episode: 408, duration: 0.156s, episode steps:  11, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.727 [0.000, 3.000],  loss: 0.001911, mae: 0.103362, mean_q: 0.140353
  3072/50000: episode: 409, duration: 0.055s, episode steps:   4, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [1.000, 3.000],  loss: 0.003305, mae: 0.112062, mean_q: 0.153555
  3080/50000: episode: 410, duration: 0.100s, episode steps:   8, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003274, mae: 0.105987, mean_q: 0.144622
  3099/50000: episode: 411, duration: 0.222s, episode steps:  19, steps per second:  86, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.368 [0.000, 3.000],  loss: 0.003888, mae: 0.106601, mean_q: 0.143292
  3115/50000: episode: 412, duration: 0.210s, episode steps:

  3363/50000: episode: 445, duration: 0.115s, episode steps:   9, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002814, mae: 0.104925, mean_q: 0.142200
  3365/50000: episode: 446, duration: 0.033s, episode steps:   2, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002255, mae: 0.101688, mean_q: 0.139757
  3367/50000: episode: 447, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.002565, mae: 0.110696, mean_q: 0.151111
  3401/50000: episode: 448, duration: 0.385s, episode steps:  34, steps per second:  88, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.618 [0.000, 3.000],  loss: 0.003261, mae: 0.107640, mean_q: 0.147076
  3405/50000: episode: 449, duration: 0.073s, episode steps:

  3649/50000: episode: 481, duration: 0.191s, episode steps:  16, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.938 [0.000, 3.000],  loss: 0.003063, mae: 0.111061, mean_q: 0.150236
  3671/50000: episode: 482, duration: 0.253s, episode steps:  22, steps per second:  87, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.273 [0.000, 3.000],  loss: 0.003204, mae: 0.104395, mean_q: 0.141161
  3683/50000: episode: 483, duration: 0.142s, episode steps:  12, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002675, mae: 0.098191, mean_q: 0.139417
  3688/50000: episode: 484, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.200 [0.000, 3.000],  loss: 0.002226, mae: 0.111206, mean_q: 0.155137
  3703/50000: episode: 485, duration: 0.187s, episode steps:

  4017/50000: episode: 516, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001628, mae: 0.101206, mean_q: 0.135051
  4028/50000: episode: 517, duration: 0.131s, episode steps:  11, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.818 [0.000, 3.000],  loss: 0.002232, mae: 0.098888, mean_q: 0.133596
  4036/50000: episode: 518, duration: 0.103s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003406, mae: 0.106980, mean_q: 0.144004
  4038/50000: episode: 519, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001281, mae: 0.101707, mean_q: 0.138126
  4053/50000: episode: 520, duration: 0.200s, episode steps:

  4333/50000: episode: 551, duration: 0.190s, episode steps:  16, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002258, mae: 0.105536, mean_q: 0.149509
  4339/50000: episode: 552, duration: 0.077s, episode steps:   6, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.002920, mae: 0.112412, mean_q: 0.159245
  4347/50000: episode: 553, duration: 0.097s, episode steps:   8, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003679, mae: 0.117507, mean_q: 0.165955
  4349/50000: episode: 554, duration: 0.032s, episode steps:   2, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001843, mae: 0.119161, mean_q: 0.163840
  4352/50000: episode: 555, duration: 0.047s, episode steps:

  4615/50000: episode: 586, duration: 0.108s, episode steps:   9, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.778 [0.000, 2.000],  loss: 0.001635, mae: 0.112575, mean_q: 0.158625
  4618/50000: episode: 587, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003478, mae: 0.113526, mean_q: 0.151504
  4624/50000: episode: 588, duration: 0.076s, episode steps:   6, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.167 [0.000, 3.000],  loss: 0.002508, mae: 0.105639, mean_q: 0.139821
  4630/50000: episode: 589, duration: 0.075s, episode steps:   6, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.002878, mae: 0.107921, mean_q: 0.143159
  4646/50000: episode: 590, duration: 0.189s, episode steps:

  4857/50000: episode: 622, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.002064, mae: 0.105133, mean_q: 0.147156
  4859/50000: episode: 623, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.005800, mae: 0.111555, mean_q: 0.156893
  4867/50000: episode: 624, duration: 0.098s, episode steps:   8, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 2.000],  loss: 0.004164, mae: 0.117283, mean_q: 0.160172
  4870/50000: episode: 625, duration: 0.043s, episode steps:   3, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001864, mae: 0.127050, mean_q: 0.180187
  4886/50000: episode: 626, duration: 0.189s, episode steps:

  5137/50000: episode: 659, duration: 0.146s, episode steps:  12, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002772, mae: 0.111514, mean_q: 0.150024
  5142/50000: episode: 660, duration: 0.066s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.003151, mae: 0.121630, mean_q: 0.166769
  5146/50000: episode: 661, duration: 0.054s, episode steps:   4, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003975, mae: 0.124257, mean_q: 0.165963
  5171/50000: episode: 662, duration: 0.312s, episode steps:  25, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.840 [0.000, 3.000],  loss: 0.003403, mae: 0.115171, mean_q: 0.157086
  5174/50000: episode: 663, duration: 0.064s, episode steps:

  5374/50000: episode: 694, duration: 0.164s, episode steps:  13, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.308 [0.000, 3.000],  loss: 0.003113, mae: 0.120862, mean_q: 0.162262
  5383/50000: episode: 695, duration: 0.126s, episode steps:   9, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002970, mae: 0.118101, mean_q: 0.158715
  5387/50000: episode: 696, duration: 0.065s, episode steps:   4, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 2.000],  loss: 0.001908, mae: 0.112235, mean_q: 0.159089
  5393/50000: episode: 697, duration: 0.087s, episode steps:   6, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.003370, mae: 0.117569, mean_q: 0.165801
  5397/50000: episode: 698, duration: 0.059s, episode steps:

  5605/50000: episode: 729, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.004319, mae: 0.128076, mean_q: 0.169943
  5607/50000: episode: 730, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001718, mae: 0.115419, mean_q: 0.155714
  5610/50000: episode: 731, duration: 0.059s, episode steps:   3, steps per second:  51, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.001805, mae: 0.119132, mean_q: 0.163972
  5615/50000: episode: 732, duration: 0.076s, episode steps:   5, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003945, mae: 0.121570, mean_q: 0.167699
  5618/50000: episode: 733, duration: 0.050s, episode steps:

  5901/50000: episode: 764, duration: 0.236s, episode steps:  20, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.300 [0.000, 3.000],  loss: 0.002267, mae: 0.112275, mean_q: 0.155553
  5912/50000: episode: 765, duration: 0.132s, episode steps:  11, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.002954, mae: 0.117390, mean_q: 0.165759
  5915/50000: episode: 766, duration: 0.043s, episode steps:   3, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: 0.006078, mae: 0.123538, mean_q: 0.170651
  5919/50000: episode: 767, duration: 0.056s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 3.000],  loss: 0.005961, mae: 0.127605, mean_q: 0.171846
  5924/50000: episode: 768, duration: 0.069s, episode steps:

  6137/50000: episode: 801, duration: 0.104s, episode steps:   7, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 3.000],  loss: 0.002802, mae: 0.116378, mean_q: 0.156074
  6149/50000: episode: 802, duration: 0.166s, episode steps:  12, steps per second:  72, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003161, mae: 0.110368, mean_q: 0.147725
  6158/50000: episode: 803, duration: 0.129s, episode steps:   9, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003297, mae: 0.113943, mean_q: 0.157010
  6171/50000: episode: 804, duration: 0.170s, episode steps:  13, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.692 [0.000, 3.000],  loss: 0.002376, mae: 0.117756, mean_q: 0.160014
  6184/50000: episode: 805, duration: 0.175s, episode steps:

  6439/50000: episode: 838, duration: 0.099s, episode steps:   8, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002030, mae: 0.105183, mean_q: 0.146086
  6451/50000: episode: 839, duration: 0.148s, episode steps:  12, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002591, mae: 0.107250, mean_q: 0.149925
  6461/50000: episode: 840, duration: 0.131s, episode steps:  10, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.300 [0.000, 3.000],  loss: 0.001922, mae: 0.105561, mean_q: 0.147953
  6463/50000: episode: 841, duration: 0.034s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.001193, mae: 0.100654, mean_q: 0.142685
  6472/50000: episode: 842, duration: 0.108s, episode steps:

  6691/50000: episode: 874, duration: 0.112s, episode steps:   7, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002740, mae: 0.100980, mean_q: 0.136455
  6701/50000: episode: 875, duration: 0.167s, episode steps:  10, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.001546, mae: 0.103726, mean_q: 0.147530
  6703/50000: episode: 876, duration: 0.045s, episode steps:   2, steps per second:  44, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.000985, mae: 0.099850, mean_q: 0.145677
  6708/50000: episode: 877, duration: 0.083s, episode steps:   5, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 2.000],  loss: 0.001847, mae: 0.098426, mean_q: 0.139181
  6721/50000: episode: 878, duration: 0.197s, episode steps:

  7039/50000: episode: 910, duration: 0.125s, episode steps:   6, steps per second:  48, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002015, mae: 0.105578, mean_q: 0.141832
  7048/50000: episode: 911, duration: 0.156s, episode steps:   9, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.003025, mae: 0.104012, mean_q: 0.137715
  7061/50000: episode: 912, duration: 0.233s, episode steps:  13, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.769 [0.000, 3.000],  loss: 0.001577, mae: 0.099658, mean_q: 0.136174
  7064/50000: episode: 913, duration: 0.086s, episode steps:   3, steps per second:  35, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003770, mae: 0.113381, mean_q: 0.155575
  7069/50000: episode: 914, duration: 0.080s, episode steps:

  7288/50000: episode: 946, duration: 0.152s, episode steps:  12, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.917 [0.000, 3.000],  loss: 0.002416, mae: 0.107245, mean_q: 0.149048
  7295/50000: episode: 947, duration: 0.088s, episode steps:   7, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.143 [0.000, 3.000],  loss: 0.002495, mae: 0.101263, mean_q: 0.140555
  7300/50000: episode: 948, duration: 0.066s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003078, mae: 0.100715, mean_q: 0.139203
  7314/50000: episode: 949, duration: 0.165s, episode steps:  14, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.786 [0.000, 3.000],  loss: 0.002873, mae: 0.105265, mean_q: 0.148033
  7321/50000: episode: 950, duration: 0.087s, episode steps:

  7558/50000: episode: 981, duration: 0.153s, episode steps:  13, steps per second:  85, episode reward:  1.000, mean reward:  0.077 [ 0.000,  1.000], mean action: 1.308 [0.000, 3.000],  loss: 0.002938, mae: 0.105813, mean_q: 0.142649
  7561/50000: episode: 982, duration: 0.044s, episode steps:   3, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.000940, mae: 0.104914, mean_q: 0.146186
  7565/50000: episode: 983, duration: 0.055s, episode steps:   4, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [1.000, 3.000],  loss: 0.002769, mae: 0.109974, mean_q: 0.146992
  7570/50000: episode: 984, duration: 0.065s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 2.000],  loss: 0.002041, mae: 0.097360, mean_q: 0.131072
  7576/50000: episode: 985, duration: 0.087s, episode steps:

  7799/50000: episode: 1016, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002064, mae: 0.104370, mean_q: 0.142852
  7817/50000: episode: 1017, duration: 0.215s, episode steps:  18, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002220, mae: 0.103304, mean_q: 0.140295
  7823/50000: episode: 1018, duration: 0.076s, episode steps:   6, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.002479, mae: 0.107745, mean_q: 0.150435
  7826/50000: episode: 1019, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.004438, mae: 0.112540, mean_q: 0.153882
  7834/50000: episode: 1020, duration: 0.102s, episode s

  8035/50000: episode: 1051, duration: 0.112s, episode steps:   9, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002639, mae: 0.102898, mean_q: 0.141270
  8041/50000: episode: 1052, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002888, mae: 0.108801, mean_q: 0.147593
  8052/50000: episode: 1053, duration: 0.135s, episode steps:  11, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.091 [0.000, 3.000],  loss: 0.003810, mae: 0.108986, mean_q: 0.146847
  8056/50000: episode: 1054, duration: 0.064s, episode steps:   4, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002364, mae: 0.101304, mean_q: 0.136024
  8068/50000: episode: 1055, duration: 0.142s, episode s

  8297/50000: episode: 1086, duration: 0.099s, episode steps:   8, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 2.000],  loss: 0.002375, mae: 0.110075, mean_q: 0.148841
  8311/50000: episode: 1087, duration: 0.165s, episode steps:  14, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.071 [0.000, 3.000],  loss: 0.002380, mae: 0.110353, mean_q: 0.150039
  8313/50000: episode: 1088, duration: 0.033s, episode steps:   2, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.002638, mae: 0.110771, mean_q: 0.157617
  8318/50000: episode: 1089, duration: 0.067s, episode steps:   5, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.004168, mae: 0.113926, mean_q: 0.157952
  8321/50000: episode: 1090, duration: 0.043s, episode s

  8538/50000: episode: 1122, duration: 0.156s, episode steps:  13, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.846 [0.000, 3.000],  loss: 0.003111, mae: 0.110740, mean_q: 0.149545
  8543/50000: episode: 1123, duration: 0.066s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002354, mae: 0.106164, mean_q: 0.141022
  8548/50000: episode: 1124, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002921, mae: 0.117479, mean_q: 0.155834
  8563/50000: episode: 1125, duration: 0.183s, episode steps:  15, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002861, mae: 0.104311, mean_q: 0.138670
  8567/50000: episode: 1126, duration: 0.055s, episode s

  8795/50000: episode: 1157, duration: 0.188s, episode steps:  16, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.562 [0.000, 3.000],  loss: 0.001654, mae: 0.093876, mean_q: 0.134850
  8803/50000: episode: 1158, duration: 0.097s, episode steps:   8, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.875 [1.000, 3.000],  loss: 0.002890, mae: 0.099402, mean_q: 0.141344
  8807/50000: episode: 1159, duration: 0.055s, episode steps:   4, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001978, mae: 0.103968, mean_q: 0.144340
  8820/50000: episode: 1160, duration: 0.160s, episode steps:  13, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.462 [0.000, 3.000],  loss: 0.002073, mae: 0.098360, mean_q: 0.135255
  8825/50000: episode: 1161, duration: 0.070s, episode s

  9093/50000: episode: 1194, duration: 0.108s, episode steps:   8, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 3.000],  loss: 0.002315, mae: 0.096127, mean_q: 0.136859
  9097/50000: episode: 1195, duration: 0.054s, episode steps:   4, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001525, mae: 0.094443, mean_q: 0.131495
  9099/50000: episode: 1196, duration: 0.032s, episode steps:   2, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001432, mae: 0.097305, mean_q: 0.135138
  9107/50000: episode: 1197, duration: 0.098s, episode steps:   8, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001948, mae: 0.093936, mean_q: 0.129315
  9113/50000: episode: 1198, duration: 0.077s, episode s

  9343/50000: episode: 1229, duration: 0.112s, episode steps:   9, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.004084, mae: 0.100224, mean_q: 0.135496
  9345/50000: episode: 1230, duration: 0.033s, episode steps:   2, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.002118, mae: 0.103461, mean_q: 0.142213
  9370/50000: episode: 1231, duration: 0.294s, episode steps:  25, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.480 [0.000, 3.000],  loss: 0.002585, mae: 0.094777, mean_q: 0.128549
  9382/50000: episode: 1232, duration: 0.148s, episode steps:  12, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.002384, mae: 0.099794, mean_q: 0.137639
  9388/50000: episode: 1233, duration: 0.079s, episode s

  9603/50000: episode: 1266, duration: 0.116s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.001849, mae: 0.100199, mean_q: 0.135492
  9618/50000: episode: 1267, duration: 0.176s, episode steps:  15, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003033, mae: 0.101553, mean_q: 0.138857
  9621/50000: episode: 1268, duration: 0.045s, episode steps:   3, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001168, mae: 0.091590, mean_q: 0.131043
  9626/50000: episode: 1269, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [1.000, 2.000],  loss: 0.002209, mae: 0.106544, mean_q: 0.147983
  9636/50000: episode: 1270, duration: 0.123s, episode s

  9881/50000: episode: 1301, duration: 0.115s, episode steps:   6, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003435, mae: 0.110420, mean_q: 0.156812
  9887/50000: episode: 1302, duration: 0.078s, episode steps:   6, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002999, mae: 0.110581, mean_q: 0.153955
  9898/50000: episode: 1303, duration: 0.132s, episode steps:  11, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.002544, mae: 0.095761, mean_q: 0.136949
  9910/50000: episode: 1304, duration: 0.142s, episode steps:  12, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.002345, mae: 0.098635, mean_q: 0.138491
  9914/50000: episode: 1305, duration: 0.064s, episode s

 10144/50000: episode: 1337, duration: 0.171s, episode steps:  14, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002265, mae: 0.105561, mean_q: 0.145129
 10147/50000: episode: 1338, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.001111, mae: 0.088153, mean_q: 0.124937
 10167/50000: episode: 1339, duration: 0.244s, episode steps:  20, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.900 [0.000, 3.000],  loss: 0.002241, mae: 0.102159, mean_q: 0.142231
 10170/50000: episode: 1340, duration: 0.043s, episode steps:   3, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.003634, mae: 0.095667, mean_q: 0.133562
 10178/50000: episode: 1341, duration: 0.103s, episode s

 10478/50000: episode: 1374, duration: 0.069s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.003116, mae: 0.102137, mean_q: 0.139595
 10484/50000: episode: 1375, duration: 0.077s, episode steps:   6, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.001761, mae: 0.095786, mean_q: 0.135742
 10486/50000: episode: 1376, duration: 0.032s, episode steps:   2, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.003544, mae: 0.119536, mean_q: 0.166812
 10492/50000: episode: 1377, duration: 0.078s, episode steps:   6, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.167 [1.000, 3.000],  loss: 0.002518, mae: 0.104507, mean_q: 0.143833
 10497/50000: episode: 1378, duration: 0.068s, episode s

 10748/50000: episode: 1410, duration: 0.111s, episode steps:   9, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002500, mae: 0.097975, mean_q: 0.133583
 10763/50000: episode: 1411, duration: 0.177s, episode steps:  15, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.467 [0.000, 3.000],  loss: 0.003799, mae: 0.104691, mean_q: 0.141706
 10772/50000: episode: 1412, duration: 0.113s, episode steps:   9, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.002827, mae: 0.105602, mean_q: 0.145754
 10776/50000: episode: 1413, duration: 0.055s, episode steps:   4, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.003292, mae: 0.096469, mean_q: 0.134605
 10794/50000: episode: 1414, duration: 0.218s, episode s

 10976/50000: episode: 1445, duration: 0.066s, episode steps:   5, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.001519, mae: 0.096878, mean_q: 0.135108
 10981/50000: episode: 1446, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002913, mae: 0.096541, mean_q: 0.134754
 10996/50000: episode: 1447, duration: 0.191s, episode steps:  15, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.002382, mae: 0.096047, mean_q: 0.134370
 10999/50000: episode: 1448, duration: 0.048s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.002248, mae: 0.097401, mean_q: 0.133537
 11007/50000: episode: 1449, duration: 0.107s, episode s

 11250/50000: episode: 1481, duration: 0.162s, episode steps:  13, steps per second:  80, episode reward:  1.000, mean reward:  0.077 [ 0.000,  1.000], mean action: 1.385 [0.000, 3.000],  loss: 0.002293, mae: 0.096744, mean_q: 0.136445
 11253/50000: episode: 1482, duration: 0.045s, episode steps:   3, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.001286, mae: 0.088616, mean_q: 0.123053
 11265/50000: episode: 1483, duration: 0.185s, episode steps:  12, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.001865, mae: 0.091908, mean_q: 0.124704
 11282/50000: episode: 1484, duration: 0.230s, episode steps:  17, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.353 [0.000, 3.000],  loss: 0.002036, mae: 0.096542, mean_q: 0.131649
 11285/50000: episode: 1485, duration: 0.053s, episode s

 11567/50000: episode: 1518, duration: 0.205s, episode steps:  17, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.529 [0.000, 3.000],  loss: 0.002009, mae: 0.097394, mean_q: 0.134438
 11575/50000: episode: 1519, duration: 0.105s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 3.000],  loss: 0.003542, mae: 0.096373, mean_q: 0.129683
 11582/50000: episode: 1520, duration: 0.088s, episode steps:   7, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.143 [0.000, 3.000],  loss: 0.003095, mae: 0.096475, mean_q: 0.127851
 11587/50000: episode: 1521, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002490, mae: 0.103140, mean_q: 0.137672
 11594/50000: episode: 1522, duration: 0.091s, episode s

 11793/50000: episode: 1554, duration: 0.050s, episode steps:   3, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002989, mae: 0.093387, mean_q: 0.122337
 11795/50000: episode: 1555, duration: 0.034s, episode steps:   2, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001877, mae: 0.086712, mean_q: 0.117946
 11806/50000: episode: 1556, duration: 0.136s, episode steps:  11, steps per second:  81, episode reward:  1.000, mean reward:  0.091 [ 0.000,  1.000], mean action: 1.364 [0.000, 3.000],  loss: 0.003109, mae: 0.088890, mean_q: 0.119804
 11810/50000: episode: 1557, duration: 0.060s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001199, mae: 0.093233, mean_q: 0.127016
 11820/50000: episode: 1558, duration: 0.121s, episode s

 12028/50000: episode: 1590, duration: 0.101s, episode steps:   8, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.002014, mae: 0.095524, mean_q: 0.130719
 12034/50000: episode: 1591, duration: 0.078s, episode steps:   6, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.167 [1.000, 3.000],  loss: 0.001241, mae: 0.094917, mean_q: 0.130680
 12036/50000: episode: 1592, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001249, mae: 0.091021, mean_q: 0.122852
 12049/50000: episode: 1593, duration: 0.157s, episode steps:  13, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.385 [0.000, 3.000],  loss: 0.003533, mae: 0.097207, mean_q: 0.131731
 12054/50000: episode: 1594, duration: 0.066s, episode s

 12248/50000: episode: 1625, duration: 0.112s, episode steps:   9, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.003087, mae: 0.097964, mean_q: 0.138549
 12256/50000: episode: 1626, duration: 0.104s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.002228, mae: 0.092717, mean_q: 0.135174
 12259/50000: episode: 1627, duration: 0.045s, episode steps:   3, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.004311, mae: 0.103099, mean_q: 0.142466
 12263/50000: episode: 1628, duration: 0.055s, episode steps:   4, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001663, mae: 0.104766, mean_q: 0.143976
 12271/50000: episode: 1629, duration: 0.104s, episode s

 12483/50000: episode: 1662, duration: 0.102s, episode steps:   8, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.005064, mae: 0.114830, mean_q: 0.151680
 12487/50000: episode: 1663, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003792, mae: 0.110501, mean_q: 0.147872
 12490/50000: episode: 1664, duration: 0.044s, episode steps:   3, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.001640, mae: 0.093675, mean_q: 0.127314
 12509/50000: episode: 1665, duration: 0.238s, episode steps:  19, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.263 [0.000, 3.000],  loss: 0.003448, mae: 0.109645, mean_q: 0.146068
 12511/50000: episode: 1666, duration: 0.033s, episode s

 12793/50000: episode: 1697, duration: 0.242s, episode steps:  21, steps per second:  87, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.381 [0.000, 3.000],  loss: 0.003944, mae: 0.113487, mean_q: 0.152705
 12795/50000: episode: 1698, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.002045, mae: 0.114010, mean_q: 0.159554
 12799/50000: episode: 1699, duration: 0.055s, episode steps:   4, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001628, mae: 0.114328, mean_q: 0.155150
 12805/50000: episode: 1700, duration: 0.082s, episode steps:   6, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003752, mae: 0.116738, mean_q: 0.155118
 12811/50000: episode: 1701, duration: 0.086s, episode s

 13042/50000: episode: 1732, duration: 0.068s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.003615, mae: 0.104775, mean_q: 0.143635
 13045/50000: episode: 1733, duration: 0.046s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 2.000],  loss: 0.002711, mae: 0.099673, mean_q: 0.136953
 13062/50000: episode: 1734, duration: 0.208s, episode steps:  17, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.471 [0.000, 3.000],  loss: 0.003659, mae: 0.111823, mean_q: 0.153151
 13070/50000: episode: 1735, duration: 0.101s, episode steps:   8, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.875 [0.000, 3.000],  loss: 0.002419, mae: 0.106430, mean_q: 0.149412
 13077/50000: episode: 1736, duration: 0.089s, episode s

 13390/50000: episode: 1768, duration: 0.131s, episode steps:  11, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.545 [0.000, 3.000],  loss: 0.002298, mae: 0.105259, mean_q: 0.144095
 13395/50000: episode: 1769, duration: 0.066s, episode steps:   5, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003251, mae: 0.109456, mean_q: 0.151342
 13402/50000: episode: 1770, duration: 0.088s, episode steps:   7, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.571 [0.000, 3.000],  loss: 0.002860, mae: 0.104556, mean_q: 0.145960
 13408/50000: episode: 1771, duration: 0.082s, episode steps:   6, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 2.000],  loss: 0.003317, mae: 0.113171, mean_q: 0.158087
 13414/50000: episode: 1772, duration: 0.077s, episode s

 13667/50000: episode: 1803, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 2.000],  loss: 0.003634, mae: 0.118414, mean_q: 0.166809
 13669/50000: episode: 1804, duration: 0.033s, episode steps:   2, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.000963, mae: 0.097152, mean_q: 0.135953
 13684/50000: episode: 1805, duration: 0.184s, episode steps:  15, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.467 [0.000, 3.000],  loss: 0.003422, mae: 0.108574, mean_q: 0.151295
 13700/50000: episode: 1806, duration: 0.190s, episode steps:  16, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003231, mae: 0.113445, mean_q: 0.158973
 13711/50000: episode: 1807, duration: 0.133s, episode s

 13946/50000: episode: 1839, duration: 0.114s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.111 [1.000, 3.000],  loss: 0.002912, mae: 0.103352, mean_q: 0.143142
 13949/50000: episode: 1840, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002444, mae: 0.114702, mean_q: 0.156607
 13953/50000: episode: 1841, duration: 0.056s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001924, mae: 0.118761, mean_q: 0.166706
 13960/50000: episode: 1842, duration: 0.091s, episode steps:   7, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.002045, mae: 0.110036, mean_q: 0.154236
 13967/50000: episode: 1843, duration: 0.092s, episode s

 14222/50000: episode: 1876, duration: 0.098s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.003501, mae: 0.110643, mean_q: 0.148640
 14228/50000: episode: 1877, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.001972, mae: 0.107036, mean_q: 0.150255
 14236/50000: episode: 1878, duration: 0.098s, episode steps:   8, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.003893, mae: 0.110620, mean_q: 0.153230
 14243/50000: episode: 1879, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.002088, mae: 0.107727, mean_q: 0.156726
 14249/50000: episode: 1880, duration: 0.081s, episode s

 14496/50000: episode: 1913, duration: 0.082s, episode steps:   6, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.002074, mae: 0.101093, mean_q: 0.141243
 14503/50000: episode: 1914, duration: 0.087s, episode steps:   7, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [1.000, 3.000],  loss: 0.002985, mae: 0.095885, mean_q: 0.135646
 14506/50000: episode: 1915, duration: 0.044s, episode steps:   3, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.001878, mae: 0.098535, mean_q: 0.137177
 14516/50000: episode: 1916, duration: 0.122s, episode steps:  10, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.700 [0.000, 3.000],  loss: 0.002335, mae: 0.101469, mean_q: 0.139845
 14518/50000: episode: 1917, duration: 0.032s, episode s

 14859/50000: episode: 1949, duration: 0.049s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001944, mae: 0.099895, mean_q: 0.137111
 14866/50000: episode: 1950, duration: 0.090s, episode steps:   7, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.002789, mae: 0.108818, mean_q: 0.151364
 14873/50000: episode: 1951, duration: 0.089s, episode steps:   7, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.001787, mae: 0.101959, mean_q: 0.144666
 14880/50000: episode: 1952, duration: 0.089s, episode steps:   7, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.002109, mae: 0.099803, mean_q: 0.142415
 14885/50000: episode: 1953, duration: 0.066s, episode s

 15135/50000: episode: 1984, duration: 0.104s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.002710, mae: 0.102123, mean_q: 0.140808
 15157/50000: episode: 1985, duration: 0.262s, episode steps:  22, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.273 [0.000, 3.000],  loss: 0.002755, mae: 0.095039, mean_q: 0.132903
 15162/50000: episode: 1986, duration: 0.069s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001379, mae: 0.100170, mean_q: 0.139913
 15171/50000: episode: 1987, duration: 0.117s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003308, mae: 0.109048, mean_q: 0.154515
 15177/50000: episode: 1988, duration: 0.079s, episode s

 15442/50000: episode: 2019, duration: 0.057s, episode steps:   4, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.001664, mae: 0.084742, mean_q: 0.118147
 15454/50000: episode: 2020, duration: 0.145s, episode steps:  12, steps per second:  83, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002526, mae: 0.093063, mean_q: 0.129306
 15457/50000: episode: 2021, duration: 0.044s, episode steps:   3, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.003307, mae: 0.092665, mean_q: 0.126960
 15479/50000: episode: 2022, duration: 0.258s, episode steps:  22, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.591 [0.000, 3.000],  loss: 0.002080, mae: 0.094090, mean_q: 0.130111
 15491/50000: episode: 2023, duration: 0.149s, episode s

 15718/50000: episode: 2054, duration: 0.144s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.002224, mae: 0.093389, mean_q: 0.129413
 15725/50000: episode: 2055, duration: 0.089s, episode steps:   7, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003532, mae: 0.097909, mean_q: 0.134654
 15727/50000: episode: 2056, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.004023, mae: 0.096667, mean_q: 0.131354
 15735/50000: episode: 2057, duration: 0.111s, episode steps:   8, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.002480, mae: 0.096337, mean_q: 0.134335
 15740/50000: episode: 2058, duration: 0.068s, episode s

 16020/50000: episode: 2089, duration: 0.051s, episode steps:   2, steps per second:  40, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.003057, mae: 0.105208, mean_q: 0.146825
 16040/50000: episode: 2090, duration: 0.255s, episode steps:  20, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001975, mae: 0.097776, mean_q: 0.132948
 16045/50000: episode: 2091, duration: 0.100s, episode steps:   5, steps per second:  50, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.001848, mae: 0.088655, mean_q: 0.119768
 16048/50000: episode: 2092, duration: 0.067s, episode steps:   3, steps per second:  45, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003158, mae: 0.093688, mean_q: 0.125230
 16052/50000: episode: 2093, duration: 0.068s, episode s

 16317/50000: episode: 2126, duration: 0.067s, episode steps:   4, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 2.000],  loss: 0.000920, mae: 0.089175, mean_q: 0.124355
 16322/50000: episode: 2127, duration: 0.073s, episode steps:   5, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002342, mae: 0.094043, mean_q: 0.127227
 16325/50000: episode: 2128, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: 0.002760, mae: 0.095841, mean_q: 0.127051
 16328/50000: episode: 2129, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.002929, mae: 0.106981, mean_q: 0.144637
 16331/50000: episode: 2130, duration: 0.050s, episode s

 16582/50000: episode: 2161, duration: 0.064s, episode steps:   4, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.250 [1.000, 3.000],  loss: 0.002914, mae: 0.100817, mean_q: 0.135178
 16591/50000: episode: 2162, duration: 0.116s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.003048, mae: 0.100382, mean_q: 0.137091
 16601/50000: episode: 2163, duration: 0.136s, episode steps:  10, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.002419, mae: 0.092473, mean_q: 0.129549
 16608/50000: episode: 2164, duration: 0.102s, episode steps:   7, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.143 [0.000, 3.000],  loss: 0.003989, mae: 0.108596, mean_q: 0.151099
 16615/50000: episode: 2165, duration: 0.108s, episode s

 16856/50000: episode: 2197, duration: 0.064s, episode steps:   4, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 2.000],  loss: 0.003467, mae: 0.105738, mean_q: 0.140800
 16867/50000: episode: 2198, duration: 0.134s, episode steps:  11, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.364 [0.000, 3.000],  loss: 0.002401, mae: 0.100471, mean_q: 0.134186
 16870/50000: episode: 2199, duration: 0.045s, episode steps:   3, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001620, mae: 0.099657, mean_q: 0.135942
 16882/50000: episode: 2200, duration: 0.145s, episode steps:  12, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002430, mae: 0.097060, mean_q: 0.132384
 16886/50000: episode: 2201, duration: 0.056s, episode s

 17142/50000: episode: 2233, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.001930, mae: 0.098779, mean_q: 0.139093
 17147/50000: episode: 2234, duration: 0.074s, episode steps:   5, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.002400, mae: 0.092578, mean_q: 0.130068
 17150/50000: episode: 2235, duration: 0.046s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001558, mae: 0.102484, mean_q: 0.142753
 17158/50000: episode: 2236, duration: 0.104s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.001320, mae: 0.084447, mean_q: 0.116662
 17161/50000: episode: 2237, duration: 0.045s, episode s

 17393/50000: episode: 2270, duration: 0.146s, episode steps:  12, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.001879, mae: 0.089617, mean_q: 0.120246
 17396/50000: episode: 2271, duration: 0.045s, episode steps:   3, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002733, mae: 0.088004, mean_q: 0.117677
 17401/50000: episode: 2272, duration: 0.066s, episode steps:   5, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [1.000, 3.000],  loss: 0.000950, mae: 0.084904, mean_q: 0.114891
 17403/50000: episode: 2273, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.006084, mae: 0.094393, mean_q: 0.120777
 17417/50000: episode: 2274, duration: 0.167s, episode s

 17707/50000: episode: 2307, duration: 0.223s, episode steps:  17, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.118 [0.000, 3.000],  loss: 0.002594, mae: 0.088284, mean_q: 0.125716
 17719/50000: episode: 2308, duration: 0.154s, episode steps:  12, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.583 [0.000, 3.000],  loss: 0.001906, mae: 0.087953, mean_q: 0.125259
 17723/50000: episode: 2309, duration: 0.065s, episode steps:   4, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.001635, mae: 0.087230, mean_q: 0.125587
 17726/50000: episode: 2310, duration: 0.049s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.001541, mae: 0.087361, mean_q: 0.122089
 17743/50000: episode: 2311, duration: 0.227s, episode s

 18016/50000: episode: 2343, duration: 0.073s, episode steps:   5, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 2.000],  loss: 0.002264, mae: 0.089074, mean_q: 0.123269
 18020/50000: episode: 2344, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001489, mae: 0.092265, mean_q: 0.128969
 18032/50000: episode: 2345, duration: 0.157s, episode steps:  12, steps per second:  76, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002473, mae: 0.088351, mean_q: 0.122207
 18040/50000: episode: 2346, duration: 0.140s, episode steps:   8, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.001329, mae: 0.083244, mean_q: 0.117549
 18043/50000: episode: 2347, duration: 0.052s, episode s

 18297/50000: episode: 2378, duration: 0.057s, episode steps:   4, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002983, mae: 0.102595, mean_q: 0.140615
 18300/50000: episode: 2379, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.002607, mae: 0.094018, mean_q: 0.126454
 18312/50000: episode: 2380, duration: 0.144s, episode steps:  12, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002936, mae: 0.097835, mean_q: 0.135083
 18314/50000: episode: 2381, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002486, mae: 0.086897, mean_q: 0.117417
 18316/50000: episode: 2382, duration: 0.033s, episode s

 18574/50000: episode: 2413, duration: 0.124s, episode steps:  10, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.900 [0.000, 3.000],  loss: 0.003061, mae: 0.098220, mean_q: 0.136263
 18584/50000: episode: 2414, duration: 0.141s, episode steps:  10, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.900 [0.000, 3.000],  loss: 0.002856, mae: 0.087721, mean_q: 0.123614
 18589/50000: episode: 2415, duration: 0.079s, episode steps:   5, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.002048, mae: 0.087141, mean_q: 0.121054
 18591/50000: episode: 2416, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001674, mae: 0.088886, mean_q: 0.127666
 18595/50000: episode: 2417, duration: 0.056s, episode s

 18824/50000: episode: 2449, duration: 0.068s, episode steps:   5, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.001153, mae: 0.089861, mean_q: 0.123771
 18830/50000: episode: 2450, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002833, mae: 0.091349, mean_q: 0.122240
 18832/50000: episode: 2451, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003218, mae: 0.109680, mean_q: 0.153876
 18844/50000: episode: 2452, duration: 0.145s, episode steps:  12, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001972, mae: 0.093297, mean_q: 0.126970
 18863/50000: episode: 2453, duration: 0.234s, episode s

 19109/50000: episode: 2485, duration: 0.050s, episode steps:   3, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.001545, mae: 0.099357, mean_q: 0.136870
 19112/50000: episode: 2486, duration: 0.048s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.002459, mae: 0.095203, mean_q: 0.128763
 19134/50000: episode: 2487, duration: 0.298s, episode steps:  22, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.455 [0.000, 3.000],  loss: 0.002149, mae: 0.089055, mean_q: 0.123562
 19138/50000: episode: 2488, duration: 0.067s, episode steps:   4, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002324, mae: 0.098199, mean_q: 0.134397
 19145/50000: episode: 2489, duration: 0.106s, episode s

 19315/50000: episode: 2520, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002100, mae: 0.091832, mean_q: 0.127637
 19322/50000: episode: 2521, duration: 0.097s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.002153, mae: 0.090884, mean_q: 0.128101
 19329/50000: episode: 2522, duration: 0.099s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.143 [1.000, 3.000],  loss: 0.003120, mae: 0.097398, mean_q: 0.137870
 19341/50000: episode: 2523, duration: 0.156s, episode steps:  12, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.001855, mae: 0.091610, mean_q: 0.129696
 19389/50000: episode: 2524, duration: 0.586s, episode s

 19640/50000: episode: 2556, duration: 0.167s, episode steps:  12, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.001883, mae: 0.093692, mean_q: 0.128315
 19647/50000: episode: 2557, duration: 0.091s, episode steps:   7, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.714 [0.000, 3.000],  loss: 0.003289, mae: 0.097407, mean_q: 0.133006
 19655/50000: episode: 2558, duration: 0.109s, episode steps:   8, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.002857, mae: 0.090761, mean_q: 0.124886
 19658/50000: episode: 2559, duration: 0.049s, episode steps:   3, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003363, mae: 0.094085, mean_q: 0.128018
 19662/50000: episode: 2560, duration: 0.060s, episode s

 19862/50000: episode: 2591, duration: 0.116s, episode steps:   9, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.001730, mae: 0.087549, mean_q: 0.120494
 19865/50000: episode: 2592, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002238, mae: 0.094566, mean_q: 0.132627
 19877/50000: episode: 2593, duration: 0.161s, episode steps:  12, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.833 [0.000, 3.000],  loss: 0.002062, mae: 0.084298, mean_q: 0.115796
 19886/50000: episode: 2594, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.002075, mae: 0.090886, mean_q: 0.123204
 19889/50000: episode: 2595, duration: 0.048s, episode s

 20158/50000: episode: 2626, duration: 0.353s, episode steps:  27, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.778 [0.000, 3.000],  loss: 0.002186, mae: 0.092207, mean_q: 0.127948
 20161/50000: episode: 2627, duration: 0.046s, episode steps:   3, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.002861, mae: 0.103487, mean_q: 0.142582
 20177/50000: episode: 2628, duration: 0.193s, episode steps:  16, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.812 [0.000, 3.000],  loss: 0.003552, mae: 0.092883, mean_q: 0.127924
 20181/50000: episode: 2629, duration: 0.055s, episode steps:   4, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 3.000],  loss: 0.001797, mae: 0.091043, mean_q: 0.125289
 20183/50000: episode: 2630, duration: 0.034s, episode s

 20361/50000: episode: 2662, duration: 0.102s, episode steps:   8, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.002099, mae: 0.086318, mean_q: 0.117585
 20366/50000: episode: 2663, duration: 0.067s, episode steps:   5, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.001631, mae: 0.085054, mean_q: 0.118706
 20371/50000: episode: 2664, duration: 0.080s, episode steps:   5, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.003247, mae: 0.101470, mean_q: 0.139051
 20383/50000: episode: 2665, duration: 0.180s, episode steps:  12, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002834, mae: 0.099840, mean_q: 0.139783
 20397/50000: episode: 2666, duration: 0.167s, episode s

 20648/50000: episode: 2697, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.889 [0.000, 3.000],  loss: 0.001872, mae: 0.097215, mean_q: 0.135451
 20661/50000: episode: 2698, duration: 0.157s, episode steps:  13, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.154 [0.000, 3.000],  loss: 0.002369, mae: 0.097496, mean_q: 0.136297
 20675/50000: episode: 2699, duration: 0.173s, episode steps:  14, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.002671, mae: 0.098692, mean_q: 0.136079
 20689/50000: episode: 2700, duration: 0.171s, episode steps:  14, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.002865, mae: 0.101474, mean_q: 0.139980
 20697/50000: episode: 2701, duration: 0.105s, episode s

 21005/50000: episode: 2734, duration: 0.084s, episode steps:   6, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.001484, mae: 0.090240, mean_q: 0.124438
 21014/50000: episode: 2735, duration: 0.113s, episode steps:   9, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.001561, mae: 0.087910, mean_q: 0.118920
 21022/50000: episode: 2736, duration: 0.101s, episode steps:   8, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.001299, mae: 0.083855, mean_q: 0.113329
 21027/50000: episode: 2737, duration: 0.068s, episode steps:   5, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [1.000, 2.000],  loss: 0.003194, mae: 0.090176, mean_q: 0.123796
 21041/50000: episode: 2738, duration: 0.174s, episode s

 21309/50000: episode: 2769, duration: 0.204s, episode steps:  16, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.002127, mae: 0.088692, mean_q: 0.119561
 21312/50000: episode: 2770, duration: 0.088s, episode steps:   3, steps per second:  34, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [1.000, 2.000],  loss: 0.000911, mae: 0.080846, mean_q: 0.108630
 21316/50000: episode: 2771, duration: 0.084s, episode steps:   4, steps per second:  48, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 2.000],  loss: 0.001598, mae: 0.076661, mean_q: 0.103923
 21327/50000: episode: 2772, duration: 0.180s, episode steps:  11, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.001528, mae: 0.077855, mean_q: 0.107063
 21334/50000: episode: 2773, duration: 0.105s, episode s

 21621/50000: episode: 2804, duration: 0.221s, episode steps:  16, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.002295, mae: 0.084587, mean_q: 0.116212
 21623/50000: episode: 2805, duration: 0.038s, episode steps:   2, steps per second:  53, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.003531, mae: 0.094146, mean_q: 0.128004
 21634/50000: episode: 2806, duration: 0.157s, episode steps:  11, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.001234, mae: 0.081130, mean_q: 0.110574
 21637/50000: episode: 2807, duration: 0.053s, episode steps:   3, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001969, mae: 0.081835, mean_q: 0.111909
 21643/50000: episode: 2808, duration: 0.094s, episode s

 21850/50000: episode: 2839, duration: 0.076s, episode steps:   5, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [1.000, 3.000],  loss: 0.000937, mae: 0.074766, mean_q: 0.104051
 21855/50000: episode: 2840, duration: 0.080s, episode steps:   5, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.001169, mae: 0.080100, mean_q: 0.112315
 21859/50000: episode: 2841, duration: 0.074s, episode steps:   4, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.003096, mae: 0.076234, mean_q: 0.103892
 21864/50000: episode: 2842, duration: 0.079s, episode steps:   5, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [1.000, 3.000],  loss: 0.001001, mae: 0.078592, mean_q: 0.106459
 21873/50000: episode: 2843, duration: 0.131s, episode s

 22114/50000: episode: 2877, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.400 [0.000, 1.000],  loss: 0.001158, mae: 0.075335, mean_q: 0.101797
 22122/50000: episode: 2878, duration: 0.113s, episode steps:   8, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.875 [0.000, 3.000],  loss: 0.001376, mae: 0.077771, mean_q: 0.106361
 22124/50000: episode: 2879, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.000754, mae: 0.073036, mean_q: 0.099900
 22132/50000: episode: 2880, duration: 0.109s, episode steps:   8, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002027, mae: 0.076588, mean_q: 0.105805
 22135/50000: episode: 2881, duration: 0.046s, episode s

 22373/50000: episode: 2912, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003362, mae: 0.087365, mean_q: 0.121146
 22384/50000: episode: 2913, duration: 0.139s, episode steps:  11, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.909 [0.000, 3.000],  loss: 0.001568, mae: 0.083827, mean_q: 0.115782
 22408/50000: episode: 2914, duration: 0.294s, episode steps:  24, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.708 [0.000, 3.000],  loss: 0.001742, mae: 0.078365, mean_q: 0.107840
 22410/50000: episode: 2915, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.001245, mae: 0.078295, mean_q: 0.107977
 22416/50000: episode: 2916, duration: 0.090s, episode s

 22626/50000: episode: 2949, duration: 0.165s, episode steps:  12, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002559, mae: 0.092400, mean_q: 0.126219
 22632/50000: episode: 2950, duration: 0.109s, episode steps:   6, steps per second:  55, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.002631, mae: 0.091635, mean_q: 0.121384
 22638/50000: episode: 2951, duration: 0.143s, episode steps:   6, steps per second:  42, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.003622, mae: 0.080156, mean_q: 0.103370
 22644/50000: episode: 2952, duration: 0.124s, episode steps:   6, steps per second:  49, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002200, mae: 0.077999, mean_q: 0.105100
 22650/50000: episode: 2953, duration: 0.100s, episode s

 22894/50000: episode: 2986, duration: 0.102s, episode steps:   5, steps per second:  49, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002943, mae: 0.091285, mean_q: 0.124268
 22896/50000: episode: 2987, duration: 0.050s, episode steps:   2, steps per second:  40, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001793, mae: 0.088063, mean_q: 0.122705
 22900/50000: episode: 2988, duration: 0.077s, episode steps:   4, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002280, mae: 0.082980, mean_q: 0.115725
 22902/50000: episode: 2989, duration: 0.042s, episode steps:   2, steps per second:  48, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001241, mae: 0.080056, mean_q: 0.112477
 22904/50000: episode: 2990, duration: 0.061s, episode s

 23168/50000: episode: 3023, duration: 0.088s, episode steps:   5, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.001724, mae: 0.087194, mean_q: 0.122859
 23177/50000: episode: 3024, duration: 0.141s, episode steps:   9, steps per second:  64, episode reward:  1.000, mean reward:  0.111 [ 0.000,  1.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001909, mae: 0.082612, mean_q: 0.116166
 23182/50000: episode: 3025, duration: 0.097s, episode steps:   5, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.600 [2.000, 3.000],  loss: 0.001423, mae: 0.090499, mean_q: 0.125821
 23184/50000: episode: 3026, duration: 0.041s, episode steps:   2, steps per second:  48, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.001843, mae: 0.089846, mean_q: 0.124907
 23190/50000: episode: 3027, duration: 0.100s, episode s

 23472/50000: episode: 3060, duration: 0.119s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [0.000, 3.000],  loss: 0.002180, mae: 0.089781, mean_q: 0.125009
 23485/50000: episode: 3061, duration: 0.181s, episode steps:  13, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.538 [0.000, 3.000],  loss: 0.003204, mae: 0.097954, mean_q: 0.136415
 23492/50000: episode: 3062, duration: 0.098s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 2.000],  loss: 0.002137, mae: 0.097584, mean_q: 0.136264
 23496/50000: episode: 3063, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.250 [1.000, 3.000],  loss: 0.001244, mae: 0.092987, mean_q: 0.131053
 23505/50000: episode: 3064, duration: 0.129s, episode s

 23778/50000: episode: 3097, duration: 0.131s, episode steps:   8, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.002176, mae: 0.094792, mean_q: 0.128158
 23786/50000: episode: 3098, duration: 0.112s, episode steps:   8, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.002771, mae: 0.099088, mean_q: 0.136349
 23796/50000: episode: 3099, duration: 0.136s, episode steps:  10, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001789, mae: 0.095304, mean_q: 0.131861
 23800/50000: episode: 3100, duration: 0.070s, episode steps:   4, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.002422, mae: 0.102048, mean_q: 0.138956
 23808/50000: episode: 3101, duration: 0.122s, episode s

 24055/50000: episode: 3133, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 2.000],  loss: 0.001754, mae: 0.100276, mean_q: 0.138881
 24059/50000: episode: 3134, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.002411, mae: 0.101719, mean_q: 0.140236
 24062/50000: episode: 3135, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.001640, mae: 0.098260, mean_q: 0.137826
 24079/50000: episode: 3136, duration: 0.219s, episode steps:  17, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.176 [0.000, 3.000],  loss: 0.002249, mae: 0.098520, mean_q: 0.138132
 24087/50000: episode: 3137, duration: 0.104s, episode s

 24329/50000: episode: 3169, duration: 0.100s, episode steps:   8, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002031, mae: 0.093857, mean_q: 0.132876
 24336/50000: episode: 3170, duration: 0.089s, episode steps:   7, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002468, mae: 0.102324, mean_q: 0.146477
 24347/50000: episode: 3171, duration: 0.137s, episode steps:  11, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.818 [0.000, 2.000],  loss: 0.001603, mae: 0.108412, mean_q: 0.149639
 24363/50000: episode: 3172, duration: 0.193s, episode steps:  16, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.002190, mae: 0.099818, mean_q: 0.138201
 24369/50000: episode: 3173, duration: 0.078s, episode s

 24649/50000: episode: 3205, duration: 0.125s, episode steps:   9, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.222 [1.000, 3.000],  loss: 0.001926, mae: 0.103736, mean_q: 0.146024
 24673/50000: episode: 3206, duration: 0.382s, episode steps:  24, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.458 [0.000, 3.000],  loss: 0.001936, mae: 0.100922, mean_q: 0.138065
 24679/50000: episode: 3207, duration: 0.088s, episode steps:   6, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.002975, mae: 0.103911, mean_q: 0.139623
 24686/50000: episode: 3208, duration: 0.091s, episode steps:   7, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 3.000],  loss: 0.001574, mae: 0.096433, mean_q: 0.132629
 24688/50000: episode: 3209, duration: 0.036s, episode s

 24880/50000: episode: 3243, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.000973, mae: 0.090430, mean_q: 0.129233
 24889/50000: episode: 3244, duration: 0.114s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.001847, mae: 0.095195, mean_q: 0.133023
 24895/50000: episode: 3245, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001729, mae: 0.095278, mean_q: 0.131515
 24899/50000: episode: 3246, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [1.000, 2.000],  loss: 0.001858, mae: 0.108438, mean_q: 0.148166
 24906/50000: episode: 3247, duration: 0.096s, episode s

 25144/50000: episode: 3279, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.001646, mae: 0.101527, mean_q: 0.142969
 25163/50000: episode: 3280, duration: 0.252s, episode steps:  19, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.895 [0.000, 3.000],  loss: 0.001825, mae: 0.099809, mean_q: 0.140918
 25183/50000: episode: 3281, duration: 0.249s, episode steps:  20, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001693, mae: 0.099848, mean_q: 0.140454
 25187/50000: episode: 3282, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 2.000],  loss: 0.002741, mae: 0.102431, mean_q: 0.142617
 25195/50000: episode: 3283, duration: 0.101s, episode s

 25462/50000: episode: 3314, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003675, mae: 0.109454, mean_q: 0.154244
 25483/50000: episode: 3315, duration: 0.293s, episode steps:  21, steps per second:  72, episode reward:  1.000, mean reward:  0.048 [ 0.000,  1.000], mean action: 2.048 [0.000, 3.000],  loss: 0.002790, mae: 0.113054, mean_q: 0.156237
 25488/50000: episode: 3316, duration: 0.069s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.001591, mae: 0.106475, mean_q: 0.152568
 25497/50000: episode: 3317, duration: 0.112s, episode steps:   9, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003219, mae: 0.104442, mean_q: 0.145793
 25505/50000: episode: 3318, duration: 0.102s, episode s

 25679/50000: episode: 3349, duration: 0.141s, episode steps:   7, steps per second:  50, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.714 [0.000, 3.000],  loss: 0.002257, mae: 0.105358, mean_q: 0.145992
 25686/50000: episode: 3350, duration: 0.103s, episode steps:   7, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.001692, mae: 0.101683, mean_q: 0.144738
 25691/50000: episode: 3351, duration: 0.068s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.400 [0.000, 1.000],  loss: 0.002934, mae: 0.105335, mean_q: 0.150631
 25703/50000: episode: 3352, duration: 0.161s, episode steps:  12, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002485, mae: 0.112146, mean_q: 0.157369
 25715/50000: episode: 3353, duration: 0.164s, episode s

 25936/50000: episode: 3384, duration: 0.154s, episode steps:  12, steps per second:  78, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001481, mae: 0.097866, mean_q: 0.137598
 25940/50000: episode: 3385, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.003036, mae: 0.104864, mean_q: 0.144198
 25951/50000: episode: 3386, duration: 0.154s, episode steps:  11, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.273 [0.000, 3.000],  loss: 0.001805, mae: 0.104645, mean_q: 0.145344
 25958/50000: episode: 3387, duration: 0.094s, episode steps:   7, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.001115, mae: 0.099336, mean_q: 0.141184
 25964/50000: episode: 3388, duration: 0.090s, episode s

 26221/50000: episode: 3419, duration: 0.271s, episode steps:  19, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.842 [0.000, 3.000],  loss: 0.001882, mae: 0.103408, mean_q: 0.143891
 26228/50000: episode: 3420, duration: 0.110s, episode steps:   7, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.714 [0.000, 2.000],  loss: 0.002231, mae: 0.110183, mean_q: 0.153065
 26243/50000: episode: 3421, duration: 0.221s, episode steps:  15, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002088, mae: 0.101368, mean_q: 0.140416
 26245/50000: episode: 3422, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.001429, mae: 0.105970, mean_q: 0.148941
 26248/50000: episode: 3423, duration: 0.049s, episode s

 26498/50000: episode: 3455, duration: 0.241s, episode steps:  20, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.950 [0.000, 3.000],  loss: 0.001711, mae: 0.102193, mean_q: 0.142437
 26504/50000: episode: 3456, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.001702, mae: 0.098697, mean_q: 0.139128
 26510/50000: episode: 3457, duration: 0.079s, episode steps:   6, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002419, mae: 0.109931, mean_q: 0.154944
 26513/50000: episode: 3458, duration: 0.050s, episode steps:   3, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.001498, mae: 0.107604, mean_q: 0.151652
 26519/50000: episode: 3459, duration: 0.080s, episode s

 26762/50000: episode: 3491, duration: 0.224s, episode steps:  17, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.529 [0.000, 3.000],  loss: 0.002441, mae: 0.112567, mean_q: 0.155950
 26766/50000: episode: 3492, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003088, mae: 0.111555, mean_q: 0.153602
 26768/50000: episode: 3493, duration: 0.034s, episode steps:   2, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.001226, mae: 0.099550, mean_q: 0.140101
 26772/50000: episode: 3494, duration: 0.057s, episode steps:   4, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002454, mae: 0.099745, mean_q: 0.141254
 26776/50000: episode: 3495, duration: 0.057s, episode s

 26982/50000: episode: 3529, duration: 0.160s, episode steps:  13, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.077 [0.000, 3.000],  loss: 0.001922, mae: 0.105462, mean_q: 0.152418
 26985/50000: episode: 3530, duration: 0.046s, episode steps:   3, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 2.000],  loss: 0.002382, mae: 0.105427, mean_q: 0.148172
 27002/50000: episode: 3531, duration: 0.217s, episode steps:  17, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002100, mae: 0.117925, mean_q: 0.164289
 27004/50000: episode: 3532, duration: 0.043s, episode steps:   2, steps per second:  46, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002747, mae: 0.115493, mean_q: 0.157661
 27015/50000: episode: 3533, duration: 0.178s, episode s

 27255/50000: episode: 3564, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002582, mae: 0.116358, mean_q: 0.161777
 27259/50000: episode: 3565, duration: 0.057s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001445, mae: 0.112679, mean_q: 0.156486
 27276/50000: episode: 3566, duration: 0.225s, episode steps:  17, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.176 [0.000, 3.000],  loss: 0.001955, mae: 0.113012, mean_q: 0.156278
 27283/50000: episode: 3567, duration: 0.091s, episode steps:   7, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.002371, mae: 0.113897, mean_q: 0.158920
 27286/50000: episode: 3568, duration: 0.045s, episode s

 27506/50000: episode: 3599, duration: 0.150s, episode steps:  12, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.002156, mae: 0.109349, mean_q: 0.154409
 27517/50000: episode: 3600, duration: 0.136s, episode steps:  11, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.091 [0.000, 3.000],  loss: 0.002896, mae: 0.118572, mean_q: 0.165330
 27520/50000: episode: 3601, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.003008, mae: 0.116323, mean_q: 0.162128
 27524/50000: episode: 3602, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 3.000],  loss: 0.002118, mae: 0.117472, mean_q: 0.162469
 27538/50000: episode: 3603, duration: 0.181s, episode s

 27759/50000: episode: 3635, duration: 0.117s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002026, mae: 0.122311, mean_q: 0.172482
 27761/50000: episode: 3636, duration: 0.035s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002696, mae: 0.113624, mean_q: 0.162081
 27771/50000: episode: 3637, duration: 0.127s, episode steps:  10, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002237, mae: 0.114135, mean_q: 0.162197
 27780/50000: episode: 3638, duration: 0.115s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.778 [1.000, 3.000],  loss: 0.001898, mae: 0.114429, mean_q: 0.164195
 27790/50000: episode: 3639, duration: 0.124s, episode s

 28046/50000: episode: 3670, duration: 0.070s, episode steps:   4, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002061, mae: 0.116015, mean_q: 0.165268
 28049/50000: episode: 3671, duration: 0.050s, episode steps:   3, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.002371, mae: 0.119030, mean_q: 0.171899
 28054/50000: episode: 3672, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.200 [1.000, 3.000],  loss: 0.003114, mae: 0.126810, mean_q: 0.176671
 28060/50000: episode: 3673, duration: 0.087s, episode steps:   6, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002589, mae: 0.116730, mean_q: 0.167119
 28071/50000: episode: 3674, duration: 0.158s, episode s

 28323/50000: episode: 3706, duration: 0.108s, episode steps:   7, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002072, mae: 0.121326, mean_q: 0.173200
 28334/50000: episode: 3707, duration: 0.162s, episode steps:  11, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.091 [0.000, 3.000],  loss: 0.002285, mae: 0.126996, mean_q: 0.177923
 28344/50000: episode: 3708, duration: 0.129s, episode steps:  10, steps per second:  77, episode reward:  1.000, mean reward:  0.100 [ 0.000,  1.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002496, mae: 0.122691, mean_q: 0.171466
 28346/50000: episode: 3709, duration: 0.070s, episode steps:   2, steps per second:  29, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.001131, mae: 0.120810, mean_q: 0.171316
 28349/50000: episode: 3710, duration: 0.052s, episode s

 28623/50000: episode: 3742, duration: 0.232s, episode steps:  17, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.765 [0.000, 3.000],  loss: 0.002402, mae: 0.120866, mean_q: 0.170341
 28634/50000: episode: 3743, duration: 0.136s, episode steps:  11, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.091 [0.000, 3.000],  loss: 0.002283, mae: 0.125575, mean_q: 0.176902
 28638/50000: episode: 3744, duration: 0.057s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.001796, mae: 0.121821, mean_q: 0.170670
 28644/50000: episode: 3745, duration: 0.086s, episode steps:   6, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 2.000],  loss: 0.002830, mae: 0.119167, mean_q: 0.166913
 28648/50000: episode: 3746, duration: 0.058s, episode s

 28881/50000: episode: 3778, duration: 0.190s, episode steps:  15, steps per second:  79, episode reward:  1.000, mean reward:  0.067 [ 0.000,  1.000], mean action: 1.733 [0.000, 3.000],  loss: 0.003109, mae: 0.125418, mean_q: 0.178159
 28893/50000: episode: 3779, duration: 0.146s, episode steps:  12, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.002689, mae: 0.123247, mean_q: 0.180030
 28904/50000: episode: 3780, duration: 0.142s, episode steps:  11, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.909 [0.000, 3.000],  loss: 0.002830, mae: 0.129894, mean_q: 0.184668
 28909/50000: episode: 3781, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.003013, mae: 0.129188, mean_q: 0.181669
 28917/50000: episode: 3782, duration: 0.103s, episode s

 29122/50000: episode: 3813, duration: 0.121s, episode steps:   9, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [1.000, 3.000],  loss: 0.001849, mae: 0.119761, mean_q: 0.171918
 29126/50000: episode: 3814, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [1.000, 3.000],  loss: 0.001610, mae: 0.121294, mean_q: 0.171547
 29128/50000: episode: 3815, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.005428, mae: 0.125476, mean_q: 0.173505
 29137/50000: episode: 3816, duration: 0.115s, episode steps:   9, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002639, mae: 0.125860, mean_q: 0.176856
 29139/50000: episode: 3817, duration: 0.038s, episode s

 29420/50000: episode: 3848, duration: 0.136s, episode steps:  10, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003351, mae: 0.136836, mean_q: 0.192622
 29447/50000: episode: 3849, duration: 0.409s, episode steps:  27, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.185 [0.000, 3.000],  loss: 0.002810, mae: 0.133370, mean_q: 0.189552
 29453/50000: episode: 3850, duration: 0.084s, episode steps:   6, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002638, mae: 0.137703, mean_q: 0.190856
 29457/50000: episode: 3851, duration: 0.056s, episode steps:   4, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003079, mae: 0.136549, mean_q: 0.187442
 29461/50000: episode: 3852, duration: 0.057s, episode s

 29665/50000: episode: 3883, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002024, mae: 0.137394, mean_q: 0.194412
 29672/50000: episode: 3884, duration: 0.098s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002766, mae: 0.135214, mean_q: 0.188420
 29675/50000: episode: 3885, duration: 0.046s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002468, mae: 0.133352, mean_q: 0.183508
 29685/50000: episode: 3886, duration: 0.135s, episode steps:  10, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002840, mae: 0.127013, mean_q: 0.180076
 29698/50000: episode: 3887, duration: 0.183s, episode s

 29943/50000: episode: 3919, duration: 0.155s, episode steps:  12, steps per second:  77, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 1.167 [0.000, 3.000],  loss: 0.002144, mae: 0.122496, mean_q: 0.175142
 29952/50000: episode: 3920, duration: 0.113s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002764, mae: 0.131712, mean_q: 0.185893
 29963/50000: episode: 3921, duration: 0.140s, episode steps:  11, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.909 [0.000, 3.000],  loss: 0.002061, mae: 0.129063, mean_q: 0.183060
 29969/50000: episode: 3922, duration: 0.080s, episode steps:   6, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003348, mae: 0.132100, mean_q: 0.185713
 29972/50000: episode: 3923, duration: 0.046s, episode s

 30195/50000: episode: 3956, duration: 0.103s, episode steps:   7, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003050, mae: 0.128409, mean_q: 0.179695
 30201/50000: episode: 3957, duration: 0.088s, episode steps:   6, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 3.000],  loss: 0.002157, mae: 0.129442, mean_q: 0.180762
 30214/50000: episode: 3958, duration: 0.175s, episode steps:  13, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.692 [0.000, 3.000],  loss: 0.002602, mae: 0.129531, mean_q: 0.182170
 30220/50000: episode: 3959, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.001746, mae: 0.122014, mean_q: 0.175825
 30229/50000: episode: 3960, duration: 0.114s, episode s

 30468/50000: episode: 3992, duration: 0.052s, episode steps:   3, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.001479, mae: 0.128033, mean_q: 0.183119
 30473/50000: episode: 3993, duration: 0.068s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002450, mae: 0.137902, mean_q: 0.191688
 30480/50000: episode: 3994, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.143 [0.000, 3.000],  loss: 0.001620, mae: 0.136740, mean_q: 0.189684
 30489/50000: episode: 3995, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.111 [0.000, 3.000],  loss: 0.002219, mae: 0.134198, mean_q: 0.185918
 30500/50000: episode: 3996, duration: 0.135s, episode s

 30744/50000: episode: 4028, duration: 0.177s, episode steps:  12, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.417 [0.000, 3.000],  loss: 0.002292, mae: 0.129149, mean_q: 0.181421
 30748/50000: episode: 4029, duration: 0.080s, episode steps:   4, steps per second:  50, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002185, mae: 0.124971, mean_q: 0.176597
 30759/50000: episode: 4030, duration: 0.142s, episode steps:  11, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.727 [0.000, 3.000],  loss: 0.002717, mae: 0.128933, mean_q: 0.182823
 30767/50000: episode: 4031, duration: 0.105s, episode steps:   8, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002925, mae: 0.136344, mean_q: 0.189402
 30771/50000: episode: 4032, duration: 0.068s, episode s

 31006/50000: episode: 4064, duration: 0.059s, episode steps:   3, steps per second:  51, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 2.000],  loss: 0.002811, mae: 0.128854, mean_q: 0.180432
 31009/50000: episode: 4065, duration: 0.053s, episode steps:   3, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002920, mae: 0.124016, mean_q: 0.174777
 31013/50000: episode: 4066, duration: 0.057s, episode steps:   4, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [1.000, 3.000],  loss: 0.002581, mae: 0.123272, mean_q: 0.174999
 31029/50000: episode: 4067, duration: 0.220s, episode steps:  16, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002865, mae: 0.132982, mean_q: 0.186366
 31040/50000: episode: 4068, duration: 0.150s, episode s

 31294/50000: episode: 4100, duration: 0.050s, episode steps:   3, steps per second:  60, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.002839, mae: 0.131129, mean_q: 0.185050
 31306/50000: episode: 4101, duration: 0.179s, episode steps:  12, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.083 [0.000, 3.000],  loss: 0.003567, mae: 0.136222, mean_q: 0.193537
 31319/50000: episode: 4102, duration: 0.185s, episode steps:  13, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.615 [0.000, 3.000],  loss: 0.002788, mae: 0.134450, mean_q: 0.196240
 31321/50000: episode: 4103, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.001744, mae: 0.131235, mean_q: 0.192942
 31327/50000: episode: 4104, duration: 0.080s, episode s

 31570/50000: episode: 4135, duration: 0.168s, episode steps:  13, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.077 [0.000, 3.000],  loss: 0.002802, mae: 0.135780, mean_q: 0.194096
 31599/50000: episode: 4136, duration: 0.360s, episode steps:  29, steps per second:  80, episode reward:  1.000, mean reward:  0.034 [ 0.000,  1.000], mean action: 1.621 [0.000, 3.000],  loss: 0.002513, mae: 0.136077, mean_q: 0.192473
 31602/50000: episode: 4137, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.002607, mae: 0.132440, mean_q: 0.188192
 31618/50000: episode: 4138, duration: 0.198s, episode steps:  16, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002489, mae: 0.140162, mean_q: 0.198008
 31621/50000: episode: 4139, duration: 0.046s, episode s

 31868/50000: episode: 4170, duration: 0.213s, episode steps:  17, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.471 [0.000, 3.000],  loss: 0.003327, mae: 0.140225, mean_q: 0.196122
 31872/50000: episode: 4171, duration: 0.062s, episode steps:   4, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001405, mae: 0.140232, mean_q: 0.200082
 31880/50000: episode: 4172, duration: 0.117s, episode steps:   8, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 2.000],  loss: 0.002288, mae: 0.137789, mean_q: 0.196791
 31887/50000: episode: 4173, duration: 0.097s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002158, mae: 0.138855, mean_q: 0.197303
 31889/50000: episode: 4174, duration: 0.034s, episode s

 32122/50000: episode: 4207, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002114, mae: 0.139520, mean_q: 0.202414
 32129/50000: episode: 4208, duration: 0.095s, episode steps:   7, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002730, mae: 0.135705, mean_q: 0.196316
 32137/50000: episode: 4209, duration: 0.103s, episode steps:   8, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003016, mae: 0.140380, mean_q: 0.201328
 32144/50000: episode: 4210, duration: 0.098s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.003913, mae: 0.147464, mean_q: 0.209865
 32152/50000: episode: 4211, duration: 0.102s, episode s

 32450/50000: episode: 4243, duration: 0.188s, episode steps:  15, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.533 [0.000, 3.000],  loss: 0.002513, mae: 0.142960, mean_q: 0.203147
 32459/50000: episode: 4244, duration: 0.115s, episode steps:   9, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.002447, mae: 0.137515, mean_q: 0.197509
 32465/50000: episode: 4245, duration: 0.080s, episode steps:   6, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002977, mae: 0.144030, mean_q: 0.203208
 32472/50000: episode: 4246, duration: 0.099s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002541, mae: 0.144963, mean_q: 0.204760
 32478/50000: episode: 4247, duration: 0.082s, episode s

 32745/50000: episode: 4279, duration: 0.036s, episode steps:   2, steps per second:  55, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.005338, mae: 0.146857, mean_q: 0.201855
 32755/50000: episode: 4280, duration: 0.133s, episode steps:  10, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002161, mae: 0.143027, mean_q: 0.200413
 32762/50000: episode: 4281, duration: 0.095s, episode steps:   7, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002743, mae: 0.142193, mean_q: 0.197990
 32772/50000: episode: 4282, duration: 0.129s, episode steps:  10, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.200 [1.000, 3.000],  loss: 0.003631, mae: 0.141139, mean_q: 0.199127
 32782/50000: episode: 4283, duration: 0.133s, episode s

 33021/50000: episode: 4316, duration: 0.088s, episode steps:   6, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.004390, mae: 0.140279, mean_q: 0.201159
 33024/50000: episode: 4317, duration: 0.046s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.002462, mae: 0.133193, mean_q: 0.195015
 33029/50000: episode: 4318, duration: 0.068s, episode steps:   5, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001857, mae: 0.133530, mean_q: 0.193657
 33032/50000: episode: 4319, duration: 0.045s, episode steps:   3, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.003000, mae: 0.154256, mean_q: 0.215726
 33043/50000: episode: 4320, duration: 0.146s, episode s

 33303/50000: episode: 4352, duration: 0.121s, episode steps:   9, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.004159, mae: 0.147363, mean_q: 0.213113
 33310/50000: episode: 4353, duration: 0.092s, episode steps:   7, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.004378, mae: 0.151222, mean_q: 0.217880
 33315/50000: episode: 4354, duration: 0.069s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003282, mae: 0.148436, mean_q: 0.212707
 33328/50000: episode: 4355, duration: 0.168s, episode steps:  13, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.538 [0.000, 3.000],  loss: 0.003160, mae: 0.145951, mean_q: 0.207641
 33334/50000: episode: 4356, duration: 0.080s, episode s

 33573/50000: episode: 4388, duration: 0.150s, episode steps:  12, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.002237, mae: 0.138109, mean_q: 0.196617
 33576/50000: episode: 4389, duration: 0.052s, episode steps:   3, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [1.000, 2.000],  loss: 0.001794, mae: 0.136740, mean_q: 0.192553
 33578/50000: episode: 4390, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.004084, mae: 0.146312, mean_q: 0.202787
 33583/50000: episode: 4391, duration: 0.075s, episode steps:   5, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.003211, mae: 0.142150, mean_q: 0.200000
 33587/50000: episode: 4392, duration: 0.065s, episode s

 33829/50000: episode: 4425, duration: 0.201s, episode steps:  16, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.003560, mae: 0.151054, mean_q: 0.211046
 33841/50000: episode: 4426, duration: 0.149s, episode steps:  12, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002533, mae: 0.140671, mean_q: 0.198245
 33845/50000: episode: 4427, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.001618, mae: 0.129525, mean_q: 0.191353
 33847/50000: episode: 4428, duration: 0.039s, episode steps:   2, steps per second:  51, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.002899, mae: 0.132587, mean_q: 0.190200
 33860/50000: episode: 4429, duration: 0.160s, episode s

 34044/50000: episode: 4461, duration: 0.113s, episode steps:   8, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.003395, mae: 0.142036, mean_q: 0.197911
 34051/50000: episode: 4462, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.003084, mae: 0.138182, mean_q: 0.192813
 34056/50000: episode: 4463, duration: 0.069s, episode steps:   5, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.001775, mae: 0.132005, mean_q: 0.189413
 34066/50000: episode: 4464, duration: 0.132s, episode steps:  10, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.100 [0.000, 2.000],  loss: 0.002457, mae: 0.135588, mean_q: 0.190582
 34068/50000: episode: 4465, duration: 0.035s, episode s

 34293/50000: episode: 4496, duration: 0.132s, episode steps:  10, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002358, mae: 0.139193, mean_q: 0.196412
 34308/50000: episode: 4497, duration: 0.183s, episode steps:  15, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002720, mae: 0.138223, mean_q: 0.197920
 34328/50000: episode: 4498, duration: 0.296s, episode steps:  20, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002418, mae: 0.139704, mean_q: 0.200619
 34334/50000: episode: 4499, duration: 0.080s, episode steps:   6, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.002138, mae: 0.133697, mean_q: 0.192787
 34342/50000: episode: 4500, duration: 0.103s, episode s

 34587/50000: episode: 4534, duration: 0.137s, episode steps:   8, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003869, mae: 0.142676, mean_q: 0.202009
 34591/50000: episode: 4535, duration: 0.076s, episode steps:   4, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 3.000],  loss: 0.003482, mae: 0.137434, mean_q: 0.201560
 34610/50000: episode: 4536, duration: 0.238s, episode steps:  19, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.737 [0.000, 3.000],  loss: 0.003086, mae: 0.144952, mean_q: 0.207500
 34612/50000: episode: 4537, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.001955, mae: 0.134739, mean_q: 0.189086
 34614/50000: episode: 4538, duration: 0.035s, episode s

 34919/50000: episode: 4571, duration: 0.101s, episode steps:   7, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.857 [0.000, 3.000],  loss: 0.002912, mae: 0.148025, mean_q: 0.214321
 34926/50000: episode: 4572, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.857 [0.000, 3.000],  loss: 0.003193, mae: 0.147687, mean_q: 0.212678
 34930/50000: episode: 4573, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.004208, mae: 0.146733, mean_q: 0.207073
 34932/50000: episode: 4574, duration: 0.035s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.002576, mae: 0.150205, mean_q: 0.211116
 34938/50000: episode: 4575, duration: 0.085s, episode s

 35175/50000: episode: 4607, duration: 0.120s, episode steps:   9, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.111 [0.000, 3.000],  loss: 0.002723, mae: 0.151360, mean_q: 0.214485
 35182/50000: episode: 4608, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003744, mae: 0.154138, mean_q: 0.215604
 35184/50000: episode: 4609, duration: 0.034s, episode steps:   2, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002344, mae: 0.144059, mean_q: 0.201099
 35186/50000: episode: 4610, duration: 0.035s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002457, mae: 0.136491, mean_q: 0.195386
 35201/50000: episode: 4611, duration: 0.189s, episode s

 35528/50000: episode: 4644, duration: 0.154s, episode steps:  12, steps per second:  78, episode reward:  1.000, mean reward:  0.083 [ 0.000,  1.000], mean action: 0.833 [0.000, 2.000],  loss: 0.002637, mae: 0.152262, mean_q: 0.217654
 35531/50000: episode: 4645, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [1.000, 2.000],  loss: 0.003722, mae: 0.153611, mean_q: 0.212269
 35543/50000: episode: 4646, duration: 0.150s, episode steps:  12, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003181, mae: 0.152114, mean_q: 0.214623
 35550/50000: episode: 4647, duration: 0.101s, episode steps:   7, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.004682, mae: 0.155012, mean_q: 0.215964
 35559/50000: episode: 4648, duration: 0.116s, episode s

 35845/50000: episode: 4680, duration: 0.144s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.636 [0.000, 3.000],  loss: 0.003960, mae: 0.159809, mean_q: 0.224516
 35850/50000: episode: 4681, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [1.000, 3.000],  loss: 0.003881, mae: 0.151382, mean_q: 0.214601
 35854/50000: episode: 4682, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002792, mae: 0.150866, mean_q: 0.214666
 35856/50000: episode: 4683, duration: 0.035s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.003796, mae: 0.144164, mean_q: 0.202037
 35866/50000: episode: 4684, duration: 0.132s, episode s

 36175/50000: episode: 4717, duration: 0.039s, episode steps:   2, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.005939, mae: 0.166967, mean_q: 0.226628
 36183/50000: episode: 4718, duration: 0.105s, episode steps:   8, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.004054, mae: 0.158959, mean_q: 0.224818
 36191/50000: episode: 4719, duration: 0.113s, episode steps:   8, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.002872, mae: 0.152812, mean_q: 0.221088
 36194/50000: episode: 4720, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.003606, mae: 0.159828, mean_q: 0.227451
 36198/50000: episode: 4721, duration: 0.058s, episode s

 36432/50000: episode: 4754, duration: 0.085s, episode steps:   6, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.833 [0.000, 3.000],  loss: 0.003720, mae: 0.152050, mean_q: 0.217590
 36435/50000: episode: 4755, duration: 0.048s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.003890, mae: 0.160153, mean_q: 0.228820
 36447/50000: episode: 4756, duration: 0.152s, episode steps:  12, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.583 [0.000, 3.000],  loss: 0.003082, mae: 0.147879, mean_q: 0.211900
 36451/50000: episode: 4757, duration: 0.061s, episode steps:   4, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.750 [0.000, 2.000],  loss: 0.003945, mae: 0.156675, mean_q: 0.219945
 36454/50000: episode: 4758, duration: 0.047s, episode s

 36667/50000: episode: 4790, duration: 0.242s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.684 [0.000, 3.000],  loss: 0.002563, mae: 0.146315, mean_q: 0.213192
 36671/50000: episode: 4791, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.003256, mae: 0.144381, mean_q: 0.213255
 36688/50000: episode: 4792, duration: 0.214s, episode steps:  17, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.824 [0.000, 3.000],  loss: 0.003507, mae: 0.150602, mean_q: 0.214853
 36705/50000: episode: 4793, duration: 0.227s, episode steps:  17, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.176 [0.000, 3.000],  loss: 0.003273, mae: 0.152398, mean_q: 0.212779
 36727/50000: episode: 4794, duration: 0.276s, episode s

 36966/50000: episode: 4825, duration: 0.234s, episode steps:  19, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.684 [0.000, 3.000],  loss: 0.003810, mae: 0.149616, mean_q: 0.213698
 36972/50000: episode: 4826, duration: 0.081s, episode steps:   6, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.003744, mae: 0.158435, mean_q: 0.222855
 36981/50000: episode: 4827, duration: 0.115s, episode steps:   9, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.111 [0.000, 3.000],  loss: 0.003047, mae: 0.156470, mean_q: 0.222380
 36993/50000: episode: 4828, duration: 0.161s, episode steps:  12, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003440, mae: 0.156819, mean_q: 0.220878
 36996/50000: episode: 4829, duration: 0.047s, episode s

 37238/50000: episode: 4860, duration: 0.072s, episode steps:   5, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001689, mae: 0.149294, mean_q: 0.211595
 37242/50000: episode: 4861, duration: 0.060s, episode steps:   4, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002375, mae: 0.156387, mean_q: 0.220538
 37245/50000: episode: 4862, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002826, mae: 0.153111, mean_q: 0.213291
 37252/50000: episode: 4863, duration: 0.091s, episode steps:   7, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.002461, mae: 0.146049, mean_q: 0.208763
 37256/50000: episode: 4864, duration: 0.064s, episode s

 37567/50000: episode: 4896, duration: 0.044s, episode steps:   2, steps per second:  46, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002941, mae: 0.145601, mean_q: 0.206970
 37577/50000: episode: 4897, duration: 0.145s, episode steps:  10, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003537, mae: 0.147469, mean_q: 0.210925
 37582/50000: episode: 4898, duration: 0.074s, episode steps:   5, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003644, mae: 0.155524, mean_q: 0.225932
 37589/50000: episode: 4899, duration: 0.097s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.002868, mae: 0.151728, mean_q: 0.216346
 37593/50000: episode: 4900, duration: 0.063s, episode s

 37818/50000: episode: 4933, duration: 0.107s, episode steps:   7, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.714 [0.000, 2.000],  loss: 0.002672, mae: 0.147797, mean_q: 0.207135
 37826/50000: episode: 4934, duration: 0.105s, episode steps:   8, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003046, mae: 0.147491, mean_q: 0.211992
 37829/50000: episode: 4935, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: 0.003488, mae: 0.157811, mean_q: 0.221103
 37849/50000: episode: 4936, duration: 0.256s, episode steps:  20, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.700 [0.000, 3.000],  loss: 0.002343, mae: 0.149899, mean_q: 0.214070
 37852/50000: episode: 4937, duration: 0.047s, episode s

 38098/50000: episode: 4969, duration: 0.140s, episode steps:  10, steps per second:  71, episode reward:  1.000, mean reward:  0.100 [ 0.000,  1.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003112, mae: 0.146773, mean_q: 0.207754
 38101/50000: episode: 4970, duration: 0.048s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.002541, mae: 0.142181, mean_q: 0.202370
 38105/50000: episode: 4971, duration: 0.059s, episode steps:   4, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.002123, mae: 0.141571, mean_q: 0.203288
 38110/50000: episode: 4972, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002576, mae: 0.140520, mean_q: 0.199070
 38120/50000: episode: 4973, duration: 0.136s, episode s

 38301/50000: episode: 5006, duration: 0.130s, episode steps:   9, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.222 [0.000, 3.000],  loss: 0.002841, mae: 0.146040, mean_q: 0.202756
 38314/50000: episode: 5007, duration: 0.164s, episode steps:  13, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.692 [0.000, 3.000],  loss: 0.003060, mae: 0.146517, mean_q: 0.207939
 38317/50000: episode: 5008, duration: 0.051s, episode steps:   3, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003647, mae: 0.144615, mean_q: 0.205344
 38319/50000: episode: 5009, duration: 0.043s, episode steps:   2, steps per second:  47, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.003601, mae: 0.144858, mean_q: 0.204947
 38333/50000: episode: 5010, duration: 0.202s, episode s

 38588/50000: episode: 5042, duration: 0.142s, episode steps:  10, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003572, mae: 0.146093, mean_q: 0.207403
 38607/50000: episode: 5043, duration: 0.240s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.105 [0.000, 3.000],  loss: 0.003634, mae: 0.148323, mean_q: 0.208135
 38620/50000: episode: 5044, duration: 0.176s, episode steps:  13, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.769 [0.000, 3.000],  loss: 0.002629, mae: 0.147561, mean_q: 0.210901
 38624/50000: episode: 5045, duration: 0.066s, episode steps:   4, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003016, mae: 0.152255, mean_q: 0.213761
 38634/50000: episode: 5046, duration: 0.130s, episode s

 38901/50000: episode: 5077, duration: 0.249s, episode steps:  19, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.579 [0.000, 3.000],  loss: 0.002997, mae: 0.145648, mean_q: 0.205116
 38903/50000: episode: 5078, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.002979, mae: 0.139640, mean_q: 0.197608
 38905/50000: episode: 5079, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.004065, mae: 0.152316, mean_q: 0.214303
 38916/50000: episode: 5080, duration: 0.144s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.909 [0.000, 3.000],  loss: 0.002688, mae: 0.143424, mean_q: 0.202681
 38922/50000: episode: 5081, duration: 0.089s, episode s

 39181/50000: episode: 5112, duration: 0.274s, episode steps:  21, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.524 [0.000, 3.000],  loss: 0.003264, mae: 0.145083, mean_q: 0.203535
 39189/50000: episode: 5113, duration: 0.106s, episode steps:   8, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.003255, mae: 0.143351, mean_q: 0.201300
 39194/50000: episode: 5114, duration: 0.071s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001784, mae: 0.144329, mean_q: 0.206026
 39202/50000: episode: 5115, duration: 0.115s, episode steps:   8, steps per second:  69, episode reward:  1.000, mean reward:  0.125 [ 0.000,  1.000], mean action: 1.250 [0.000, 2.000],  loss: 0.002579, mae: 0.151226, mean_q: 0.211990
 39207/50000: episode: 5116, duration: 0.070s, episode s

 39467/50000: episode: 5148, duration: 0.185s, episode steps:  13, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.308 [0.000, 3.000],  loss: 0.002991, mae: 0.147888, mean_q: 0.210117
 39473/50000: episode: 5149, duration: 0.089s, episode steps:   6, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.667 [1.000, 3.000],  loss: 0.002342, mae: 0.139097, mean_q: 0.204346
 39480/50000: episode: 5150, duration: 0.097s, episode steps:   7, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003188, mae: 0.143720, mean_q: 0.206034
 39486/50000: episode: 5151, duration: 0.094s, episode steps:   6, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: 0.002361, mae: 0.141120, mean_q: 0.203671
 39496/50000: episode: 5152, duration: 0.130s, episode s

 39760/50000: episode: 5185, duration: 0.204s, episode steps:  16, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.875 [0.000, 3.000],  loss: 0.002438, mae: 0.144990, mean_q: 0.208181
 39765/50000: episode: 5186, duration: 0.069s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002493, mae: 0.143584, mean_q: 0.205718
 39768/50000: episode: 5187, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 2.000],  loss: 0.001357, mae: 0.145010, mean_q: 0.208224
 39772/50000: episode: 5188, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003285, mae: 0.144638, mean_q: 0.207760
 39777/50000: episode: 5189, duration: 0.074s, episode s

 40013/50000: episode: 5220, duration: 0.064s, episode steps:   4, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.250 [2.000, 3.000],  loss: 0.003357, mae: 0.150871, mean_q: 0.213441
 40026/50000: episode: 5221, duration: 0.165s, episode steps:  13, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.462 [0.000, 3.000],  loss: 0.002699, mae: 0.153393, mean_q: 0.218161
 40028/50000: episode: 5222, duration: 0.035s, episode steps:   2, steps per second:  58, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.002368, mae: 0.150303, mean_q: 0.212572
 40042/50000: episode: 5223, duration: 0.180s, episode steps:  14, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003185, mae: 0.154325, mean_q: 0.217681
 40044/50000: episode: 5224, duration: 0.035s, episode s

 40353/50000: episode: 5256, duration: 0.098s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.002016, mae: 0.144981, mean_q: 0.210333
 40361/50000: episode: 5257, duration: 0.107s, episode steps:   8, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.875 [0.000, 2.000],  loss: 0.004137, mae: 0.151253, mean_q: 0.214663
 40376/50000: episode: 5258, duration: 0.190s, episode steps:  15, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.733 [0.000, 3.000],  loss: 0.002648, mae: 0.148279, mean_q: 0.208052
 40401/50000: episode: 5259, duration: 0.307s, episode steps:  25, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.720 [0.000, 3.000],  loss: 0.002909, mae: 0.151027, mean_q: 0.213881
 40407/50000: episode: 5260, duration: 0.082s, episode s

 40649/50000: episode: 5291, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.004525, mae: 0.154659, mean_q: 0.214730
 40656/50000: episode: 5292, duration: 0.099s, episode steps:   7, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.002825, mae: 0.152207, mean_q: 0.214234
 40670/50000: episode: 5293, duration: 0.180s, episode steps:  14, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.071 [0.000, 3.000],  loss: 0.002489, mae: 0.149659, mean_q: 0.209501
 40677/50000: episode: 5294, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.004569, mae: 0.156430, mean_q: 0.215982
 40682/50000: episode: 5295, duration: 0.070s, episode s

 40931/50000: episode: 5326, duration: 0.040s, episode steps:   2, steps per second:  50, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.002116, mae: 0.147788, mean_q: 0.214764
 40934/50000: episode: 5327, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.003061, mae: 0.151680, mean_q: 0.217368
 40943/50000: episode: 5328, duration: 0.118s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.002631, mae: 0.148318, mean_q: 0.217729
 40954/50000: episode: 5329, duration: 0.145s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.002300, mae: 0.144913, mean_q: 0.212306
 40956/50000: episode: 5330, duration: 0.035s, episode s

 41162/50000: episode: 5361, duration: 0.113s, episode steps:   8, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.875 [0.000, 2.000],  loss: 0.003263, mae: 0.150981, mean_q: 0.214308
 41167/50000: episode: 5362, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [1.000, 3.000],  loss: 0.002119, mae: 0.148584, mean_q: 0.210864
 41178/50000: episode: 5363, duration: 0.142s, episode steps:  11, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.003470, mae: 0.151393, mean_q: 0.214290
 41183/50000: episode: 5364, duration: 0.072s, episode steps:   5, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.003760, mae: 0.153328, mean_q: 0.220094
 41193/50000: episode: 5365, duration: 0.129s, episode s

 41421/50000: episode: 5396, duration: 0.133s, episode steps:  10, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003116, mae: 0.157895, mean_q: 0.219800
 41432/50000: episode: 5397, duration: 0.142s, episode steps:  11, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.364 [0.000, 3.000],  loss: 0.003078, mae: 0.155377, mean_q: 0.216076
 41439/50000: episode: 5398, duration: 0.100s, episode steps:   7, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.002821, mae: 0.148444, mean_q: 0.208961
 41441/50000: episode: 5399, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002092, mae: 0.149099, mean_q: 0.210783
 41444/50000: episode: 5400, duration: 0.048s, episode s

 41664/50000: episode: 5431, duration: 0.241s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.105 [0.000, 3.000],  loss: 0.002865, mae: 0.153786, mean_q: 0.219970
 41669/50000: episode: 5432, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002493, mae: 0.152976, mean_q: 0.220195
 41704/50000: episode: 5433, duration: 0.424s, episode steps:  35, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.029 [0.000, 3.000],  loss: 0.002766, mae: 0.154253, mean_q: 0.220909
 41713/50000: episode: 5434, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [0.000, 3.000],  loss: 0.002798, mae: 0.155730, mean_q: 0.214206
 41718/50000: episode: 5435, duration: 0.071s, episode s

 41957/50000: episode: 5466, duration: 0.135s, episode steps:  10, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.100 [0.000, 3.000],  loss: 0.003224, mae: 0.155468, mean_q: 0.215525
 41960/50000: episode: 5467, duration: 0.047s, episode steps:   3, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002662, mae: 0.154210, mean_q: 0.218754
 41965/50000: episode: 5468, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.200 [1.000, 3.000],  loss: 0.002659, mae: 0.156345, mean_q: 0.221261
 41967/50000: episode: 5469, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.003747, mae: 0.157436, mean_q: 0.219801
 41981/50000: episode: 5470, duration: 0.186s, episode s

 42219/50000: episode: 5501, duration: 0.083s, episode steps:   6, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.833 [0.000, 3.000],  loss: 0.003435, mae: 0.154747, mean_q: 0.216614
 42221/50000: episode: 5502, duration: 0.038s, episode steps:   2, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.001150, mae: 0.150368, mean_q: 0.215180
 42225/50000: episode: 5503, duration: 0.061s, episode steps:   4, steps per second:  66, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 2.000],  loss: 0.003686, mae: 0.159684, mean_q: 0.222671
 42230/50000: episode: 5504, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.003279, mae: 0.153851, mean_q: 0.212139
 42249/50000: episode: 5505, duration: 0.242s, episode s

 42501/50000: episode: 5537, duration: 0.078s, episode steps:   5, steps per second:  64, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.800 [0.000, 3.000],  loss: 0.003069, mae: 0.153660, mean_q: 0.216483
 42503/50000: episode: 5538, duration: 0.036s, episode steps:   2, steps per second:  55, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003302, mae: 0.151749, mean_q: 0.213620
 42511/50000: episode: 5539, duration: 0.105s, episode steps:   8, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003146, mae: 0.152994, mean_q: 0.215925
 42516/50000: episode: 5540, duration: 0.072s, episode steps:   5, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.003181, mae: 0.150243, mean_q: 0.214572
 42525/50000: episode: 5541, duration: 0.121s, episode s

 42753/50000: episode: 5575, duration: 0.077s, episode steps:   5, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.200 [0.000, 3.000],  loss: 0.002968, mae: 0.150627, mean_q: 0.213774
 42773/50000: episode: 5576, duration: 0.253s, episode steps:  20, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.002712, mae: 0.151011, mean_q: 0.212496
 42781/50000: episode: 5577, duration: 0.106s, episode steps:   8, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003390, mae: 0.150831, mean_q: 0.211397
 42794/50000: episode: 5578, duration: 0.169s, episode steps:  13, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.462 [0.000, 3.000],  loss: 0.003745, mae: 0.141908, mean_q: 0.206662
 42797/50000: episode: 5579, duration: 0.048s, episode s

 42974/50000: episode: 5610, duration: 0.133s, episode steps:  10, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003074, mae: 0.159361, mean_q: 0.222625
 42980/50000: episode: 5611, duration: 0.083s, episode steps:   6, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 2.000],  loss: 0.002565, mae: 0.152098, mean_q: 0.214649
 43003/50000: episode: 5612, duration: 0.283s, episode steps:  23, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.609 [0.000, 3.000],  loss: 0.002741, mae: 0.150291, mean_q: 0.213760
 43014/50000: episode: 5613, duration: 0.141s, episode steps:  11, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.182 [0.000, 3.000],  loss: 0.002876, mae: 0.150234, mean_q: 0.216160
 43023/50000: episode: 5614, duration: 0.122s, episode s

 43264/50000: episode: 5645, duration: 0.212s, episode steps:  16, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002967, mae: 0.152979, mean_q: 0.217308
 43281/50000: episode: 5646, duration: 0.211s, episode steps:  17, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.471 [0.000, 3.000],  loss: 0.003666, mae: 0.156012, mean_q: 0.219294
 43283/50000: episode: 5647, duration: 0.039s, episode steps:   2, steps per second:  51, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002624, mae: 0.149375, mean_q: 0.214292
 43303/50000: episode: 5648, duration: 0.253s, episode steps:  20, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.300 [0.000, 3.000],  loss: 0.002776, mae: 0.151805, mean_q: 0.218851
 43306/50000: episode: 5649, duration: 0.047s, episode s

 43538/50000: episode: 5681, duration: 0.266s, episode steps:  21, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 3.000],  loss: 0.002759, mae: 0.153775, mean_q: 0.219619
 43540/50000: episode: 5682, duration: 0.036s, episode steps:   2, steps per second:  55, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002642, mae: 0.148775, mean_q: 0.206083
 43558/50000: episode: 5683, duration: 0.227s, episode steps:  18, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.003291, mae: 0.152511, mean_q: 0.214327
 43568/50000: episode: 5684, duration: 0.130s, episode steps:  10, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003044, mae: 0.151261, mean_q: 0.218486
 43579/50000: episode: 5685, duration: 0.143s, episode s

 43807/50000: episode: 5719, duration: 0.052s, episode steps:   2, steps per second:  39, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.003082, mae: 0.154418, mean_q: 0.224570
 43814/50000: episode: 5720, duration: 0.113s, episode steps:   7, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.003017, mae: 0.148380, mean_q: 0.213502
 43820/50000: episode: 5721, duration: 0.087s, episode steps:   6, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002090, mae: 0.148990, mean_q: 0.212908
 43828/50000: episode: 5722, duration: 0.114s, episode steps:   8, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.375 [1.000, 3.000],  loss: 0.003370, mae: 0.155510, mean_q: 0.218049
 43830/50000: episode: 5723, duration: 0.036s, episode s

 44094/50000: episode: 5754, duration: 0.115s, episode steps:   8, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.003204, mae: 0.148793, mean_q: 0.209801
 44099/50000: episode: 5755, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [1.000, 3.000],  loss: 0.002915, mae: 0.152302, mean_q: 0.210900
 44106/50000: episode: 5756, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.002620, mae: 0.151471, mean_q: 0.214126
 44108/50000: episode: 5757, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.005251, mae: 0.152443, mean_q: 0.212760
 44112/50000: episode: 5758, duration: 0.063s, episode s

 44335/50000: episode: 5790, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.003709, mae: 0.152622, mean_q: 0.216451
 44342/50000: episode: 5791, duration: 0.100s, episode steps:   7, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.286 [0.000, 3.000],  loss: 0.003277, mae: 0.147844, mean_q: 0.208305
 44346/50000: episode: 5792, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 2.000],  loss: 0.002274, mae: 0.154045, mean_q: 0.217137
 44352/50000: episode: 5793, duration: 0.088s, episode steps:   6, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.333 [0.000, 2.000],  loss: 0.003780, mae: 0.152788, mean_q: 0.215295
 44366/50000: episode: 5794, duration: 0.179s, episode s

 44607/50000: episode: 5825, duration: 0.287s, episode steps:  23, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.652 [0.000, 3.000],  loss: 0.002849, mae: 0.153380, mean_q: 0.214508
 44612/50000: episode: 5826, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002788, mae: 0.150425, mean_q: 0.212651
 44614/50000: episode: 5827, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.004313, mae: 0.159031, mean_q: 0.221573
 44619/50000: episode: 5828, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.004143, mae: 0.157292, mean_q: 0.222945
 44630/50000: episode: 5829, duration: 0.153s, episode s

 44848/50000: episode: 5860, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.002917, mae: 0.148906, mean_q: 0.210542
 44857/50000: episode: 5861, duration: 0.123s, episode steps:   9, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.111 [0.000, 3.000],  loss: 0.002305, mae: 0.145858, mean_q: 0.204813
 44867/50000: episode: 5862, duration: 0.130s, episode steps:  10, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.700 [0.000, 3.000],  loss: 0.003532, mae: 0.153728, mean_q: 0.215714
 44870/50000: episode: 5863, duration: 0.047s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.003131, mae: 0.152617, mean_q: 0.213603
 44872/50000: episode: 5864, duration: 0.038s, episode s

 45160/50000: episode: 5896, duration: 0.086s, episode steps:   6, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.003327, mae: 0.155041, mean_q: 0.220925
 45178/50000: episode: 5897, duration: 0.238s, episode steps:  18, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002528, mae: 0.155729, mean_q: 0.221077
 45186/50000: episode: 5898, duration: 0.107s, episode steps:   8, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 2.000],  loss: 0.002269, mae: 0.149713, mean_q: 0.218135
 45188/50000: episode: 5899, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002095, mae: 0.147291, mean_q: 0.209827
 45204/50000: episode: 5900, duration: 0.204s, episode s

 45456/50000: episode: 5933, duration: 0.121s, episode steps:   9, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.004008, mae: 0.155060, mean_q: 0.218241
 45473/50000: episode: 5934, duration: 0.214s, episode steps:  17, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.118 [0.000, 3.000],  loss: 0.002485, mae: 0.146556, mean_q: 0.207601
 45475/50000: episode: 5935, duration: 0.037s, episode steps:   2, steps per second:  54, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.003205, mae: 0.145315, mean_q: 0.207057
 45480/50000: episode: 5936, duration: 0.071s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.003050, mae: 0.145338, mean_q: 0.207140
 45485/50000: episode: 5937, duration: 0.071s, episode s

 45733/50000: episode: 5968, duration: 0.049s, episode steps:   3, steps per second:  61, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002668, mae: 0.151927, mean_q: 0.219412
 45744/50000: episode: 5969, duration: 0.144s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.455 [0.000, 3.000],  loss: 0.002294, mae: 0.152876, mean_q: 0.218656
 45749/50000: episode: 5970, duration: 0.073s, episode steps:   5, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.003047, mae: 0.155478, mean_q: 0.219881
 45753/50000: episode: 5971, duration: 0.064s, episode steps:   4, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.005233, mae: 0.156299, mean_q: 0.219716
 45759/50000: episode: 5972, duration: 0.095s, episode s

 45964/50000: episode: 6003, duration: 0.110s, episode steps:   8, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002506, mae: 0.148030, mean_q: 0.210120
 45975/50000: episode: 6004, duration: 0.146s, episode steps:  11, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.545 [0.000, 3.000],  loss: 0.002866, mae: 0.149632, mean_q: 0.212714
 45978/50000: episode: 6005, duration: 0.048s, episode steps:   3, steps per second:  63, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002570, mae: 0.148266, mean_q: 0.216212
 45980/50000: episode: 6006, duration: 0.041s, episode steps:   2, steps per second:  49, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002853, mae: 0.159339, mean_q: 0.225098
 45985/50000: episode: 6007, duration: 0.071s, episode s

 46272/50000: episode: 6040, duration: 0.082s, episode steps:   6, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.004209, mae: 0.155022, mean_q: 0.221545
 46303/50000: episode: 6041, duration: 0.394s, episode steps:  31, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.645 [0.000, 3.000],  loss: 0.002929, mae: 0.149366, mean_q: 0.211389
 46314/50000: episode: 6042, duration: 0.141s, episode steps:  11, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.091 [0.000, 3.000],  loss: 0.002138, mae: 0.143508, mean_q: 0.206240
 46316/50000: episode: 6043, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.500 [2.000, 3.000],  loss: 0.003557, mae: 0.154170, mean_q: 0.222200
 46320/50000: episode: 6044, duration: 0.066s, episode s

 46602/50000: episode: 6076, duration: 0.217s, episode steps:  17, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.647 [0.000, 3.000],  loss: 0.002385, mae: 0.152517, mean_q: 0.217459
 46607/50000: episode: 6077, duration: 0.070s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.003043, mae: 0.148144, mean_q: 0.211912
 46620/50000: episode: 6078, duration: 0.170s, episode steps:  13, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.923 [1.000, 3.000],  loss: 0.003192, mae: 0.152406, mean_q: 0.213540
 46625/50000: episode: 6079, duration: 0.072s, episode steps:   5, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [1.000, 2.000],  loss: 0.002917, mae: 0.152357, mean_q: 0.212758
 46628/50000: episode: 6080, duration: 0.055s, episode s

 46925/50000: episode: 6111, duration: 0.090s, episode steps:   6, steps per second:  67, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.003248, mae: 0.148037, mean_q: 0.209752
 46927/50000: episode: 6112, duration: 0.036s, episode steps:   2, steps per second:  55, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [1.000, 1.000],  loss: 0.005356, mae: 0.155569, mean_q: 0.217093
 46932/50000: episode: 6113, duration: 0.071s, episode steps:   5, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.200 [0.000, 1.000],  loss: 0.002474, mae: 0.150859, mean_q: 0.211125
 46934/50000: episode: 6114, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.002676, mae: 0.140155, mean_q: 0.202424
 46937/50000: episode: 6115, duration: 0.047s, episode s

 47157/50000: episode: 6148, duration: 0.051s, episode steps:   3, steps per second:  59, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.001830, mae: 0.145808, mean_q: 0.211477
 47160/50000: episode: 6149, duration: 0.048s, episode steps:   3, steps per second:  62, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002932, mae: 0.142919, mean_q: 0.203563
 47162/50000: episode: 6150, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [1.000, 2.000],  loss: 0.001241, mae: 0.142732, mean_q: 0.207966
 47171/50000: episode: 6151, duration: 0.118s, episode steps:   9, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.003294, mae: 0.149079, mean_q: 0.208748
 47186/50000: episode: 6152, duration: 0.194s, episode s

 47444/50000: episode: 6183, duration: 0.205s, episode steps:  15, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.267 [0.000, 3.000],  loss: 0.002933, mae: 0.146256, mean_q: 0.207753
 47455/50000: episode: 6184, duration: 0.159s, episode steps:  11, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.545 [0.000, 3.000],  loss: 0.002412, mae: 0.146282, mean_q: 0.206845
 47470/50000: episode: 6185, duration: 0.200s, episode steps:  15, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.267 [0.000, 3.000],  loss: 0.003959, mae: 0.150448, mean_q: 0.208088
 47475/50000: episode: 6186, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.001875, mae: 0.142448, mean_q: 0.206400
 47479/50000: episode: 6187, duration: 0.065s, episode s

 47742/50000: episode: 6218, duration: 0.155s, episode steps:  12, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.003192, mae: 0.145752, mean_q: 0.204435
 47763/50000: episode: 6219, duration: 0.264s, episode steps:  21, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.002352, mae: 0.143023, mean_q: 0.199723
 47774/50000: episode: 6220, duration: 0.142s, episode steps:  11, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.182 [0.000, 3.000],  loss: 0.002649, mae: 0.137406, mean_q: 0.194529
 47793/50000: episode: 6221, duration: 0.241s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.368 [0.000, 3.000],  loss: 0.002445, mae: 0.141560, mean_q: 0.201634
 47798/50000: episode: 6222, duration: 0.070s, episode s

 48032/50000: episode: 6254, duration: 0.218s, episode steps:  17, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.882 [0.000, 3.000],  loss: 0.002839, mae: 0.140868, mean_q: 0.198988
 48040/50000: episode: 6255, duration: 0.105s, episode steps:   8, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.002896, mae: 0.141982, mean_q: 0.201736
 48055/50000: episode: 6256, duration: 0.195s, episode steps:  15, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.267 [0.000, 3.000],  loss: 0.002429, mae: 0.137513, mean_q: 0.196062
 48065/50000: episode: 6257, duration: 0.128s, episode steps:  10, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002769, mae: 0.140325, mean_q: 0.199389
 48096/50000: episode: 6258, duration: 0.388s, episode s

 48358/50000: episode: 6290, duration: 0.054s, episode steps:   3, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.001690, mae: 0.142614, mean_q: 0.204845
 48368/50000: episode: 6291, duration: 0.130s, episode steps:  10, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.300 [0.000, 3.000],  loss: 0.002327, mae: 0.138463, mean_q: 0.196317
 48382/50000: episode: 6292, duration: 0.186s, episode steps:  14, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002922, mae: 0.138430, mean_q: 0.198405
 48399/50000: episode: 6293, duration: 0.212s, episode steps:  17, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.588 [0.000, 3.000],  loss: 0.002545, mae: 0.134597, mean_q: 0.192203
 48404/50000: episode: 6294, duration: 0.074s, episode s

 48640/50000: episode: 6326, duration: 0.108s, episode steps:   7, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.143 [1.000, 3.000],  loss: 0.002875, mae: 0.138907, mean_q: 0.202836
 48646/50000: episode: 6327, duration: 0.085s, episode steps:   6, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.002559, mae: 0.133933, mean_q: 0.197088
 48651/50000: episode: 6328, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.002130, mae: 0.133809, mean_q: 0.191901
 48666/50000: episode: 6329, duration: 0.202s, episode steps:  15, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.002272, mae: 0.139434, mean_q: 0.196502
 48681/50000: episode: 6330, duration: 0.193s, episode s

 48960/50000: episode: 6363, duration: 0.254s, episode steps:  20, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.550 [0.000, 3.000],  loss: 0.002494, mae: 0.139698, mean_q: 0.198626
 48962/50000: episode: 6364, duration: 0.036s, episode steps:   2, steps per second:  56, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [2.000, 2.000],  loss: 0.004773, mae: 0.142489, mean_q: 0.199244
 48964/50000: episode: 6365, duration: 0.035s, episode steps:   2, steps per second:  57, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001967, mae: 0.134447, mean_q: 0.194696
 48969/50000: episode: 6366, duration: 0.069s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.002793, mae: 0.134669, mean_q: 0.196838
 48974/50000: episode: 6367, duration: 0.071s, episode s

 49224/50000: episode: 6399, duration: 0.112s, episode steps:   8, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.003158, mae: 0.140370, mean_q: 0.200008
 49245/50000: episode: 6400, duration: 0.265s, episode steps:  21, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.003340, mae: 0.139456, mean_q: 0.200825
 49254/50000: episode: 6401, duration: 0.117s, episode steps:   9, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.889 [0.000, 3.000],  loss: 0.002488, mae: 0.146651, mean_q: 0.207941
 49271/50000: episode: 6402, duration: 0.216s, episode steps:  17, steps per second:  79, episode reward:  1.000, mean reward:  0.059 [ 0.000,  1.000], mean action: 1.235 [0.000, 3.000],  loss: 0.002638, mae: 0.141221, mean_q: 0.199343
 49273/50000: episode: 6403, duration: 0.035s, episode s

 49537/50000: episode: 6435, duration: 0.170s, episode steps:  13, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.769 [0.000, 3.000],  loss: 0.002975, mae: 0.140963, mean_q: 0.204818
 49560/50000: episode: 6436, duration: 0.287s, episode steps:  23, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.696 [0.000, 3.000],  loss: 0.002614, mae: 0.139885, mean_q: 0.201459
 49567/50000: episode: 6437, duration: 0.094s, episode steps:   7, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.429 [1.000, 3.000],  loss: 0.003237, mae: 0.144988, mean_q: 0.204783
 49571/50000: episode: 6438, duration: 0.059s, episode steps:   4, steps per second:  68, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.750 [2.000, 3.000],  loss: 0.002824, mae: 0.138548, mean_q: 0.194838
 49573/50000: episode: 6439, duration: 0.036s, episode s

 49816/50000: episode: 6470, duration: 0.218s, episode steps:  17, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.471 [0.000, 3.000],  loss: 0.002776, mae: 0.139178, mean_q: 0.202290
 49820/50000: episode: 6471, duration: 0.058s, episode steps:   4, steps per second:  69, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [1.000, 3.000],  loss: 0.002411, mae: 0.137949, mean_q: 0.199777
 49823/50000: episode: 6472, duration: 0.046s, episode steps:   3, steps per second:  65, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [1.000, 3.000],  loss: 0.001932, mae: 0.140442, mean_q: 0.204658
 49843/50000: episode: 6473, duration: 0.258s, episode steps:  20, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.003423, mae: 0.142258, mean_q: 0.202110
 49857/50000: episode: 6474, duration: 0.214s, episode s

<keras.callbacks.History at 0x7fe75394c1f0>

In [43]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
Episode 1: reward: 0.000, steps: 5
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Episode 9: reward: 0.000, steps: 18
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mF

<keras.callbacks.History at 0x7fe6a00effa0>

## Frozen Lake (8 by 8)

In [44]:
# Create the environment and reset it to the initial state
env = gym.make("FrozenLake8x8-v0")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [45]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


    21/50000: episode: 1, duration: 0.477s, episode steps:  21, steps per second:  44, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.143 [0.000, 3.000],  loss: --, mae: --, mean_q: --
    71/50000: episode: 2, duration: 0.037s, episode steps:  50, steps per second: 1347, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 3.000],  loss: --, mae: --, mean_q: --
    83/50000: episode: 3, duration: 0.009s, episode steps:  12, steps per second: 1272, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 2.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


   123/50000: episode: 4, duration: 1.736s, episode steps:  40, steps per second:  23, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.700 [0.000, 3.000],  loss: 0.139666, mae: 1.053857, mean_q: 0.792187
   146/50000: episode: 5, duration: 0.127s, episode steps:  23, steps per second: 181, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.261 [0.000, 3.000],  loss: 0.126538, mae: 0.783607, mean_q: 0.843383
   158/50000: episode: 6, duration: 0.068s, episode steps:  12, steps per second: 176, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.417 [0.000, 3.000],  loss: 0.103001, mae: 0.573195, mean_q: 0.814182
   176/50000: episode: 7, duration: 0.108s, episode steps:  18, steps per second: 167, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.278 [0.000, 3.000],  loss: 0.086655, mae: 0.476064, mean_q: 0.772428
   189/50000: episode: 8, duration: 0.073s, episode steps:  13, step

   951/50000: episode: 40, duration: 0.189s, episode steps:  33, steps per second: 174, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.182 [0.000, 3.000],  loss: 0.013872, mae: 0.434144, mean_q: 0.585709
   976/50000: episode: 41, duration: 0.139s, episode steps:  25, steps per second: 180, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.080 [0.000, 3.000],  loss: 0.008228, mae: 0.426205, mean_q: 0.573209
  1010/50000: episode: 42, duration: 0.192s, episode steps:  34, steps per second: 177, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.294 [0.000, 3.000],  loss: 0.009253, mae: 0.420450, mean_q: 0.573911
  1026/50000: episode: 43, duration: 0.092s, episode steps:  16, steps per second: 173, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.312 [0.000, 3.000],  loss: 0.006718, mae: 0.411687, mean_q: 0.555737
  1088/50000: episode: 44, duration: 0.369s, episode steps:  62,

  2226/50000: episode: 76, duration: 0.596s, episode steps:  92, steps per second: 154, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.380 [0.000, 3.000],  loss: 0.002433, mae: 0.326069, mean_q: 0.445476
  2246/50000: episode: 77, duration: 0.114s, episode steps:  20, steps per second: 176, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.003683, mae: 0.324133, mean_q: 0.440504
  2272/50000: episode: 78, duration: 0.151s, episode steps:  26, steps per second: 173, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.462 [0.000, 3.000],  loss: 0.002414, mae: 0.326873, mean_q: 0.443588
  2302/50000: episode: 79, duration: 0.167s, episode steps:  30, steps per second: 180, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.433 [0.000, 3.000],  loss: 0.002788, mae: 0.327236, mean_q: 0.447221
  2346/50000: episode: 80, duration: 0.245s, episode steps:  44,

  3111/50000: episode: 111, duration: 0.248s, episode steps:  39, steps per second: 157, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.462 [0.000, 3.000],  loss: 0.002685, mae: 0.310244, mean_q: 0.426282
  3153/50000: episode: 112, duration: 0.274s, episode steps:  42, steps per second: 153, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.002688, mae: 0.313888, mean_q: 0.431531
  3228/50000: episode: 113, duration: 0.423s, episode steps:  75, steps per second: 177, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.747 [0.000, 3.000],  loss: 0.002643, mae: 0.311758, mean_q: 0.430436
  3237/50000: episode: 114, duration: 0.054s, episode steps:   9, steps per second: 166, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.002230, mae: 0.301886, mean_q: 0.400677
  3249/50000: episode: 115, duration: 0.069s, episode steps:

  4459/50000: episode: 146, duration: 0.272s, episode steps:  47, steps per second: 173, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.511 [0.000, 3.000],  loss: 0.002242, mae: 0.299869, mean_q: 0.411426
  4516/50000: episode: 147, duration: 0.318s, episode steps:  57, steps per second: 179, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.439 [0.000, 3.000],  loss: 0.002564, mae: 0.300068, mean_q: 0.407652
  4553/50000: episode: 148, duration: 0.208s, episode steps:  37, steps per second: 178, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.514 [0.000, 3.000],  loss: 0.002502, mae: 0.294741, mean_q: 0.405147
  4593/50000: episode: 149, duration: 0.238s, episode steps:  40, steps per second: 168, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.675 [0.000, 3.000],  loss: 0.002422, mae: 0.300879, mean_q: 0.414724
  4611/50000: episode: 150, duration: 0.114s, episode steps:

  5677/50000: episode: 181, duration: 0.329s, episode steps:  49, steps per second: 149, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.653 [0.000, 3.000],  loss: 0.001782, mae: 0.286810, mean_q: 0.390395
  5706/50000: episode: 182, duration: 0.193s, episode steps:  29, steps per second: 150, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.690 [0.000, 3.000],  loss: 0.001828, mae: 0.292993, mean_q: 0.401533
  5721/50000: episode: 183, duration: 0.107s, episode steps:  15, steps per second: 140, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001302, mae: 0.286674, mean_q: 0.385183
  5739/50000: episode: 184, duration: 0.123s, episode steps:  18, steps per second: 146, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.001460, mae: 0.291649, mean_q: 0.400517
  5783/50000: episode: 185, duration: 0.295s, episode steps:

  6689/50000: episode: 216, duration: 0.144s, episode steps:  23, steps per second: 160, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [0.000, 3.000],  loss: 0.001849, mae: 0.278806, mean_q: 0.385038
  6708/50000: episode: 217, duration: 0.135s, episode steps:  19, steps per second: 141, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.421 [0.000, 3.000],  loss: 0.001366, mae: 0.283895, mean_q: 0.388395
  6713/50000: episode: 218, duration: 0.039s, episode steps:   5, steps per second: 127, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 2.000],  loss: 0.002320, mae: 0.278797, mean_q: 0.372404
  6731/50000: episode: 219, duration: 0.127s, episode steps:  18, steps per second: 142, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.001967, mae: 0.276786, mean_q: 0.381597
  6826/50000: episode: 220, duration: 0.584s, episode steps:

  7607/50000: episode: 251, duration: 0.137s, episode steps:  19, steps per second: 139, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.158 [0.000, 3.000],  loss: 0.001765, mae: 0.271435, mean_q: 0.370980
  7643/50000: episode: 252, duration: 0.246s, episode steps:  36, steps per second: 146, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.583 [0.000, 3.000],  loss: 0.001962, mae: 0.267758, mean_q: 0.365271
  7653/50000: episode: 253, duration: 0.061s, episode steps:  10, steps per second: 164, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.001545, mae: 0.264450, mean_q: 0.368131
  7690/50000: episode: 254, duration: 0.210s, episode steps:  37, steps per second: 176, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.703 [0.000, 3.000],  loss: 0.001689, mae: 0.268098, mean_q: 0.364539
  7729/50000: episode: 255, duration: 0.221s, episode steps:

  8502/50000: episode: 286, duration: 0.200s, episode steps:  32, steps per second: 160, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.531 [0.000, 3.000],  loss: 0.001346, mae: 0.251873, mean_q: 0.343612
  8573/50000: episode: 287, duration: 0.452s, episode steps:  71, steps per second: 157, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.366 [0.000, 3.000],  loss: 0.001479, mae: 0.255259, mean_q: 0.344422
  8581/50000: episode: 288, duration: 0.054s, episode steps:   8, steps per second: 148, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.000 [1.000, 3.000],  loss: 0.001391, mae: 0.248120, mean_q: 0.341694
  8619/50000: episode: 289, duration: 0.227s, episode steps:  38, steps per second: 167, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.605 [0.000, 3.000],  loss: 0.001391, mae: 0.254783, mean_q: 0.347460
  8651/50000: episode: 290, duration: 0.179s, episode steps:

  9717/50000: episode: 321, duration: 0.295s, episode steps:  42, steps per second: 143, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.452 [0.000, 3.000],  loss: 0.001125, mae: 0.235301, mean_q: 0.322443
  9754/50000: episode: 322, duration: 0.262s, episode steps:  37, steps per second: 141, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.784 [0.000, 3.000],  loss: 0.001141, mae: 0.233364, mean_q: 0.317643
  9772/50000: episode: 323, duration: 0.128s, episode steps:  18, steps per second: 140, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.611 [0.000, 3.000],  loss: 0.001057, mae: 0.231770, mean_q: 0.312908
  9795/50000: episode: 324, duration: 0.167s, episode steps:  23, steps per second: 138, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.783 [0.000, 3.000],  loss: 0.001099, mae: 0.228288, mean_q: 0.307713
  9823/50000: episode: 325, duration: 0.213s, episode steps:

 10975/50000: episode: 357, duration: 0.108s, episode steps:  16, steps per second: 148, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.312 [0.000, 3.000],  loss: 0.001054, mae: 0.211500, mean_q: 0.285006
 11004/50000: episode: 358, duration: 0.212s, episode steps:  29, steps per second: 137, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.621 [0.000, 3.000],  loss: 0.001029, mae: 0.209926, mean_q: 0.286362
 11053/50000: episode: 359, duration: 0.358s, episode steps:  49, steps per second: 137, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.755 [0.000, 3.000],  loss: 0.001523, mae: 0.208436, mean_q: 0.285610
 11066/50000: episode: 360, duration: 0.076s, episode steps:  13, steps per second: 171, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.308 [0.000, 3.000],  loss: 0.001722, mae: 0.210173, mean_q: 0.288817
 11092/50000: episode: 361, duration: 0.177s, episode steps:

 12050/50000: episode: 393, duration: 0.240s, episode steps:  38, steps per second: 159, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.342 [0.000, 3.000],  loss: 0.000818, mae: 0.201219, mean_q: 0.272956
 12096/50000: episode: 394, duration: 0.312s, episode steps:  46, steps per second: 147, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.630 [0.000, 3.000],  loss: 0.000879, mae: 0.200220, mean_q: 0.268510
 12108/50000: episode: 395, duration: 0.084s, episode steps:  12, steps per second: 142, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.000738, mae: 0.201711, mean_q: 0.274634
 12117/50000: episode: 396, duration: 0.065s, episode steps:   9, steps per second: 139, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.222 [0.000, 2.000],  loss: 0.000849, mae: 0.198352, mean_q: 0.268277
 12155/50000: episode: 397, duration: 0.246s, episode steps:

 12959/50000: episode: 429, duration: 0.152s, episode steps:  26, steps per second: 171, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.423 [0.000, 3.000],  loss: 0.000683, mae: 0.185938, mean_q: 0.251779
 12969/50000: episode: 430, duration: 0.061s, episode steps:  10, steps per second: 164, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.700 [0.000, 3.000],  loss: 0.000712, mae: 0.186565, mean_q: 0.257029
 12999/50000: episode: 431, duration: 0.171s, episode steps:  30, steps per second: 175, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.533 [0.000, 3.000],  loss: 0.000665, mae: 0.183827, mean_q: 0.249237
 13005/50000: episode: 432, duration: 0.039s, episode steps:   6, steps per second: 152, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.333 [2.000, 3.000],  loss: 0.000715, mae: 0.179814, mean_q: 0.242502
 13028/50000: episode: 433, duration: 0.130s, episode steps:

 13936/50000: episode: 464, duration: 0.157s, episode steps:  27, steps per second: 172, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.741 [0.000, 3.000],  loss: 0.000680, mae: 0.171080, mean_q: 0.230002
 13953/50000: episode: 465, duration: 0.099s, episode steps:  17, steps per second: 171, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.765 [0.000, 3.000],  loss: 0.000408, mae: 0.176794, mean_q: 0.238152
 13984/50000: episode: 466, duration: 0.182s, episode steps:  31, steps per second: 171, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.613 [0.000, 3.000],  loss: 0.000633, mae: 0.172727, mean_q: 0.237130
 14039/50000: episode: 467, duration: 0.314s, episode steps:  55, steps per second: 175, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.473 [0.000, 3.000],  loss: 0.000683, mae: 0.171128, mean_q: 0.230968
 14081/50000: episode: 468, duration: 0.240s, episode steps:

 15268/50000: episode: 500, duration: 0.359s, episode steps:  53, steps per second: 147, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.604 [0.000, 3.000],  loss: 0.000562, mae: 0.152880, mean_q: 0.205726
 15316/50000: episode: 501, duration: 0.301s, episode steps:  48, steps per second: 159, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.854 [0.000, 3.000],  loss: 0.000570, mae: 0.153521, mean_q: 0.208591
 15340/50000: episode: 502, duration: 0.140s, episode steps:  24, steps per second: 172, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.208 [0.000, 3.000],  loss: 0.000465, mae: 0.152705, mean_q: 0.205735
 15350/50000: episode: 503, duration: 0.061s, episode steps:  10, steps per second: 165, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.000407, mae: 0.158021, mean_q: 0.212487
 15374/50000: episode: 504, duration: 0.142s, episode steps:

 16469/50000: episode: 536, duration: 0.045s, episode steps:   5, steps per second: 112, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.000332, mae: 0.142391, mean_q: 0.193630
 16558/50000: episode: 537, duration: 0.524s, episode steps:  89, steps per second: 170, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.742 [0.000, 3.000],  loss: 0.000460, mae: 0.140419, mean_q: 0.191478
 16606/50000: episode: 538, duration: 0.307s, episode steps:  48, steps per second: 156, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.000398, mae: 0.139604, mean_q: 0.190401
 16666/50000: episode: 539, duration: 0.383s, episode steps:  60, steps per second: 157, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.400 [0.000, 3.000],  loss: 0.000442, mae: 0.140320, mean_q: 0.192630
 16699/50000: episode: 540, duration: 0.213s, episode steps:

 17708/50000: episode: 571, duration: 0.233s, episode steps:  40, steps per second: 172, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.450 [0.000, 3.000],  loss: 0.000377, mae: 0.133570, mean_q: 0.182135
 17748/50000: episode: 572, duration: 0.235s, episode steps:  40, steps per second: 170, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.150 [0.000, 3.000],  loss: 0.000409, mae: 0.133433, mean_q: 0.180181
 17792/50000: episode: 573, duration: 0.276s, episode steps:  44, steps per second: 159, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.568 [0.000, 3.000],  loss: 0.000401, mae: 0.132066, mean_q: 0.178990
 17854/50000: episode: 574, duration: 0.413s, episode steps:  62, steps per second: 150, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.919 [0.000, 3.000],  loss: 0.000327, mae: 0.130612, mean_q: 0.178564
 17869/50000: episode: 575, duration: 0.117s, episode steps:

 19067/50000: episode: 608, duration: 0.163s, episode steps:  27, steps per second: 166, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.259 [0.000, 3.000],  loss: 0.000398, mae: 0.123248, mean_q: 0.166608
 19162/50000: episode: 609, duration: 0.537s, episode steps:  95, steps per second: 177, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.632 [0.000, 3.000],  loss: 0.000304, mae: 0.123329, mean_q: 0.167800
 19201/50000: episode: 610, duration: 0.222s, episode steps:  39, steps per second: 176, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.590 [0.000, 3.000],  loss: 0.000301, mae: 0.122639, mean_q: 0.167555
 19209/50000: episode: 611, duration: 0.051s, episode steps:   8, steps per second: 156, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.000148, mae: 0.123827, mean_q: 0.168597
 19229/50000: episode: 612, duration: 0.124s, episode steps:

 20327/50000: episode: 643, duration: 0.173s, episode steps:  25, steps per second: 145, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.480 [0.000, 3.000],  loss: 0.000243, mae: 0.116366, mean_q: 0.158026
 20403/50000: episode: 644, duration: 0.462s, episode steps:  76, steps per second: 165, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.605 [0.000, 3.000],  loss: 0.000296, mae: 0.116058, mean_q: 0.157633
 20411/50000: episode: 645, duration: 0.057s, episode steps:   8, steps per second: 141, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.125 [0.000, 3.000],  loss: 0.000266, mae: 0.115705, mean_q: 0.157308
 20434/50000: episode: 646, duration: 0.168s, episode steps:  23, steps per second: 137, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.565 [0.000, 3.000],  loss: 0.000247, mae: 0.114274, mean_q: 0.155374
 20455/50000: episode: 647, duration: 0.153s, episode steps:

 21240/50000: episode: 679, duration: 0.359s, episode steps:  63, steps per second: 176, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.381 [0.000, 3.000],  loss: 0.000223, mae: 0.110314, mean_q: 0.149316
 21272/50000: episode: 680, duration: 0.196s, episode steps:  32, steps per second: 163, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.688 [0.000, 3.000],  loss: 0.000235, mae: 0.110515, mean_q: 0.149425
 21285/50000: episode: 681, duration: 0.087s, episode steps:  13, steps per second: 149, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.615 [0.000, 3.000],  loss: 0.000195, mae: 0.110915, mean_q: 0.149483
 21301/50000: episode: 682, duration: 0.100s, episode steps:  16, steps per second: 160, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.625 [0.000, 3.000],  loss: 0.000236, mae: 0.110742, mean_q: 0.150063
 21319/50000: episode: 683, duration: 0.115s, episode steps:

 22534/50000: episode: 715, duration: 0.232s, episode steps:  40, steps per second: 173, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.575 [0.000, 3.000],  loss: 0.000211, mae: 0.101209, mean_q: 0.138725
 22576/50000: episode: 716, duration: 0.252s, episode steps:  42, steps per second: 167, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.000183, mae: 0.101635, mean_q: 0.138171
 22586/50000: episode: 717, duration: 0.068s, episode steps:  10, steps per second: 146, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.700 [1.000, 3.000],  loss: 0.000228, mae: 0.101161, mean_q: 0.137100
 22606/50000: episode: 718, duration: 0.126s, episode steps:  20, steps per second: 158, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.100 [0.000, 3.000],  loss: 0.000121, mae: 0.099008, mean_q: 0.134531
 22621/50000: episode: 719, duration: 0.101s, episode steps:

 23696/50000: episode: 751, duration: 0.113s, episode steps:  18, steps per second: 159, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.222 [0.000, 3.000],  loss: 0.000194, mae: 0.093531, mean_q: 0.128295
 23724/50000: episode: 752, duration: 0.180s, episode steps:  28, steps per second: 155, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.000160, mae: 0.093723, mean_q: 0.127853
 23772/50000: episode: 753, duration: 0.282s, episode steps:  48, steps per second: 171, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.562 [0.000, 3.000],  loss: 0.000181, mae: 0.093966, mean_q: 0.129276
 23777/50000: episode: 754, duration: 0.036s, episode steps:   5, steps per second: 141, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.600 [2.000, 3.000],  loss: 0.000184, mae: 0.093756, mean_q: 0.128275
 23796/50000: episode: 755, duration: 0.112s, episode steps:

 24770/50000: episode: 786, duration: 0.087s, episode steps:  14, steps per second: 160, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.786 [0.000, 3.000],  loss: 0.000155, mae: 0.086510, mean_q: 0.118503
 24775/50000: episode: 787, duration: 0.034s, episode steps:   5, steps per second: 148, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 3.000],  loss: 0.000105, mae: 0.087582, mean_q: 0.118947
 24790/50000: episode: 788, duration: 0.089s, episode steps:  15, steps per second: 169, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.467 [0.000, 3.000],  loss: 0.000117, mae: 0.086870, mean_q: 0.117816
 24818/50000: episode: 789, duration: 0.179s, episode steps:  28, steps per second: 156, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.321 [0.000, 3.000],  loss: 0.000204, mae: 0.085467, mean_q: 0.116448
 24829/50000: episode: 790, duration: 0.076s, episode steps:

 25880/50000: episode: 823, duration: 0.064s, episode steps:   8, steps per second: 125, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.125 [0.000, 3.000],  loss: 0.000197, mae: 0.081193, mean_q: 0.108392
 25915/50000: episode: 824, duration: 0.250s, episode steps:  35, steps per second: 140, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.257 [0.000, 3.000],  loss: 0.000126, mae: 0.080322, mean_q: 0.108595
 25972/50000: episode: 825, duration: 0.360s, episode steps:  57, steps per second: 158, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.263 [0.000, 3.000],  loss: 0.000411, mae: 0.079580, mean_q: 0.108809
 26012/50000: episode: 826, duration: 0.254s, episode steps:  40, steps per second: 158, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.000123, mae: 0.079560, mean_q: 0.108844
 26037/50000: episode: 827, duration: 0.158s, episode steps:

 27125/50000: episode: 858, duration: 0.214s, episode steps:  30, steps per second: 140, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.433 [0.000, 3.000],  loss: 0.000103, mae: 0.072894, mean_q: 0.099371
 27140/50000: episode: 859, duration: 0.107s, episode steps:  15, steps per second: 140, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.000122, mae: 0.073825, mean_q: 0.100054
 27159/50000: episode: 860, duration: 0.138s, episode steps:  19, steps per second: 137, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.000 [0.000, 3.000],  loss: 0.000123, mae: 0.071667, mean_q: 0.097866
 27184/50000: episode: 861, duration: 0.176s, episode steps:  25, steps per second: 142, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.000160, mae: 0.070966, mean_q: 0.096723
 27206/50000: episode: 862, duration: 0.158s, episode steps:

 28259/50000: episode: 893, duration: 0.394s, episode steps:  60, steps per second: 152, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.383 [0.000, 3.000],  loss: 0.000095, mae: 0.066616, mean_q: 0.090581
 28282/50000: episode: 894, duration: 0.161s, episode steps:  23, steps per second: 143, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.304 [0.000, 3.000],  loss: 0.000069, mae: 0.066052, mean_q: 0.089319
 28341/50000: episode: 895, duration: 0.349s, episode steps:  59, steps per second: 169, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.763 [0.000, 3.000],  loss: 0.000076, mae: 0.066176, mean_q: 0.089759
 28372/50000: episode: 896, duration: 0.193s, episode steps:  31, steps per second: 160, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.581 [0.000, 3.000],  loss: 0.000092, mae: 0.065986, mean_q: 0.090396
 28428/50000: episode: 897, duration: 0.342s, episode steps:

 29358/50000: episode: 929, duration: 0.123s, episode steps:   9, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.000056, mae: 0.062061, mean_q: 0.084699
 29363/50000: episode: 930, duration: 0.070s, episode steps:   5, steps per second:  72, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [1.000, 3.000],  loss: 0.000031, mae: 0.062565, mean_q: 0.085044
 29377/50000: episode: 931, duration: 0.177s, episode steps:  14, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: 0.000109, mae: 0.060735, mean_q: 0.082643
 29440/50000: episode: 932, duration: 0.744s, episode steps:  63, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.476 [0.000, 3.000],  loss: 0.000067, mae: 0.060759, mean_q: 0.082367
 29491/50000: episode: 933, duration: 0.604s, episode steps:

 30464/50000: episode: 964, duration: 1.183s, episode steps: 101, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.574 [0.000, 3.000],  loss: 0.000065, mae: 0.056076, mean_q: 0.075895
 30516/50000: episode: 965, duration: 0.617s, episode steps:  52, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.827 [0.000, 3.000],  loss: 0.000068, mae: 0.055200, mean_q: 0.075091
 30552/50000: episode: 966, duration: 0.440s, episode steps:  36, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.472 [0.000, 3.000],  loss: 0.000069, mae: 0.055434, mean_q: 0.075376
 30575/50000: episode: 967, duration: 0.283s, episode steps:  23, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.522 [0.000, 3.000],  loss: 0.000077, mae: 0.055476, mean_q: 0.075055
 30600/50000: episode: 968, duration: 0.303s, episode steps:

 31726/50000: episode: 999, duration: 0.402s, episode steps:  33, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.394 [0.000, 3.000],  loss: 0.000051, mae: 0.050902, mean_q: 0.068738
 31741/50000: episode: 1000, duration: 0.184s, episode steps:  15, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.800 [0.000, 3.000],  loss: 0.000060, mae: 0.051035, mean_q: 0.069414
 31771/50000: episode: 1001, duration: 0.362s, episode steps:  30, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.267 [0.000, 3.000],  loss: 0.000066, mae: 0.050422, mean_q: 0.067741
 31784/50000: episode: 1002, duration: 0.164s, episode steps:  13, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.615 [0.000, 3.000],  loss: 0.000041, mae: 0.050770, mean_q: 0.068380
 31816/50000: episode: 1003, duration: 0.453s, episode st

 32979/50000: episode: 1035, duration: 0.296s, episode steps:  24, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.000038, mae: 0.045418, mean_q: 0.061479
 33015/50000: episode: 1036, duration: 0.435s, episode steps:  36, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.000035, mae: 0.044346, mean_q: 0.060090
 33029/50000: episode: 1037, duration: 0.175s, episode steps:  14, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.000044, mae: 0.044650, mean_q: 0.060178
 33101/50000: episode: 1038, duration: 0.854s, episode steps:  72, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.306 [0.000, 3.000],  loss: 0.000042, mae: 0.044305, mean_q: 0.060094
 33135/50000: episode: 1039, duration: 0.417s, episode s

 34171/50000: episode: 1070, duration: 0.979s, episode steps:  83, steps per second:  85, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.566 [0.000, 3.000],  loss: 0.000037, mae: 0.041071, mean_q: 0.055703
 34195/50000: episode: 1071, duration: 0.302s, episode steps:  24, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.000035, mae: 0.040870, mean_q: 0.055402
 34236/50000: episode: 1072, duration: 0.497s, episode steps:  41, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.488 [0.000, 3.000],  loss: 0.000034, mae: 0.040916, mean_q: 0.055289
 34311/50000: episode: 1073, duration: 0.893s, episode steps:  75, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.747 [0.000, 3.000],  loss: 0.000036, mae: 0.040523, mean_q: 0.055113
 34352/50000: episode: 1074, duration: 0.497s, episode s

 35161/50000: episode: 1106, duration: 0.241s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.632 [0.000, 3.000],  loss: 0.000037, mae: 0.036567, mean_q: 0.049401
 35183/50000: episode: 1107, duration: 0.273s, episode steps:  22, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.136 [0.000, 3.000],  loss: 0.000031, mae: 0.036624, mean_q: 0.050001
 35215/50000: episode: 1108, duration: 0.389s, episode steps:  32, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.156 [0.000, 3.000],  loss: 0.000026, mae: 0.036722, mean_q: 0.050251
 35224/50000: episode: 1109, duration: 0.115s, episode steps:   9, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.556 [0.000, 3.000],  loss: 0.000037, mae: 0.036643, mean_q: 0.049906
 35302/50000: episode: 1110, duration: 0.922s, episode s

 36370/50000: episode: 1141, duration: 0.550s, episode steps:  45, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.622 [0.000, 3.000],  loss: 0.000023, mae: 0.033486, mean_q: 0.046069
 36406/50000: episode: 1142, duration: 0.446s, episode steps:  36, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.167 [0.000, 3.000],  loss: 0.000029, mae: 0.033029, mean_q: 0.045073
 36429/50000: episode: 1143, duration: 0.284s, episode steps:  23, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.565 [0.000, 3.000],  loss: 0.000030, mae: 0.033282, mean_q: 0.045297
 36458/50000: episode: 1144, duration: 0.356s, episode steps:  29, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.724 [0.000, 3.000],  loss: 0.000035, mae: 0.033491, mean_q: 0.045550
 36497/50000: episode: 1145, duration: 0.470s, episode s

 37216/50000: episode: 1176, duration: 0.109s, episode steps:   8, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.375 [0.000, 3.000],  loss: 0.000027, mae: 0.029727, mean_q: 0.040429
 37229/50000: episode: 1177, duration: 0.169s, episode steps:  13, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.077 [0.000, 3.000],  loss: 0.000019, mae: 0.031167, mean_q: 0.042356
 37250/50000: episode: 1178, duration: 0.264s, episode steps:  21, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.714 [0.000, 3.000],  loss: 0.000026, mae: 0.030616, mean_q: 0.041998
 37257/50000: episode: 1179, duration: 0.093s, episode steps:   7, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.571 [0.000, 3.000],  loss: 0.000018, mae: 0.029862, mean_q: 0.040450
 37297/50000: episode: 1180, duration: 0.485s, episode s

 38313/50000: episode: 1211, duration: 0.213s, episode steps:  17, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.412 [0.000, 3.000],  loss: 0.000021, mae: 0.028351, mean_q: 0.038426
 38323/50000: episode: 1212, duration: 0.134s, episode steps:  10, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.100 [0.000, 3.000],  loss: 0.000020, mae: 0.027706, mean_q: 0.037493
 38328/50000: episode: 1213, duration: 0.072s, episode steps:   5, steps per second:  70, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 2.000],  loss: 0.000013, mae: 0.029051, mean_q: 0.039249
 38360/50000: episode: 1214, duration: 0.399s, episode steps:  32, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.562 [0.000, 3.000],  loss: 0.000015, mae: 0.028107, mean_q: 0.037961
 38374/50000: episode: 1215, duration: 0.176s, episode s

 39475/50000: episode: 1246, duration: 0.603s, episode steps:  50, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.640 [0.000, 3.000],  loss: 0.000024, mae: 0.026200, mean_q: 0.035845
 39486/50000: episode: 1247, duration: 0.139s, episode steps:  11, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.818 [0.000, 3.000],  loss: 0.000018, mae: 0.026384, mean_q: 0.035443
 39501/50000: episode: 1248, duration: 0.194s, episode steps:  15, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.000017, mae: 0.026030, mean_q: 0.035510
 39555/50000: episode: 1249, duration: 0.651s, episode steps:  54, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.444 [0.000, 3.000],  loss: 0.000017, mae: 0.026188, mean_q: 0.035533
 39563/50000: episode: 1250, duration: 0.105s, episode s

 40465/50000: episode: 1281, duration: 0.182s, episode steps:  14, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.643 [0.000, 3.000],  loss: 0.000015, mae: 0.024440, mean_q: 0.033410
 40477/50000: episode: 1282, duration: 0.155s, episode steps:  12, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.833 [0.000, 3.000],  loss: 0.000017, mae: 0.024277, mean_q: 0.032702
 40541/50000: episode: 1283, duration: 0.772s, episode steps:  64, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.422 [0.000, 3.000],  loss: 0.000013, mae: 0.024067, mean_q: 0.032842
 40668/50000: episode: 1284, duration: 1.504s, episode steps: 127, steps per second:  84, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.520 [0.000, 3.000],  loss: 0.000013, mae: 0.024023, mean_q: 0.032651
 40676/50000: episode: 1285, duration: 0.106s, episode s

 41841/50000: episode: 1316, duration: 0.478s, episode steps:  39, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.308 [0.000, 3.000],  loss: 0.000030, mae: 0.023591, mean_q: 0.033622
 41882/50000: episode: 1317, duration: 0.501s, episode steps:  41, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.463 [0.000, 3.000],  loss: 0.000021, mae: 0.022991, mean_q: 0.031012
 41913/50000: episode: 1318, duration: 0.386s, episode steps:  31, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.613 [0.000, 3.000],  loss: 0.000019, mae: 0.022737, mean_q: 0.031065
 41941/50000: episode: 1319, duration: 0.351s, episode steps:  28, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.464 [0.000, 3.000],  loss: 0.000016, mae: 0.022602, mean_q: 0.030589
 41964/50000: episode: 1320, duration: 0.290s, episode s

 42896/50000: episode: 1351, duration: 0.302s, episode steps:  24, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.750 [0.000, 3.000],  loss: 0.000011, mae: 0.020806, mean_q: 0.028385
 42952/50000: episode: 1352, duration: 0.682s, episode steps:  56, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.536 [0.000, 3.000],  loss: 0.000012, mae: 0.020820, mean_q: 0.028240
 42968/50000: episode: 1353, duration: 0.202s, episode steps:  16, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.438 [0.000, 3.000],  loss: 0.000010, mae: 0.020495, mean_q: 0.028005
 42978/50000: episode: 1354, duration: 0.135s, episode steps:  10, steps per second:  74, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.200 [0.000, 2.000],  loss: 0.000010, mae: 0.020438, mean_q: 0.027703
 43023/50000: episode: 1355, duration: 0.567s, episode s

 44147/50000: episode: 1386, duration: 2.025s, episode steps: 169, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.609 [0.000, 3.000],  loss: 0.000009, mae: 0.019595, mean_q: 0.026553
 44160/50000: episode: 1387, duration: 0.174s, episode steps:  13, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.538 [0.000, 3.000],  loss: 0.000009, mae: 0.019284, mean_q: 0.026186
 44246/50000: episode: 1388, duration: 1.033s, episode steps:  86, steps per second:  83, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.488 [0.000, 3.000],  loss: 0.000010, mae: 0.019258, mean_q: 0.026113
 44360/50000: episode: 1389, duration: 1.386s, episode steps: 114, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.684 [0.000, 3.000],  loss: 0.000008, mae: 0.019169, mean_q: 0.025992
 44379/50000: episode: 1390, duration: 0.242s, episode s

 45376/50000: episode: 1421, duration: 0.740s, episode steps:  61, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.623 [0.000, 3.000],  loss: 0.000016, mae: 0.019400, mean_q: 0.026679
 45398/50000: episode: 1422, duration: 0.281s, episode steps:  22, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.591 [0.000, 3.000],  loss: 0.000020, mae: 0.019192, mean_q: 0.026229
 45417/50000: episode: 1423, duration: 0.241s, episode steps:  19, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.789 [0.000, 3.000],  loss: 0.000025, mae: 0.019290, mean_q: 0.026269
 45423/50000: episode: 1424, duration: 0.085s, episode steps:   6, steps per second:  71, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.000021, mae: 0.019512, mean_q: 0.026832
 45453/50000: episode: 1425, duration: 0.377s, episode s

 46614/50000: episode: 1456, duration: 0.338s, episode steps:  27, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.778 [0.000, 3.000],  loss: 0.000020, mae: 0.018772, mean_q: 0.025599
 46625/50000: episode: 1457, duration: 0.143s, episode steps:  11, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.909 [1.000, 3.000],  loss: 0.000028, mae: 0.019083, mean_q: 0.026222
 46647/50000: episode: 1458, duration: 0.278s, episode steps:  22, steps per second:  79, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.409 [0.000, 3.000],  loss: 0.000018, mae: 0.018752, mean_q: 0.025791
 46662/50000: episode: 1459, duration: 0.191s, episode steps:  15, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.533 [0.000, 3.000],  loss: 0.000030, mae: 0.019082, mean_q: 0.025742
 46696/50000: episode: 1460, duration: 0.423s, episode s

 47569/50000: episode: 1491, duration: 0.663s, episode steps:  50, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.000007, mae: 0.017227, mean_q: 0.023289
 47581/50000: episode: 1492, duration: 0.160s, episode steps:  12, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.333 [0.000, 3.000],  loss: 0.000007, mae: 0.017083, mean_q: 0.023352
 47593/50000: episode: 1493, duration: 0.160s, episode steps:  12, steps per second:  75, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.250 [0.000, 3.000],  loss: 0.000008, mae: 0.017443, mean_q: 0.023366
 47613/50000: episode: 1494, duration: 0.260s, episode steps:  20, steps per second:  77, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 2.050 [0.000, 3.000],  loss: 0.000009, mae: 0.016889, mean_q: 0.022510
 47631/50000: episode: 1495, duration: 0.285s, episode s

 48704/50000: episode: 1527, duration: 0.137s, episode steps:  10, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.500 [0.000, 3.000],  loss: 0.000008, mae: 0.016759, mean_q: 0.022548
 48753/50000: episode: 1528, duration: 0.613s, episode steps:  49, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.490 [0.000, 3.000],  loss: 0.000008, mae: 0.016902, mean_q: 0.022954
 48763/50000: episode: 1529, duration: 0.132s, episode steps:  10, steps per second:  76, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.600 [0.000, 3.000],  loss: 0.000008, mae: 0.016754, mean_q: 0.022963
 48774/50000: episode: 1530, duration: 0.150s, episode steps:  11, steps per second:  73, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.364 [0.000, 3.000],  loss: 0.000008, mae: 0.016917, mean_q: 0.023000
 48802/50000: episode: 1531, duration: 0.353s, episode s

 49744/50000: episode: 1562, duration: 0.660s, episode steps:  54, steps per second:  82, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.574 [0.000, 3.000],  loss: 0.000008, mae: 0.016100, mean_q: 0.021910
 49757/50000: episode: 1563, duration: 0.166s, episode steps:  13, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.538 [0.000, 3.000],  loss: 0.000007, mae: 0.016241, mean_q: 0.022127
 49776/50000: episode: 1564, duration: 0.244s, episode steps:  19, steps per second:  78, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.263 [0.000, 3.000],  loss: 0.000008, mae: 0.016091, mean_q: 0.022164
 49806/50000: episode: 1565, duration: 0.373s, episode steps:  30, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.667 [0.000, 3.000],  loss: 0.000009, mae: 0.016103, mean_q: 0.021858
 49817/50000: episode: 1566, duration: 0.143s, episode s

<keras.callbacks.History at 0x7fe69a526d30>

In [46]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF


  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
FFFFFFFF
[41mF[0mFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
S

<keras.callbacks.History at 0x7fe69a526c10>

## Taxi

In [47]:
# Create the environment and reset it to the initial state
env = gym.make("Taxi-v3")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [48]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


   200/50000: episode: 1, duration: 2.847s, episode steps: 200, steps per second:  70, episode reward: -2000.000, mean reward: -10.000 [-10.000, -10.000], mean action: 5.000 [5.000, 5.000],  loss: 2.090592, mae: 58.836896, mean_q: 150.402672
   400/50000: episode: 2, duration: 1.154s, episode steps: 200, steps per second: 173, episode reward: -2000.000, mean reward: -10.000 [-10.000, -10.000], mean action: 5.000 [5.000, 5.000],  loss: 38.393116, mae: 57.845596, mean_q: 139.599899
   600/50000: episode: 3, duration: 1.306s, episode steps: 200, steps per second: 153, episode reward: -2000.000, mean reward: -10.000 [-10.000, -10.000], mean action: 5.000 [5.000, 5.000],  loss: 32.164146, mae: 47.821381, mean_q: 105.263924
   800/50000: episode: 4, duration: 1.324s, episode steps: 200, steps per second: 151, episode reward: -1982.000, mean reward: -9.910 [-10.000, -1.000], mean action: 4.950 [0.000, 5.000],  loss: 26.047260, mae: 31.729910, mean_q: 65.319054
  1000/50000: episode: 5, durati

  7200/50000: episode: 36, duration: 1.166s, episode steps: 200, steps per second: 172, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.415 [0.000, 4.000],  loss: 0.134354, mae: 17.374413, mean_q: -3.940053
  7400/50000: episode: 37, duration: 1.427s, episode steps: 200, steps per second: 140, episode reward: -209.000, mean reward: -1.045 [-10.000, -1.000], mean action: 1.245 [0.000, 4.000],  loss: 0.177593, mae: 19.336262, mean_q: -4.802045
  7600/50000: episode: 38, duration: 1.237s, episode steps: 200, steps per second: 162, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.430 [0.000, 4.000],  loss: 0.150491, mae: 20.740026, mean_q: -5.517731
  7800/50000: episode: 39, duration: 1.276s, episode steps: 200, steps per second: 157, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.365 [0.000, 3.000],  loss: 0.245424, mae: 21.905876, mean_q: -6.209590
  8000/50000: episode: 40, duration: 1.268s, ep

 14200/50000: episode: 71, duration: 1.419s, episode steps: 200, steps per second: 141, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.605 [0.000, 3.000],  loss: 1.494521, mae: 37.538380, mean_q: -24.195137
 14400/50000: episode: 72, duration: 1.193s, episode steps: 200, steps per second: 168, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.550 [0.000, 3.000],  loss: 1.868576, mae: 37.934402, mean_q: -24.564581
 14600/50000: episode: 73, duration: 1.388s, episode steps: 200, steps per second: 144, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.715 [0.000, 3.000],  loss: 2.007037, mae: 37.778702, mean_q: -25.014921
 14800/50000: episode: 74, duration: 1.453s, episode steps: 200, steps per second: 138, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.565 [0.000, 3.000],  loss: 2.185457, mae: 38.331329, mean_q: -25.346403
 15000/50000: episode: 75, duration: 1.266s,

 21200/50000: episode: 106, duration: 2.436s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.430 [0.000, 3.000],  loss: 2.722883, mae: 43.565842, mean_q: -36.955078
 21400/50000: episode: 107, duration: 2.427s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.520 [0.000, 3.000],  loss: 4.169387, mae: 44.063580, mean_q: -36.989063
 21600/50000: episode: 108, duration: 2.486s, episode steps: 200, steps per second:  80, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.485 [0.000, 3.000],  loss: 3.540305, mae: 44.754868, mean_q: -37.579819
 21800/50000: episode: 109, duration: 2.366s, episode steps: 200, steps per second:  85, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.490 [0.000, 3.000],  loss: 3.645775, mae: 44.845181, mean_q: -37.979019
 22000/50000: episode: 110, duration: 2.

 28200/50000: episode: 141, duration: 2.360s, episode steps: 200, steps per second:  85, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.380 [0.000, 3.000],  loss: 6.492307, mae: 46.252743, mean_q: -46.367767
 28400/50000: episode: 142, duration: 2.335s, episode steps: 200, steps per second:  86, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.405 [0.000, 3.000],  loss: 6.118598, mae: 46.457973, mean_q: -46.587170
 28600/50000: episode: 143, duration: 2.341s, episode steps: 200, steps per second:  85, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.580 [0.000, 3.000],  loss: 5.495004, mae: 46.746170, mean_q: -46.682999
 28800/50000: episode: 144, duration: 2.348s, episode steps: 200, steps per second:  85, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.465 [0.000, 3.000],  loss: 5.071979, mae: 46.646606, mean_q: -47.071026
 29000/50000: episode: 145, duration: 2.

 35200/50000: episode: 176, duration: 2.376s, episode steps: 200, steps per second:  84, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.570 [0.000, 3.000],  loss: 6.171040, mae: 46.851852, mean_q: -48.742855
 35400/50000: episode: 177, duration: 2.387s, episode steps: 200, steps per second:  84, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.625 [0.000, 3.000],  loss: 7.781847, mae: 46.755905, mean_q: -48.574364
 35600/50000: episode: 178, duration: 2.379s, episode steps: 200, steps per second:  84, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.540 [0.000, 3.000],  loss: 4.743392, mae: 47.013935, mean_q: -49.109344
 35800/50000: episode: 179, duration: 2.368s, episode steps: 200, steps per second:  84, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.320 [0.000, 3.000],  loss: 8.719033, mae: 46.764530, mean_q: -48.809200
 36000/50000: episode: 180, duration: 2.

 42200/50000: episode: 211, duration: 2.409s, episode steps: 200, steps per second:  83, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.555 [0.000, 3.000],  loss: 9.398576, mae: 47.514175, mean_q: -51.030186
 42400/50000: episode: 212, duration: 2.426s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.570 [0.000, 3.000],  loss: 8.617304, mae: 47.705467, mean_q: -51.030548
 42600/50000: episode: 213, duration: 2.407s, episode steps: 200, steps per second:  83, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.420 [0.000, 3.000],  loss: 5.846580, mae: 47.818451, mean_q: -51.234497
 42800/50000: episode: 214, duration: 2.412s, episode steps: 200, steps per second:  83, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.530 [0.000, 3.000],  loss: 7.953347, mae: 47.832558, mean_q: -50.992355
 43000/50000: episode: 215, duration: 2.

 49200/50000: episode: 246, duration: 2.431s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.690 [0.000, 3.000],  loss: 9.083593, mae: 48.180008, mean_q: -52.119617
 49400/50000: episode: 247, duration: 2.432s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.440 [0.000, 3.000],  loss: 7.180509, mae: 47.985428, mean_q: -52.112637
 49600/50000: episode: 248, duration: 2.457s, episode steps: 200, steps per second:  81, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.505 [0.000, 3.000],  loss: 7.680777, mae: 48.157574, mean_q: -52.225986
 49800/50000: episode: 249, duration: 2.441s, episode steps: 200, steps per second:  82, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.425 [0.000, 3.000],  loss: 5.981421, mae: 48.365150, mean_q: -52.494064
 50000/50000: episode: 250, duration: 2.

<keras.callbacks.History at 0x7fe69a3baa00>

In [49]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
+---------+
|R: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |[34;1mB[0m: |
+---------

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: 

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: 

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: 

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: 

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|[3

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|R: 

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)
+---------+
|R: 

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[3

<keras.callbacks.History at 0x7fe69a3ba790>

## Roulette

In [53]:
# Create the environment and reset it to the initial state
env = gym.make("Roulette-v0")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [54]:
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


     7/50000: episode: 1, duration: 0.521s, episode steps:   7, steps per second:  13, episode reward: -2.000, mean reward: -0.286 [-1.000,  1.000], mean action: 20.571 [8.000, 37.000],  loss: --, mae: --, mean_q: --
    47/50000: episode: 2, duration: 0.030s, episode steps:  40, steps per second: 1340, episode reward:  7.000, mean reward:  0.175 [-1.000,  1.000], mean action: 17.950 [2.000, 37.000],  loss: --, mae: --, mean_q: --
    90/50000: episode: 3, duration: 0.032s, episode steps:  43, steps per second: 1336, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 19.070 [0.000, 37.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


   117/50000: episode: 4, duration: 1.897s, episode steps:  27, steps per second:  14, episode reward:  2.000, mean reward:  0.074 [-1.000,  1.000], mean action: 17.741 [0.000, 37.000],  loss: 0.485328, mae: 0.028672, mean_q: 0.006086
   122/50000: episode: 5, duration: 0.034s, episode steps:   5, steps per second: 145, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 21.600 [7.000, 37.000],  loss: 0.485574, mae: 0.033507, mean_q: 0.015416
   130/50000: episode: 6, duration: 0.051s, episode steps:   8, steps per second: 156, episode reward: -1.000, mean reward: -0.125 [-1.000,  1.000], mean action: 23.750 [6.000, 37.000],  loss: 0.479059, mae: 0.036241, mean_q: 0.021380
   198/50000: episode: 7, duration: 0.391s, episode steps:  68, steps per second: 174, episode reward: -9.000, mean reward: -0.132 [-1.000,  1.000], mean action: 18.441 [0.000, 37.000],  loss: 0.472372, mae: 0.048472, mean_q: 0.047506
   215/50000: episode: 8, duration: 0.099s, episode steps:  

  1333/50000: episode: 39, duration: 0.526s, episode steps:  90, steps per second: 171, episode reward: 11.000, mean reward:  0.122 [-1.000,  1.000], mean action: 18.567 [0.000, 37.000],  loss: 0.733445, mae: 0.230873, mean_q: 0.497998
  1433/50000: episode: 40, duration: 0.600s, episode steps: 100, steps per second: 167, episode reward: -2.000, mean reward: -0.020 [-1.000,  1.000], mean action: 20.220 [0.000, 36.000],  loss: 1.119097, mae: 0.248843, mean_q: 0.523875
  1438/50000: episode: 41, duration: 0.041s, episode steps:   5, steps per second: 121, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 18.600 [5.000, 37.000],  loss: 4.663131, mae: 0.264188, mean_q: 0.541516
  1517/50000: episode: 42, duration: 0.500s, episode steps:  79, steps per second: 158, episode reward: -12.000, mean reward: -0.152 [-1.000,  1.000], mean action: 17.810 [1.000, 37.000],  loss: 0.757116, mae: 0.265277, mean_q: 0.554143
  1549/50000: episode: 43, duration: 0.183s, episode st

  3329/50000: episode: 74, duration: 0.358s, episode steps:  62, steps per second: 173, episode reward: -15.000, mean reward: -0.242 [-1.000,  1.000], mean action: 20.371 [2.000, 37.000],  loss: 0.844432, mae: 0.607254, mean_q: 0.883779
  3354/50000: episode: 75, duration: 0.147s, episode steps:  25, steps per second: 170, episode reward:  4.000, mean reward:  0.160 [-1.000,  1.000], mean action: 16.760 [0.000, 37.000],  loss: 0.489134, mae: 0.615614, mean_q: 0.888515
  3392/50000: episode: 76, duration: 0.223s, episode steps:  38, steps per second: 170, episode reward:  7.000, mean reward:  0.184 [-1.000,  1.000], mean action: 18.816 [1.000, 37.000],  loss: 0.486714, mae: 0.620472, mean_q: 0.888068
  3492/50000: episode: 77, duration: 0.569s, episode steps: 100, steps per second: 176, episode reward: -2.000, mean reward: -0.020 [-1.000,  1.000], mean action: 17.640 [0.000, 36.000],  loss: 0.698074, mae: 0.631401, mean_q: 0.903439
  3592/50000: episode: 78, duration: 0.570s, episode st

  5740/50000: episode: 109, duration: 0.600s, episode steps: 100, steps per second: 167, episode reward: -10.000, mean reward: -0.100 [-1.000,  1.000], mean action: 18.460 [0.000, 36.000],  loss: 0.695969, mae: 0.909205, mean_q: 1.107297
  5836/50000: episode: 110, duration: 0.571s, episode steps:  96, steps per second: 168, episode reward: -7.000, mean reward: -0.073 [-1.000,  1.000], mean action: 17.510 [0.000, 37.000],  loss: 0.708194, mae: 0.918010, mean_q: 1.109317
  5886/50000: episode: 111, duration: 0.318s, episode steps:  50, steps per second: 157, episode reward: -7.000, mean reward: -0.140 [-1.000,  1.000], mean action: 17.160 [0.000, 37.000],  loss: 0.494519, mae: 0.923960, mean_q: 1.119688
  5986/50000: episode: 112, duration: 0.674s, episode steps: 100, steps per second: 148, episode reward: -12.000, mean reward: -0.120 [-1.000,  1.000], mean action: 17.980 [0.000, 36.000],  loss: 0.696756, mae: 0.928687, mean_q: 1.110535
  6086/50000: episode: 113, duration: 0.575s, epis

  8448/50000: episode: 144, duration: 0.576s, episode steps: 100, steps per second: 174, episode reward: -4.000, mean reward: -0.040 [-1.000,  1.000], mean action: 16.740 [0.000, 36.000],  loss: 0.708754, mae: 1.127749, mean_q: 1.326183
  8512/50000: episode: 145, duration: 0.372s, episode steps:  64, steps per second: 172, episode reward: -5.000, mean reward: -0.078 [-1.000,  1.000], mean action: 17.969 [1.000, 37.000],  loss: 1.481320, mae: 1.133777, mean_q: 1.342596
  8612/50000: episode: 146, duration: 0.576s, episode steps: 100, steps per second: 174, episode reward: -14.000, mean reward: -0.140 [-1.000,  1.000], mean action: 18.110 [1.000, 36.000],  loss: 1.126159, mae: 1.138198, mean_q: 1.349930
  8644/50000: episode: 147, duration: 0.195s, episode steps:  32, steps per second: 164, episode reward: -7.000, mean reward: -0.219 [-1.000,  1.000], mean action: 18.875 [0.000, 37.000],  loss: 0.495174, mae: 1.142061, mean_q: 1.353413
  8724/50000: episode: 148, duration: 0.476s, episo

 11061/50000: episode: 179, duration: 0.606s, episode steps: 100, steps per second: 165, episode reward: 12.000, mean reward:  0.120 [-1.000,  1.000], mean action: 19.720 [1.000, 36.000],  loss: 0.495003, mae: 1.357636, mean_q: 1.588710
 11161/50000: episode: 180, duration: 0.598s, episode steps: 100, steps per second: 167, episode reward: -16.000, mean reward: -0.160 [-1.000,  1.000], mean action: 17.770 [0.000, 36.000],  loss: 0.501512, mae: 1.365465, mean_q: 1.603818
 11227/50000: episode: 181, duration: 0.476s, episode steps:  66, steps per second: 139, episode reward: -17.000, mean reward: -0.258 [-1.000,  1.000], mean action: 19.818 [0.000, 37.000],  loss: 0.819046, mae: 1.373986, mean_q: 1.606990
 11327/50000: episode: 182, duration: 0.666s, episode steps: 100, steps per second: 150, episode reward: -8.000, mean reward: -0.080 [-1.000,  1.000], mean action: 17.560 [0.000, 36.000],  loss: 0.496797, mae: 1.379809, mean_q: 1.607665
 11409/50000: episode: 183, duration: 0.486s, epis

 13775/50000: episode: 214, duration: 0.583s, episode steps: 100, steps per second: 171, episode reward: -6.000, mean reward: -0.060 [-1.000,  1.000], mean action: 20.820 [1.000, 36.000],  loss: 0.496770, mae: 1.560336, mean_q: 1.760649
 13875/50000: episode: 215, duration: 0.584s, episode steps: 100, steps per second: 171, episode reward: -8.000, mean reward: -0.080 [-1.000,  1.000], mean action: 18.710 [1.000, 36.000],  loss: 0.704427, mae: 1.562714, mean_q: 1.760864
 13975/50000: episode: 216, duration: 0.612s, episode steps: 100, steps per second: 163, episode reward:  4.000, mean reward:  0.040 [-1.000,  1.000], mean action: 17.050 [1.000, 36.000],  loss: 0.714172, mae: 1.567259, mean_q: 1.770627
 14075/50000: episode: 217, duration: 0.601s, episode steps: 100, steps per second: 166, episode reward:  4.000, mean reward:  0.040 [-1.000,  1.000], mean action: 19.920 [1.000, 36.000],  loss: 1.135584, mae: 1.572985, mean_q: 1.788957
 14175/50000: episode: 218, duration: 0.609s, episod

 16516/50000: episode: 250, duration: 0.103s, episode steps:   7, steps per second:  68, episode reward:  2.000, mean reward:  0.286 [-1.000,  1.000], mean action: 24.000 [14.000, 37.000],  loss: 0.497095, mae: 1.717655, mean_q: 1.893322
 16616/50000: episode: 251, duration: 1.190s, episode steps: 100, steps per second:  84, episode reward:  8.000, mean reward:  0.080 [-1.000,  1.000], mean action: 20.320 [0.000, 36.000],  loss: 0.499561, mae: 1.717838, mean_q: 1.895601
 16716/50000: episode: 252, duration: 1.196s, episode steps: 100, steps per second:  84, episode reward: -10.000, mean reward: -0.100 [-1.000,  1.000], mean action: 19.050 [0.000, 36.000],  loss: 0.503277, mae: 1.719669, mean_q: 1.893147
 16816/50000: episode: 253, duration: 1.191s, episode steps: 100, steps per second:  84, episode reward:  2.000, mean reward:  0.020 [-1.000,  1.000], mean action: 20.060 [0.000, 36.000],  loss: 0.717643, mae: 1.720493, mean_q: 1.892837
 16868/50000: episode: 254, duration: 0.631s, epis

 19535/50000: episode: 285, duration: 0.485s, episode steps:  39, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 19.179 [2.000, 37.000],  loss: 1.048471, mae: 1.723696, mean_q: 1.864921
 19635/50000: episode: 286, duration: 1.205s, episode steps: 100, steps per second:  83, episode reward: -6.000, mean reward: -0.060 [-1.000,  1.000], mean action: 17.050 [0.000, 36.000],  loss: 0.921980, mae: 1.724815, mean_q: 1.870423
 19735/50000: episode: 287, duration: 1.196s, episode steps: 100, steps per second:  84, episode reward: -2.000, mean reward: -0.020 [-1.000,  1.000], mean action: 17.980 [1.000, 35.000],  loss: 0.507679, mae: 1.723826, mean_q: 1.877504
 19835/50000: episode: 288, duration: 1.194s, episode steps: 100, steps per second:  84, episode reward:  4.000, mean reward:  0.040 [-1.000,  1.000], mean action: 20.250 [0.000, 36.000],  loss: 0.509084, mae: 1.724358, mean_q: 1.883993
 19935/50000: episode: 289, duration: 1.197s, episod

 22551/50000: episode: 320, duration: 1.205s, episode steps: 100, steps per second:  83, episode reward:  4.000, mean reward:  0.040 [-1.000,  1.000], mean action: 18.490 [1.000, 36.000],  loss: 0.513384, mae: 1.821989, mean_q: 2.049634
 22625/50000: episode: 321, duration: 0.897s, episode steps:  74, steps per second:  82, episode reward: -11.000, mean reward: -0.149 [-1.000,  1.000], mean action: 19.932 [0.000, 37.000],  loss: 0.503113, mae: 1.826504, mean_q: 2.041167
 22725/50000: episode: 322, duration: 1.207s, episode steps: 100, steps per second:  83, episode reward: -6.000, mean reward: -0.060 [-1.000,  1.000], mean action: 18.340 [1.000, 36.000],  loss: 0.931173, mae: 1.832850, mean_q: 2.043111
 22748/50000: episode: 323, duration: 0.296s, episode steps:  23, steps per second:  78, episode reward: -2.000, mean reward: -0.087 [-1.000,  1.000], mean action: 22.043 [2.000, 37.000],  loss: 0.501355, mae: 1.834953, mean_q: 2.043576
 22779/50000: episode: 324, duration: 0.385s, episo

 25559/50000: episode: 355, duration: 1.208s, episode steps: 100, steps per second:  83, episode reward: -24.000, mean reward: -0.240 [-1.000,  1.000], mean action: 17.370 [0.000, 36.000],  loss: 0.513654, mae: 1.931684, mean_q: 2.145947
 25659/50000: episode: 356, duration: 1.208s, episode steps: 100, steps per second:  83, episode reward: -4.000, mean reward: -0.040 [-1.000,  1.000], mean action: 18.040 [0.000, 36.000],  loss: 0.714177, mae: 1.937458, mean_q: 2.147753
 25693/50000: episode: 357, duration: 0.425s, episode steps:  34, steps per second:  80, episode reward: -1.000, mean reward: -0.029 [-1.000,  1.000], mean action: 19.294 [1.000, 37.000],  loss: 0.518326, mae: 1.939619, mean_q: 2.149633
 25704/50000: episode: 358, duration: 0.142s, episode steps:  11, steps per second:  77, episode reward:  4.000, mean reward:  0.364 [-1.000,  1.000], mean action: 27.364 [8.000, 37.000],  loss: 0.491442, mae: 1.940172, mean_q: 2.153050
 25804/50000: episode: 359, duration: 1.204s, episo

 28695/50000: episode: 390, duration: 1.310s, episode steps: 100, steps per second:  76, episode reward: 18.000, mean reward:  0.180 [-1.000,  1.000], mean action: 16.050 [0.000, 35.000],  loss: 0.505023, mae: 2.044721, mean_q: 2.231536
 28795/50000: episode: 391, duration: 1.220s, episode steps: 100, steps per second:  82, episode reward: -14.000, mean reward: -0.140 [-1.000,  1.000], mean action: 17.670 [0.000, 35.000],  loss: 0.714604, mae: 2.046653, mean_q: 2.234587
 28863/50000: episode: 392, duration: 0.833s, episode steps:  68, steps per second:  82, episode reward:  1.000, mean reward:  0.015 [-1.000,  1.000], mean action: 20.544 [0.000, 37.000],  loss: 1.433593, mae: 2.049597, mean_q: 2.242629
 28963/50000: episode: 393, duration: 1.215s, episode steps: 100, steps per second:  82, episode reward: -20.000, mean reward: -0.200 [-1.000,  1.000], mean action: 18.360 [0.000, 36.000],  loss: 0.719710, mae: 2.049534, mean_q: 2.243866
 29063/50000: episode: 394, duration: 1.215s, epis

 31598/50000: episode: 425, duration: 1.254s, episode steps: 100, steps per second:  80, episode reward: 21.000, mean reward:  0.210 [-1.000, 36.000], mean action: 16.650 [0.000, 36.000],  loss: 1.133982, mae: 2.090260, mean_q: 2.252120
 31618/50000: episode: 426, duration: 0.260s, episode steps:  20, steps per second:  77, episode reward: 32.000, mean reward:  1.600 [-1.000, 36.000], mean action: 18.700 [0.000, 37.000],  loss: 0.526306, mae: 2.088499, mean_q: 2.248194
 31718/50000: episode: 427, duration: 1.219s, episode steps: 100, steps per second:  82, episode reward:  2.000, mean reward:  0.020 [-1.000,  1.000], mean action: 17.020 [1.000, 36.000],  loss: 0.724563, mae: 2.085856, mean_q: 2.249848
 31761/50000: episode: 428, duration: 0.537s, episode steps:  43, steps per second:  80, episode reward:  2.000, mean reward:  0.047 [-1.000,  1.000], mean action: 20.535 [1.000, 37.000],  loss: 0.524877, mae: 2.084145, mean_q: 2.256690
 31842/50000: episode: 429, duration: 1.030s, episod

 34423/50000: episode: 460, duration: 1.255s, episode steps: 100, steps per second:  80, episode reward:  4.000, mean reward:  0.040 [-1.000,  1.000], mean action: 18.990 [0.000, 36.000],  loss: 0.921960, mae: 2.129062, mean_q: 2.312829
 34523/50000: episode: 461, duration: 1.225s, episode steps: 100, steps per second:  82, episode reward: -18.000, mean reward: -0.180 [-1.000,  1.000], mean action: 18.000 [0.000, 36.000],  loss: 0.718562, mae: 2.130076, mean_q: 2.318004
 34623/50000: episode: 462, duration: 1.225s, episode steps: 100, steps per second:  82, episode reward:  8.000, mean reward:  0.080 [-1.000,  1.000], mean action: 17.040 [0.000, 35.000],  loss: 0.730141, mae: 2.134229, mean_q: 2.328695
 34723/50000: episode: 463, duration: 1.239s, episode steps: 100, steps per second:  81, episode reward: -2.000, mean reward: -0.020 [-1.000,  1.000], mean action: 18.690 [0.000, 36.000],  loss: 1.346847, mae: 2.138036, mean_q: 2.330814
 34823/50000: episode: 464, duration: 1.227s, episo

 37298/50000: episode: 495, duration: 0.799s, episode steps:  64, steps per second:  80, episode reward: -11.000, mean reward: -0.172 [-1.000,  1.000], mean action: 18.875 [0.000, 37.000],  loss: 0.821763, mae: 2.184775, mean_q: 2.372266
 37398/50000: episode: 496, duration: 1.234s, episode steps: 100, steps per second:  81, episode reward: -26.000, mean reward: -0.260 [-1.000,  1.000], mean action: 19.030 [0.000, 36.000],  loss: 0.923124, mae: 2.185327, mean_q: 2.371370
 37498/50000: episode: 497, duration: 1.234s, episode steps: 100, steps per second:  81, episode reward: 19.000, mean reward:  0.190 [-1.000, 36.000], mean action: 19.070 [0.000, 36.000],  loss: 0.513420, mae: 2.186928, mean_q: 2.376266
 37598/50000: episode: 498, duration: 1.240s, episode steps: 100, steps per second:  81, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 19.470 [1.000, 36.000],  loss: 0.716688, mae: 2.188260, mean_q: 2.380855
 37642/50000: episode: 499, duration: 0.556s, epis

 40158/50000: episode: 531, duration: 0.026s, episode steps:   1, steps per second:  39, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 37.000 [37.000, 37.000],  loss: 0.488312, mae: 2.215705, mean_q: 2.378824
 40258/50000: episode: 532, duration: 1.258s, episode steps: 100, steps per second:  79, episode reward: -12.000, mean reward: -0.120 [-1.000,  1.000], mean action: 18.400 [0.000, 36.000],  loss: 0.922294, mae: 2.216534, mean_q: 2.388005
 40358/50000: episode: 533, duration: 1.246s, episode steps: 100, steps per second:  80, episode reward: -18.000, mean reward: -0.180 [-1.000,  1.000], mean action: 18.580 [0.000, 36.000],  loss: 0.519095, mae: 2.212683, mean_q: 2.380848
 40458/50000: episode: 534, duration: 1.243s, episode steps: 100, steps per second:  80, episode reward: -6.000, mean reward: -0.060 [-1.000,  1.000], mean action: 16.710 [0.000, 36.000],  loss: 0.723376, mae: 2.212133, mean_q: 2.381513
 40492/50000: episode: 535, duration: 0.436s, epi

 43343/50000: episode: 566, duration: 1.244s, episode steps: 100, steps per second:  80, episode reward: -4.000, mean reward: -0.040 [-1.000,  1.000], mean action: 18.000 [0.000, 36.000],  loss: 0.518296, mae: 2.210508, mean_q: 2.375360
 43443/50000: episode: 567, duration: 1.247s, episode steps: 100, steps per second:  80, episode reward: -4.000, mean reward: -0.040 [-1.000,  1.000], mean action: 18.040 [0.000, 36.000],  loss: 0.513692, mae: 2.210317, mean_q: 2.368148
 43543/50000: episode: 568, duration: 1.251s, episode steps: 100, steps per second:  80, episode reward: -16.000, mean reward: -0.160 [-1.000,  1.000], mean action: 17.990 [0.000, 36.000],  loss: 0.722480, mae: 2.210497, mean_q: 2.372429
 43643/50000: episode: 569, duration: 1.248s, episode steps: 100, steps per second:  80, episode reward: -2.000, mean reward: -0.020 [-1.000,  1.000], mean action: 19.400 [0.000, 36.000],  loss: 0.919754, mae: 2.210685, mean_q: 2.375262
 43743/50000: episode: 570, duration: 1.299s, episo

 46491/50000: episode: 601, duration: 1.259s, episode steps: 100, steps per second:  79, episode reward: -22.000, mean reward: -0.220 [-1.000,  1.000], mean action: 17.670 [0.000, 36.000],  loss: 0.517170, mae: 2.181828, mean_q: 2.334739
 46591/50000: episode: 602, duration: 1.254s, episode steps: 100, steps per second:  80, episode reward:  0.000, mean reward:  0.000 [-1.000,  1.000], mean action: 17.310 [0.000, 36.000],  loss: 0.922595, mae: 2.184266, mean_q: 2.334244
 46691/50000: episode: 603, duration: 1.262s, episode steps: 100, steps per second:  79, episode reward: -26.000, mean reward: -0.260 [-1.000,  1.000], mean action: 20.050 [0.000, 36.000],  loss: 0.920961, mae: 2.184155, mean_q: 2.332712
 46791/50000: episode: 604, duration: 1.260s, episode steps: 100, steps per second:  79, episode reward:  2.000, mean reward:  0.020 [-1.000,  1.000], mean action: 18.990 [0.000, 36.000],  loss: 0.724636, mae: 2.185110, mean_q: 2.337565
 46891/50000: episode: 605, duration: 1.256s, epis

 49525/50000: episode: 637, duration: 1.270s, episode steps: 100, steps per second:  79, episode reward:  2.000, mean reward:  0.020 [-1.000,  1.000], mean action: 16.660 [0.000, 36.000],  loss: 0.514085, mae: 2.163467, mean_q: 2.359917
 49602/50000: episode: 638, duration: 0.975s, episode steps:  77, steps per second:  79, episode reward:  2.000, mean reward:  0.026 [-1.000,  1.000], mean action: 18.234 [0.000, 37.000],  loss: 0.774597, mae: 2.165223, mean_q: 2.361362
 49702/50000: episode: 639, duration: 1.274s, episode steps: 100, steps per second:  78, episode reward: -8.000, mean reward: -0.080 [-1.000,  1.000], mean action: 17.570 [0.000, 36.000],  loss: 0.925892, mae: 2.166768, mean_q: 2.366888
 49802/50000: episode: 640, duration: 1.275s, episode steps: 100, steps per second:  78, episode reward: -22.000, mean reward: -0.220 [-1.000,  1.000], mean action: 19.370 [0.000, 36.000],  loss: 0.921249, mae: 2.169911, mean_q: 2.357515
 49902/50000: episode: 641, duration: 1.266s, episo

<keras.callbacks.History at 0x7fe699fbfca0>

In [55]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...


NotImplementedError: 