### Instalando e importando as bibliotecas necessárias

In [1]:
!pip install -rq 'requirements.txt'

In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

### Criando o ambiente e extraindo o número de inputs

In [3]:
env = gym.make('CartPole-v0')
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
print('Número de inputs: {}'.format(nb_actions))

Número de inputs: 2


### Testando o ambiente com inputs randômicos

In [4]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        # env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episódio: {} Score: {}'.format(episode, score))
env.close()

Episódio: 1 Score: 26.0
Episódio: 2 Score: 16.0
Episódio: 3 Score: 32.0
Episódio: 4 Score: 28.0
Episódio: 5 Score: 14.0
Episódio: 6 Score: 29.0
Episódio: 7 Score: 45.0
Episódio: 8 Score: 16.0
Episódio: 9 Score: 17.0
Episódio: 10 Score: 23.0


### Construindo da model

In [5]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dense(units = 64, activation = 'relu'))
model.add(Dense(units = 32, activation = 'relu'))
model.add(Dense(units = nb_actions, activation = 'linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               640       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 66        
Total params: 11,042
Trainable params: 11,042
Non-trainable params: 0
_________________________________________________________________
None


### Configurando e compilando o Agent

In [6]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=1000)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-4), metrics=['mae'])

In [7]:
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2, nb_max_start_steps=750)

Training for 10000 steps ...




   15/10000: episode: 1, duration: 0.596s, episode steps:  15, steps per second:  25, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.550408, mae: 0.570810, mean_q: 0.126588, mean_eps: 0.988750
   29/10000: episode: 2, duration: 0.128s, episode steps:  14, steps per second: 110, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.643 [0.000, 1.000],  loss: 0.509433, mae: 0.564630, mean_q: 0.148011, mean_eps: 0.980650




   44/10000: episode: 3, duration: 0.139s, episode steps:  15, steps per second: 108, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.467 [0.000, 1.000],  loss: 0.455749, mae: 0.564539, mean_q: 0.203650, mean_eps: 0.967600
   57/10000: episode: 4, duration: 0.106s, episode steps:  13, steps per second: 123, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.692 [0.000, 1.000],  loss: 0.426414, mae: 0.570471, mean_q: 0.243204, mean_eps: 0.955000




  116/10000: episode: 5, duration: 0.469s, episode steps:  59, steps per second: 126, episode reward: 59.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.441 [0.000, 1.000],  loss: 0.354764, mae: 0.588402, mean_q: 0.369936, mean_eps: 0.922600
  138/10000: episode: 6, duration: 0.168s, episode steps:  22, steps per second: 131, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.318 [0.000, 1.000],  loss: 0.286414, mae: 0.625357, mean_q: 0.566469, mean_eps: 0.886150




  198/10000: episode: 7, duration: 0.487s, episode steps:  60, steps per second: 123, episode reward: 60.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.417 [0.000, 1.000],  loss: 0.211801, mae: 0.700658, mean_q: 0.867886, mean_eps: 0.849250
  212/10000: episode: 8, duration: 0.121s, episode steps:  14, steps per second: 116, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.429 [0.000, 1.000],  loss: 0.178881, mae: 0.812847, mean_q: 1.204293, mean_eps: 0.815950




  236/10000: episode: 9, duration: 0.199s, episode steps:  24, steps per second: 121, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 0.154352, mae: 0.879125, mean_q: 1.371571, mean_eps: 0.798850
  245/10000: episode: 10, duration: 0.070s, episode steps:   9, steps per second: 129, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.162400, mae: 0.910286, mean_q: 1.453482, mean_eps: 0.784000
  258/10000: episode: 11, duration: 0.105s, episode steps:  13, steps per second: 124, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.308 [0.000, 1.000],  loss: 0.191132, mae: 0.979884, mean_q: 1.561761, mean_eps: 0.774100




  292/10000: episode: 12, duration: 0.303s, episode steps:  34, steps per second: 112, episode reward: 34.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 0.198326, mae: 1.026635, mean_q: 1.655255, mean_eps: 0.752950
  307/10000: episode: 13, duration: 0.120s, episode steps:  15, steps per second: 125, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 0.197241, mae: 1.106158, mean_q: 1.837601, mean_eps: 0.730900




  324/10000: episode: 14, duration: 0.147s, episode steps:  17, steps per second: 116, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.353 [0.000, 1.000],  loss: 0.211757, mae: 1.162770, mean_q: 1.905869, mean_eps: 0.716500
  335/10000: episode: 15, duration: 0.086s, episode steps:  11, steps per second: 128, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.182 [0.000, 1.000],  loss: 0.223867, mae: 1.223361, mean_q: 2.001588, mean_eps: 0.703900




  349/10000: episode: 16, duration: 0.133s, episode steps:  14, steps per second: 105, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.714 [0.000, 1.000],  loss: 0.246197, mae: 1.270165, mean_q: 2.123732, mean_eps: 0.692650




  381/10000: episode: 17, duration: 0.235s, episode steps:  32, steps per second: 136, episode reward: 32.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.625 [0.000, 1.000],  loss: 0.233437, mae: 1.332418, mean_q: 2.282009, mean_eps: 0.671950
  403/10000: episode: 18, duration: 0.173s, episode steps:  22, steps per second: 127, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.409 [0.000, 1.000],  loss: 0.302931, mae: 1.445320, mean_q: 2.476566, mean_eps: 0.647650




  431/10000: episode: 19, duration: 0.232s, episode steps:  28, steps per second: 121, episode reward: 28.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.464 [0.000, 1.000],  loss: 0.305433, mae: 1.512276, mean_q: 2.632460, mean_eps: 0.625150
  449/10000: episode: 20, duration: 0.132s, episode steps:  18, steps per second: 136, episode reward: 18.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.269105, mae: 1.559503, mean_q: 2.770006, mean_eps: 0.604450




  471/10000: episode: 21, duration: 0.183s, episode steps:  22, steps per second: 120, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.364 [0.000, 1.000],  loss: 0.191268, mae: 1.641320, mean_q: 3.009543, mean_eps: 0.586450
  491/10000: episode: 22, duration: 0.163s, episode steps:  20, steps per second: 123, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.350 [0.000, 1.000],  loss: 0.383719, mae: 1.792131, mean_q: 3.271623, mean_eps: 0.567550




  519/10000: episode: 23, duration: 0.222s, episode steps:  28, steps per second: 126, episode reward: 28.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.393 [0.000, 1.000],  loss: 0.300213, mae: 1.810313, mean_q: 3.304564, mean_eps: 0.545950
  533/10000: episode: 24, duration: 0.120s, episode steps:  14, steps per second: 116, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.214 [0.000, 1.000],  loss: 0.232144, mae: 1.859223, mean_q: 3.470540, mean_eps: 0.527050




  546/10000: episode: 25, duration: 0.127s, episode steps:  13, steps per second: 102, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.321832, mae: 1.963196, mean_q: 3.635914, mean_eps: 0.514900
  556/10000: episode: 26, duration: 0.084s, episode steps:  10, steps per second: 119, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.200 [0.000, 1.000],  loss: 0.408840, mae: 2.022514, mean_q: 3.690509, mean_eps: 0.504550




  576/10000: episode: 27, duration: 0.165s, episode steps:  20, steps per second: 121, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.650 [0.000, 1.000],  loss: 0.354544, mae: 2.056123, mean_q: 3.735801, mean_eps: 0.491050
  587/10000: episode: 28, duration: 0.085s, episode steps:  11, steps per second: 129, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.909 [0.000, 1.000],  loss: 0.447966, mae: 2.146597, mean_q: 3.964112, mean_eps: 0.477100




  603/10000: episode: 29, duration: 0.132s, episode steps:  16, steps per second: 121, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.375 [0.000, 1.000],  loss: 0.396503, mae: 2.206468, mean_q: 4.072680, mean_eps: 0.464950
  615/10000: episode: 30, duration: 0.095s, episode steps:  12, steps per second: 126, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 0.501646, mae: 2.285610, mean_q: 4.180488, mean_eps: 0.452350
  626/10000: episode: 31, duration: 0.085s, episode steps:  11, steps per second: 129, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.273 [0.000, 1.000],  loss: 0.531277, mae: 2.310365, mean_q: 4.293892, mean_eps: 0.442000




  638/10000: episode: 32, duration: 0.100s, episode steps:  12, steps per second: 120, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.250 [0.000, 1.000],  loss: 0.391246, mae: 2.329277, mean_q: 4.370004, mean_eps: 0.431650
  648/10000: episode: 33, duration: 0.107s, episode steps:  10, steps per second:  94, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.900 [0.000, 1.000],  loss: 0.372561, mae: 2.330930, mean_q: 4.387678, mean_eps: 0.421750
  657/10000: episode: 34, duration: 0.084s, episode steps:   9, steps per second: 108, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.472146, mae: 2.429656, mean_q: 4.581276, mean_eps: 0.413200




  672/10000: episode: 35, duration: 0.121s, episode steps:  15, steps per second: 124, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.452534, mae: 2.454247, mean_q: 4.573493, mean_eps: 0.402400
  684/10000: episode: 36, duration: 0.100s, episode steps:  12, steps per second: 119, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 0.528455, mae: 2.511452, mean_q: 4.666749, mean_eps: 0.390250




  695/10000: episode: 37, duration: 0.099s, episode steps:  11, steps per second: 112, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.364 [0.000, 1.000],  loss: 0.328613, mae: 2.504740, mean_q: 4.714504, mean_eps: 0.379900
  711/10000: episode: 38, duration: 0.140s, episode steps:  16, steps per second: 114, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.375 [0.000, 1.000],  loss: 0.569825, mae: 2.617785, mean_q: 4.845760, mean_eps: 0.367750




  721/10000: episode: 39, duration: 0.098s, episode steps:  10, steps per second: 102, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.200 [0.000, 1.000],  loss: 0.464471, mae: 2.649320, mean_q: 4.990965, mean_eps: 0.356050
  736/10000: episode: 40, duration: 0.116s, episode steps:  15, steps per second: 130, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.267 [0.000, 1.000],  loss: 0.584704, mae: 2.719198, mean_q: 5.115554, mean_eps: 0.344800
  747/10000: episode: 41, duration: 0.083s, episode steps:  11, steps per second: 132, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.273 [0.000, 1.000],  loss: 0.541751, mae: 2.753336, mean_q: 5.112041, mean_eps: 0.333100




  760/10000: episode: 42, duration: 0.112s, episode steps:  13, steps per second: 116, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.558098, mae: 2.775146, mean_q: 5.166465, mean_eps: 0.322300
  774/10000: episode: 43, duration: 0.108s, episode steps:  14, steps per second: 129, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.214 [0.000, 1.000],  loss: 0.691395, mae: 2.839397, mean_q: 5.363215, mean_eps: 0.310150
  783/10000: episode: 44, duration: 0.088s, episode steps:   9, steps per second: 103, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.523416, mae: 2.820479, mean_q: 5.350177, mean_eps: 0.299800




  796/10000: episode: 45, duration: 0.117s, episode steps:  13, steps per second: 111, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.651602, mae: 2.905546, mean_q: 5.435626, mean_eps: 0.289900
  808/10000: episode: 46, duration: 0.105s, episode steps:  12, steps per second: 115, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.250 [0.000, 1.000],  loss: 0.461908, mae: 2.885926, mean_q: 5.447442, mean_eps: 0.278650
  816/10000: episode: 47, duration: 0.069s, episode steps:   8, steps per second: 117, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.629244, mae: 2.985252, mean_q: 5.678307, mean_eps: 0.269650




  826/10000: episode: 48, duration: 0.103s, episode steps:  10, steps per second:  97, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.486374, mae: 2.953268, mean_q: 5.659924, mean_eps: 0.261550
  837/10000: episode: 49, duration: 0.089s, episode steps:  11, steps per second: 124, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.182 [0.000, 1.000],  loss: 0.899967, mae: 3.086921, mean_q: 5.720178, mean_eps: 0.252100
  850/10000: episode: 50, duration: 0.101s, episode steps:  13, steps per second: 129, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.455909, mae: 3.000664, mean_q: 5.609913, mean_eps: 0.241300




  864/10000: episode: 51, duration: 0.121s, episode steps:  14, steps per second: 116, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.214 [0.000, 1.000],  loss: 0.554901, mae: 3.095549, mean_q: 5.893956, mean_eps: 0.229150
  876/10000: episode: 52, duration: 0.095s, episode steps:  12, steps per second: 127, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.250 [0.000, 1.000],  loss: 0.493648, mae: 3.101826, mean_q: 5.872997, mean_eps: 0.217450
  885/10000: episode: 53, duration: 0.082s, episode steps:   9, steps per second: 110, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.447739, mae: 3.137059, mean_q: 5.984960, mean_eps: 0.208000




  898/10000: episode: 54, duration: 0.115s, episode steps:  13, steps per second: 113, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.685794, mae: 3.218898, mean_q: 6.005444, mean_eps: 0.198100
  909/10000: episode: 55, duration: 0.087s, episode steps:  11, steps per second: 127, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.273 [0.000, 1.000],  loss: 0.471038, mae: 3.203130, mean_q: 5.925240, mean_eps: 0.187300
  918/10000: episode: 56, duration: 0.078s, episode steps:   9, steps per second: 116, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.275816, mae: 3.237236, mean_q: 6.213928, mean_eps: 0.178300




  927/10000: episode: 57, duration: 0.092s, episode steps:   9, steps per second:  98, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.583083, mae: 3.334017, mean_q: 6.376501, mean_eps: 0.170200
  938/10000: episode: 58, duration: 0.093s, episode steps:  11, steps per second: 119, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.182 [0.000, 1.000],  loss: 0.485730, mae: 3.324188, mean_q: 6.284179, mean_eps: 0.161200
  947/10000: episode: 59, duration: 0.076s, episode steps:   9, steps per second: 118, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.600570, mae: 3.357682, mean_q: 6.218055, mean_eps: 0.152200




  962/10000: episode: 60, duration: 0.135s, episode steps:  15, steps per second: 112, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.267 [0.000, 1.000],  loss: 0.759603, mae: 3.408537, mean_q: 6.328702, mean_eps: 0.141400
  971/10000: episode: 61, duration: 0.073s, episode steps:   9, steps per second: 124, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.424333, mae: 3.372785, mean_q: 6.430548, mean_eps: 0.130600
  981/10000: episode: 62, duration: 0.080s, episode steps:  10, steps per second: 125, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.200 [0.000, 1.000],  loss: 0.508356, mae: 3.406368, mean_q: 6.401218, mean_eps: 0.122050




  990/10000: episode: 63, duration: 0.090s, episode steps:   9, steps per second: 100, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.407354, mae: 3.405975, mean_q: 6.383804, mean_eps: 0.113500
 1000/10000: episode: 64, duration: 0.082s, episode steps:  10, steps per second: 122, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.200 [0.000, 1.000],  loss: 0.640068, mae: 3.486988, mean_q: 6.551632, mean_eps: 0.104950
 1012/10000: episode: 65, duration: 0.101s, episode steps:  12, steps per second: 119, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.250 [0.000, 1.000],  loss: 0.564555, mae: 3.568923, mean_q: 6.771956, mean_eps: 0.100000




 1025/10000: episode: 66, duration: 0.116s, episode steps:  13, steps per second: 112, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.308 [0.000, 1.000],  loss: 0.416978, mae: 3.519572, mean_q: 6.693887, mean_eps: 0.100000
 1035/10000: episode: 67, duration: 0.085s, episode steps:  10, steps per second: 118, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.300 [0.000, 1.000],  loss: 0.593143, mae: 3.550358, mean_q: 6.614016, mean_eps: 0.100000
 1048/10000: episode: 68, duration: 0.106s, episode steps:  13, steps per second: 123, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.231 [0.000, 1.000],  loss: 0.305193, mae: 3.551809, mean_q: 6.771240, mean_eps: 0.100000




 1057/10000: episode: 69, duration: 0.083s, episode steps:   9, steps per second: 108, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.841016, mae: 3.688551, mean_q: 6.923104, mean_eps: 0.100000
 1068/10000: episode: 70, duration: 0.093s, episode steps:  11, steps per second: 118, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.182 [0.000, 1.000],  loss: 0.662628, mae: 3.574584, mean_q: 6.675770, mean_eps: 0.100000
 1077/10000: episode: 71, duration: 0.073s, episode steps:   9, steps per second: 123, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.403214, mae: 3.612972, mean_q: 6.855820, mean_eps: 0.100000




 1087/10000: episode: 72, duration: 0.090s, episode steps:  10, steps per second: 111, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.100 [0.000, 1.000],  loss: 0.686134, mae: 3.645021, mean_q: 6.828745, mean_eps: 0.100000
 1096/10000: episode: 73, duration: 0.076s, episode steps:   9, steps per second: 118, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.839857, mae: 3.718918, mean_q: 6.891359, mean_eps: 0.100000
 1104/10000: episode: 74, duration: 0.066s, episode steps:   8, steps per second: 122, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.125 [0.000, 1.000],  loss: 0.647392, mae: 3.617061, mean_q: 6.729765, mean_eps: 0.100000




 1112/10000: episode: 75, duration: 0.075s, episode steps:   8, steps per second: 107, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.988690, mae: 3.721391, mean_q: 6.821270, mean_eps: 0.100000
 1120/10000: episode: 76, duration: 0.064s, episode steps:   8, steps per second: 126, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.405446, mae: 3.616855, mean_q: 6.829693, mean_eps: 0.100000
 1134/10000: episode: 77, duration: 0.123s, episode steps:  14, steps per second: 114, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.214 [0.000, 1.000],  loss: 0.713923, mae: 3.711086, mean_q: 6.904474, mean_eps: 0.100000




 1143/10000: episode: 78, duration: 0.086s, episode steps:   9, steps per second: 105, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.111 [0.000, 1.000],  loss: 0.339465, mae: 3.670602, mean_q: 6.950347, mean_eps: 0.100000
 1153/10000: episode: 79, duration: 0.082s, episode steps:  10, steps per second: 121, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.612679, mae: 3.699564, mean_q: 6.917620, mean_eps: 0.100000
 1162/10000: episode: 80, duration: 0.074s, episode steps:   9, steps per second: 121, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.261124, mae: 3.802982, mean_q: 7.280033, mean_eps: 0.100000




 1174/10000: episode: 81, duration: 0.116s, episode steps:  12, steps per second: 104, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.917 [0.000, 1.000],  loss: 0.721874, mae: 3.959940, mean_q: 7.449677, mean_eps: 0.100000
 1190/10000: episode: 82, duration: 0.129s, episode steps:  16, steps per second: 124, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.938 [0.000, 1.000],  loss: 0.638302, mae: 3.915272, mean_q: 7.401385, mean_eps: 0.100000




 1200/10000: episode: 83, duration: 0.091s, episode steps:  10, steps per second: 109, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.978648, mae: 3.909077, mean_q: 7.283152, mean_eps: 0.100000
 1209/10000: episode: 84, duration: 0.084s, episode steps:   9, steps per second: 107, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.889 [0.000, 1.000],  loss: 0.856576, mae: 3.936450, mean_q: 7.283144, mean_eps: 0.100000
 1230/10000: episode: 85, duration: 0.181s, episode steps:  21, steps per second: 116, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.810 [0.000, 1.000],  loss: 0.699830, mae: 3.924407, mean_q: 7.276858, mean_eps: 0.100000
 1238/10000: episode: 86, duration: 0.074s, episode steps:   8, steps per second: 109, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 1.123535, mae: 3.982626, mean_q: 7.330472, mean_ep



 1256/10000: episode: 88, duration: 0.091s, episode steps:   9, steps per second:  99, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.889 [0.000, 1.000],  loss: 0.647194, mae: 4.075225, mean_q: 7.653717, mean_eps: 0.100000
 1266/10000: episode: 89, duration: 0.088s, episode steps:  10, steps per second: 114, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.796688, mae: 4.070616, mean_q: 7.593504, mean_eps: 0.100000
 1277/10000: episode: 90, duration: 0.086s, episode steps:  11, steps per second: 128, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.909 [0.000, 1.000],  loss: 0.938736, mae: 4.052046, mean_q: 7.591738, mean_eps: 0.100000




 1287/10000: episode: 91, duration: 0.104s, episode steps:  10, steps per second:  96, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.344898, mae: 4.048541, mean_q: 7.760190, mean_eps: 0.100000
 1300/10000: episode: 92, duration: 0.114s, episode steps:  13, steps per second: 114, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.769 [0.000, 1.000],  loss: 0.760844, mae: 4.150451, mean_q: 7.853861, mean_eps: 0.100000
 1309/10000: episode: 93, duration: 0.073s, episode steps:   9, steps per second: 124, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.517208, mae: 4.162268, mean_q: 7.914702, mean_eps: 0.100000




 1320/10000: episode: 94, duration: 0.102s, episode steps:  11, steps per second: 107, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.909 [0.000, 1.000],  loss: 0.723720, mae: 4.285139, mean_q: 8.093194, mean_eps: 0.100000
 1330/10000: episode: 95, duration: 0.085s, episode steps:  10, steps per second: 118, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.640570, mae: 4.198095, mean_q: 7.919419, mean_eps: 0.100000
 1339/10000: episode: 96, duration: 0.069s, episode steps:   9, steps per second: 130, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.709036, mae: 4.172623, mean_q: 7.826280, mean_eps: 0.100000




 1349/10000: episode: 97, duration: 0.091s, episode steps:  10, steps per second: 109, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.663262, mae: 4.254147, mean_q: 8.025628, mean_eps: 0.100000
 1363/10000: episode: 98, duration: 0.123s, episode steps:  14, steps per second: 114, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.857 [0.000, 1.000],  loss: 0.673189, mae: 4.409218, mean_q: 8.373262, mean_eps: 0.100000




 1373/10000: episode: 99, duration: 0.096s, episode steps:  10, steps per second: 104, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.822364, mae: 4.251898, mean_q: 7.996177, mean_eps: 0.100000
 1382/10000: episode: 100, duration: 0.076s, episode steps:   9, steps per second: 119, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.855019, mae: 4.435698, mean_q: 8.350040, mean_eps: 0.100000
 1393/10000: episode: 101, duration: 0.110s, episode steps:  11, steps per second: 100, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.909 [0.000, 1.000],  loss: 0.626325, mae: 4.335421, mean_q: 8.247470, mean_eps: 0.100000




 1403/10000: episode: 102, duration: 0.100s, episode steps:  10, steps per second: 100, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.639657, mae: 4.401718, mean_q: 8.388874, mean_eps: 0.100000
 1416/10000: episode: 103, duration: 0.107s, episode steps:  13, steps per second: 122, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.692 [0.000, 1.000],  loss: 0.895587, mae: 4.333741, mean_q: 8.129143, mean_eps: 0.100000




 1429/10000: episode: 104, duration: 0.117s, episode steps:  13, steps per second: 111, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.615 [0.000, 1.000],  loss: 0.895546, mae: 4.454349, mean_q: 8.289829, mean_eps: 0.100000




 1486/10000: episode: 105, duration: 0.448s, episode steps:  57, steps per second: 127, episode reward: 57.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.404 [0.000, 1.000],  loss: 0.789076, mae: 4.635746, mean_q: 8.714938, mean_eps: 0.100000
 1510/10000: episode: 106, duration: 0.191s, episode steps:  24, steps per second: 125, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 0.672339, mae: 4.665174, mean_q: 8.831748, mean_eps: 0.100000




 1571/10000: episode: 107, duration: 0.462s, episode steps:  61, steps per second: 132, episode reward: 61.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.377 [0.000, 1.000],  loss: 0.667490, mae: 4.830078, mean_q: 9.158312, mean_eps: 0.100000




 1723/10000: episode: 108, duration: 1.163s, episode steps: 152, steps per second: 131, episode reward: 152.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.467 [0.000, 1.000],  loss: 0.643601, mae: 5.167970, mean_q: 9.877136, mean_eps: 0.100000




 1923/10000: episode: 109, duration: 1.496s, episode steps: 200, steps per second: 134, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 0.668804, mae: 5.851282, mean_q: 11.325832, mean_eps: 0.100000




 2035/10000: episode: 110, duration: 0.844s, episode steps: 112, steps per second: 133, episode reward: 112.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.527 [0.000, 1.000],  loss: 0.803683, mae: 6.408784, mean_q: 12.450031, mean_eps: 0.100000




 2100/10000: episode: 111, duration: 0.506s, episode steps:  65, steps per second: 129, episode reward: 65.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.477 [0.000, 1.000],  loss: 1.124055, mae: 6.637097, mean_q: 12.853987, mean_eps: 0.100000




 2184/10000: episode: 112, duration: 0.667s, episode steps:  84, steps per second: 126, episode reward: 84.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.536 [0.000, 1.000],  loss: 1.013494, mae: 7.001010, mean_q: 13.630264, mean_eps: 0.100000




 2303/10000: episode: 113, duration: 0.922s, episode steps: 119, steps per second: 129, episode reward: 119.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.521 [0.000, 1.000],  loss: 1.136442, mae: 7.347481, mean_q: 14.371781, mean_eps: 0.100000
 2381/10000: episode: 114, duration: 0.597s, episode steps:  78, steps per second: 131, episode reward: 78.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.538 [0.000, 1.000],  loss: 1.507516, mae: 7.691809, mean_q: 15.090944, mean_eps: 0.100000




 2532/10000: episode: 115, duration: 1.187s, episode steps: 151, steps per second: 127, episode reward: 151.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.517 [0.000, 1.000],  loss: 1.269525, mae: 8.136377, mean_q: 16.002223, mean_eps: 0.100000




 2609/10000: episode: 116, duration: 0.570s, episode steps:  77, steps per second: 135, episode reward: 77.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.481 [0.000, 1.000],  loss: 1.207164, mae: 8.556207, mean_q: 16.923092, mean_eps: 0.100000




 2675/10000: episode: 117, duration: 0.528s, episode steps:  66, steps per second: 125, episode reward: 66.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.470 [0.000, 1.000],  loss: 1.427045, mae: 8.880727, mean_q: 17.519019, mean_eps: 0.100000




 2770/10000: episode: 118, duration: 0.747s, episode steps:  95, steps per second: 127, episode reward: 95.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 1.487164, mae: 9.151214, mean_q: 18.082785, mean_eps: 0.100000




 2834/10000: episode: 119, duration: 0.528s, episode steps:  64, steps per second: 121, episode reward: 64.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.484 [0.000, 1.000],  loss: 1.501166, mae: 9.449721, mean_q: 18.684688, mean_eps: 0.100000




 2888/10000: episode: 120, duration: 0.417s, episode steps:  54, steps per second: 129, episode reward: 54.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.463 [0.000, 1.000],  loss: 1.986966, mae: 9.708225, mean_q: 19.114668, mean_eps: 0.100000




 2938/10000: episode: 121, duration: 0.418s, episode steps:  50, steps per second: 120, episode reward: 50.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.460 [0.000, 1.000],  loss: 2.089882, mae: 9.876488, mean_q: 19.484758, mean_eps: 0.100000




 3007/10000: episode: 122, duration: 0.535s, episode steps:  69, steps per second: 129, episode reward: 69.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.536 [0.000, 1.000],  loss: 1.754804, mae: 10.111188, mean_q: 19.989660, mean_eps: 0.100000




 3086/10000: episode: 123, duration: 0.631s, episode steps:  79, steps per second: 125, episode reward: 79.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.468 [0.000, 1.000],  loss: 2.009837, mae: 10.294290, mean_q: 20.369549, mean_eps: 0.100000




 3175/10000: episode: 124, duration: 0.694s, episode steps:  89, steps per second: 128, episode reward: 89.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.483 [0.000, 1.000],  loss: 2.923721, mae: 10.647913, mean_q: 20.922505, mean_eps: 0.100000




 3278/10000: episode: 125, duration: 0.806s, episode steps: 103, steps per second: 128, episode reward: 103.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 2.409664, mae: 10.951854, mean_q: 21.617179, mean_eps: 0.100000




 3378/10000: episode: 126, duration: 0.785s, episode steps: 100, steps per second: 127, episode reward: 100.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 2.569232, mae: 11.214351, mean_q: 22.138809, mean_eps: 0.100000




 3483/10000: episode: 127, duration: 0.811s, episode steps: 105, steps per second: 129, episode reward: 105.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.533 [0.000, 1.000],  loss: 2.561565, mae: 11.681874, mean_q: 23.136185, mean_eps: 0.100000




 3610/10000: episode: 128, duration: 0.970s, episode steps: 127, steps per second: 131, episode reward: 127.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 3.620886, mae: 11.989536, mean_q: 23.632770, mean_eps: 0.100000




 3734/10000: episode: 129, duration: 0.940s, episode steps: 124, steps per second: 132, episode reward: 124.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.532 [0.000, 1.000],  loss: 2.891123, mae: 12.323637, mean_q: 24.374055, mean_eps: 0.100000




 3813/10000: episode: 130, duration: 0.616s, episode steps:  79, steps per second: 128, episode reward: 79.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.468 [0.000, 1.000],  loss: 1.601753, mae: 12.502708, mean_q: 24.881812, mean_eps: 0.100000




 3893/10000: episode: 131, duration: 0.623s, episode steps:  80, steps per second: 128, episode reward: 80.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.475 [0.000, 1.000],  loss: 3.579153, mae: 12.809502, mean_q: 25.316949, mean_eps: 0.100000




 3974/10000: episode: 132, duration: 0.635s, episode steps:  81, steps per second: 128, episode reward: 81.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.469 [0.000, 1.000],  loss: 3.887201, mae: 13.018036, mean_q: 25.745029, mean_eps: 0.100000




 4054/10000: episode: 133, duration: 0.640s, episode steps:  80, steps per second: 125, episode reward: 80.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.475 [0.000, 1.000],  loss: 2.914158, mae: 13.218724, mean_q: 26.216060, mean_eps: 0.100000




 4162/10000: episode: 134, duration: 0.812s, episode steps: 108, steps per second: 133, episode reward: 108.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.546 [0.000, 1.000],  loss: 3.282203, mae: 13.484815, mean_q: 26.726444, mean_eps: 0.100000
 4252/10000: episode: 135, duration: 0.752s, episode steps:  90, steps per second: 120, episode reward: 90.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.544 [0.000, 1.000],  loss: 3.614477, mae: 13.797255, mean_q: 27.364191, mean_eps: 0.100000
 4254/10000: episode: 136, duration: 0.023s, episode steps:   2, steps per second:  87, episode reward:  2.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.867822, mae: 13.312839, mean_q: 26.745773, mean_eps: 0.100000




 4375/10000: episode: 137, duration: 0.921s, episode steps: 121, steps per second: 131, episode reward: 121.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.545 [0.000, 1.000],  loss: 4.240568, mae: 14.002627, mean_q: 27.708091, mean_eps: 0.100000




 4509/10000: episode: 138, duration: 1.017s, episode steps: 134, steps per second: 132, episode reward: 134.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.478 [0.000, 1.000],  loss: 4.108635, mae: 14.281104, mean_q: 28.298114, mean_eps: 0.100000




 4696/10000: episode: 139, duration: 1.421s, episode steps: 187, steps per second: 132, episode reward: 187.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 3.522178, mae: 14.676684, mean_q: 29.171609, mean_eps: 0.100000




 4890/10000: episode: 140, duration: 1.480s, episode steps: 194, steps per second: 131, episode reward: 194.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 3.023311, mae: 15.164370, mean_q: 30.222030, mean_eps: 0.100000




 5090/10000: episode: 141, duration: 1.472s, episode steps: 200, steps per second: 136, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 3.761136, mae: 15.681861, mean_q: 31.274448, mean_eps: 0.100000




 5267/10000: episode: 142, duration: 1.327s, episode steps: 177, steps per second: 133, episode reward: 177.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.531 [0.000, 1.000],  loss: 3.589318, mae: 16.189183, mean_q: 32.351771, mean_eps: 0.100000




 5467/10000: episode: 143, duration: 1.491s, episode steps: 200, steps per second: 134, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.525 [0.000, 1.000],  loss: 4.341698, mae: 16.538042, mean_q: 32.954338, mean_eps: 0.100000




 5667/10000: episode: 144, duration: 1.520s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 4.189559, mae: 16.909987, mean_q: 33.731480, mean_eps: 0.100000




 5815/10000: episode: 145, duration: 1.130s, episode steps: 148, steps per second: 131, episode reward: 148.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 4.679013, mae: 17.333123, mean_q: 34.626920, mean_eps: 0.100000




 6015/10000: episode: 146, duration: 1.511s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 3.888035, mae: 17.655676, mean_q: 35.342600, mean_eps: 0.100000




 6215/10000: episode: 147, duration: 1.526s, episode steps: 200, steps per second: 131, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 3.445055, mae: 18.229597, mean_q: 36.607318, mean_eps: 0.100000




 6415/10000: episode: 148, duration: 1.509s, episode steps: 200, steps per second: 133, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 4.279191, mae: 18.662320, mean_q: 37.435431, mean_eps: 0.100000




 6615/10000: episode: 149, duration: 1.518s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 5.669224, mae: 19.052859, mean_q: 38.093122, mean_eps: 0.100000




 6815/10000: episode: 150, duration: 1.542s, episode steps: 200, steps per second: 130, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 3.408344, mae: 19.466315, mean_q: 39.131024, mean_eps: 0.100000




 7015/10000: episode: 151, duration: 1.526s, episode steps: 200, steps per second: 131, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 4.143372, mae: 19.922288, mean_q: 40.092885, mean_eps: 0.100000




 7215/10000: episode: 152, duration: 1.494s, episode steps: 200, steps per second: 134, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 3.404258, mae: 20.510658, mean_q: 41.350531, mean_eps: 0.100000




 7415/10000: episode: 153, duration: 1.508s, episode steps: 200, steps per second: 133, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 5.124694, mae: 21.006519, mean_q: 42.253396, mean_eps: 0.100000




 7615/10000: episode: 154, duration: 1.513s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 4.355347, mae: 21.434579, mean_q: 43.259584, mean_eps: 0.100000




 7815/10000: episode: 155, duration: 1.536s, episode steps: 200, steps per second: 130, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 5.441531, mae: 21.879792, mean_q: 44.032714, mean_eps: 0.100000
 7938/10000: episode: 156, duration: 0.925s, episode steps: 123, steps per second: 133, episode reward: 123.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.463 [0.000, 1.000],  loss: 5.119336, mae: 22.333305, mean_q: 44.961790, mean_eps: 0.100000




 8138/10000: episode: 157, duration: 1.531s, episode steps: 200, steps per second: 131, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 5.701700, mae: 22.517363, mean_q: 45.338902, mean_eps: 0.100000




 8338/10000: episode: 158, duration: 1.500s, episode steps: 200, steps per second: 133, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 5.110662, mae: 22.958474, mean_q: 46.296131, mean_eps: 0.100000




 8538/10000: episode: 159, duration: 1.532s, episode steps: 200, steps per second: 131, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 4.929777, mae: 23.295767, mean_q: 47.034271, mean_eps: 0.100000




 8738/10000: episode: 160, duration: 1.511s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 3.963999, mae: 23.920596, mean_q: 48.399194, mean_eps: 0.100000




 8938/10000: episode: 161, duration: 1.508s, episode steps: 200, steps per second: 133, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 5.884755, mae: 24.282491, mean_q: 49.033926, mean_eps: 0.100000




 9138/10000: episode: 162, duration: 1.539s, episode steps: 200, steps per second: 130, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 5.892304, mae: 24.797978, mean_q: 50.093914, mean_eps: 0.100000




 9338/10000: episode: 163, duration: 1.494s, episode steps: 200, steps per second: 134, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 4.881453, mae: 25.183294, mean_q: 50.915047, mean_eps: 0.100000




 9538/10000: episode: 164, duration: 1.510s, episode steps: 200, steps per second: 132, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 5.493525, mae: 25.608563, mean_q: 51.704377, mean_eps: 0.100000




 9738/10000: episode: 165, duration: 1.535s, episode steps: 200, steps per second: 130, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 6.403872, mae: 25.909695, mean_q: 52.268779, mean_eps: 0.100000




 9938/10000: episode: 166, duration: 1.505s, episode steps: 200, steps per second: 133, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 6.185717, mae: 26.271432, mean_q: 53.102228, mean_eps: 0.100000




done, took 78.316 seconds


<tensorflow.python.keras.callbacks.History at 0x7feee3541910>

### Avaliação do treinamento em 20 episódios

In [8]:
dqn.test(env, nb_episodes=20, visualize=False)

Testing for 20 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200


<tensorflow.python.keras.callbacks.History at 0x7feee331c9d0>

### Salvamento dos pesos após a avaliação dos resultados

In [9]:
dqn.save_weights('saves/dqn_weights.h5f', overwrite=False)