In [1]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [3]:
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                80        
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________

In [4]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...




    26/50000: episode: 1, duration: 2.593s, episode steps:  26, steps per second:  10, episode reward: 26.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.538 [0.000, 1.000],  loss: 0.482618, mae: 0.509426, mean_q: 0.039436




    64/50000: episode: 2, duration: 0.648s, episode steps:  38, steps per second:  59, episode reward: 38.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.632 [0.000, 1.000],  loss: 0.415804, mae: 0.507038, mean_q: 0.183316
    88/50000: episode: 3, duration: 0.399s, episode steps:  24, steps per second:  60, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 0.309530, mae: 0.537291, mean_q: 0.427105
   117/50000: episode: 4, duration: 0.486s, episode steps:  29, steps per second:  60, episode reward: 29.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.517 [0.000, 1.000],  loss: 0.151831, mae: 0.584011, mean_q: 0.776176
   125/50000: episode: 5, duration: 0.128s, episode steps:   8, steps per second:  63, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.875 [0.000, 1.000],  loss: 0.063526, mae: 0.658265, mean_q: 1.076620
   148/50000: episode: 6, duration: 0.386s, episode steps:  23, step

   759/50000: episode: 38, duration: 0.218s, episode steps:  13, steps per second:  60, episode reward: 13.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.308 [0.000, 1.000],  loss: 0.250690, mae: 3.031759, mean_q: 5.845471
   798/50000: episode: 39, duration: 0.649s, episode steps:  39, steps per second:  60, episode reward: 39.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.487 [0.000, 1.000],  loss: 0.426155, mae: 3.116711, mean_q: 5.918222
   818/50000: episode: 40, duration: 0.332s, episode steps:  20, steps per second:  60, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 0.444654, mae: 3.245605, mean_q: 6.160835
   857/50000: episode: 41, duration: 0.651s, episode steps:  39, steps per second:  60, episode reward: 39.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.487 [0.000, 1.000],  loss: 0.391431, mae: 3.356775, mean_q: 6.416512
   895/50000: episode: 42, duration: 0.635s, episode steps:  38,

  4511/50000: episode: 73, duration: 3.399s, episode steps: 200, steps per second:  59, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 2.677550, mae: 19.297401, mean_q: 39.181236
  4679/50000: episode: 74, duration: 2.816s, episode steps: 168, steps per second:  60, episode reward: 168.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 2.660093, mae: 19.879210, mean_q: 40.419750
  4879/50000: episode: 75, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 2.635105, mae: 20.630190, mean_q: 42.021584
  5050/50000: episode: 76, duration: 2.849s, episode steps: 171, steps per second:  60, episode reward: 171.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 4.239336, mae: 21.435755, mean_q: 43.456444
  5250/50000: episode: 77, duration: 3.332s, episode

 11251/50000: episode: 108, duration: 3.182s, episode steps: 191, steps per second:  60, episode reward: 191.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.545 [0.000, 1.000],  loss: 7.764876, mae: 37.690968, mean_q: 75.961861
 11451/50000: episode: 109, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.545 [0.000, 1.000],  loss: 10.865396, mae: 37.721039, mean_q: 75.831474
 11641/50000: episode: 110, duration: 3.166s, episode steps: 190, steps per second:  60, episode reward: 190.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.479 [0.000, 1.000],  loss: 11.458108, mae: 38.003033, mean_q: 76.368767
 11821/50000: episode: 111, duration: 2.999s, episode steps: 180, steps per second:  60, episode reward: 180.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.472 [0.000, 1.000],  loss: 7.668428, mae: 38.189480, mean_q: 76.857269
 12021/50000: episode: 112, duration: 3.333s, 

 17292/50000: episode: 143, duration: 2.600s, episode steps: 156, steps per second:  60, episode reward: 156.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 6.039194, mae: 42.453934, mean_q: 85.476822
 17492/50000: episode: 144, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 5.132001, mae: 43.129829, mean_q: 86.824997
 17676/50000: episode: 145, duration: 3.066s, episode steps: 184, steps per second:  60, episode reward: 184.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.527 [0.000, 1.000],  loss: 4.064645, mae: 42.774635, mean_q: 86.222878
 17831/50000: episode: 146, duration: 2.583s, episode steps: 155, steps per second:  60, episode reward: 155.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.471 [0.000, 1.000],  loss: 7.732999, mae: 42.601742, mean_q: 85.569489
 18022/50000: episode: 147, duration: 3.183s, ep

 23673/50000: episode: 178, duration: 3.336s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 7.417567, mae: 42.387917, mean_q: 85.177216
 23844/50000: episode: 179, duration: 2.849s, episode steps: 171, steps per second:  60, episode reward: 171.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 7.582265, mae: 42.713970, mean_q: 85.855797
 24017/50000: episode: 180, duration: 2.883s, episode steps: 173, steps per second:  60, episode reward: 173.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.474 [0.000, 1.000],  loss: 4.065862, mae: 42.725735, mean_q: 85.932236
 24217/50000: episode: 181, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 5.623342, mae: 42.367149, mean_q: 85.284218
 24402/50000: episode: 182, duration: 3.082s, ep

 30199/50000: episode: 213, duration: 3.331s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 4.190214, mae: 40.696423, mean_q: 81.793640
 30379/50000: episode: 214, duration: 3.001s, episode steps: 180, steps per second:  60, episode reward: 180.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.544 [0.000, 1.000],  loss: 2.888734, mae: 40.877110, mean_q: 82.106964
 30579/50000: episode: 215, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 7.081144, mae: 40.845608, mean_q: 81.894699
 30779/50000: episode: 216, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 3.477149, mae: 40.795620, mean_q: 81.875877
 30979/50000: episode: 217, duration: 3.332s, ep

 37078/50000: episode: 248, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 4.928547, mae: 39.132572, mean_q: 78.538086
 37231/50000: episode: 249, duration: 2.549s, episode steps: 153, steps per second:  60, episode reward: 153.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.484 [0.000, 1.000],  loss: 2.905269, mae: 38.880070, mean_q: 78.108620
 37431/50000: episode: 250, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 3.352705, mae: 38.826870, mean_q: 78.025879
 37631/50000: episode: 251, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 3.378736, mae: 38.901844, mean_q: 78.161438
 37831/50000: episode: 252, duration: 3.332s, ep

 44031/50000: episode: 283, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 5.338795, mae: 39.904510, mean_q: 79.998192
 44231/50000: episode: 284, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 7.604247, mae: 39.800369, mean_q: 79.645584
 44431/50000: episode: 285, duration: 3.333s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 4.642234, mae: 39.519268, mean_q: 79.210213
 44631/50000: episode: 286, duration: 3.332s, episode steps: 200, steps per second:  60, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 3.695536, mae: 40.000378, mean_q: 80.415985
 44831/50000: episode: 287, duration: 3.333s, ep

<tensorflow.python.keras.callbacks.History at 0x14b1caac8>

In [5]:
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<tensorflow.python.keras.callbacks.History at 0x14b020cf8>