In [3]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam


In [5]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [7]:
#set a relevent variable
ENV_NAME = 'CartPole-v0'
#Get the environment and extract the no of action available in cart pole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [8]:
#build a very simple hidden layer neural network model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...




Instructions for updating:
Use tf.cast instead.
   79/5000: episode: 1, duration: 10.835s, episode steps: 79, steps per second: 7, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.428329, mean_absolute_error: 0.496236, mean_q: 0.052405
  113/5000: episode: 2, duration: 0.538s, episode steps: 34, steps per second: 63, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.353307, mean_absolute_error: 0.446486, mean_q: 0.191942
  163/5000: episode: 3, duration: 0.833s, episode steps: 50, steps per second: 60, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.317570, mean_absolute_error: 0.468236, mean_q: 0.317697
  197/5000: episode: 4, duration: 0.566s, episode steps: 34, steps per second: 60, episode reward: 34.000,

  700/5000: episode: 30, duration: 0.195s, episode steps: 12, steps per second: 61, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.114 [-0.748, 1.469], loss: 0.418320, mean_absolute_error: 2.276746, mean_q: 4.351179
  716/5000: episode: 31, duration: 0.275s, episode steps: 16, steps per second: 58, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.080 [-0.825, 1.483], loss: 0.448412, mean_absolute_error: 2.342621, mean_q: 4.463023
  728/5000: episode: 32, duration: 0.191s, episode steps: 12, steps per second: 63, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.095 [-0.829, 1.288], loss: 0.527969, mean_absolute_error: 2.423585, mean_q: 4.571693
  740/5000: episode: 33, duration: 0.197s, episode steps: 12, steps per second: 61, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean act

 1076/5000: episode: 59, duration: 0.202s, episode steps: 12, steps per second: 59, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.103 [-0.633, 1.139], loss: 1.085588, mean_absolute_error: 3.705820, mean_q: 6.959055
 1090/5000: episode: 60, duration: 0.229s, episode steps: 14, steps per second: 61, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.100 [-0.825, 1.234], loss: 0.959688, mean_absolute_error: 3.730137, mean_q: 6.978728
 1103/5000: episode: 61, duration: 0.216s, episode steps: 13, steps per second: 60, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.091 [-0.797, 1.250], loss: 1.188751, mean_absolute_error: 3.787336, mean_q: 7.086816
 1117/5000: episode: 62, duration: 0.232s, episode steps: 14, steps per second: 60, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean act

 1912/5000: episode: 88, duration: 0.862s, episode steps: 52, steps per second: 60, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.130 [-0.751, 0.248], loss: 1.334405, mean_absolute_error: 5.535321, mean_q: 10.569262
 1951/5000: episode: 89, duration: 0.650s, episode steps: 39, steps per second: 60, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.081 [-0.859, 0.283], loss: 1.157321, mean_absolute_error: 5.708306, mean_q: 11.112662
 1979/5000: episode: 90, duration: 0.466s, episode steps: 28, steps per second: 60, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.110 [-0.660, 0.433], loss: 1.602455, mean_absolute_error: 5.820862, mean_q: 11.133946
 2019/5000: episode: 91, duration: 0.667s, episode steps: 40, steps per second: 60, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], me

 2516/5000: episode: 117, duration: 0.805s, episode steps: 48, steps per second: 60, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.107 [-0.818, 0.431], loss: 2.954190, mean_absolute_error: 7.446457, mean_q: 14.125079
 2552/5000: episode: 118, duration: 0.595s, episode steps: 36, steps per second: 61, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.124 [-0.783, 0.302], loss: 3.047535, mean_absolute_error: 7.500880, mean_q: 14.273122
 2586/5000: episode: 119, duration: 0.566s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.114 [-0.970, 0.459], loss: 3.481608, mean_absolute_error: 7.633544, mean_q: 14.420965
 2607/5000: episode: 120, duration: 0.350s, episode steps: 21, steps per second: 60, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000]

 4183/5000: episode: 146, duration: 1.237s, episode steps: 66, steps per second: 53, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.159 [-0.761, 0.490], loss: 4.870924, mean_absolute_error: 10.359591, mean_q: 19.968761
 4261/5000: episode: 147, duration: 1.430s, episode steps: 78, steps per second: 55, episode reward: 78.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.123 [-0.753, 0.262], loss: 4.205443, mean_absolute_error: 10.393514, mean_q: 20.113487
 4461/5000: episode: 148, duration: 3.720s, episode steps: 200, steps per second: 54, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.056 [-0.758, 0.731], loss: 4.566591, mean_absolute_error: 10.656670, mean_q: 20.602295
 4535/5000: episode: 149, duration: 1.233s, episode steps: 74, steps per second: 60, episode reward: 74.000, mean reward: 1.000 [1.000, 1.

<keras.callbacks.History at 0x21bac365748>

In [13]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 197.000, steps: 197
Episode 2: reward: 76.000, steps: 76
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 78.000, steps: 78
Episode 5: reward: 109.000, steps: 109


<keras.callbacks.History at 0x21babfcb630>