# Cart Pole System

In [4]:
import gym
import numpy as np 
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

env = gym.make('CartPole-v0')
np.random.seed(1)
env.seed(1)

nb_actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [6]:
memory = SequentialMemory(limit = 50000, window_length = 1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(
    model = model, 
    nb_actions = 
    nb_actions,
    memory = memory, 
    nb_steps_warmup = 10, 
    target_model_update = 1e-2, 
    policy = policy)

dqn.compile(Adam(lr = 1e-3), metrics=['mae'])
dqn.fit(env, nb_steps = 1000, visualize = True, verbose = 2)
dqn.save_weights('dqn_weights.h5f', overwrite = True)
dqn.test(env, nb_episodes = 5, visualize = True)

Training for 1000 steps ...




  19/1000: episode: 1, duration: 1.740s, episode steps: 19, steps per second: 11, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.368 [0.000, 1.000], mean observation: 0.066 [-1.176, 1.899], loss: 0.492009, mean_absolute_error: 0.539172, mean_q: 0.136899
  36/1000: episode: 2, duration: 0.281s, episode steps: 17, steps per second: 60, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.107 [-0.984, 0.363], loss: 0.372251, mean_absolute_error: 0.541726, mean_q: 0.360728
  45/1000: episode: 3, duration: 0.149s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.148 [-1.182, 1.984], loss: 0.290552, mean_absolute_error: 0.544632, mean_q: 0.563937
  56/1000: episode: 4, duration: 0.180s, episode steps: 11, steps per second: 61, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.27

 583/1000: episode: 32, duration: 0.750s, episode steps: 45, steps per second: 60, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.047 [-1.180, 1.797], loss: 0.283482, mean_absolute_error: 2.654948, mean_q: 5.099975
 604/1000: episode: 33, duration: 0.349s, episode steps: 21, steps per second: 60, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.049 [-0.790, 1.324], loss: 0.267343, mean_absolute_error: 2.788553, mean_q: 5.412519
 639/1000: episode: 34, duration: 0.583s, episode steps: 35, steps per second: 60, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.080 [-1.068, 0.817], loss: 0.314576, mean_absolute_error: 2.887800, mean_q: 5.555448
 665/1000: episode: 35, duration: 0.431s, episode steps: 26, steps per second: 60, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action

<keras.callbacks.History at 0x1349a3e50>