# Cart Pole - Double Deep Q-Network

In [1]:
import gym
import numpy as np
import random
import tensorflow as tf
import os

#### Load the CartPole Environment

In [2]:
env = gym.make('CartPole-v0')

[2016-10-19 21:54:27,002] Making new env: CartPole-v0


#### Implement Q-Network

In [3]:
class Qnetwork():
    def __init__(self, h_size, frameShape, batch_size):
        # The network recieves a frame from the game, flattened into an array.
        # It then resizes it and processes it through four convolutional layers.
        
        # the number of pixels in a frame, number of inputs to network
        numInputs = 1
        for i in frameShape:
            numInputs *= i
        
        # raw rgb values from game
        self.rgb_array = tf.placeholder(shape=[None, frameShape[0], frameShape[1], frameShape[2]], dtype=tf.float32)
        
        # tf input: a 4-D tensor [batch_size, height, width, channels]
        self.imageIn = tf.reshape(self.rgb_array,shape=[-1, frameShape[0], frameShape[1], frameShape[2]])
        
        self.conv1 = tf.contrib.layers.convolution2d(
            inputs = self.imageIn,
            num_outputs = 32,
            kernel_size = [8, 8],
            stride = [4, 4],
            padding = 'VALID',
            biases_initializer = None)
        
        self.conv2 = tf.contrib.layers.convolution2d(
            inputs = self.conv1,
            num_outputs = 64,
            kernel_size = [4, 4],
            stride = [2, 2],
            padding = 'VALID',
            biases_initializer = None)
        
        self.conv3 = tf.contrib.layers.convolution2d(
            inputs = self.conv2,
            num_outputs = 64,
            kernel_size = [3, 3],
            stride = [1, 1],
            padding = 'VALID',
            biases_initializer = None)
        
        self.conv4 = tf.contrib.layers.convolution2d(
            inputs = self.conv3,
            num_outputs = 512,
            kernel_size = [7, 7],
            stride = [1, 1],
            padding = 'VALID',
            biases_initializer = None)
        
        # We take the output from the final convolutional layer and split it
        # into separate advantage and value streams.
        
        # TODO: figure out what shape of self.conv4 is
        # split on the 3rd dimension into 2 different parts
        self.streamAC, self.streamVC = tf.split(3, 2, self.conv4)
        
        # flatten to [batch_size, k]
        self.streamA = tf.contrib.layers.flatten(self.streamAC)
        self.streamV = tf.contrib.layers.flatten(self.streamVC)
        
        # weights for advantage and value stream layer
        self.AW = tf.Variable(tf.random_normal([665600, env.action_space.n]))
        self.VW = tf.Variable(tf.random_normal([665600, 1]))
        
        # output of advantage and value layer
        self.Advantage = tf.matmul(self.streamA, self.AW)
        self.Value = tf.matmul(self.streamV, self.VW)
        
        # Then combine them together to get our final Q-values.
        # Q(s, a) = V(s) + A(a)
        self.Qout = self.Value + tf.sub(
            self.Advantage,
            tf.reduce_mean( # TODO: understand how this A(a) is calculated
                self.Advantage,
                reduction_indices=1,
                keep_dims=True))
        
        # index of max value across 1st dimension
        self.predict = tf.argmax(self.Qout, 1)
        
        # Below we obtain the loss by taking the sum of squares difference between
        # the target and prediction Q values.
        
        # target Q value
        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        
        # possible actions
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions, env.action_space.n, dtype=tf.float32)
        
        # predicted Q values
        self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
        
        # error = sum( (target - actual)^2 ) / batch_size
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        
        # define trainer
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        
        # define optimizer
        self.updateModel = self.trainer.minimize(self.loss)

#### Experience Replay

In [4]:
class experience_buffer():
    """Used to store experiences and samples randomly to train the network."""
    def __init__(self, buffer_size=500):
        self.buffer_size = buffer_size
        # state, action, reward, state1, done
        self.states = []
        self.actions = []
        self.rewards = []
        self.states_ = []
        self.dones = []
        
    def add(self, states, actions, rewards, states_, dones):
        if len(self.states) + len(states) >= self.buffer_size:
            self.states = self.states[:(self.buffer_size - len(states))]
            self.actions = self.actions[:(self.buffer_size - len(actions))]
            self.rewards = self.rewards[:(self.buffer_size - len(rewards))]
            self.states_ = self.states_[:(self.buffer_size - len(states_))]
            self.dones = self.dones[:(self.buffer_size - len(dones))]

        self.states.extend(states)
        self.actions.extend(actions)
        self.rewards.extend(rewards)
        self.states_.extend(states_)
        self.dones.extend(dones)
        
    def sample(self, size):
        samples = random.sample(range(len(self.actions)), size)

        states = []
        actions = []
        rewards = []
        states_ = []
        dones = []
        for i in samples:
            states.append(self.states[i])
            actions.append(self.actions[i])
            rewards.append(self.rewards[i])
            states_.append(self.states_[i])
            dones.append(self.dones[i])
                
        return states, actions, rewards, states_, dones

In [6]:
def updateTarget(tfVars, sess):
    """Updates the parameters of our target network with those of the primary network."""
    total_vars = len(tfVars)
    for idx, var in enumerate(tfVars[:int(total_vars / 2)]):
        sess.run(tfVars[int(idx + total_vars / 2)].assign(var.eval()))

In [7]:
gamma = 0.99

def discount_rewards(r):
    """Take 1D float array of rewards and compute discounted reward.
    
    Discounts rewards for a given episode.
    This is the Monte-Carlo method since we apply it to all rewards
    in a given episode.
    
    Provides more robust reward signal to DQN.
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

#### Setup network

In [8]:
# number of experiences to use for each training step
batch_size = 5

# how often to execute training step
update_freq = 1

# discount factor on target Q-values
y = 0.99

# starting chance of random action
startE = 1

# final chance of random action
endE = 0.1

# how many training steps required to fully reduce startE to endE
anneling_steps = 1000

# number of episodes of env to train network with
num_episodes = 10000

# number of random actions before training begins
pre_train_steps = 500

# load saved model?
load_model = False

# path to save model to
path = "./dqn"

# size of final convolutional layer before
# splitting it into Advantage and Value streams
h_size = 512

#### Get frame shape

In [9]:
env.render()
pixels = env.viewer.get_array()
frameShape = pixels.shape
env.viewer.close()
env.viewer = None

#### Begin Training

In [11]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size, frameShape, batch_size)
targetQN = Qnetwork(h_size, frameShape, batch_size)

init = tf.initialize_all_variables()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

myBuffer = experience_buffer()

# set rate of random action decrease
e = startE
stepDrop = (startE - endE) / anneling_steps

# create lists to contain total rewards and steps per episode
movesList = []
rewardList = []
total_steps = 0

# make path for model to be saved in
if not os.path.exists(path):
    os.makedirs(path)


with tf.Session() as sess:
    if load_model == True:
        print("Loading Model...")
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    sess.run(init)

    # set target network to be equal to primary network
    updateTarget(trainables, sess)

    try:
        for i in range(num_episodes):
            episodeBuffer = experience_buffer()

            # reset environment and get first new observation
            env.reset()
            state = env.render(mode='rgb_array')
            done = False
            rewardAll = 0
            numMoves = 0

            # the Q-Network
            while numMoves < 200:
                numMoves += 1

                # choose action with probability e of being a random action
                if np.random.rand(1) < e or total_steps < pre_train_steps:
                    action = np.random.randint(0, 2) # 2 = num different actions
                else:
                    # select [0] position because returns in the form [action]
                    action = sess.run(mainQN.predict, feed_dict={mainQN.rgb_array: [state]})[0]

                observation, reward, done, _ = env.step(action)
                state1 = env.render(mode='rgb_array')
                total_steps += 1

                # save experience to episode buffer
                episodeBuffer.add([state], [action], [reward], [state1], [done])

                if total_steps > pre_train_steps:
                    if e > endE:
                        e -= stepDrop

                    if total_steps % (update_freq * 1000) == 0:
                        print("Target network updated.")
                        updateTarget(trainables, sess)

                    if total_steps % (update_freq * 100) == 0:
                        # random sample of experiences
                        states_t, actions_t, rewards_t, state1_t, dones_t = myBuffer.sample(batch_size)

                        # Double-DQN update to the target Q-values
                        Q1 = sess.run(mainQN.predict, feed_dict={
                                mainQN.rgb_array: state1_t})

                        Q2 = sess.run(targetQN.Qout, feed_dict={
                                targetQN.rgb_array: state1_t})
                        
                        # If resulting state is DONE, Q-Target = r
                        # If True: 0. If False: 1.
                        end_multiplier = -(np.array(dones_t) - 1)

                        # The Q values for predicted actions
                        doubleQ = np.array([Q2[i, j] for i, j in zip(range(len(Q1)), Q1)])
                        
                        targetQ = np.array(rewards_t) + (y * doubleQ * end_multiplier)
                        
                        # update network with target values
                        _  = sess.run(mainQN.updateModel,
                                     feed_dict={mainQN.rgb_array: np.array(states_t),
                                               mainQN.targetQ: targetQ,
                                               mainQN.actions: actions_t})

                rewardAll += reward
                state = state1

                if done == True:
                    break

            # get all experiences from this episode
            episodeRewards = np.array(episodeBuffer.rewards)

            # discount all rewards
            discountRewards = discount_rewards(episodeRewards)
            episodeBuffer.rewards = discountRewards

            # add discounted experiences to our experience buffer
            # state, action, reward, state1, done
            myBuffer.add(episodeBuffer.states,
                         episodeBuffer.actions,
                         episodeBuffer.rewards,
                         episodeBuffer.states_,
                         episodeBuffer.dones)

            movesList.append(numMoves)
            rewardList.append(rewardAll)

            # periodically save model
            if i % 1000 == 0:
                saver.save(sess, path+'/model-'+str(i)+'.cptk')
                print("Saved Model")
            if i % 10 == 0:
                print(i, total_steps, rewardAll, e)

        saver.save(sess, path+'/model-'+str(i)+'.cptk')
    except:
        # if frames are still rendering, stop it
        if env.viewer is not None:
            env.viewer.close()
            env.viewer = None
        
        saver.save(sess, path+'/model-'+str(i)+'.cptk')
        print("Saved Model")

print("Reward of last episode: " + str(rewardList[len(rewardList)-1]))

Saved Model
0 9 9.0 1
10 224 17.0 1
Saved Model
Reward of last episode: 15.0
