# Exercise 5: Deep Q-Network (DQN)

In [None]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import os
import random
import scipy.misc
import tensorflow as tf
import tensorflow.contrib.slim as slim

from unityenv import UnityEnvironment

%matplotlib inline

### Hyperparameters

In [None]:
batch_size = 32 # How many experiences to use for each training step.
update_freq = 4 # How often to perform a training step.
y = .99 # Discount factor on the target Q-values
startE = 1 # Starting chance of random action
endE = 0.1 # Final chance of random action
anneling_steps = 50000 # How many steps of training to reduce startE to endE.
num_episodes = 10000 # How many episodes of game environment to train network with.
pre_train_steps = 5000 # How many steps of random actions before training begins.
model_path = "./models/dqn" # The path to save our model to.
summary_path = './summaries/dqn' # The path to save summary statistics to.
h_size = 256 # The number of units in the hidden layer.
learning_rate = 1e-3 # Agent Learning Rate
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model

### Load the Unity environment

In [None]:
env_config = {'--grid-size': 5, '--num-objects': 1, '--num-goals': 1}
env = UnityEnvironment(file_name="RandomGridWorld", train_model=train_model, 
                       worker_num=4, config=env_config)
env.bw_render = True
env.resolution = 32
print(str(env))

### Examine observation space

In [None]:
observations, _ = env.reset()
plt.imshow(np.reshape(observations[0], [32, 32]))

Observation (o) corresponds to a greyscale pixel image of the screen.

### Implementing the network itself

In [None]:
class Qnetwork():
    def __init__(self ,h_size, num_actions, lr, scope):
        with tf.variable_scope(scope):
            # The network recieves a frame from the game, flattened into an array.
            # It then resizes it and processes it through four convolutional layers.
            self.observation_input =  tf.placeholder(shape=[None, 32, 32, 3],dtype=tf.float32)
            self.conv1 = slim.conv2d(self.observation_input, 32, 
                                     kernel_size=[3,3], stride=[2,2], 
                                     biases_initializer=None,
                                     activation_fn=tf.nn.elu)
            self.conv2 = slim.conv2d(self.conv1, 64, 
                                     kernel_size=[3,3], 
                                     stride=[2,2], 
                                     biases_initializer=None,
                                     activation_fn=tf.nn.elu)

            # We take the output from the final convolutional layer 
            # and split it into separate advantage and value streams.
            self.hidden = slim.fully_connected(slim.flatten(self.conv2), 
                                               h_size, activation_fn=tf.nn.elu)
            self.advantage = slim.fully_connected(self.hidden, num_actions, activation_fn=None,
                                                  biases_initializer=None)
            self.value = slim.fully_connected(self.hidden, 1, activation_fn=None,
                                                  biases_initializer=None)

            # Then combine them together to get our final Q-values.
            self.q_out = self.value + tf.subtract(self.advantage, 
                                                  tf.reduce_mean(self.advantage,axis=1, keep_dims=True))
            self.predict = tf.argmax(self.q_out,1)

            # Below we obtain the loss by taking the sum of squares difference 
            # between the target and prediction Q values.
            self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions,num_actions,dtype=tf.float32)

            self.Q = tf.reduce_sum(tf.multiply(self.q_out, self.actions_onehot), axis=1)

            self.td_error = tf.square(self.targetQ - self.Q)
            self.loss = tf.reduce_mean(self.td_error)
            self.trainer = tf.train.AdamOptimizer(learning_rate=lr)
            self.update = self.trainer.minimize(self.loss)

### Experience Replay

This class allows us to store experies and sample then randomly to train the network.

In [None]:
class experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

These functions allow us to update the parameters of our target network with those of the primary network.

In [None]:
def update_target_graph(from_scope, to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

### Training the network

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size, env.action_space_size, learning_rate, "main")
targetQN = Qnetwork(h_size, env.action_space_size, learning_rate, "target")

init = tf.global_variables_initializer()

if not os.path.exists(summary_path):
    os.makedirs(summary_path)
    
if not os.path.exists(model_path):
    os.makedirs(model_path)

saver = tf.train.Saver()

trainables = tf.trainable_variables()

update_target_ops = update_target_graph("main", "target")

myBuffer = experience_buffer()

# Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/anneling_steps

# Create lists to contain total rewards and steps per episode
episode_lengths = []
episode_rewards = []
losses = []
total_steps = 0

with tf.Session() as sess:
    sess.run(init)
    summary_writer = tf.summary.FileWriter(summary_path)
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    for i in range(num_episodes):
        episodeBuffer = experience_buffer()
        observations, _ = env.reset()
        observation = np.concatenate([observations[0], observations[0], observations[0]], axis=2)
        done = False
        episode_reward = 0
        episode_steps = 0
        while not done:
            episode_steps+=1
            # Choose an action by greedily (with e chance of random action) from the Q-network
            if (np.random.rand(1) < e or total_steps < pre_train_steps) and train_model:
                action = np.random.randint(0,env.action_space_size)
            else:
                action = sess.run(mainQN.predict, 
                                  feed_dict={mainQN.observation_input:[observation]})[0]
            if not train_model and np.random.rand(1) < 0.1:
                action = np.random.randint(0,env.action_space_size)
            observations_1, _, reward, done = env.step(action, np.zeros([49]).tolist())
            observation_1 = observation[:, :, 1:]
            observation_1 = np.concatenate([observation_1, observations_1[0]], axis=2)
            total_steps += 1
            
            # Save the experience to our episode buffer.
            episodeBuffer.add(np.reshape(np.array([observation,action,reward,observation_1,done]),[1,5])) 
            
            if total_steps > pre_train_steps and train_model:
                if total_steps % 1000 == 0:
                    sess.run(update_target_ops)
                
                if e > endE:
                    e -= stepDrop
                
                if total_steps % (update_freq) == 0:
                    # Get a random batch of experiences.
                    trainBatch = myBuffer.sample(batch_size) 
                    # Below we perform the Double-DQN update to the target Q-values
                    Q1 = sess.run(mainQN.predict, 
                                  feed_dict={mainQN.observation_input:np.stack(trainBatch[:,3], axis=0)})
                    Q2 = sess.run(targetQN.q_out, 
                                  feed_dict={targetQN.observation_input:np.stack(trainBatch[:,3], axis=0)})
                    end_multiplier = -(trainBatch[:,4] - 1)
                    doubleQ = Q2[range(batch_size),Q1]
                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    # Update the network with our target values.
                    _, q_loss = sess.run([mainQN.update, mainQN.loss],
                        feed_dict={mainQN.observation_input:np.stack(trainBatch[:,0], axis=0),
                                   mainQN.targetQ:targetQ, 
                                   mainQN.actions:trainBatch[:,1]})
                    losses.append(q_loss)
            episode_reward += reward
            observation = observation_1
                    
        myBuffer.add(episodeBuffer.buffer)
        episode_lengths.append(episode_steps)
        episode_rewards.append(episode_reward)
        # Periodically save the model and summary statistics
        if i % 1000 == 0 and i != 0:
            saver.save(sess, model_path+'/model-'+str(i)+'.cptk')
            print("Saved Model")
        if i % 50 == 0 and i != 0:
            summary = tf.Summary()
            summary.value.add(tag='Info/Reward', simple_value=float(np.mean(episode_rewards[-50:])))
            summary.value.add(tag='Info/Q Loss', simple_value=float(np.mean(losses[-50:])))
            summary.value.add(tag='Info/Epsilon', simple_value=float(e))
            summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(episode_lengths[-50:])))
            summary_writer.add_summary(summary, i)
            summary_writer.flush()
            print ("Mean Reward: {}".format(np.mean(episode_rewards[-50:])))
env.close()

In [None]:
env.close()