# Simple Reinforcement Learning with Tensorflow Part 4: Deep Q-Networks and Beyond

In this iPython notebook I implement a Deep Q-Network using both Double DQN and Dueling DQN. The agent learn to kill creature in VizDoom. 

To learn more, read here: https://medium.com/p/8438a3e2b8df

For more reinforcment learning tutorials, see:
https://github.com/awjuliani/DeepRL-Agents

In [12]:
#To use the old print function
%autocall 1 


import gym

import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
from helper import make_gif
import matplotlib.pyplot as plt
import scipy.misc
import os
%matplotlib inline

Automatic calling is: Smart


### Load the game environment

Feel free to change the Vizdoom map!

In [13]:
from vizdoom import *
a_size = 3 # Agent can move Left, Right, or Fire

#The Below code is related to setting up the Doom environment
game = DoomGame()
game.set_doom_scenario_path("defend_the_center.wad")  #This corresponds to the simple task we will pose our agent
game.load_config("defend_the_center.cfg")
game.set_doom_map("map01")
game.set_screen_resolution(ScreenResolution.RES_160X120)
game.set_screen_format(ScreenFormat.GRAY8)
game.set_render_hud(False)
game.set_render_crosshair(False)
game.set_render_weapon(True)
game.set_render_decals(False)
game.set_render_particles(False)

game.add_available_button(Button.TURN_LEFT)
game.add_available_button(Button.TURN_RIGHT)
game.add_available_button(Button.ATTACK)
actions_list = np.identity(a_size,dtype=bool).tolist()
print(actions_list)

game.add_available_game_variable(GameVariable.AMMO2)
game.add_available_game_variable(GameVariable.POSITION_X)
game.add_available_game_variable(GameVariable.POSITION_Y)
game.set_episode_timeout(300)
game.set_episode_start_time(10)
game.set_window_visible(False)
game.set_sound_enabled(False)
#game.set_living_reward(-1)
game.set_mode(Mode.PLAYER)
game.init()

#End Doom set-up

env = game


[[True, False, False], [False, True, False], [False, False, True]]


Above is an example of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green square (for +1 reward) and avoid the red square (for -1 reward). The position of the three blocks is randomized every episode.

### Implementing the network itself

In [14]:
class Qnetwork():
    def __init__(self,h_size):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        self.scalarInput =  tf.placeholder(shape=[None,7056],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,1])
        self.conv1 = slim.conv2d( \
            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = slim.conv2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = slim.conv2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
        self.conv4 = slim.conv2d( \
            inputs=self.conv3,num_outputs=h_size,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None)
        
        #We take the output from the final convolutional layer and split it into separate advantage and value streams.
        self.streamAC,self.streamVC = tf.split(self.conv4,2,3)
        self.streamA = slim.flatten(self.streamAC)
        self.streamV = slim.flatten(self.streamVC)
        xavier_init = tf.contrib.layers.xavier_initializer()
        self.AW = tf.Variable(xavier_init([h_size//2,a_size]))
        self.VW = tf.Variable(xavier_init([h_size//2,1]))
        self.Advantage = tf.matmul(self.streamA,self.AW)
        self.Value = tf.matmul(self.streamV,self.VW)
        
        #Then combine them together to get our final Q-values.
        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        self.updateModel = self.trainer.minimize(self.loss)

### Experience Replay

This class allows us to store experies and sample then randomly to train the network.

In [15]:
class experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

This is a simple function to resize our game frames.

In [16]:
def processState(states):
    return np.reshape(states,[21168])

# Processes Doom screen image to produce cropped and resized image. 
def process_frame(frame):
    s = frame[10:-10,30:-30]
    s = scipy.misc.imresize(s,[84,84])
    s = np.reshape(s,[np.prod(s.shape)]) / 255.0
    return s
    

These functions allow us to update the parameters of our target network with those of the primary network.

In [17]:
def updateTargetGraph(tfVars,tau):
    #tfVars are all the trainable values of the computation graph, e.i. all the weights of the networks (main and target)
    #tau is the the ratio to which we update the Target network with respect to the Main network
    total_vars = len(tfVars)
    op_holder = []
    #Here we need to understand the structure of the tfVars array.
    #The first half entries are the trainable values of the Main Network
    #The last half entries are the trainable values of the Main Network
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        #New_targetNet_values = tau * New_MainNet_values + (1 - tau) * Old_MainNet_values
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

#This function just runs the session to compute the above expression
def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

### Training the network

Setting all the training parameters

In [18]:
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
annealing_steps = 10000. #How many steps of training to reduce startE to endE.
num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #How many steps of random actions before training begins.
max_epLength = 300 #The max allowed length of our episode.
load_model = False #Whether to load a saved model.
path = "./dddqn-VDdtc" #The path to save our model to.
h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.001 #Rate to update target network toward primary network
saveframes = True


In [19]:

if saveframes == True:
    if not os.path.exists('./DDDQN_VizDoomDtc_frames'):
        os.makedirs('./DDDQN_VizDoomDtc_frames')
        
tf.reset_default_graph()
mainQN = Qnetwork(h_size)
targetQN = Qnetwork(h_size)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

#trainable variables of the Main Network and the Target Network
trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()

#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/annealing_steps

#create lists to contain total rewards and steps per episodes
jList = []
rList = []
total_steps = 0


#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

with tf.Session() as sess:
    sess.run(init)
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    for i in range(num_episodes):
        #GAther episode frames to create gifs eventuly
        episode_frames = []
        episodeBuffer = experience_buffer()
        #Reset environment and get first new observation
        env.new_episode()
        s = env.get_state().screen_buffer
        s = process_frame(s)
        d = False
        rAll = 0
        j = 0
        #The Q-Network
        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:
                a = np.random.randint(0,a_size)
            else:
                a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0] #Feed through main network to predict action
            
            r = env.make_action(actions_list[a])
            d = env.is_episode_finished()
            if d == False:
                s1 = env.get_state().screen_buffer
            else:
                break
            episode_frames.append(s1)
            s1 = process_frame(s1)
            total_steps += 1
            episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.
            
            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop #epsilon annealing
                
                if total_steps % (update_freq) == 0:
                    #Get a random batch of experiences from the episode buffer
                    trainBatch = myBuffer.sample(batch_size)
                    
                    #Below we perform the Double-DQN update to the target Q-values
                    
                    #First we calculate the best actions for state s1 in each experience of the batch using our Main Network
                    A = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})
                    
                    #Then we calculate the Qvalues for every selected experiences in the batch using our Target Network.
                    #So Q2 is a 2D-array containing a vector of Q-values for each randomly selected experiences in the batch
                    Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})
                    
                    #the end multiplier goes to zero if the experience is an "end of game", so the target Q-value = the reward
                    end_multiplier = -(trainBatch[:,4] - 1) 
                    
                    #doubleQs are the Q-values estimated from Q2 at state s1 given action A = argmax(Q1(s1,:)). 
                    #So doubleQ is a vector containing the estimated Q-values for the best action possible 
                    #for each randomly selected experience in the batch.
                    doubleQ = Q2[range(batch_size),A]
                    
                    # Target-Q = r + gamma*(doubleQ) for non "end-of-game" experiences. Otherwise Target-Q = Reward
                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    
                    #Update the Main network with our target Q-values.
                    _ = sess.run(mainQN.updateModel, \
                                   #loss function to optimize                    
                        feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})
                    
                    #Update the Target network toward the main network but slowly (with a tau rate)
                    updateTarget(targetOps,sess)
                    
            rAll += r
            s = s1

        
        myBuffer.add(episodeBuffer.buffer)
        jList.append(j)
        rList.append(rAll)
        #Periodically save the model. 
        if i % 500 == 0:
            saver.save(sess,path+'/model-'+str(i)+'.ckpt')
            print("Saved Model")
            time_per_step = 0.05
            gif_cnt_down = 10
            
        if saveframes == True and 0 < gif_cnt_down < 11:
            images = np.array(episode_frames)
            make_gif(images,'./DDDQN_VizDoomDtc_frames/image'+str(i)+'.gif', 
                     duration=len(images)*time_per_step,true_image=True,salience=False)
            gif_cnt_down -= 1
            
        if len(rList) % 10 == 0:
            print(total_steps,np.mean(rList[-10:]), e)
    saver.save(sess,path+'/model-'+str(i)+'.ckpt')
print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")

KeyboardInterrupt: 

### Checking network learning

Mean reward over time

In [None]:
rMat = np.resize(np.array(rList),[len(rList)//100,100])
rMean = np.average(rMat,1)
plt.plot(rMean)