# Importing Libraries

In [41]:
import tensorflow as tf
import numpy as np
from vizdoom import *

import random
import time
from skimage import transform
from skimage.color import rgb2gray

from collections import deque
import matplotlib.pyplot as plt

import warnings

# Create a game environment and test

The game has three possible actions

1. Move left
2. Move right
3. Shoot

The agent is rewarded points for each action and state

1. For shooting the monster - +101 points
2. For missing - -5 points
3. For being alive - -1 point

The reward system will force the agent to kill the monster as soon as possible and without wasting ammos.
The reward system is preloaded in the game

In [42]:
def create_environment():
    game = DoomGame()
    
    game.load_config("basic.cfg")  # You may load different config for a different game scenario
    
    game.set_doom_scenario_path("basic.wad") # You may load different wad for a different game scenario
    
    game.init()
    
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

def test_environment():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    
    actions = [left, right, shoot]
    
    episodes = 10
    
    for i in range(episodes):
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state() 
            img = state.screen_buffer # Returns the frame from a game (RGB Frame)
            misc = state.game_variables
            action = random.choice(actions)
            print(action)
            reward = game.make_action(action)
            print("reward: ",reward)
            time.sleep(0.02)
            
        print("Result: ",game.get_total_reward())
        time.sleep(2)
    
    game.close()

In [43]:
# Uncomment the line below to make a test environment and check if game works fine

# test_environment()

In [44]:
game, possible_actions = create_environment()

In [45]:
# Preprocess the game frame

def preprocess_frame(frame):
    frame = np.moveaxis(frame, 0, -1) # Re-order to have frame in (Height, Width, Channels) order
    frame = rgb2gray(frame)     # Color does not add any additional information so its computationally efficient to
                                  # convert to grayscale
    
    cropped_frame = frame[30: -10, 30: -30]  # Cropping unecessary area from the frame
    
    normalized_frame = cropped_frame/ 255.
    
    preprocessed_frame = transform.resize(normalized_frame, [84, 84])
    
    return preprocessed_frame

In [46]:
stack_size = 4   # Stacking 4 frames to give a picture of motion to the model

stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    # for first frame, stack it four times
    if is_new_episode:
        stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = stack_size)

        for i in range(stack_size):
            stacked_frames.append(frame)
            
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    # if not the first frame, enqueue the latest frame and dequeue the oldest frame
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    return stacked_state, stacked_frames

In [47]:
state_size = [84, 84, 4]  # Size of input to the model

action_size = game.get_available_buttons_size()

learning_rate = 0.0002

total_episodes = 500  # Total number of training games
max_steps = 100  # Maximum steps to be taken in a game
batch_size = 64  # Batch size input to the model

epsilon_start = 1.  # max exploration rate
epsilon_end = 0.01  # min exploration rate
decay_rate = 0.0001  # decay rate per game

gamma = 0.95   #discount factor

pretrain_length = batch_size  # Initial memory size
memory_size = 1000000  # Maximum memory size

training = True  # Boolen value to train or not
render = True  # Boolean value to see the agent train

In [48]:
# stacked_frame [84, 84, 4] ---> conv1 ----> [20, 20, 32] (batch_norm + elu) ----> conv2 ----> [9, 9, 64] (batch_norm + elu)
# -----> conv3 -----> [3, 3, 128] (batch_norm + elu) ----> Flatten -----> [1152,] -----> Dense ----> [512,] (elu)
# ------> Dense ----> [self.action,]

class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name = 'DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # placeholders for inputs and actions
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name = 'inputs')
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name = 'actions')
            
            # Placeholder of target variable
            self.target_Q = tf.placeholder(tf.float32, [None], name = 'targets')
            
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm1')

            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")
        
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm2')

            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 128,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")
        
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm3')

            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = action_size, 
                                        activation=None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            # Mean squared loss function
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            # Adam optimizer
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [49]:
# reset the tensorflow graph
tf.reset_default_graph()

# Initialize network
DQNetwork = DQNetwork(state_size, action_size, learning_rate)





# Experience Replay

Memory class to store experience (stacked frames, action, reward, next state, boolean to know game finished or not)

In [50]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        
        index = np.random.choice(np.arange(buffer_size), size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

Initially filling memory with random experiences from the start frame

In [51]:
memory = Memory(max_size = memory_size)

game.new_episode()

for i in range(pretrain_length):
    if i==0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state,
                                            True)
        
    action = random.choice(possible_actions)
    
    reward = game.make_action(action)
    
    done = game.is_episode_finished()
    
    if done:
        next_state = np.zeros(state.shape)   # all zeros mark the final state
        
        memory.add((state, action, reward, next_state, done))
        
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames,
                                                 next_state, False)
        
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

## Tensorboard Config

In [52]:
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

tf.summary.scalar('loss', DQNetwork.loss)

write_op = tf.summary.merge_all()

In [53]:
def predict_action(epsilon_start, epsilon_end, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    explore_probability = epsilon_end + (epsilon_start - epsilon_end)*np.exp(-decay_rate*decay_step)
    
    if explore_probability > exp_exp_tradeoff:
        action = random.choice(possible_actions)
    
    else:
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        choice = np.argmax(Qs)
        action = possible_actions[choice]
        
    return action, explore_probability

# Training

In [54]:
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        #Training from scratch: comment the line below and uncomment the line next to it
        saver.restore(sess, "./models/model.ckpt")

        
#         sess.run(tf.global_variables_initializer())
        
        decay_step = 0
        
        game.init()
        
        for episode in range(1, total_episodes+1):
            step = 0
            
            episode_rewards = []
            
            game.new_episode()
            state = game.get_state().screen_buffer
            
            state, stacked_frames = stack_frames(stacked_frames, state,
                                                True)
            
            while step < max_steps:
                step += 1
                
                decay_step += 1  # decaying exploration for each step in the game
                
                action, explore_probability = predict_action(epsilon_start,
                                                            epsilon_end,
                                                            decay_rate,
                                                            decay_step,
                                                            state,
                                                            possible_actions)
                
                reward = game.make_action(action)
                
                done = game.is_episode_finished()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((84, 84), dtype = np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state,
                                                             False)
                    step = max_steps  # ending the current episode
                    
                    total_rewards = np.sum(episode_rewards)
                    
                    print("Episode: {}".format(episode),
                         "Total reward: {}".format(total_rewards),
                         "Training loss: {:.4f}".format(loss),
                         "Explore Prob: {:.4f}".format(explore_probability))
                    
                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state,
                                                             False)
                    
                    memory.add((state, action, reward, next_state, done))
                    
                    state = next_state
                    
                # sample a batch from memory   
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin = 3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin = 3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                for i in range(batch_size):
                    terminal = dones_mb[i]
                    
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma*np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                  feed_dict = {DQNetwork.inputs_: states_mb,
                                              DQNetwork.actions_: actions_mb,
                                              DQNetwork.target_Q: targets_mb})
                
                summary = sess.run(write_op, feed_dict = {DQNetwork.inputs_: states_mb,
                                                         DQNetwork.target_Q: targets_mb,
                                                         DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            if episode%2==0:
                save_path = saver.save(sess, './models/model.ckpt')
                print("Model Saved")
                

                

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Episode: 1 Total reward: 95.0 Training loss: 2.8436 Explore Prob: 0.9994
Episode: 2 Total reward: 93.0 Training loss: 2.5919 Explore Prob: 0.9986
Model Saved
Episode: 3 Total reward: 90.0 Training loss: 8.6603 Explore Prob: 0.9975
Model Saved
Episode: 5 Total reward: 95.0 Training loss: 2.5018 Explore Prob: 0.9871
Model Saved
Episode: 7 Total reward: 92.0 Training loss: 1.0611 Explore Prob: 0.9765
Episode: 8 Total reward: 71.0 Training loss: 11.4534 Explore Prob: 0.9741
Model Saved
Episode: 10 Total reward: 88.0 Training loss: 1.5417 Explore Prob: 0.9633
Model Saved
Episode: 11 Total reward: 3.0 Training loss: 3.4974 Explore Prob: 0.9559
Episode: 12 Total reward: 35.0 Training loss: 3.8786 Explore Prob: 0.9506
Model Saved
Episode: 13 Total reward: 95.0 Training loss: 3.0608 Explore Prob: 0.9500
Model Saved
Episode: 16 Total reward: 92.0 Training loss: 1.0314 Explore Prob: 0.9306
Model Saved
Model Saved
Episode: 20 Total rewa

Episode: 118 Total reward: 94.0 Training loss: 4.8631 Explore Prob: 0.6494
Model Saved
Episode: 119 Total reward: 92.0 Training loss: 9.4872 Explore Prob: 0.6489
Episode: 120 Total reward: 95.0 Training loss: 3.4232 Explore Prob: 0.6485
Model Saved
Episode: 121 Total reward: 94.0 Training loss: 5.7635 Explore Prob: 0.6480
Episode: 122 Total reward: 95.0 Training loss: 4.2862 Explore Prob: 0.6477
Model Saved
Episode: 123 Total reward: 95.0 Training loss: 3.7789 Explore Prob: 0.6473
Episode: 124 Total reward: 95.0 Training loss: 5.2755 Explore Prob: 0.6469
Model Saved
Episode: 125 Total reward: 94.0 Training loss: 7.9627 Explore Prob: 0.6464
Episode: 126 Total reward: 95.0 Training loss: 3.9292 Explore Prob: 0.6461
Model Saved
Episode: 127 Total reward: 94.0 Training loss: 7.9246 Explore Prob: 0.6456
Episode: 128 Total reward: 95.0 Training loss: 2.2060 Explore Prob: 0.6452
Model Saved
Episode: 129 Total reward: 40.0 Training loss: 2.1900 Explore Prob: 0.6420
Episode: 130 Total reward: 9

Episode: 222 Total reward: 71.0 Training loss: 8.9301 Explore Prob: 0.5035
Model Saved
Episode: 223 Total reward: 44.0 Training loss: 5.1608 Explore Prob: 0.5012
Episode: 224 Total reward: 95.0 Training loss: 4.4062 Explore Prob: 0.5009
Model Saved
Episode: 225 Total reward: 70.0 Training loss: 5.2935 Explore Prob: 0.4997
Episode: 226 Total reward: 95.0 Training loss: 7.1209 Explore Prob: 0.4994
Model Saved
Episode: 227 Total reward: 95.0 Training loss: 4.4838 Explore Prob: 0.4991
Episode: 228 Total reward: 28.0 Training loss: 3.1025 Explore Prob: 0.4960
Model Saved
Episode: 229 Total reward: 95.0 Training loss: 5.6088 Explore Prob: 0.4957
Episode: 230 Total reward: 75.0 Training loss: 2.4912 Explore Prob: 0.4947
Model Saved
Episode: 231 Total reward: 92.0 Training loss: 5.5611 Explore Prob: 0.4943
Episode: 232 Total reward: 92.0 Training loss: 3.7123 Explore Prob: 0.4938
Model Saved
Episode: 233 Total reward: 95.0 Training loss: 7.3406 Explore Prob: 0.4935
Episode: 234 Total reward: 1

Episode: 324 Total reward: 65.0 Training loss: 4.1076 Explore Prob: 0.4043
Model Saved
Episode: 325 Total reward: 62.0 Training loss: 7.0483 Explore Prob: 0.4029
Episode: 326 Total reward: 94.0 Training loss: 5.6312 Explore Prob: 0.4026
Model Saved
Episode: 327 Total reward: 95.0 Training loss: 5.5664 Explore Prob: 0.4024
Episode: 328 Total reward: 95.0 Training loss: 3.6795 Explore Prob: 0.4022
Model Saved
Episode: 329 Total reward: 95.0 Training loss: 6.5039 Explore Prob: 0.4019
Episode: 330 Total reward: 95.0 Training loss: 7.9480 Explore Prob: 0.4017
Model Saved
Episode: 331 Total reward: 57.0 Training loss: 4.6162 Explore Prob: 0.4004
Episode: 332 Total reward: 50.0 Training loss: 5.1240 Explore Prob: 0.3988
Model Saved
Episode: 333 Total reward: 95.0 Training loss: 5.2183 Explore Prob: 0.3985
Episode: 334 Total reward: 95.0 Training loss: 4.2883 Explore Prob: 0.3983
Model Saved
Episode: 335 Total reward: 95.0 Training loss: 4.1718 Explore Prob: 0.3981
Episode: 336 Total reward: 7

Episode: 426 Total reward: 60.0 Training loss: 7.9517 Explore Prob: 0.3391
Model Saved
Episode: 427 Total reward: 95.0 Training loss: 2.3418 Explore Prob: 0.3389
Episode: 428 Total reward: 42.0 Training loss: 3.3627 Explore Prob: 0.3373
Model Saved
Episode: 429 Total reward: 95.0 Training loss: 2.9949 Explore Prob: 0.3371
Episode: 430 Total reward: 31.0 Training loss: 6.9399 Explore Prob: 0.3353
Model Saved
Episode: 431 Total reward: 72.0 Training loss: 10.6187 Explore Prob: 0.3345
Episode: 432 Total reward: 36.0 Training loss: 2.2542 Explore Prob: 0.3328
Model Saved
Episode: 433 Total reward: 37.0 Training loss: 3.2032 Explore Prob: 0.3312
Episode: 434 Total reward: 93.0 Training loss: 3.0722 Explore Prob: 0.3309
Model Saved
Episode: 435 Total reward: 57.0 Training loss: 3.9352 Explore Prob: 0.3297
Episode: 436 Total reward: 75.0 Training loss: 4.2225 Explore Prob: 0.3290
Model Saved
Episode: 437 Total reward: 95.0 Training loss: 5.6772 Explore Prob: 0.3288
Episode: 438 Total reward: 

# Testing - Agent plays the game

In [58]:
test_episodes = 100
average_score = 0

with tf.Session() as sess:
    game, possible_actions = create_environment()
    totalScore = 0
    
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    
    
    for i in range(test_episodes):
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames,
                                            state, False)
        
        while not game.is_episode_finished():
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
                        
            if done:
                print("Game Score: ", score)
                average_score += score
                break
            
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames,
                                                     next_state, False)
            
            state = next_state
            
            score = game.get_total_reward()
            
    print("Average Score: ", average_score/test_episodes)
    game.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Game Score:  57.0
Game Score:  95.0
Game Score:  95.0
Game Score:  95.0
Game Score:  95.0
Game Score:  76.0
Game Score:  62.0
Game Score:  88.0
Game Score:  95.0
Game Score:  93.0
Game Score:  95.0
Game Score:  93.0
Game Score:  95.0
Game Score:  88.0
Game Score:  92.0
Game Score:  93.0
Game Score:  94.0
Game Score:  93.0
Game Score:  58.0
Game Score:  95.0
Game Score:  88.0
Game Score:  93.0
Game Score:  94.0
Game Score:  86.0
Game Score:  95.0
Game Score:  95.0
Game Score:  95.0
Game Score:  95.0
Game Score:  95.0
Game Score:  86.0
Game Score:  92.0
Game Score:  93.0
Game Score:  95.0
Game Score:  61.0
Game Score:  95.0
Game Score:  59.0
Game Score:  95.0
Game Score:  93.0
Game Score:  95.0
Game Score:  78.0
Game Score:  95.0
Game Score:  69.0
Game Score:  95.0
Game Score:  95.0
Game Score:  80.0
Game Score:  78.0
Game Score:  74.0
Game Score:  95.0
Game Score:  78.0
Game Score:  92.0
Game Score:  54.0
Game Score:  95.0
Ga