# Importing Libraries

In [1]:
import tensorflow as tf
import numpy as np
from vizdoom import *

import random
import time
from skimage import transform
from skimage.color import rgb2gray

from collections import deque
import matplotlib.pyplot as plt
from tqdm import tqdm
import os as os

import warnings
warnings.filterwarnings('ignore')

# WandB

This section is not necessary, comment all the code in this section.
It's only to connect tensorflow model with wandb.ai

Link: app.wandb.ai/dhruv/ddqnn-doom

In [2]:
import wandb
from wandb.tensorflow import WandbHook

wandb.init(project = 'ddqnn-doom', sync_tensorboard = True)


wandb.config.epochs = 2000

flags = tf.app.flags
flags.DEFINE_string('data_dir', '/tmp/data', 'Data Directory')
flags.DEFINE_integer('batch_size', 64, 'Batch size.')
wandb.config.update(flags.FLAGS)




wandb: Wandb version 0.8.31 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


# ------------------------------------------------------------------------------------------------

# Create a game environment and test

The game has three possible actions

1. Move left
2. Move right
3. Shoot
4. Move forward
5. Move backward
6. Turn left
7. Turn right

The agent is rewarded points for each action and state

1. Death penalty = -100
2. Getting closer to vest = +dX
3. Getting farther from vest = -dX

The reward system will force the agent to reaach the vest as soon as possible.
The reward system is preloaded in the game

In [3]:
def create_environment():
    game = DoomGame()
    game.load_config("deadly_corridor.cfg")
    
    game.set_doom_scenario_path("deadly_corridor.wad")
    
    game.init()
    
    possible_actions = np.identity(7, dtype = int).tolist()
    
    return game, possible_actions

def test_environment():
    game = DoomGame()
    game.load_config("deadly_corridor.cfg")
    game.set_doom_scenario_path("deadly_corridor.wad")
    game.init()
    
    actions = np.identity(7, dtype = int).tolist()
    
    episodes = 10
    
    for i in range(episodes):
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state() 
            img = state.screen_buffer # Returns the frame from a game (RGB Frame)
            misc = state.game_variables
            action = random.choice(actions)
            print(action)
            reward = game.make_action(action)
            print("reward: ",reward)
            time.sleep(0.02)
            
        print("Result: ",game.get_total_reward())
        time.sleep(2)
    
    game.close()

In [4]:
# Uncomment the line below to make a test environment and check if game works fine

# test_environment()

In [5]:
# Preprocess the game frame

def preprocess_frame(frame):
    frame = np.moveaxis(frame, 0, -1) # Re-order to have frame in (Height, Width, Channels) order
    frame = rgb2gray(frame)     # Color does not add any additional information so its computationally efficient to
                                  # convert to grayscale
#     plt.imshow(frame, cmap = 'gray')
    cropped_frame = frame[15: -5, 20: -20]  # Cropping unecessary area from the frame
    
    normalized_frame = cropped_frame/ 255.
    
    preprocessed_frame = transform.resize(normalized_frame, [100, 120])
    
    return preprocessed_frame

In [6]:
game, possible_actions = create_environment()

In [7]:
stack_size = 4   # Stacking 4 frames to give a picture of motion to the model

stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    # for first frame, stack it four times
    if is_new_episode:
        stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = stack_size)

        for i in range(stack_size):
            stacked_frames.append(frame)
            
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    # if not the first frame, enqueue the latest frame and dequeue the oldest frame
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    return stacked_state, stacked_frames

In [8]:
state_size = [100, 120, 4]  # Size of input to the model

action_size = game.get_available_buttons_size()

learning_rate = 0.00025

total_episodes = 2000  # Total number of training games
max_steps = 5000  # Maximum steps to be taken in a game
batch_size = 128  # Batch size input to the model

max_tau = 1000 # Step number at which we need to update our target network

epsilon_start = 1.  # max exploration rate
epsilon_end = 0.01  # min exploration rate
decay_rate = 0.0005  # decay rate per game

gamma = 0.95   #discount factor

pretrain_length = 1000  # Initial memory size
memory_size = 100000  # Maximum memory size

training = False  # Boolen value to train or not
render = True  # Boolean value to see the agent train

# Dueling Double Deep-Q-learning Neural Net

In [9]:
class DDDQNNet:
    def __init__(self, state_size, action_size, learning_rate, name):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name
        
        with tf.variable_scope(self.name):
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size],
                                         name = 'inputs')
            self.actions_ = tf.placeholder(tf.float32, [None, action_size],
                                         name = 'actions')
            
            self.target_Q = tf.placeholder(tf.float32, [None],
                                          name = 'target')
            
            
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")

            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")
            
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 128,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")

            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            self.value_fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="value_fc")
            
            self.value = tf.layers.dense(inputs = self.value_fc,
                                        units = 1,
                                        activation = None,
                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="value")
            
            self.advantage_fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="advantage_fc")
            
            self.advantage = tf.layers.dense(inputs = self.advantage_fc,
                                        units = self.action_size,
                                        activation = None,
                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="advantages")
            
            self.output = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
                        
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_Q, self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [10]:
tf.reset_default_graph()

DQNetwork = DDDQNNet(state_size, action_size, learning_rate, name="DQNetwork")

TargetNetwork = DDDQNNet(state_size, action_size, learning_rate, name="TargetNetwork")

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.






# Experience Replay

In [11]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        
        index = np.random.choice(np.arange(buffer_size), size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [12]:
memory = Memory(memory_size)

game.new_episode()

for i in tqdm(range(128)):
    if i == 0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    action = random.choice(possible_actions)
    reward = game.make_action(action)
    done = game.is_episode_finished()

    if done:
        next_state = np.zeros(state.shape)
                
        experience = state, action, reward, next_state, done
        memory.add(experience)
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        experience = state, action, reward, next_state, done
        memory.add(experience)
        
        state = next_state

100%|███████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 147.01it/s]


# Tensorboard config

In [13]:
writer = tf.summary.FileWriter("/tensorboard/dddqn/1")

tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [14]:
def predict_action(epsilon_start, epsilon_end, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    explore_probability = epsilon_end + (epsilon_start - epsilon_end)*np.exp(-decay_rate*decay_step)
    
    if explore_probability > exp_exp_tradeoff:
        action = random.choice(possible_actions)
    
    else:
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        choice = np.argmax(Qs)
        action = possible_actions[choice]
        
    return action, explore_probability

In [15]:
def update_target_graph():
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")
    
    op_holder = []
    
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
        
    return op_holder

# Training

In [16]:
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        #Training from scratch: comment the line below and uncomment the line next to it
        saver.restore(sess, "./wandb/run-20200328_052741-36ojprom/model.ckpt")

        
#         sess.run(tf.global_variables_initializer())
        
        decay_step = 0
        
        tau = 0
        
        game.init()
        
        update_target = update_target_graph()
        sess.run(update_target)
        
        for episode in range(1, total_episodes+1):
            step = 0
            
            episode_rewards = []
            
            game.new_episode()
            state = game.get_state().screen_buffer
            
            state, stacked_frames = stack_frames(stacked_frames, state,
                                                True)
            
            while step < max_steps:
                step += 1
                
                decay_step += 1  # decaying exploration for each step in the game
                
                action, explore_probability = predict_action(epsilon_start,
                                                            epsilon_end,
                                                            decay_rate,
                                                            decay_step,
                                                            state,
                                                            possible_actions)
                
                reward = game.make_action(action)
                
                done = game.is_episode_finished()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((84, 84), dtype = np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state,
                                                             False)
                    step = max_steps  # ending the current episode
                    
                    total_rewards = np.sum(episode_rewards)
                    
                    print("Episode: {}".format(episode),
                         "Total reward: {}".format(total_rewards),
                         "Training loss: {:.4f}".format(loss),
                         "Explore Prob: {:.4f}".format(explore_probability))
                    
                    wandb.log({'rewards': total_rewards, 'loss':loss, 'episode': episode})

                    
                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state,
                                                             False)
                    
                    memory.add((state, action, reward, next_state, done))
                    
                    state = next_state

                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin = 3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin = 3)
                dones_mb = np.array([each[4] for each in batch])
    
                target_Qs_batch = []
                
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                Q_target_next_state = sess.run(TargetNetwork.output, feed_dict = {TargetNetwork.inputs_: next_states_mb})
                
                for i in range(batch_size):
                    terminal = dones_mb[i]
                    
                    action = np.argmax(Qs_next_state[i])
                    
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma*Q_target_next_state[i][action]
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                
                _, loss, absolute_errors = sess.run([DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors],
                                  feed_dict = {DQNetwork.inputs_: states_mb,
                                              DQNetwork.actions_: actions_mb,
                                              DQNetwork.target_Q: targets_mb})
#                                               DQNetwork.ISWeights_: ISWeights_mb})
                
                
#                 memory.batch_update(tree_idx, absolute_errors)
                
                summary = sess.run(write_op, feed_dict = {DQNetwork.inputs_: states_mb,
                                                         DQNetwork.target_Q: targets_mb,
                                                         DQNetwork.actions_: actions_mb})
#                                                         DQNetwork.ISWeights_: ISWeights_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
                if tau > max_tau:
                    update_target = update_target_graph()
                    sess.run(update_target)
                    tau = 0
                    print("Target Network Update")
                
            if episode%2==0:
                saver.save(sess, os.path.join(wandb.run.dir, "model.ckpt"))
                print("Model Saved")
                

                

# Testing - Agent plays the game

In [19]:
test_episodes = 100
average_score = 0

with tf.Session() as sess:
    game, possible_actions = create_environment()
    totalScore = 0
    
    saver.restore(sess, "./wandb/run-20200328_112139-1ut21yln/model.ckpt")
    game.init()
    
    
    for i in range(test_episodes):
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames,
                                            state, False)
        
        while not game.is_episode_finished():
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
                        
            if done:
                print("Game Score: ", score)
                average_score += score
                break
            
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames,
                                                     next_state, False)
            
            state = next_state
            
            score = game.get_total_reward()
            
    print("Average Score: ", average_score/test_episodes)
    game.close()

INFO:tensorflow:Restoring parameters from ./wandb/run-20200328_112139-1ut21yln/model.ckpt
Game Score:  16.651336669921875
Game Score:  41.46549987792969
Game Score:  100.62275695800781
Game Score:  100.62275695800781
Game Score:  26.246994018554688
Game Score:  100.62275695800781
Game Score:  7.335357666015625
Game Score:  122.01123046875
Game Score:  65.36900329589844
Game Score:  30.322723388671875
Game Score:  100.62275695800781
Game Score:  100.62275695800781
Game Score:  150.4249725341797
Game Score:  100.62275695800781
Game Score:  100.24705505371094
Game Score:  100.62275695800781
Game Score:  46.72770690917969
Game Score:  168.09585571289062
Game Score:  38.225799560546875
Game Score:  -1.5209808349609375
Game Score:  -25.687347412109375
Game Score:  -1.728729248046875
Game Score:  4.4536590576171875
Game Score:  67.384033203125
Game Score:  100.62275695800781
Game Score:  45.04173278808594
Game Score:  100.62275695800781
Game Score:  29.164718627929688
Game Score:  195.0583648