# Imports

In [1]:
from __future__ import division
from __future__ import print_function
from vizdoom import *
import itertools as it
import random
from random import sample, randint, random, choice
from time import time, sleep
import numpy as np
import skimage.color, skimage.transform
import tensorflow as tf
from tqdm import trange
from collections import deque
from tensorflow.contrib.layers import flatten, conv2d, fully_connected

# Constants

In [2]:
#hyperparameters
learning_rate = 0.00025
discount_factor = 0.99
epochs = 3
learning_steps = 2000
total_steps = epochs*learning_steps
replay_memory_size = 10000
batch_size = 32

#state settings
frame_repeat = 8
resolution = (30, 45)
stacks = 2
steps = 0
state_size = [resolution[0], resolution[1],stacks]  
stacked_frames  =  deque([np.zeros(resolution, dtype=np.int) for i in range(stacks)], maxlen=stacks) 

state_dict = {"frame_repeat":frame_repeat, "resolution":resolution, "stacks":stacks,"state_size":state_size}

#exploration params
eps_start = 1.0
eps_end = 0.0001
observe = 0.1 * total_steps
explore = 0.6 * total_steps
decay_rate = (explore-observe) * (eps_start - eps_end)

model_savefile = "/tmp/model.ckpt"
save_model = True
load_model = False
skip_learning = False
episodes_to_watch = 10
# Configuration file path
config_file_path = "C:/vizdoom/vizdoom115pre/scenarios/simpler_basic.cfg"

# Utilities

In [3]:
def preprocess(img):
    img = img[30:-10,30:-30]
    img = img/255.0
    img = skimage.transform.resize(img,resolution)
    return img

In [4]:
def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
   
    if is_new_episode:
        
        stacked_frames = deque([np.zeros(resolution, dtype=np.int) for i in range(stacks)], maxlen=stacks)
        
        for i in range(stacks):
            stacked_frames.append(state)
        
    else:
        stacked_frames.append(state)

    # Build the stacked state (first dimension specifies different frames)
    stacked_state = np.stack(stacked_frames, axis=2) 
    
    
    return stacked_state, stacked_frames

In [5]:
def get_epsilon(step):
    if step < observe:
        return eps_start
    elif step < explore:
        return eps_start - (step - observe) / decay_rate
    else:
        return eps_end

In [6]:
def init_vizdoom(config_file_path):
        print("Initializing doom...")
        game = DoomGame()
        game.load_config(config_file_path)
        game.set_window_visible(False)
        game.set_mode(Mode.PLAYER)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.init()
        print("Doom initialized.")
        return game

# Brain - Deep Q Network

In [7]:
class DQN():
    
    def __init__(self, session,action_size,state_size):
        self.sess = session
        self.action_size = action_size
        self.state_size = state_size
        self.model = self.build()
       
    def build(self):
        with tf.variable_scope("main"):
            self.input_state = tf.placeholder(tf.float32, [None] + self.state_size, name="State")
            self.target_q = tf.placeholder(tf.float32, [None, self.action_size], name="TargetQ")

            self.layer_1 = conv2d(self.input_state, num_outputs=8, kernel_size=(6,6), stride=3,activation_fn=tf.nn.relu, padding='SAME')
            tf.summary.histogram('layer_1',self.layer_1)

            self.layer_2 = conv2d(self.layer_1, num_outputs=8, kernel_size=(3,3), stride=2,activation_fn=tf.nn.relu, padding='SAME')
            tf.summary.histogram('layer_2',self.layer_2)

            self.flat = flatten(self.layer_2)

            self.fc = fully_connected(self.flat, num_outputs=128)
            tf.summary.histogram('fc',self.fc)

            self.output = fully_connected(self.fc, num_outputs=self.action_size, activation_fn=None)
            tf.summary.histogram('output',self.output)

            self.max_q = tf.argmax(self.output, 1)

            self.loss = tf.losses.mean_squared_error(self.output, self.target_q)
            self.optimizer = tf.train.RMSPropOptimizer(learning_rate)
            self.train_step = self.optimizer.minimize(self.loss)
    
    def learn(self,s1, tq):
        feed_dict = {self.input_state: s1, self.target_q: tq}
        l, _ = self.sess.run([self.loss, self.train_step], feed_dict=feed_dict)
        return l
    
    def get_q_values(self,state):
        return self.sess.run(self.output, feed_dict={self.input_state: state})

    def get_best_action(self,state):
        return self.sess.run(self.max_q, feed_dict={self.input_state: state})

    def simple_get_best_action(self,state):
        return self.get_best_action(state.reshape((1, *state.shape)))[0]
        
    

# Memory - Experience Replay

In [8]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        self.size = 0
    
    def add(self, experience):
        self.size += 1
        self.buffer.append(experience)
    
    def length(self):
        return len(self.buffer)
    
    def size(self):
        return self.size
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

# Environment

In [9]:
class Environment():
    
    def __init__(self, game, agent, memory,state_dict):
        self.game = game
        self.agent = agent
        self.memory = memory
        self.state_dict = state_dict
        self.steps = 0
        self.stacked_frames  =  deque([np.zeros(state_dict["resolution"], dtype=np.int
                                               ) for i in range(state_dict["stacks"])], maxlen=state_dict["stacks"])   
        
    def run(self, training_steps=2000, epochs=4):
        epoch_scores = []
        for e in range(epochs):
            print("\nEpoch %d\n-------" % (e + 1))
            epoch_scores = self.run_epoch(training_steps)
            epoch_scores = np.array(epoch_scores)
            print("Results: mean: %.1f±%.1f," % (epoch_scores.mean(), epoch_scores.std()), \
                  "min: %.1f," % epoch_scores.min(), "max: %.1f," % epoch_scores.max())
            
        self.game.close()
            
    def run_epoch(self, training_steps):
        
        self.game.new_episode()
        frame = preprocess(game.get_state().screen_buffer)
        state, self.stacked_frames = stack_frames(self.stacked_frames, frame, True)
        train_scores = []

        for i in range(training_steps):

            eps = get_epsilon(self.steps)
            if random() <= eps:
                 a = randint(0, len(actions) - 1)
            else:    
                a = self.agent.simple_get_best_action(state)

            reward = self.game.make_action(actions[a], frame_repeat)

            done = self.game.is_episode_finished()

            # Get the next state
            if done:
                next_frame = np.zeros(state_dict["resolution"], dtype=np.int)
            else:
                next_frame = preprocess(self.game.get_state().screen_buffer)

            # Stack the frame of the next_state
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_frame, False)

            # Add experience to memory
            memory.add((state, a, reward, next_state, done))

            # st+1 is now our current state
            state = next_state

            if self.steps > observe:
                self.learn_from_memory()
            self.steps +=1

            if done:
                train_scores.append(self.game.get_total_reward())
                self.game.new_episode()
                frame = preprocess(self.game.get_state().screen_buffer)
                state, self.stacked_frames = stack_frames(self.stacked_frames, frame, True)
                
        return train_scores
        
                
    def learn_from_memory(self):
        """ Learns from a single transition (making use of replay memory).
        s2 is ignored if s2_isterminal """

        # Get a random minibatch from the replay memory and learns from it.
        batch = memory.sample(batch_size)
        states_mb = np.array([each[0] for each in batch], ndmin=3)
        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch]) 
        next_states_mb = np.array([each[3] for each in batch], ndmin=3)
        dones_mb = np.array([each[4] for each in batch])

        max_q_values = np.max(self.agent.get_q_values(next_states_mb), axis=1)

        target_q = self.agent.get_q_values(states_mb)
        target_q[np.arange(target_q.shape[0]), actions_mb] = rewards_mb + discount_factor * (1 - dones_mb) * max_q_values
        self.agent.learn(states_mb, target_q)
    

# Main

In [10]:
tf.reset_default_graph()
game = init_vizdoom(config_file_path)
    
n = game.get_available_buttons_size()
actions = [list(a) for a in it.product([0, 1], repeat=n)]

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
agent = DQN(session, len(actions), state_size)

init = tf.global_variables_initializer()
session.run(init)

memory = Memory(replay_memory_size)
env = Environment(game, agent, memory, state_dict)

print("Training....")
env.run(learning_steps, epochs)
    


Initializing doom...
Doom initialized.
Training....

Epoch 1
-------


  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Results: mean: -53.0±191.0, min: -395.0, max: 95.0,

Epoch 2
-------


KeyboardInterrupt: 