In [None]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy.signal
%matplotlib inline
#from helper import *
import os
import pdb
from random import choice
from time import sleep
from time import time

In [None]:
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


In [None]:
class AC_Network():
    def __init__(self, layers, scope, trainer):
        with tf.variable_scope(scope):
            
            self.inputs = tf.placeholder(shape=[None, layers.dim_s], dtype=tf.float32)
            
            # shared layers
            shared_w_1 = tf.Variable(tf.truncated_normal([layers.dim_s, layers.shared[0]]))
            shared_b_1 = tf.Variable(tf.zeros([layers.shared[0]]))
            shared_inputs_1 = tf.matmul(self.inputs, shared_w_1) + shared_b_1
            shared_out_1 = tf.nn.relu(shared_inputs_1)
            
            # actor hidden layer
            actor_w_1 = tf.Variable(tf.truncated_normal([layers.shared[0], layers.actor[0]]))
            actor_b_1 = tf.Variable(tf.zeros([layers.actor[0]]))
            actor_inputs_1 = tf.matmul(shared_out_1, actor_w_1) + actor_b_1
            actor_out_1 = tf.nn.relu(actor_inputs_1)
            
            # critic hidden layer
            critic_w_1 = tf.Variable(tf.truncated_normal([layers.shared[0], layers.critic[0]]))
            critic_b_1 = tf.Variable(tf.zeros([layers.critic[0]]))
            critic_inputs_1 = tf.matmul(shared_out_1, critic_w_1) + critic_b_1
            critic_out_1 = tf.nn.relu(critic_inputs_1)
            
            # Policy 
            policy_w = tf.Variable(tf.truncated_normal([layers.actor[0], layers.dim_a]))
            policy_b = tf.Variable(tf.zeros([layers.dim_a]))
            policy_inputs = tf.matmul(actor_out_1, policy_w) + policy_b
            self.policy = tf.nn.softmax(policy_inputs)
            
            # Value estimation
            value_w = tf.Variable(tf.truncated_normal([layers.critic[0], 1]))
            value_b = tf.Variable(tf.zeros([1]))
            value_inputs = tf.matmul(critic_out_1, value_w) + value_b
            self.value = value_inputs
            
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,layers.dim_a,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)

                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [None]:
class Worker():
    def __init__(self,game,name,layers,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(layers,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        #The Below code is related to setting up the Doom environment
        '''
        game.set_doom_scenario_path("basic.wad") #This corresponds to the simple task we will pose our agent
        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_160X120)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(False)
        game.add_available_button(Button.MOVE_LEFT)
        game.add_available_button(Button.MOVE_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_game_variable(GameVariable.AMMO2)
        game.add_available_game_variable(GameVariable.POSITION_X)
        game.add_available_game_variable(GameVariable.POSITION_Y)
        game.set_window_visible(False)
        game.set_sound_enabled(False)
        game.set_living_reward(-1)
        game.set_mode(Mode.PLAYER)
        game.init()
        self.actions = [[True,False,False],[False,True,False],[False,False,True]]
        game.set_episode_timeout(300)
        game.set_episode_start_time(10)
        '''
        
        self.sleep_time = 0.028
        #End Doom set-up
        self.env = game
        
    def train(self,global_AC,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        '''
        why discount on advantages ????
        advantages = discount(advantages,gamma)
        '''
        
        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.vstack(observations),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages}
        v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,global_AC,sess,coord,saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                terminal = False
                d = False
                
                s = self.env.reset()
                episode_frames.append(s)
                
                while terminal == False:
                    #Take an action using probabilities from policy network output.
                    #pdb.set_trace()
                    a_dist,v = sess.run([self.local_AC.policy,self.local_AC.value], 
                        feed_dict={self.local_AC.inputs:[s]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)

                    s2, r, terminal, report = self.env.step(a) # / 100.0
                    if terminal == True:
                        s2 = s
                        
                    episode_buffer.append([s,a,r,s2,terminal,v[0,0]])
                    episode_values.append(v[0,0])

                    episode_reward += r
                    s = s2                    
                    total_steps += 1
                    episode_step_count += 1
                    
                    '''
                    #Specific to VizDoom. We sleep the game for a specific time.
                    if self.sleep_time>0:
                        sleep(self.sleep_time)
                    '''
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    '''
                    len(episode_buffer) ????
                    '''
                    if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(global_AC,episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the experience buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(global_AC,episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print("Saved Model")
                        print("episode: %d --- reward = %f" %(episode_count, np.mean(self.episode_rewards[-50:])))

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

In [None]:
import gym
game = gym.make('LunarLander-v2')
'''
for i_episode in range(1):
    pdb.set_trace()
    observation = game.reset()
    for t in range(100):
        #env.render()
        print(observation)
        action = game.action_space.sample()
        observation, reward, done, info = game.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
#game = gym.make('CartPole-v0')
'''
state_dim = game.observation_space.shape[0]
action_dim = game.action_space.n

In [None]:
class input_struct:
    def __init__(self):
        pass
layers = input_struct
layers.dim_s = state_dim
layers.dim_a = action_dim
layers.shared = np.array([64])
layers.actor = np.array([32])
layers.critic = np.array([32])
LEARNING_RATE = 1e-4

max_episode_length = 300
gamma = .99 # discount rate for advantage estimation and reward discounting
load_model = False
model_path = './model'

In [None]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
with tf.device("/cpu:0"): 
    
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    
    master_network = AC_Network(layers,'global',None) # Generate global network
    num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(num_workers):
        
        workers.append(Worker(game,i,layers,trainer,model_path,global_episodes))
        
    saver = tf.train.Saver(max_to_keep=5)
        
with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # Start the "work" process for each worker in a separate threat.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(max_episode_length,gamma,master_network,sess,coord, saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        worker_threads.append(t)
    coord.join(worker_threads)

In [None]:
work(self, max_episode_length, gamma, global_AC, sess, coord, saver):