# Exercise 6: Vanilla Policy Gradient (VPG)

In [None]:
import numpy as np
import os
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.python.framework import ops

from unityenv import UnityEnvironment

### Hyperparameters

In [None]:
y = 0.99 # Discount rate.
total_episodes = 20000 #Set total number of episodes to train agent on.
update_frequency = 10 # How many episodes before updating model.
learning_rate = 1e-3 # Agent learning rate.
hidden_units = 128 # Number of units in hidden layer.
summary_path = './summaries/vpg' # Where to save summary statistics.
model_path = './models/vpg' # Where to save model checkpoints.
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.

### Load the Unity environment

In [None]:
env_config = {}
env = UnityEnvironment(file_name="2DBall", train_model=train_model, worker_num=5, config=env_config)
print(str(env))

### Examine state space

In [None]:
_, state = env.reset()
print(state)

State (s) is a vector whose values corresponds to:
* Platform rotation
* Ball X position
* Ball Y position
* Ball X velocity
* Ball Y velocity

### The Actor Critic Agent

In [None]:
def discount_rewards(r, gamma):
    """ 
    function from karpathy.github.io/2016/05/31/rl/
    take 1D float array of rewards and compute discounted reward 
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
class VPGAgent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        self.batch_size = tf.placeholder(shape=None, dtype=tf.int32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.elu)
        self.out = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.output = self.out * (0.9) + 0.1/a_size
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions = slim.one_hot_encoding(self.action_holder, a_size)
        
        self.responsible_outputs = tf.reduce_sum(self.output * self.actions, axis=1)
        self.advantage = self.reward_holder
        
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.advantage)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

### Training the Agent

In [None]:
#Clear the Tensorflow graph.
tf.reset_default_graph() 

#Load the agent.
myAgent = VPGAgent(lr=learning_rate, s_size=env.state_space_size, 
                          a_size=env.action_space_size, h_size=hidden_units) 

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    summary_writer = tf.summary.FileWriter(summary_path)
    total_reward = []
    total_length = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    i = 0
    while i < total_episodes:
        _, state = env.reset()
        running_reward = 0
        j = 0
        done = False
        ep_history = []
        while not done:
            j += 1
            # Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output, 
                              feed_dict={myAgent.state_in:[state], myAgent.batch_size: 1})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            action = np.argmax(a_dist == a)
            _, state_1, reward, done = env.step(action, 0) 
            ep_history.append([state, action, reward, state_1])
            state = state_1
            running_reward += reward
            if done and train_model:
                # Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2], y)
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],
                        myAgent.state_in:np.vstack(ep_history[:,0]),
                        myAgent.batch_size: len(ep_history)}
                p_loss, grads = sess.run([myAgent.loss, myAgent.gradients], feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
            total_reward.append(running_reward)
            total_length.append(j)

        # Write summary statistics to tensorboard.
        if i % 50 == 0 and i != 0:
            summary = tf.Summary()
            summary.value.add(tag='Info/Reward', simple_value=float(np.mean(total_reward[-50:])))
            summary.value.add(tag='Info/Policy Loss', simple_value=float(p_loss))
            summary_writer.add_summary(summary, i)
            summary_writer.flush()
            print ("Mean Reward: {}".format(np.mean(total_reward[-50:])))
        # Save agent's model
        if i % 1000 == 0 and i != 0:
            saver.save(sess, model_path+'/model-'+str(i)+'.cptk')
            print("Saved Model")
        i += 1
env.close()

In [None]:
env.close()