# Reinforcement Learning: Policy Gradients

The alternative to the function-based approach (to which TD, DP and Monte Carlo methods belong) is policy search. The aim is to directly modify the parameter vector $\theta$ of the parametrized policy $\pi_{\theta}$.

In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

The environment chosen for this exercise is the Cart-Pole environment. It has an observation space of dimension 4, that is $(x, \dot{x}, \theta, \dot{\theta})$. The action space is discrete, meaning that the cart can be pushed left or right.

In [3]:
env = gym.make('CartPole-v0')

In [4]:
gamma = 0.99

# Helper function to discout full episode return
def discountRewards(r):
    discountedR = np.zeros_like(r)
    runningAdd = 0
    for t in reversed(range(0, r.size)):
        runningAdd = runningAdd * gamma + r[t]
        discountedR[t] = runningAdd
    return discountedR

The class `agent` defines a neural network with 1 hidden layer (size h_size). It is a fully connected network with 4-dimensional continuous input state ($x, \dot{x}, \theta, \dot{\theta}$) and 1-dimensional discrete action space (force on the Cart), assuming values 0 or 1 (bang-bang controller). 

The loss function, as described in the slides, is the pseudo-loss function $$J(\theta)=−\frac{1}{|D|}\sum_{\tau\in D}\sum\limits_{t=0}^T\log⁡\pi_{\theta} (a_t│s_t)R(\tau)$$,
where $D$ is the set of collected trajectories with the same policy, and $R(\tau)$ is the discounted return for the episode.


In [5]:
class agent():
    def __init__(self, lr, s_size, a_size, h_size):
        # The policy net takes all the states of the episode as input, and outputs the correspondent actions
        self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in, h_size, activation_fn=tf.nn.relu, biases_initializer=None)
        self.output = slim.fully_connected(hidden, a_size, activation_fn=tf.nn.softmax, biases_initializer=None)
        self.chosen_action = tf.argmax(self.output, 1)
        
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # Only the actions that were actually chosen are used to do backprop. 
        """
        self.outputs=[[0.1, 0.9], [0.7, 0.3]]
        self.indexes=[0, 1]*2 + [1, 0] = [0, 2] + [1, 0] = [1, 2]
        self.responsible_outputs=[0.9, 0.7]
        """
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.reward_holder)
        
        self.tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx, var in enumerate(self.tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
            self.gradient_holders.append(placeholder)
            
        self.gradients = tf.gradients(self.loss, self.tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate = lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, self.tvars))

In [6]:
tf.reset_default_graph()

myAgent = agent(lr=1e-2, s_size=4, a_size=2, h_size=10)

total_episodes = 2500
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    # Zero the gradient buffer.
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            
            # Choose a random action according to the distribution provided by the network
            noise = np.array([0.1, 0.1, 0.1, 0.1])*np.random.randn(4)
            a_dist = sess.run(myAgent.output, feed_dict={myAgent.state_in: [s + noise]})
            a = np.random.choice(a_dist[0], p=a_dist[0])
            a = np.argmax(a_dist == a)
            
            # Interact with env and store data
            s1, r, done, info = env.step(a)
            if i % 100 == 0:
                env.render()
                
            ep_history.append([s, a, r, s1])
            s = s1
            running_reward += r
            
            # Update network when episode is over. The network accepts data from entire episode.
            if done:
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discountRewards(ep_history[:, 2])
                feed_dict = {myAgent.reward_holder: ep_history[:, 2], \
                             myAgent.action_holder: ep_history[:, 1], \
                             myAgent.state_in: np.vstack(ep_history[:, 0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad
                    
                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                        
                total_reward.append(running_reward)
                total_length.append(j)
                break
                
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1
#     print(sess.run(myAgent.tvars))
    save_path = saver.save(sess, "/tmp/model.ckpt")
    print("Model saved in file: %s" % save_path)

19.0
34.07
42.83
57.72
76.55
106.18
144.52
164.64
161.49
147.94
158.51
145.35
131.15
107.99
148.41
166.72
132.01
136.11
139.99
153.6
138.56
112.91
136.35
134.73
136.24
Model saved in file: /tmp/model.ckpt


In [7]:
# Evaluation and rendering
prop_control = False
total_episodes = 10
max_ep = 999

with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    saver.restore(sess, "/tmp/model.ckpt")
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            noise = np.array([0.1, 0.1, 0.1, 0.1])*np.random.randn(4)
            # NN control
            a = sess.run(myAgent.chosen_action, feed_dict={myAgent.state_in: [s + noise]})
            a = a[0]
            
            # Proportional control
            if prop_control:
                a = int(np.sign(np.dot(s, np.array([1, 1, 10, 10]))))
                a = np.max([0, a])
                
            # Interact with env and store data
            s1, r, d, info = env.step(a)
            env.render()
                
            ep_history.append([s, a, r, s1])
            s = s1
            running_reward += r
            
            # Update network when episode is over. The network accepts data from entire episode.
            if d == True:
                total_reward.append(running_reward)
                total_length.append(j)
                break
                
        print("Episode Reward: " + str(running_reward))
        i += 1
    print("Average Reward: " + str(np.mean(total_reward[-total_episodes:])))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 200.0
Episode Reward: 196.0
Average Reward: 199.6


In [8]:
# Remember to close Gym rendering window!
env.close()