# Cart Pole - Policy Based Agent

In [1]:
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

#### Load the CartPole Environment

In [2]:
import gym
env = gym.make('CartPole-v0')

[2016-11-15 15:43:13,029] Making new env: CartPole-v0


#### Random Actions

In [3]:
env.reset()
random_episodes = 0
reward_sum = 0
while random_episodes < 10:
    env.render()
    observation, reward, done, _ = env.step(np.random.randint(0, 2))
    reward_sum += reward
    if done:
        random_episodes += 1
        print("Reward for this episode was: %f" % reward_sum)
        reward_sum = 0
        env.reset()
        
# close window
env.viewer.close()

# have to do this or next time you try to render, it will throw error
env.viewer = None

Reward for this episode was: 50.000000
Reward for this episode was: 20.000000
Reward for this episode was: 22.000000
Reward for this episode was: 26.000000
Reward for this episode was: 16.000000
Reward for this episode was: 15.000000
Reward for this episode was: 17.000000
Reward for this episode was: 20.000000
Reward for this episode was: 15.000000
Reward for this episode was: 28.000000


#### Setting up Neural Network

In [3]:
# hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 50 # every how many episodes to do a param update?
learning_rate = 1e-2 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward

D = 4 # input dimensionality

In [4]:
tf.reset_default_graph()

#This defines the network as it goes from taking an observation of the environment to 
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None, D], name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))

W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())

score = tf.matmul(layer1,W2)

probability = tf.nn.sigmoid(score)

#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
advantages = tf.placeholder(tf.float32, name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
loss = -tf.reduce_mean((tf.log(input_y - probability)) * advantages) 
newGrads = tf.gradients(loss, tvars)

# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
W1Grad = tf.placeholder(tf.float32, name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32, name="batch_grad2")
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))

#### Advantage function

In [5]:
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

#### Run the Agent on the Environment

In [6]:
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 50000
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
    rendering = False
    finished = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment

    # Reset the gradient placeholder. We will collect gradients in 
    # gradBuffer until we are ready to update our policy network. 
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:
        
        # Rendering the environment slows things down, 
        # so let's only look at it once our agent is doing a good job.
#         if reward_sum/batch_size > 180 or rendering == True :
        if rendering == True :
            env.render()
            
        # Make sure the observation is in a shape the network can handle.
        x = np.reshape(observation, [1, D])
        
        # Run the policy network and get an action to take. 
        tfprob = sess.run(probability, feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        xs.append(x) # observation
        y = 1 if action == 0 else 0 # a "fake label"
        ys.append(y)

        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)
        reward_sum += reward

        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

        if done: 
            episode_number += 1
            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory

            # compute the discounted reward backwards through time
            discounted_epr = discount_rewards(epr)
            # size the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            # Get the gradient for this episode, and save it in the gradBuffer
            tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # If we have completed enough episodes, then update the policy network with our gradients.
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                # Give a summary of how well our network is doing for each batch of episodes.
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('Average reward for episode %f.  Total average reward %f.' % (reward_sum/batch_size, running_reward/batch_size))
                
                if reward_sum/batch_size > 500: 
                    print("Task solved in", episode_number, 'episodes!')
                    if (finished):
                        break
                    finished = True
                    rendering = True
                    
                reward_sum = 0
            
            if rendering:
                env.monitor.start('/tmp/cartpole-experiment-1', force=False)
            
            observation = env.reset()
        
env.monitor.close()
env.viewer.close()
env.viewer = None
print(episode_number, 'Episodes completed.')

Average reward for episode 19.260000.  Total average reward 19.260000.
Average reward for episode 19.120000.  Total average reward 19.258600.
Average reward for episode 28.540000.  Total average reward 19.351414.
Average reward for episode 21.320000.  Total average reward 19.371100.
Average reward for episode 20.460000.  Total average reward 19.381989.
Average reward for episode 20.600000.  Total average reward 19.394169.
Average reward for episode 23.440000.  Total average reward 19.434627.
Average reward for episode 24.000000.  Total average reward 19.480281.
Average reward for episode 23.440000.  Total average reward 19.519878.
Average reward for episode 22.480000.  Total average reward 19.549479.
Average reward for episode 28.560000.  Total average reward 19.639585.
Average reward for episode 22.160000.  Total average reward 19.664789.
Average reward for episode 27.240000.  Total average reward 19.740541.
Average reward for episode 24.740000.  Total average reward 19.790535.
Averag

[2016-11-15 15:48:02,568] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.0.6521.video000000.mp4


Average reward for episode 539.840000.  Total average reward 140.134355.
Task solved in 5900 episodes!


[2016-11-15 15:48:09,961] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.1.6521.video000001.mp4
[2016-11-15 15:48:34,148] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.8.6521.video000008.mp4
[2016-11-15 15:49:43,197] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.27.6521.video000027.mp4


Average reward for episode 191.480000.  Total average reward 140.647812.


[2016-11-15 15:51:51,808] Starting new video recorder writing to /tmp/cartpole-experiment-1/openaigym.video.64.6521.video000064.mp4


KeyboardInterrupt: 