In [1]:
import tensorflow as tf

In [2]:
from tensorflow.contrib.layers import fully_connected

In [3]:
import gym
import numpy as np

In [4]:
# tf.reset_default_graph()

In [5]:
# 1. Specify the neural network architecture
n_inputs = 4 # == env.observation_space.shape[0]
n_hidden = 4 # it's a simple task, we don't need more hidden neurons 
n_outputs = 1 # only outputs the probability of accelerating left 
initializer = tf.contrib.layers.variance_scaling_initializer()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [6]:
# 2. Build the neural network
X_ = tf.placeholder(tf.float32, shape=[None, n_inputs], name="X_")
hidden = fully_connected(X_, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer) 
logits = fully_connected(hidden, n_outputs, activation_fn=None, weights_initializer=initializer) 
outputs = tf.nn.sigmoid(logits, name="Y_proba")

# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs]) 
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
init = tf.global_variables_initializer()

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use `tf.random.categorical` instead.


In [7]:
y = 1. - tf.to_float(action)

Instructions for updating:
Use `tf.cast` instead.


In [8]:
learning_rate = 0.01
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate) 
grads_and_vars = optimizer.compute_gradients(cross_entropy)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
gradients = [grad for grad, variable in grads_and_vars]

In [10]:
gradient_placeholders = [] 
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape()) 
    gradient_placeholders.append(gradient_placeholder) 
    grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)


In [11]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
def discount_rewards(rewards, discount_rate=0.95): 
    discounted_rewards = np.empty(len(rewards)) 
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards 
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate=0.95): 
    all_discounted_rewards = [discount_rewards(rewards) for rewards in all_rewards] 
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [13]:
n_iterations = 251 # number of training iterations
n_max_steps = 1000 # max steps per episode
n_games_per_update = 10 # train the policy every 10 episodes 
save_iterations = 10 # save the model every 10 training iterations discount_rate = 0.95

In [14]:
env = gym.make('CartPole-v1')

In [15]:
with tf.Session() as sess: 
    init.run()
    for iteration in range(n_iterations):
        all_rewards = [] # all sequences of raw rewards for each episode 
        all_gradients = [] # gradients saved at each step of each episode 
        for game in range(n_games_per_update):
            current_rewards = [] # all raw rewards from the current episode 
            current_gradients = [] # all gradients from the current episode
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run( [action, gradients], 
                feed_dict={X_: obs.reshape(1,n_inputs)}) # one obs 
                obs, reward, done, info = env.step(action_val[0][0]) 
                current_rewards.append(reward) 
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards) 
            all_gradients.append(current_gradients)
            
        # At this point we have run the policy for 10 episodes, and we are # ready for a policy update using the algorithm described earlier. 
        all_rewards = discount_and_normalize_rewards(all_rewards) 
        feed_dict = {}
        for var_index, grad_placeholder in enumerate(gradient_placeholders):
            # multiply the gradients by the action scores, and compute the mean 
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index] 
            for game_index, rewards in enumerate(all_rewards) 
            for step, reward in enumerate(rewards)],axis=0)
            feed_dict[grad_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict) 
        if iteration % save_iterations == 0:
            print('{} epoch \n'.format(iteration))
            saver.save(sess, "./ckpt/my_policy_net_pg.ckpt")                

0 epoch 

10 epoch 

20 epoch 

30 epoch 

40 epoch 

50 epoch 

60 epoch 

70 epoch 

80 epoch 

90 epoch 

100 epoch 

110 epoch 

120 epoch 

130 epoch 

140 epoch 

150 epoch 

160 epoch 

170 epoch 

180 epoch 

190 epoch 

200 epoch 

210 epoch 

220 epoch 

230 epoch 

240 epoch 

250 epoch 



In [16]:
def predict():
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph('./ckpt/my_policy_net_pg.ckpt.meta')
        saver.restore(sess, "./ckpt/my_policy_net_pg.ckpt") 

        graph = tf.get_default_graph()
        outputs = graph.get_tensor_by_name("Y_proba:0") 
        X_ = graph.get_tensor_by_name("X_:0") 
        
        ans=[]
        epoch=20
        steps=1000
        for i_episode in range(epoch):
            obs = env.reset()
            for t in range(steps):
                env.render()
                output=sess.run([outputs],feed_dict={X_: obs.reshape(1,n_inputs)})
                action = np.log([output[0][0][0], 1 - output[0][0][0]])
                obs, reward, done, info = env.step(np.argmax(action))
                if done or t==steps-1:
                    print("Episode finished after {} timesteps".format(t+1))
                    ans.append(t+1)
                    break
        env.close()
    return sum(ans)/epoch            

In [17]:
predict()

INFO:tensorflow:Restoring parameters from ./ckpt/my_policy_net_pg.ckpt
Episode finished after 387 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 267 timesteps
Episode finished after 313 timesteps
Episode finished after 280 timesteps
Episode finished after 307 timesteps
Episode finished after 500 timesteps
Episode finished after 270 timesteps
Episode finished after 291 timesteps
Episode finished after 306 timesteps
Episode finished after 334 timesteps
Episode finished after 340 timesteps
Episode finished after 385 timesteps
Episode finished after 358 timesteps
Episode finished after 500 timesteps
Episode finished after 288 timesteps
Episode finished after 264 timesteps
Episode finished after 500 timesteps
Episode finished after 380 timesteps


363.5