In [None]:
import tensorflow as tf
import numpy as np
import gym

# Create and setup environment

In [None]:
env = gym.make('MountainCar-v0')
env = env.unwrapped

env.seed(1)

state_size = 2
action_size = env.action_space.n

max_episodes = 1000
learning_rate = 0.01
gamma = 0.95 # Discount rate

# Discounted Rewards

In [None]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

# Neural Network

In [None]:
import time

g = tf.Graph()
with g.as_default():
    with tf.name_scope("inputs"):
        input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
        actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
        discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name="discounted_episode_rewards")

        # Add this placeholder for having this variable in tensorboard
        mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")
        avg_loss = tf.placeholder(tf.float32, name="avg_loss")
        
        with tf.name_scope("fc1"):
            fc1 = tf.contrib.layers.fully_connected(inputs = input_,
                                                    num_outputs = 30, weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                                                    activation_fn=tf.nn.relu,
                                                    weights_initializer=tf.contrib.layers.xavier_initializer())

        with tf.name_scope("fc2"):
            fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                                    num_outputs = 10,  weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                                                    activation_fn= tf.nn.relu,
                                                    weights_initializer=tf.contrib.layers.xavier_initializer())

        with tf.name_scope("fc3"):
            fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                                    num_outputs = action_size,  weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                                                    activation_fn= None,
                                                    weights_initializer=tf.contrib.layers.xavier_initializer())

        with tf.name_scope("softmax"):
            action_distribution = tf.nn.softmax(fc3)

        with tf.name_scope("loss"):
            # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
            # If you have single-class labels, where an object can only belong to one class, you might now consider using 
            # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array.
            #time.sleep(4)
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            reg_constant = 0.01  # Choose an appropriate one.
            neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3, labels = actions)
            loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_) + reg_constant * sum(reg_losses)


        with tf.name_scope("train"):
            train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# Setup Tensorboard

In [None]:
# Setup TensorBoard Writer
with g.as_default():
    writer = tf.summary.FileWriter("./tensorboard/pg/2", g)

    ## Losses
    tf.summary.scalar("Loss", loss)

    ## Loss / Avg
    tf.summary.scalar("Average-Loss", avg_loss)
    
    ## Reward mean
    tf.summary.scalar("Reward_mean", mean_reward_)

    write_op = tf.summary.merge_all()

# Train Agent 

In [None]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]
losses = []
avgloss = 1
with g.as_default():
    saver = tf.train.Saver()

env.reset()

with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    
    # each episode is one trajectory
    for episode in range(max_episodes):
        episode_rewards_sum = 0
        state = env.reset()
        env.render()
        step=1
        while True:
            step+=1
            if(episode % 100 == 0 and episode != 0):
                env.render()
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,2])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob

            # Perform a
            new_state, reward, done, info = env.step(action)
            if(step>2500):
                done=True
            # Store s, a, r
            episode_states.append(state)
                        
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                                
                # Feedforward, gradient and backpropagation
                loss_, train_, fc3_, actions_ = sess.run([loss, train_opt, fc3, action_distribution], feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards 
                                                                })
                losses.append(loss_)
                avgloss = np.mean(losses)
                print(loss_)
                #print(actions_)
 
                                                                 
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards,
                                                                    mean_reward_: mean_reward,
                                                                avg_loss: avgloss
                                                                })
                
               
                writer.add_summary(summary, episode)
                writer.flush()
                
            
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = [],[],[]
                
                break
            
            state = new_state
        
        # Save Model
        if episode % 100 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")