In [3]:
"""
This code list implement the Monte Carlo Policy Gradient Alogrithm.
------------------------------------------------------------------
Input:
    differentiable policy function $\pi_{\theta}(a|s)$

Initalize:
    Parameter $\theta$ for policy function

Repeat  experience trajectory:
    Use $\pi_{\theta}(a|s)$ to generate one trajectory $(s_0,a_0,r_1....s_T)$
    Repeat each step in trajectory:
        G <--- cumlated reward at time step t
        Calculate the policy gradient  $\Delta\theta_t = \alpha \Delta_{\theta}log\pi_{\theta}(s_t, a_t)G_t$
------------------------------------------------------------------
"""
import gym
import os 
import sys
import itertools
import numpy as np
import tensorflow as tf
from collections import defaultdict, namedtuple

import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
matplotlib.style.use('ggplot')

In [5]:
RENDER_ENV = False

In [6]:
env = gym.envs.make('CartPole-v0')
env = env.unwrapped
env.seed(1)

print("env.action_sapce:", env.action_space.n)
print("env.observation_sapce:", env.observation_space.shape[0])
print("env.observation_space.high:", env.observation_space.high)
print("env.observation_space.low:", env.observation_space.low)

env.action_sapce: 2
env.observation_sapce: 4
env.observation_space.high: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
env.observation_space.low: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]


In [None]:
class MCPG():
    def __init__(self, env, estimator, actions, discount=1.0, alpha=0.5, epsilon=0.1):
        self.actions = actions
        self.n_actions = env.action_space.n
        self.n_features = env.observation_space.n
        self.discount = discount
        self.alpha = alpha
        self.epsilon = epsilon
        self.env = env
        self.epsisode_states = []
        self.episode_actions = [] 
        self.episode_rewards = []
        self.__build_network()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def __build_network():
        # input
        self.tf_observations = tf.placeholder(tf.float32, [None, self.n_features], name='X') 
        # for calculating loss
        self.tf_acts = tf.placeholder(tf.float32, [None, self.n_actions], name='Y')
        
        with tf.variable_scope('PolicyNetwork'):
            # c_names(collections_names) are the collections to store variables
            c_names = ['net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            n_l1 = 10
            n_l2 = 10
            w_initializer = tf.random_normal_initializer(0., 0.3)
            b_initializer = tf.constant_initializer(0.1)
            
            # first layer. 
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], 
                                     initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], 
                                     initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)

            # second layer.
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, n_l2], 
                                     initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, n_l2], 
                                     initializer=b_initializer, collections=c_names)
                l2 = tf.nn.relu(tf.matmul(l1, w2) + b2)
                
            # third layer.
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], 
                                     initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], 
                                     initializer=b_initializer, collections=c_names)
                all_action = tf.matmul(l1, w2) + b2
                self.all_action_prob = tf.nn.softmax(all_action)
                
        with tf.variable_scope('loss'):
            #  to maximize total reward  (log_p * R) is 
            #  to minimize -(log_p * R), and the tensorflow only have minimize(loss)
            # sparse_softmax_cross_entropy_with_logits is negative log of chosen action
            # or in this way:
            # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)\
            #                *tf.one_hot(self.tf_acts, self.n_actions), axis=1)
            neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_action,
                                                                    labels=self.tf_acts)
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)    
            
            
    def choose_action(self, observation):
        # select action
        if np.random.uniform() > self.epsilon:
            # choose the best action
            state_action = self.estimator.predict(observation)
            action = np.argmax(state_action)
        else:
            # choose a random action
            action = np.random.choice(self.actions)
        return action
    
    def store_transistion(self, s, a, r):
        pass

    def learn(self, s, a, r, s_,done):
        pass

In [None]:
def update(RL, env, num_episodes):
    # Track the statistics of the result
    record = namedtuple("Record", ["episode_lengths","episode_rewards"])
    
    rec = record(episode_lengths=np.zeros(num_episodes),
                          episode_rewards=np.zeros(num_episodes))
    
    for i_episode in range(num_episodes):
        if 0 == (i_episode +1) % 2:
            print("This the episode {}/{}".format(i_episode, num_episodes), end = "\r")
        observation = env.reset()
        step =0
        reward = 0
        while True:
            #env.render()
            if RENDER_ENV:
                env.render()
            # step1: choose action based on the state
            action = RL.choose_action(observation)
            
            # step2: take the action in the enviroment
            observation_next, reward, done, info = env.step(action)
            
            # step3: store the transistion for training
            RL.store_transistion(observation, action, reward)
            # update the record
            step += 1
            rec.episode_lengths[i_episode] = step 
            rec.episode_rewards[i_episode] += reward

            if done:
                # step4: train the network
                RL.learn(observation, action, reward, observation_next, done)
                break
                
            # step5: save the new state
            observation = observation_next
                
    print("Finished")
    env.close()
    return rec

In [None]:
def plot_episode_stats(rec, xlabel, ylabel,title):
    fig = plt.figure(figsize=(20, 10), facecolor = "white")
    ax = fig.add_subplot(111)
    ax.plot(rec) 
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    return fig

In [None]:
def plot_episode_stats(stats):
    # Plot time steps and episode number
    fig3 = plt.figure(figsize=(20, 10))
    plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths)))
    plt.xlabel("Time Steps")
    plt.ylabel("Episode")
    plt.title("Episode per time step")
    return fig3

In [None]:
def Plot_the_result(rec):
    # Plot episode length over time
    episode_lengths = rec.episode_lengths
    fig = plot_episode_stats(episode_lengths, 
                       xlabel = "Episode",
                       ylabel = "Episode Length",
                       title = "Episode length over Time"
            )
    fig.show()
#    fig.savefig("./log/FA_QLearning_MountainCar_EpisodeLength.jpg")

    # Plot Episode reward over time
    smoohing_window = 10
    reward_smooths = pd.Series(rec.episode_rewards).rolling(smoohing_window,\
                    min_periods = smoohing_window).mean()
    fig = plot_episode_stats(reward_smooths, 
                       xlabel = "Episode",
                       ylabel = "Episode Reward",
                       title = "Episode reward over time"
            )
    fig.show()
#    fig.savefig("./log/FA_QLearning_Mountain_EpisodeReward.jpg")
    
    # Plot Episode per time step
    fig = plot_episode_stats(rec)
    fig.show()
#    fig.savefig("./log/FA_QLearning_Mountain_EpisodePerTimeStep.jpg")

In [None]:
if __name__ == "__main__":
    start_time = time.time()
    num_episodes = 1000
    env = gym.make("MountainCar-v0")
    actions = [i for i in range(env.action_space.n)] 
    RL = MCPG(env, estimator, actions, discount=1.0, alpha=0.5, epsilon=0.1)
    rec = update(RL, env, num_episodes=num_episodes)
    #Plot the result
    Plot_the_result(rec)
    
    end_time= time.time()
    print("This alogrithm cost time is :",end_time-start_time)
