In [1]:
"""
This code list implement the Monte Carlo Policy Gradient Alogrithm.
------------------------------------------------------------------
Input:
    differentiable policy function $\pi_{\theta}(a|s)$

Initalize:
    Parameter $\theta$ for policy function

Repeat  experience trajectory:
    Use $\pi_{\theta}(a|s)$ to generate one trajectory $(s_0,a_0,r_1....s_T)$
    Repeat each step in trajectory:
        G <--- cumlated reward at time step t
        Calculate the policy gradient  $\Delta\theta_t = \alpha \Delta_{\theta}log\pi_{\theta}(s_t, a_t)G_t$
------------------------------------------------------------------
"""
import time
import pandas as pd
import gym
import os 
import sys
import numpy as np
import tensorflow as tf
from collections import defaultdict, namedtuple
from keras.models import Sequential, load_model
from keras.layers import Convolution2D, Flatten, ZeroPadding2D
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.pooling import MaxPooling2D
from keras.optimizers import SGD , Adam
import keras.backend.tensorflow_backend as KTF
import keras.backend as K
from keras import models,layers,optimizers

import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
matplotlib.style.use('ggplot')

Using TensorFlow backend.


In [2]:
RENDER_ENV = False
LEARNING_RATE = 0.01
REWARD_DECAY = 0.95
OUTPUT_GRAPH = False
ENVNAME = "CartPole-v0"
N_LAYER1 = 10
N_LAYER2 = 10
NUM_EPISODES = 200
ACTIVATION_FUNCTION = tf.nn.tanh

In [7]:
class MCPG():
    def __init__(self, 
                 n_action, 
                 n_feature, 
                 learning_rate=0.01, 
                 reward_decay=0.95, 
                 ouput_graph=False,
                ):
        self.n_action = n_action
        self.n_features = n_feature
        self.gamma = reward_decay
        self.learning_rate = learning_rate
        self.episode_observation = []
        self.episode_actions = [] 
        self.episode_rewards = []
        self.model = self.createModel()
        
    def createModel(self):
        model = Sequential()
        model.add(Dense(N_LAYER1,activation="relu",input_shape=(self.n_features,)))
        model.add(layers.Dropout(0.5))
        model.add(Dense(N_LAYER2,activation="relu"))
        model.add(layers.Dropout(0.5))
        model.add(Dense(self.n_action,activation="softmax"))
        model.compile(optimizer=optimizers.RMSprop(lr=0.0001),
                loss="categorical_crossentropy",metrics=["accuracy"])
        model.summary()
        return model
                        
    def choose_action(self, observation):
        feed_state = observation[np.newaxis,:]
        prob_weights = self.model.predict(feed_state)
        action = np.random.choice(range(prob_weights.shape[1]),
                                 p=prob_weights.ravel())
        return action
    
    def store_transistion(self, s, a, r):
        self.episode_observation.append(s)
        self.episode_actions.append(a)
        self.episode_rewards.append(r)

    def learn(self):
        #discount and normalize the episode reward
        discounted_episode_reward_normalized = self._discount_and_norm_rewards()
        episode_length = len(self.episode_observation)
        # transform to one-hot label
        advantage = np.zeros((episode_length, self.n_action))
        for i in range(episode_length):
            advantage[i][self.episode_actions] = discounted_episode_reward_normalized[i]
        # train
        self.model.fit(np.vstack(self.episode_observation), advantage, verbose=0)
        
        self.episode_observation = []
        self.episode_actions = [] 
        self.episode_rewards = []
        return discounted_episode_reward_normalized
    
    def _discount_and_norm_rewards(self):
        discounted_episode_reward = np.zeros_like(self.episode_rewards)
        running_add = 0
        for t in reversed(range(0, len(self.episode_rewards))):
            running_add = running_add * self.gamma + self.episode_rewards[t]
            discounted_episode_reward[t] = running_add
        
        # normalized
        discounted_episode_reward -= np.mean(discounted_episode_reward)
        discounted_episode_reward /= np.std(discounted_episode_reward)
        return discounted_episode_reward        

In [9]:
def plot_episode_stats1(rec, xlabel, ylabel,title):
    fig = plt.figure(figsize=(20, 10), facecolor = "white")
    ax = fig.add_subplot(111)
    ax.plot(rec) 
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    return fig

def plot_episode_stats2(stats):
    # Plot time steps and episode number
    fig3 = plt.figure(figsize=(20, 10))
    plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths)))
    plt.xlabel("Time Steps")
    plt.ylabel("Episode")
    plt.title("Episode per time step")
    return fig3

def Plot_the_result(rec):
    # Plot episode length over time
    episode_lengths = rec.episode_lengths
    fig = plot_episode_stats1(episode_lengths, 
                       xlabel = "Episode",
                       ylabel = "Episode Length",
                       title = "Episode length over Time"
            )
    fig.show()
#    fig.savefig("./log/FA_QLearning_MountainCar_EpisodeLength.jpg")

    # Plot Episode reward over time
    smoohing_window = 10
    reward_smooths = pd.Series(rec.episode_rewards).rolling(smoohing_window,\
                    min_periods = smoohing_window).mean()
    fig = plot_episode_stats1(reward_smooths, 
                       xlabel = "Episode",
                       ylabel = "Episode Reward",
                       title = "Episode reward over time"
            )
    fig.show()
#    fig.savefig("./log/FA_QLearning_Mountain_EpisodeReward.jpg")
    
    # Plot Episode per time step
    fig = plot_episode_stats2(rec)
    fig.show()
#    fig.savefig("./log/FA_QLearning_Mountain_EpisodePerTimeStep.jpg")

In [5]:
def update(RL, env, num_episodes):
    # Track the statistics of the result
    record = namedtuple("Record", ["episode_lengths","episode_rewards"])
    
    rec = record(episode_lengths=np.zeros(num_episodes),
                          episode_rewards=np.zeros(num_episodes))
    
    for i_episode in range(num_episodes):
        if 0 == (i_episode +1) % 1:
            print("This the episode {}/{}".format(i_episode, num_episodes), end = "\r")
        observation = env.reset()
        step =0
        reward_cum = 0
        done = False
        while True:
            #env.render()
            if RENDER_ENV:
                env.render()
            # step1: choose action based on the state
            action = RL.choose_action(observation)
            # step2: take the action in the enviroment
            observation_next, reward, done, info = env.step(action)
            # step3: store the transistion for training
            RL.store_transistion(observation, action, reward)
            # update the record
            step += 1
#            if step % 10000 == 0:
#                print("step is:",step)
            rec.episode_lengths[i_episode] = step 
            rec.episode_rewards[i_episode] += reward
            if done:
                # step4: train the network
                RL.learn()
                print("The reward at episode {} is {}.".format(i_episode, 
                                                              rec.episode_rewards[i_episode]))
                break
            # step5: save the new state
            observation = observation_next
    print("Finished")
    env.close()
    return rec

In [8]:
if __name__ == "__main__":
    env = gym.make(ENVNAME)
    env = env.unwrapped
    RL = MCPG(n_action=env.action_space.n,
             n_feature=env.observation_space.shape[0],
             learning_rate=LEARNING_RATE,
             reward_decay=REWARD_DECAY,
             ouput_graph=OUTPUT_GRAPH)
    rec = update(RL, env, num_episodes=NUM_EPISODES)
    #Plot the result
    Plot_the_result(rec)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 10)                50        
_________________________________________________________________
dropout_3 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                110       
_________________________________________________________________
dropout_4 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 22        
Total params: 182
Trainable params: 182
Non-trainable params: 0
_________________________________________________________________
The reward at episode 0 is 25.0.
The reward at episode 1 is 13.0.
The reward at episode 2 is 15.0.
The reward at episode 3

The reward at episode 225 is 17.0.
The reward at episode 226 is 44.0.
The reward at episode 227 is 27.0.
The reward at episode 228 is 27.0.
The reward at episode 229 is 19.0.
The reward at episode 230 is 16.0.
The reward at episode 231 is 30.0.
The reward at episode 232 is 17.0.
The reward at episode 233 is 30.0.
The reward at episode 234 is 28.0.
The reward at episode 235 is 13.0.
The reward at episode 236 is 9.0.
The reward at episode 237 is 27.0.
The reward at episode 238 is 20.0.
The reward at episode 239 is 45.0.
The reward at episode 240 is 22.0.
The reward at episode 241 is 23.0.
The reward at episode 242 is 29.0.
The reward at episode 243 is 14.0.
The reward at episode 244 is 30.0.
The reward at episode 245 is 18.0.
The reward at episode 246 is 14.0.
The reward at episode 247 is 17.0.
The reward at episode 248 is 14.0.
The reward at episode 249 is 18.0.
The reward at episode 250 is 15.0.
The reward at episode 251 is 18.0.
The reward at episode 252 is 16.0.
The reward at episode

The reward at episode 464 is 17.0.
The reward at episode 465 is 42.0.
The reward at episode 466 is 13.0.
The reward at episode 467 is 13.0.
The reward at episode 468 is 14.0.
The reward at episode 469 is 13.0.
The reward at episode 470 is 30.0.
The reward at episode 471 is 10.0.
The reward at episode 472 is 18.0.
The reward at episode 473 is 34.0.
The reward at episode 474 is 41.0.
The reward at episode 475 is 13.0.
The reward at episode 476 is 12.0.
The reward at episode 477 is 11.0.
The reward at episode 478 is 20.0.
The reward at episode 479 is 29.0.
The reward at episode 480 is 27.0.
The reward at episode 481 is 21.0.
The reward at episode 482 is 41.0.
The reward at episode 483 is 29.0.
The reward at episode 484 is 13.0.
The reward at episode 485 is 9.0.
The reward at episode 486 is 17.0.
The reward at episode 487 is 18.0.
The reward at episode 488 is 25.0.
The reward at episode 489 is 21.0.
The reward at episode 490 is 19.0.
The reward at episode 491 is 24.0.
The reward at episode

The reward at episode 698 is 18.0.
The reward at episode 699 is 12.0.
The reward at episode 700 is 19.0.
The reward at episode 701 is 21.0.
The reward at episode 702 is 13.0.
The reward at episode 703 is 18.0.
The reward at episode 704 is 37.0.
The reward at episode 705 is 16.0.
The reward at episode 706 is 12.0.
The reward at episode 707 is 26.0.
The reward at episode 708 is 15.0.
The reward at episode 709 is 18.0.
The reward at episode 710 is 14.0.
The reward at episode 711 is 14.0.
The reward at episode 712 is 20.0.
The reward at episode 713 is 29.0.
The reward at episode 714 is 24.0.
The reward at episode 715 is 47.0.
The reward at episode 716 is 13.0.
The reward at episode 717 is 14.0.
The reward at episode 718 is 60.0.
The reward at episode 719 is 14.0.
The reward at episode 720 is 11.0.
The reward at episode 721 is 23.0.
The reward at episode 722 is 15.0.
The reward at episode 723 is 13.0.
The reward at episode 724 is 15.0.
The reward at episode 725 is 29.0.
The reward at episod

The reward at episode 947 is 22.0.
The reward at episode 948 is 21.0.
The reward at episode 949 is 40.0.
The reward at episode 950 is 16.0.
The reward at episode 951 is 18.0.
The reward at episode 952 is 12.0.
The reward at episode 953 is 9.0.
The reward at episode 954 is 12.0.
The reward at episode 955 is 14.0.
The reward at episode 956 is 38.0.
The reward at episode 957 is 44.0.
The reward at episode 958 is 15.0.
The reward at episode 959 is 13.0.
The reward at episode 960 is 17.0.
The reward at episode 961 is 29.0.
The reward at episode 962 is 22.0.
The reward at episode 963 is 11.0.
The reward at episode 964 is 17.0.
The reward at episode 965 is 23.0.
The reward at episode 966 is 10.0.
The reward at episode 967 is 15.0.
The reward at episode 968 is 13.0.
The reward at episode 969 is 59.0.
The reward at episode 970 is 28.0.
The reward at episode 971 is 28.0.
The reward at episode 972 is 14.0.
The reward at episode 973 is 10.0.
The reward at episode 974 is 16.0.
The reward at episode

NameError: name 'plot_episode_stats' is not defined