In [1]:
# reference from

import numpy as np
import tensorflow as tf
from tqdm import tqdm
import gym
import time
import matplotlib.pyplot as plt

MAX_EPISODES = 200
MAX_EP_STEPS = 200
ENV_NAME = 'Pendulum-v0'
MEMORY_CAPACITY = 10000
RENDER = True
np.random.seed(1)
tf.compat.v1.set_random_seed(1)
tf.compat.v1.disable_eager_execution()
    

In [2]:

class DDPG:
    def __init__(
            self,
            a_dim,
            s_dim,
            a_bound,
            learning_rate=0.02,
            discount_factor=0.9,
            memory_size=10000,
            batch_size=64,
            output_graph=False,
    ):
        self.n_actions = a_dim
        self.n_features = s_dim
        self.lr = learning_rate
        self.gamma = discount_factor
        self.memory_size = memory_size
        self.batch_size=batch_size
        self.pointer = 0
        self.TAU = 0.01
        self.a_bound = a_bound
        self.memory_trans = np.zeros((self.memory_size, self.n_features * 2 + self.n_actions + 1), dtype=np.float32)

        # total learning step
        self.learn_step_counter = 0

        # Create sess
        self.sess = tf.compat.v1.Session()

        # define inputs
        self.s = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s')  # s
        self.s_ = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s_')  # s'
        self.r = tf.compat.v1.placeholder(tf.float32, [None, 1], name='r')  # Reward

        with tf.compat.v1.variable_scope('Actor'):
            self.a = self._build_actor(self.s, trainable=True, scope='eval')
            self.a_ = self._build_actor(self.s_, trainable=False, scope='target')

        with tf.compat.v1.variable_scope('Critic'):
            self.Av = self._build_critic(self.s, self.a, trainable=True, scope='eval')
            self.Av_ = self._build_critic(self.s_, self.a_, trainable=False, scope='target')

        # networks parameters
        self.ae_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # target net replacement
        self.soft_replace = [tf.compat.v1.assign(t, (1 - self.TAU)*t + self.TAU*e)
                             for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]

        with tf.compat.v1.variable_scope('Critic_loss'):
            self.TD_target = self.r + self.gamma*self.Av_
            self.TD_error = tf.compat.v1.losses.mean_squared_error(labels=self.TD_target, predictions=self.Av)

        with tf.compat.v1.variable_scope('Actor_loss'):
            self.actor_error = - tf.reduce_mean(self.Av)

        with tf.compat.v1.variable_scope('train'):
            # minimize error
            self.critic_train = tf.compat.v1.train.AdamOptimizer(self.lr).minimize(self.TD_error, var_list=self.ce_params)
            self.actor_train = tf.compat.v1.train.AdamOptimizer(self.lr).minimize(self.actor_error, var_list=self.ae_params)

        # initial parameter
        self.sess.run(tf.compat.v1.global_variables_initializer())

        if output_graph:
            # $ tensorboard --logdir=logs
            tf.summary.FileWriter("logs/", self.sess.graph)

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory_trans[index, :] = transition
        self.pointer += 1

    def choose_action(self, s):
        return self.sess.run(self.a, {self.s: s[np.newaxis, :]})[0]

    def _build_actor(self, s, trainable, scope):
        with tf.compat.v1.variable_scope(scope):
            f1 = tf.compat.v1.layers.dense(s, 20, activation=tf.nn.relu, name='f1', trainable=trainable)
            Av_raw = tf.compat.v1.layers.dense(f1, self.n_actions, activation=tf.nn.tanh, name='Av', trainable=trainable)
            return tf.multiply(Av_raw, self.a_bound, name='scaled_a')

    def _build_critic(self, s, a, trainable, scope):
        with tf.compat.v1.variable_scope(scope):
            n_l1 = 20
            w1_s = tf.compat.v1.get_variable('w1_s', [self.n_features, n_l1], trainable=trainable)
            w1_a = tf.compat.v1.get_variable('w1_a', [self.n_actions, n_l1], trainable=trainable)
            b1 = tf.compat.v1.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            Av = tf.compat.v1.layers.dense(net, 1, trainable=trainable)
            return Av

    def learn(self):
        self.sess.run(self.soft_replace)

        # sample batch memory from all memory
        indices = np.random.choice(self.memory_size, size=self.batch_size)

        batch_memory_trans = self.memory_trans[indices, :]
        bt = self.memory_trans[indices, :]
        bs = bt[:, :self.n_features]
        ba = bt[:, self.n_features: self.n_features + self.n_actions]
        br = bt[:, -self.n_features - 1: -self.n_features]
        bs_ = bt[:, -self.n_features:]

        self.sess.run(self.actor_train, feed_dict={self.s:  bs})
        self.sess.run(self.critic_train, feed_dict={self.s:  bs,
                                                    self.a:  ba,
                                                    self.r:  br,
                                                    self.s_: bs_})


In [3]:


env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high


ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3  # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
    s = env.reset()
    ep_reward = 0
    for j in range(MAX_EP_STEPS):
        if RENDER:
            env.render()

        # Add exploration noise
        a = ddpg.choose_action(s)
        a = np.clip(np.random.normal(a, var), -2, 2)    # add randomness to action selection for exploration
        s_, r, done, info = env.step(a)

        ddpg.store_transition(s, a, r / 10, s_)

        if ddpg.pointer > MEMORY_CAPACITY:
            var *= .9995    # decay the action randomness
            ddpg.learn()

        s = s_
        ep_reward += r
        if j == MAX_EP_STEPS-1:
            print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
            # if ep_reward > -300:RENDER = True
            break
print('Running time: ', time.time() - t1)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode: 0  Reward: -1631 Explore: 3.00
Episode: 1  Reward: -1389 Explore: 3.00
Episode: 2  Reward: -1450 Explore: 3.00
Episode: 3  Reward: -1604 Explore: 3.00
Episode: 4  Reward: -1368 Explore: 3.00
Episode: 5  Reward: -1718 Explore: 3.00
Episode: 6  Reward: -1457 Explore: 3.00
Episode: 7  Reward: -1583 Explore: 3.00
Episode: 8  Reward: -1163 Explore: 3.00
Episode: 9  Reward: -1450 Explore: 3.00
Episode: 10  Reward: -1508 Explore: 3.00
Episode: 11  Reward: -1587 Explore: 3.00
Episode: 12  Reward: -1824 Explore: 3.00
Episode: 13  Reward: -1377 Explore: 3.00
Episode: 14  Reward: -1550 Explore: 3.00
Episode: 15  Reward: -1592 Explore: 3.00
Episode: 16  Reward: -1519 Explore: 3.00
Episode: 17  Reward: -1320 Explore: 3.00
Episode: 18  Reward: -1488 Explore: 3.00
Episode: 19  Reward:

Episode: 179  Reward: -128 Explore: 0.00
Episode: 180  Reward: -132 Explore: 0.00
Episode: 181  Reward: -279 Explore: 0.00
Episode: 182  Reward: -131 Explore: 0.00
Episode: 183  Reward: -126 Explore: 0.00
Episode: 184  Reward: -240 Explore: 0.00
Episode: 185  Reward: -138 Explore: 0.00
Episode: 186  Reward: -262 Explore: 0.00
Episode: 187  Reward: -504 Explore: 0.00
Episode: 188  Reward: -129 Explore: 0.00
Episode: 189  Reward: -120 Explore: 0.00
Episode: 190  Reward: -260 Explore: 0.00
Episode: 191  Reward: -270 Explore: 0.00
Episode: 192  Reward: -130 Explore: 0.00
Episode: 193  Reward: -3 Explore: 0.00
Episode: 194  Reward: -407 Explore: 0.00
Episode: 195  Reward: -290 Explore: 0.00
Episode: 196  Reward: -763 Explore: 0.00
Episode: 197  Reward: -390 Explore: 0.00
Episode: 198  Reward: -124 Explore: 0.00
Episode: 199  Reward: -128 Explore: 0.00
Running time:  314.37458968162537
