使用和DDQN类似的方法：即经验回放和双网络的方法来改进Actor-Critic难收敛的问题，这个算法就是是深度确定性策略梯度(Deep Deterministic Policy Gradient，以下简称DDPG)。

In [1]:
import tensorflow as tf
import numpy as np
import gym
import time

In [2]:
MAX_EPISODES = 2000
MAX_EP_STEPS = 200
LR_A = 0.001    # learning rate for actor
LR_C = 0.002    # learning rate for critic
GAMMA = 0.9     # reward discount
TAU = 0.01      # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

RENDER = False
ENV_NAME = 'Pendulum-v0'

In [3]:
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound,):
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.pointer = 0
        self.sess = tf.Session()

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval', trainable=True)
            a_ = self._build_a(self.S_, scope='target', trainable=False)
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            q = self._build_c(self.S, self.a, scope='eval', trainable=True)
            q_ = self._build_c(self.S_, a_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # target net replacement
        self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
                             for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]

        q_target = self.R + GAMMA * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)

        a_loss = - tf.reduce_mean(q)    # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

    def learn(self):
        # soft target replacement
        self.sess.run(self.soft_replace)

        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]
        bs = bt[:, :self.s_dim]
        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
        br = bt[:, -self.s_dim - 1: -self.s_dim]
        bs_ = bt[:, -self.s_dim:]

        self.sess.run(self.atrain, {self.S: bs})
        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1

    def _build_a(self, s, scope, trainable):
        with tf.variable_scope(scope):
            net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
            a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
            return tf.multiply(a, self.a_bound, name='scaled_a')

    def _build_c(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 30
            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)

In [None]:
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high

ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3  # control exploration
t1 = time.time()
for episode in range(MAX_EPISODES):
    s = env.reset()
    ep_reward = 0
    for j in range(MAX_EP_STEPS):
        if RENDER:
            env.render()

        # Add exploration noise
        a = ddpg.choose_action(s)
        a = np.clip(np.random.normal(a, var), -2, 2)  # add randomness to action selection for exploration
        s_, r, done, info = env.step(a)

        ddpg.store_transition(s, a, r / 10, s_)

        if ddpg.pointer > MEMORY_CAPACITY:
            var *= .9995  # decay the action randomness
            ddpg.learn()

        s = s_
        ep_reward += r
        if j == MAX_EP_STEPS - 1:
            print('Episode:', episode, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var,)
            # if ep_reward > -300:RENDER = True
            break
    if episode % 100 == 0:
        total_reward = 0
        for i in range(10):
            state = env.reset()
            for j in range(MAX_EP_STEPS):
                render = lambda: plt.imshow(env.render(mode='rgb_array'))
                action = ddpg.choose_action(state)  # direct action for test
                state, reward, done, _ = env.step(action)
                total_reward += reward
                if done:
                    break
        ave_reward = total_reward / 300
        print ('episode: ', episode, 'Evaluation Average Reward:', ave_reward)

print('Running time: ', time.time() - t1)

Episode: 0  Reward: -1663 Explore: 3.00
episode:  0 Evaluation Average Reward: -57.381147774677736
Episode: 1  Reward: -1755 Explore: 3.00
Episode: 2  Reward: -1756 Explore: 3.00
Episode: 3  Reward: -1681 Explore: 3.00
Episode: 4  Reward: -1568 Explore: 3.00
Episode: 5  Reward: -1818 Explore: 3.00
Episode: 6  Reward: -1734 Explore: 3.00
Episode: 7  Reward: -1542 Explore: 3.00
Episode: 8  Reward: -1605 Explore: 3.00
Episode: 9  Reward: -1634 Explore: 3.00
Episode: 10  Reward: -1725 Explore: 3.00
Episode: 11  Reward: -1649 Explore: 3.00
Episode: 12  Reward: -1277 Explore: 3.00
Episode: 13  Reward: -1684 Explore: 3.00
Episode: 14  Reward: -1605 Explore: 3.00
Episode: 15  Reward: -1645 Explore: 3.00
Episode: 16  Reward: -1640 Explore: 3.00
Episode: 17  Reward: -1501 Explore: 3.00
Episode: 18  Reward: -1474 Explore: 3.00
Episode: 19  Reward: -1478 Explore: 3.00
Episode: 20  Reward: -1389 Explore: 3.00
Episode: 21  Reward: -1686 Explore: 3.00
Episode: 22  Reward: -1153 Explore: 3.00
Episode:

Episode: 199  Reward: -331 Explore: 0.00
Episode: 200  Reward: -418 Explore: 0.00
episode:  200 Evaluation Average Reward: -3.2835222344063286
Episode: 201  Reward: -383 Explore: 0.00
Episode: 202  Reward: -254 Explore: 0.00
Episode: 203  Reward: -138 Explore: 0.00
Episode: 204  Reward: -261 Explore: 0.00
Episode: 205  Reward: -135 Explore: 0.00
Episode: 206  Reward: -383 Explore: 0.00
Episode: 207  Reward: -8 Explore: 0.00
Episode: 208  Reward: -406 Explore: 0.00
Episode: 209  Reward: -133 Explore: 0.00
Episode: 210  Reward: -137 Explore: 0.00
Episode: 211  Reward: -524 Explore: 0.00
Episode: 212  Reward: -879 Explore: 0.00
Episode: 213  Reward: -4 Explore: 0.00
Episode: 214  Reward: -9 Explore: 0.00
Episode: 215  Reward: -1518 Explore: 0.00
Episode: 216  Reward: -1436 Explore: 0.00
Episode: 217  Reward: -376 Explore: 0.00
Episode: 218  Reward: -5 Explore: 0.00
Episode: 219  Reward: -525 Explore: 0.00
Episode: 220  Reward: -1031 Explore: 0.00
Episode: 221  Reward: -528 Explore: 0.00
E

Episode: 397  Reward: -138 Explore: 0.00
Episode: 398  Reward: -275 Explore: 0.00
Episode: 399  Reward: -274 Explore: 0.00
Episode: 400  Reward: -1051 Explore: 0.00
episode:  400 Evaluation Average Reward: -10.172370187723603
Episode: 401  Reward: -1110 Explore: 0.00
Episode: 402  Reward: -133 Explore: 0.00
Episode: 403  Reward: -138 Explore: 0.00
Episode: 404  Reward: -407 Explore: 0.00
Episode: 405  Reward: -410 Explore: 0.00
Episode: 406  Reward: -7 Explore: 0.00
Episode: 407  Reward: -6 Explore: 0.00
Episode: 408  Reward: -132 Explore: 0.00
Episode: 409  Reward: -1492 Explore: 0.00
Episode: 410  Reward: -271 Explore: 0.00
Episode: 411  Reward: -284 Explore: 0.00
Episode: 412  Reward: -274 Explore: 0.00
Episode: 413  Reward: -406 Explore: 0.00
Episode: 414  Reward: -936 Explore: 0.00
Episode: 415  Reward: -945 Explore: 0.00
Episode: 416  Reward: -917 Explore: 0.00
Episode: 417  Reward: -971 Explore: 0.00
Episode: 418  Reward: -923 Explore: 0.00
Episode: 419  Reward: -807 Explore: 0.