In [1]:
from collections import deque
import gym
import numpy as np
import os
import pickle
import random
import tensorflow as tf
from tensorflow.keras import Input
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *

tf.compat.v1.disable_eager_execution() # 关闭动态图机制

class DDPGTrainer():
    def __init__(
        self, 
        n_features, 
        n_actions, 
        sample_size=128, 
        tau=0.99, 
        gamma=0.95, 
        epsilon=1.0, 
        epsilon_decay=0.995, 
        epsilon_min=0.01, 
        a_lr=0.0001, 
        c_lr=0.001
    ):
        self.tau = tau
        self.memory_buffer = deque(maxlen=4000)
        self.sample_size = sample_size
        self.gamma = gamma 
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.a_lr = a_lr
        self.c_lr = c_lr
        self.n_features = n_features
        self.n_actions = n_actions
        
        self.actor, self.critic = self.build_model()
        self.target_actor, self.target_critic = self.build_model()
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        
    def build_model(self):
        s_input = Input([self.n_features])
        a_input = Input([1])
        
        # actor
        x = Dense(units=40, activation='relu')(s_input)
        x = Dense(units=40, activation='relu')(x)
        x = Dense(units=1, activation='tanh')(x)
        action = Lambda(lambda x: x * self.n_actions)(x)
        actor = tf.keras.models.Model(inputs=s_input, outputs=action)
        
        # critic
        x = K.concatenate([s_input, a_input], axis=-1)
        x = Dense(40, activation='relu')(x)
        x = Dense(40, activation='relu')(x)
        q_a_value = Dense(1, activation='linear')(x)
        critic = tf.keras.models.Model(inputs=[s_input, a_input], outputs=q_a_value)
        
        actor.add_loss(-critic([s_input, action])) # 最大化Q_a，注意有个负号
        critic.trainable = False
        actor.compile(optimizer=tf.keras.optimizers.Adam(self.a_lr))
        critic.trainable = True
        
        actor.trainable = False
        critic.trainable = True # 由于actor的计算图用到critic部分，actor.trainable变化会影响critic.trainable
        critic.compile(optimizer=tf.keras.optimizers.Adam(self.c_lr), loss='mse')
        actor.trainable = True
        return actor, critic
    

    def OU(self, x, mu=0, theta=0.15, sigma=0.2):
        return theta * (mu - x) + sigma * np.random.randn(1) # shape: [1]

    def choose_action(self, state):
        action = self.actor.predict(state)[0][0] # shape: []
        noise = max(self.epsilon, 0) * self.OU(action)
        action = np.clip(action + noise, -self.n_actions, self.n_actions) # shape: [1]
        return action

    def store(self, state, action, reward, next_state, done):
        sample = (state, action, reward, next_state, done)
        self.memory_buffer.append(sample)

    def update_epsilon(self):
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_model(self):
        samples = random.sample(self.memory_buffer, self.sample_size)
        states = np.array([sample[0] for sample in samples])
        actions = np.array([sample[1] for sample in samples])
        rewards = np.array([sample[2] for sample in samples])
        next_states = np.array([sample[3] for sample in samples])
        dones = np.array([sample[4] for sample in samples])

        next_actions = self.target_actor.predict(next_states)
        q_a_next = self.target_critic.predict([next_states, next_actions]) # q_a_next.shape: [self.sample_size, 1]
        y = rewards + self.gamma * q_a_next[:, 0] * ~dones  # y.shape: [self.sample_size]
        self.critic.fit([states, actions], y[:, None], verbose=0) 
        self.actor.fit(states, verbose=0)
        
    def update_target_model(self):
        actor_weights = self.actor.get_weights()
        critic_weights = self.critic.get_weights()
        actor_target_weights = self.target_actor.get_weights()
        critic_target_weights = self.target_critic.get_weights()
        for i in range(len(actor_target_weights)):
            actor_target_weights[i] = actor_target_weights[i] * self.tau + (1 - self.tau) * actor_weights[i]
        for i in range(len(critic_target_weights)):
            critic_target_weights[i] = critic_target_weights[i] * self.tau + (1 - self.tau) * critic_weights[i]
        self.target_actor.set_weights(actor_target_weights)
        self.target_critic.set_weights(critic_target_weights)
        
    def save(self, checkpoint_path='pendulum'):
        self.ckpt_manager.save()
        with open(f'{checkpoint_path}/epsilon.pkl', 'wb') as f:
            pickle.dump(self.epsilon, f)
        
    def load(self, checkpoint_path='pendulum'):
        ckpt = tf.train.Checkpoint(
            actor=self.actor,
            critic=self.critic,
            target_actor=self.target_actor,
            target_critic=self.target_critic,
            actor_optimizer = self.actor.optimizer,
            critic_optimizer = self.critic.optimizer,
        )
        self.ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
        
        if os.path.exists(f'{checkpoint_path}/epsilon.pkl'):
            with open(f'{checkpoint_path}/epsilon.pkl', 'rb') as f:
                self.epsilon = pickle.load(f)
                
        if self.ckpt_manager.latest_checkpoint:
            status = ckpt.restore(self.ckpt_manager.latest_checkpoint)
            status.run_restore_ops() # 关闭动态图后需要添加这句执行restore操作
        

In [2]:
session = tf.compat.v1.InteractiveSession() # 关闭动态图后，ckpt_manager.save()需要有默认的session

In [5]:
env = gym.make('Pendulum-v0')
model = DDPGTrainer(env.observation_space.shape[0], env.action_space.high[0])
model.load()
try:
    for episode in range(100):
        next_state = env.reset()
        reward_sum = 0
        for step in range(200):
            env.render()
            state = next_state
            action = model.choose_action(state[None])
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward
            model.store(state, action, reward, next_state, done)

            if len(model.memory_buffer) > model.sample_size:
                model.update_model()
                model.update_target_model()
                model.update_epsilon()
        print(f'episode{episode} total reward: {reward_sum}')
    model.save()
finally:
    env.close()

episode0 total reward: -1132.2314667781802
episode1 total reward: -1653.4862889361395
episode2 total reward: -1600.8072680930939
episode3 total reward: -1422.7313416379466
episode4 total reward: -1273.8603840801647
episode5 total reward: -1328.8167627819191
episode6 total reward: -1368.0923254901684
episode7 total reward: -1214.2838980258011
episode8 total reward: -1195.7382097476486
episode9 total reward: -1219.164374071276
episode10 total reward: -1119.6288206332135
episode11 total reward: -1165.3573075244553
episode12 total reward: -1265.6987197976694
episode13 total reward: -565.3520756898341
episode14 total reward: -1224.7826103277896
episode15 total reward: -1041.4816089924786
episode16 total reward: -988.6489471709601
episode17 total reward: -595.6588166046691
episode18 total reward: -987.7924239019839
episode19 total reward: -787.6139483971895
episode20 total reward: -765.023713172175
episode21 total reward: -738.7464172030428
episode22 total reward: -760.0190769642367
episode2

In [6]:
session.close()