In [None]:
import gym
from gym import wrappers
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import copy
import os
from datetime import date
import logging
from collections import deque

In [4]:
class REPLAY_BUFFER(object):
    def __init__(self, capacity, batch_size=64):
        self.capacity = capacity
        self.memory = deque(maxlen=capacity)
        self.batch_size = batch_size
        self.is_enough = False

    def add_data(self, data):
        self.memory.append(data)
        if len(self.memory) >= self.capacity:
            self.is_enough =True

    def sample_batch(self):
        batch_index = tf.random.uniform(shape=(self.batch_size, ), minval=0, maxval=self.capacity-1, dtype=tf.dtypes.int32)
        return np.array([self.memory[x] for x in batch_index])

    def clear_memory (self):
        self.memory.clear()
        self.is_enough = False

# critic net
class CRITIC_NET(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.concat = tf.keras.layers.Concatenate(axis=1)
        self.dense1 = tf.keras.layers.Dense(256, activation=tf.keras.activations.relu)
        self.dense2 = tf.keras.layers.Dense(256, activation=tf.keras.activations.relu)
        self.dense3 = tf.keras.layers.Dense(1)

    def call(self, inputs,  training=False):
        concat = self.concat([inputs[0], inputs[1]])
        x = self.dense1(concat)
        x = self.dense2(x)
        return self.dense3(x)

# actor net
class ACTOR_NET(tf.keras.Model):
    def __init__(self, action_dims, clip_min=-1, clip_max=1, epsilon=1e-16):
        super().__init__()
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.action_dims = action_dims
        self.epsilon = epsilon

        #mlp
        self.dense1 = tf.keras.layers.Dense(256, activation=tf.keras.activations.relu)
        self.dense2 = tf.keras.layers.Dense(256, activation=tf.keras.activations.relu)

        # mean and std
        self.mean_dense = tf.keras.layers.Dense(self.action_dims)
        self.std_dense = tf.keras.layers.Dense(self.action_dims)

    def call(self, inputs, training=False):
        x = self.dense1(inputs)
        x = self.dense2(x)
        y = self.mean_dense(x)
        z = self.std_dense(x)
        return y, z

    def eval(self, inputs):
        mean, log_std = self.call(inputs)
        # you can put limitation on mean and log_std if you want training more stable
        # mean = tf.clip_by_value(mean, self.clip_min, self.clip_max)
        # log_std = tf.clip_by_value(log_prob, self.clip_min, self.clip_max)

        # reparameterization
        std = tf.math.exp(log_std)
        n_dist = tfp.distributions.Normal(loc=mean, scale=std)
        z = mean + std*tf.random.normal(shape=(std.shape))

        actions = tf.tanh(z)
        log_prob = tf.reduce_sum(n_dist.log_prob(z) - tf.math.log(1 - tf.pow(tf.tanh(z), 2) + self.epsilon), 1, keepdims=True)
        return actions, log_prob



In [5]:
game_name = 'Ant'
ROOT_PATH = os.getcwd()
d = date.today().strftime('%Y%m%d')
path = os.path.join(ROOT_PATH, f'SAC_{game_name}', d)
if not os.path.isdir(path):
    os.makedirs(path)

# set logger
logger = logging.Logger(f'SAC_{game_name}')
fh = logging.FileHandler(filename=f'{path}/misc.log')
fh.setFormatter(logging.Formatter(f'[%(name)s] %(asctime)s: %(message)s'))
logger.addHandler(fh)
logger.info(f'logger has been created')

# set hyper parameters
GAMMA = 0.99 # discounting factor
TAU = 0.003 # soft update

# create replay buffer
replay_buffer = REPLAY_BUFFER(10000)

# create env of the gam
env = gym.make(game_name)
env = gym.wrappers.RecordVideo(env=env, video_folder=path, episode_trigger=lambda x: x%50==0, name_prefix=f'SAC_{game_name}')

# you should change the items below to fit your game
# In this code, the parameters below are designed for Ant
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
upper_boundary = env.action_space.high[0]
lower_boundary = env.action_space.low[0]

# create networks including critic network and actor work
actor_net =ACTOR_NET(action_dims=action_space, clip_min=-20, clip_max=20)
critic1_net = CRITIC_NET()
critic1_target_net = copy.deepcopy(critic1_net)
critic2_net = CRITIC_NET()
critic2_target_net = copy.deepcopy(critic2_net)

# create loss and opt
c_loss = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.Adam(learning_rate=3e-4)

# create temperature factor alpha
alpha = tf.Variable(0.0, dtype=tf.dtypes.float32)
h0 = tf.constant(-action_space, dtype=tf.dtypes.float32)

  logger.warn(
  deprecation(
  deprecation(
  logger.warn(


In [7]:
logger.info('start to train')
replay_buffer.clear_memory()
best_reward = 0
for episode in range(1000):
    # start a new game
    observation, info = env.reset(return_info=True)
    reward_list = []

    for step in range(1000):
        observation = observation.reshape(1, state_space)
        action, _ = actor_net.eval(observation)
        action = tf.clip_by_value(t=action, clip_value_min=lower_boundary, clip_value_max=upper_boundary)
        action = tf.squeeze(action)
        new_observation, reward, done, info = env.step(action)
        reward_list.append(reward)

        # store data into the replay buffer
        done = tf.cast(done, tf.float32)
        replay_buffer.add_data(np.hstack((tf.squeeze(observation), action, reward, done, tf.squeeze(new_observation))))

        # start to training if there are enough data in replay buffer
        if replay_buffer.is_enough:
            batches = replay_buffer.sample_batch()
            states = batches[:, 0:state_space]
            actions = batches[:, state_space:(state_space+action_space)]
            rewards = batches[:, (state_space+action_space):(state_space+action_space+1)]
            dones = batches[:, (state_space+action_space+1):(state_space+action_space+2)]
            new_states = batches[:, (state_space+action_space+2):]


            # calculate thee gradient of critic net
            with tf.GradientTape() as tape:
                q1_values = critic1_net((states, actions))
                next_actions, log_prob = actor_net.eval(new_states)

                q1_target_values = critic1_target_net((new_states, next_actions))
                q2_target_values = critic2_target_net((new_states, next_actions))

                v = tf.minimum(q1_target_values, q2_target_values)-alpha*log_prob
                y = rewards + GAMMA*(1-dones)*v
                c1_l = tf.reduce_mean(0.5*c_loss(q1_values, y))
            c1_grads = tape.gradient(c1_l, critic1_net.trainable_weights)

            with tf.GradientTape() as tape:
                q2_values = critic2_net((states, actions))
                next_actions, log_prob = actor_net.eval(new_states)

                q1_target_values = critic1_target_net((new_states, next_actions))
                q2_target_values = critic2_target_net((new_states, next_actions))

                v = tf.minimum(q1_target_values, q2_target_values)-alpha*log_prob
                y = rewards + GAMMA*(1-dones)*v
                c2_l = tf.reduce_mean(0.5*c_loss(q2_values, y))
            c2_grads = tape.gradient(c2_l, critic2_net.trainable_weights)

            opt.apply_gradients(zip(c1_grads, critic1_net.trainable_weights))
            opt.apply_gradients(zip(c2_grads, critic2_net.trainable_weights))

            # calculate the gradient of the actor net
            with tf.GradientTape() as tape:
                current_actions, log_prob = actor_net.eval(states)
                current_q1_values = critic1_net((states, current_actions))
                current_q2_values = critic2_net((states, current_actions))

                min_current_q_values = tf.minimum(current_q1_values, current_q2_values)
                soft_q_values = alpha*log_prob - min_current_q_values
                a_l = tf.reduce_mean(soft_q_values)
            a_grads = tape.gradient(a_l, actor_net.trainable_weights)
            opt.apply_gradients(zip(a_grads, actor_net.trainable_weights))

            # calculate the gradient of alpha
            with tf.GradientTape() as tape:
                _, log_prob = actor_net.eval(states)
                alpha_l = -tf.reduce_mean(alpha*(log_prob+h0))
            alpha_grads = tape.gradient(alpha_l, [alpha])
            opt.apply_gradients(zip(alpha_grads, [alpha]))

            # update weights of critic net
            for t_ws, ws in zip(critic1_target_net.weights, critic1_net.weights):
                t_ws.assign((1-TAU)*t_ws + TAU*ws)

            for t_ws, ws in zip(critic2_target_net.weights, critic2_net.weights):
                t_ws.assign((1-TAU)*t_ws + TAU*ws)

        observation = new_observation.copy()

        if done:
            break

    # report losses after every episode
    if episode%1 == 0 and replay_buffer.is_enough:
        logger.info(f'episode:{episode}, critic loss = {c1_l}, {c2_l}, actor loss = {a_l}, sum_r={tf.reduce_sum(reward_list)}')
        if (br:= tf.reduce_sum(reward_list)) > best_reward:
            # save the weights which produces the best result
            best_reward = br
            logger.warning(f'The weight of the best reward {best_reward} has been saved')
            for ws, wsn in zip((critic1_net, critic1_target_net, critic2_net, critic2_target_net, actor_net), ('critic1_net', 'critic1_target_net', 'critic2_net', 'critic2_target_net', 'actor_net')):
                ws.save_weights(filepath=f'{path}/{wsn}')

    env.close()

  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
