In [None]:
import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque

In [None]:
TAU = 0.01
GAMMA = 0.9
MAX_EP_STEPS = 200

In [None]:
env = gym.make('Pendulum-v0')
env = env.unwrapped

print(env.action_space)
print(env.action_space.high)
print(env.action_space.low)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.shape[0]
action_space_bound = env.action_space.high

In [None]:
state_inputs = tf.placeholder(tf.float32, shape=[None, state_space_dim], name='states')
reward_inputs = tf.placeholder(tf.float32, [None], name='rewards')
next_state_inputs = tf.placeholder(tf.float32, shape=[None, state_space_dim], name='next_states')

In [None]:
with tf.variable_scope('actor'):
    hidden = tf.layers.dense(state_inputs, 32, activation=tf.nn.relu)
    unscaled_actions = tf.layers.dense(hidden, action_space_dim, activation=tf.nn.tanh)
    actions = action_space_bound * unscaled_actions

with tf.variable_scope('target_actor'):
    hidden = tf.layers.dense(next_state_inputs, 32, activation=tf.nn.relu, trainable=False)
    unscaled_actions = tf.layers.dense(hidden, action_space_dim, activation=tf.nn.tanh, trainable=False)
    target_actions = action_space_bound * unscaled_actions

actor_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor')
actor_target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor')
update_actor_target_op = [tf.assign(atp, (1 - TAU) * atp + TAU * ap) for atp, ap in zip(actor_target_params, actor_params)]

In [None]:
with tf.variable_scope('critic'):
    hidden = tf.layers.dense(
        tf.concat([state_inputs, actions], axis=-1), 32, activation=tf.nn.relu)
    q_values = tf.squeeze(tf.layers.dense(hidden, 1, activation=None), axis=-1)

with tf.variable_scope('target_critic'):
    hidden = tf.layers.dense(
        tf.concat([next_state_inputs, target_actions], axis=-1), 32, activation=tf.nn.relu, trainable=False)
    target_q_values = tf.squeeze(tf.layers.dense(hidden, 1, activation=None, trainable=False), axis=-1)

critic_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic')
critic_target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic')
update_critic_target_op = [tf.assign(ctp, (1 - TAU) * ctp + TAU * cp) for ctp, cp in zip(critic_target_params, critic_params)]

In [None]:
with tf.variable_scope('optimization'):
    critic_loss = tf.reduce_mean(tf.squared_difference(reward_inputs + GAMMA * target_q_values, q_values))
    critic_train_op = tf.train.AdamOptimizer(0.001).minimize(critic_loss)

    action_grads = tf.gradients(q_values, actions)
    actor_grads = tf.gradients(ys=actions, xs=actor_params, grad_ys=action_grads)
    actor_train_op = tf.train.AdamOptimizer(-0.001).apply_gradients(zip(actor_grads, actor_params))

In [None]:
def choose_action(state, noise_scale):
    act = sess.run(actions, feed_dict={state_inputs: [state]})[0]
    act = np.clip(np.random.normal(act, noise_scale), -action_space_bound, action_space_bound)
    return act

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
BATCH_SIZE = 32
replay_buffer = deque(maxlen=10000)
noise_scale = 3.0

total_rewards = []
for episode in range(201):
    total_rewards.append(0.0)
    state = env.reset()
    for _ in range(MAX_EP_STEPS):
        act = choose_action(state, noise_scale)
        next_state, r, done, _ = env.step(act)
        replay_buffer.append({'s':state, 'r':r, 'next_s':next_state})
        state = next_state
        total_rewards[-1] += r

        if len(replay_buffer) > BATCH_SIZE:
            noise_scale *= .9999
            mini_batch = random.sample(replay_buffer, BATCH_SIZE)
            fd = {
                state_inputs: [rb['s'] for rb in mini_batch],
                reward_inputs: [rb['r'] for rb in mini_batch],
                next_state_inputs: [rb['next_s'] for rb in mini_batch], 
            }
            sess.run(critic_train_op, fd)
            sess.run(update_critic_target_op, fd)
            sess.run(actor_train_op, fd)
            sess.run(update_actor_target_op, fd)

    if episode % 10 == 0:
        print("Episode: {} | Mean Reward of 10 turns is: {:.2f} | noise_scale is: {:.5f}".format(
            episode,
            np.mean(total_rewards[-10:]),
            noise_scale
        ))