In [None]:
import tensorflow as tf
import numpy as np
import gym

In [None]:
env = gym.make('CartPole-v1').unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

In [None]:
P_LR = 0.0001
C_LR = 0.0001
BATCH_SIZE = 64
UPDATE_TIMES = 15
EPSILON = 0.2

In [None]:
state_inputs = tf.placeholder(tf.float32, [None, state_space_dim], name='state_inputs')
value_inputs = tf.placeholder(tf.float32, [None], name='value_inputs')
advantage_inputs = tf.placeholder(tf.float32, [None], name='advantage_inputs')
action_inputs = tf.placeholder(tf.int32, [None], name='action_inputs')

In [None]:
with tf.variable_scope('critic'):
    hidden1 = tf.layers.dense(state_inputs, 32, tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu)
    values = tf.squeeze(tf.layers.dense(hidden2, 1), axis=-1)
    value_loss = tf.reduce_mean(tf.squared_difference(value_inputs, values))
    critic_train_op = tf.train.AdamOptimizer(C_LR).minimize(value_loss)

with tf.variable_scope('policy'):
    hidden1 = tf.layers.dense(state_inputs, 32, tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu)
    action_logits = tf.layers.dense(hidden2, action_space_dim)
    action_probs = tf.nn.softmax(action_logits)
    sampled_action = tf.squeeze(tf.multinomial(action_logits, 1), axis=-1)

with tf.variable_scope('old_policy'):
    hidden1 = tf.layers.dense(state_inputs, 32, tf.nn.relu, trainable=False)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu, trainable=False)
    old_action_logits = tf.layers.dense(hidden2, action_space_dim, trainable=False)
    old_action_probs = tf.nn.softmax(old_action_logits)

action_one_hot = tf.one_hot(action_inputs, action_space_dim)
ratio = tf.reduce_sum(action_one_hot * action_probs, axis=-1) \
        / tf.reduce_sum(action_one_hot * old_action_probs, axis=-1)
policy_loss = -tf.reduce_mean(
    tf.minimum(ratio * advantage_inputs,
               tf.clip_by_value(ratio, 1.0-EPSILON, 1.0+EPSILON) * advantage_inputs)
)
policy_train_op = tf.train.AdamOptimizer(P_LR).minimize(policy_loss)

policy_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy')
old_policy_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='old_policy')
update_old_policy_op = [old_p.assign(p) for old_p, p in zip(old_policy_params, policy_params)]

def returns_advantages(replay_buffer, next_value, gamma=0.99):
    rewards = [rb['r'] for rb in replay_buffer]
    values = [rb['v'] for rb in replay_buffer]
    dones = [rb['done'] for rb in replay_buffer]

    returns = np.append(np.zeros_like(rewards), next_value)
    for t in reversed(range(len(replay_buffer))):
        returns[t] = rewards[t] + gamma * returns[t+1] * (1-dones[t])
    returns = returns[:-1]
    advantages = returns - values
    return returns, advantages

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
state = env.reset()
total_rewards = [0.0]
for episode in range(2001):
    replay_buffer = []
    for _ in range(BATCH_SIZE):
        act = sess.run(sampled_action, {state_inputs:[state]})[0]
        v = sess.run(values, {state_inputs: [state]})[0]

        next_state, r, done, _ = env.step(act)
        replay_buffer.append({'s':state, 'v':v, 'a':act, 'r':r, 'done':done})
        state = next_state
        total_rewards[-1] += r
        if done:
            state = env.reset()
            total_rewards.append(0.0)

    next_value = sess.run(values, {state_inputs:[next_state]})[0]
    returns, advs = returns_advantages(replay_buffer, next_value)

    fd = {
        state_inputs: [rb['s'] for rb in replay_buffer],
        value_inputs: returns,
        advantage_inputs: advs,
        action_inputs: [rb['a'] for rb in replay_buffer],
    }
    sess.run(update_old_policy_op)
    for _ in range(UPDATE_TIMES):
        sess.run(policy_train_op, fd)
    for _ in range(UPDATE_TIMES):
        sess.run(critic_train_op, fd)

    if episode % 100 == 0:
        print("Episode: {} | Mean Reward of 100 turns is: {:.2f}".format(
            episode, np.mean(total_rewards[-100:])))