In [None]:
import tensorflow as tf
import numpy as np
import gym

In [None]:
env = gym.make('CartPole-v1')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

state_inputs = tf.placeholder(tf.float32, [None, state_space_dim], name='state_inputs')
value_inputs = tf.placeholder(tf.float32, [None], name='value_inputs')
advantage_inputs = tf.placeholder(tf.float32, [None], name='advantage_inputs')
action_inputs = tf.placeholder(tf.int32, [None], name='action_inputs')

with tf.variable_scope('actor_critic'):
    hidden1 = tf.layers.dense(state_inputs, 32, tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu)
    action_logits = tf.layers.dense(hidden2, action_space_dim)
    action_probs = tf.nn.softmax(action_logits)
    values = tf.squeeze(tf.layers.dense(hidden2, 1), axis=-1)

with tf.variable_scope('loss'):
    value_loss = tf.reduce_mean(tf.squared_difference(value_inputs, values))
    action_one_hot = tf.one_hot(action_inputs, action_space_dim)
    cross_entropy = -tf.reduce_sum(action_one_hot * tf.log(action_probs), axis=-1)
    policy_loss = tf.reduce_mean(cross_entropy * advantage_inputs)
    entropy_loss = -tf.reduce_mean(action_probs * tf.log(action_probs))
    loss = policy_loss - 0.0001*entropy_loss + 0.5*value_loss
    train_op = tf.train.AdamOptimizer(0.001, beta1=0.0).minimize(loss)

with tf.variable_scope('predict'):
    action_predict = tf.squeeze(tf.multinomial(action_logits, 1), axis=-1)

def returns_advantages(replay_buffer, next_value, gamma=0.99):
    rewards = [rb['r'] for rb in replay_buffer]
    values = [rb['v'] for rb in replay_buffer]
    dones = [rb['done'] for rb in replay_buffer]

    returns = np.append(np.zeros_like(rewards), next_value)
    for t in reversed(range(len(replay_buffer))):
        returns[t] = rewards[t] + gamma * returns[t+1] * (1-dones[t])
    returns = returns[:-1]
    advantages = returns - values
    return returns, advantages

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
BATCH_SIZE = 32

state = env.reset()
total_rewards = [0.0]
for episode in range(5001):
    replay_buffer = []
    for _ in range(BATCH_SIZE):
        act = sess.run(action_predict, {state_inputs:[state]})[0]
        v = sess.run(values, {state_inputs: [state]})[0]
        next_state, r, done, _ = env.step(act)
        replay_buffer.append({'s':state, 'v':v, 'a':act, 'r':r, 'done':done})
        state = next_state
        total_rewards[-1] += r
        if done:
            state = env.reset()
            total_rewards.append(0.0)

    if episode % 100 == 0:
        print("Episode: {} | Mean Reward of 100 turns is: {:.2f}".format(episode, np.mean(total_rewards[-100:])))

    next_value = sess.run(values, {state_inputs:[next_state]})[0]
    returns, advs = returns_advantages(replay_buffer, next_value)

    fd = {
        state_inputs: [rb['s'] for rb in replay_buffer],
        value_inputs: returns,
        advantage_inputs: advs,
        action_inputs: [rb['a'] for rb in replay_buffer],
    }
    sess.run(train_op, fd)
    