In [None]:
%matplotlib inline

import tensorflow as tf
import numpy as np
import gym
import matplotlib.pyplot as plt

In [None]:
env = gym.make('CartPole-v1')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

def discount_rewards(rewards, gamma=0.99):
    returns = [0] * (len(rewards) + 1)
    for t in reversed(range(len(rewards))):
        returns[t] = rewards[t] + gamma * returns[t+1]
    returns = returns[:-1]
    return returns

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

states = tf.placeholder(tf.float32, [None, state_space_dim])
rewards = tf.placeholder(tf.float32, [None])
actions = tf.placeholder(tf.int32, [None])

with tf.variable_scope('policy'):
    hidden1 = tf.layers.dense(states, 32, tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu)
    action_logits = tf.layers.dense(hidden2, action_space_dim)
    action_probs = tf.nn.softmax(action_logits)

with tf.variable_scope('loss'):
    action_one_hot = tf.one_hot(actions, action_space_dim)
    cross_entropy = -tf.reduce_sum(action_one_hot * tf.log(action_probs))
    loss = tf.reduce_mean(cross_entropy * rewards)
    train_op = tf.train.AdamOptimizer(0.001).minimize(loss)

with tf.variable_scope('predict'):
    action_predict = tf.squeeze(tf.multinomial(action_logits, 1), axis=-1)


In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
for episode in range(5001):
    replay_buffer = []
    state = env.reset()
    while True:
        act = sess.run(action_predict, {states:[state]})[0]
        next_state, r, done, _ = env.step(act)
        replay_buffer.append({'s':state, 'a':act, 'r':r})
        state = next_state
        if done:
            break

    fd = {
        states: [rb['s'] for rb in replay_buffer],
        rewards: discount_rewards([rb['r'] for rb in replay_buffer]),
        actions: [rb['a'] for rb in replay_buffer],
    }
    sess.run(train_op, fd)

    if episode % 100 == 0:
        print("Episode: {} | Reward is: {}".format(episode, np.sum([rb['r'] for rb in replay_buffer])))
