In [None]:
import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque

In [None]:
env = gym.make('CartPole-v1')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

state_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

states = tf.placeholder(tf.float32, [None, state_space_dim])
actions = tf.placeholder(tf.int32, [None])
td_q_values = tf.placeholder(tf.float32, [None])

with tf.variable_scope('q_network'):
    hidden1 = tf.layers.dense(states, 32, tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, 32, tf.nn.relu)
    q_values = tf.layers.dense(hidden2, action_space_dim)

with tf.variable_scope('loss'):
    one_hot_actions = tf.one_hot(actions, action_space_dim)
    loss = tf.reduce_mean(tf.squared_difference(
        tf.reduce_sum(q_values * one_hot_actions, axis=1),
        td_q_values
    ))
    train_op = tf.train.AdamOptimizer(0.0001).minimize(loss)

In [None]:
def choose_action(state, epsilon=0.1):
    if np.random.uniform() < epsilon:
        action = np.random.randint(0, action_space_dim)
    else:
        action = np.argmax(sess.run(q_values, feed_dict={states:[state]}))
    return action

def create_td_q_values(mini_batch, gamma=0.99):
    td_q_values = []
    next_q_values = sess.run(q_values, {states: [rb['next_s'] for rb in mini_batch]})
    rewards = [rb['r'] for rb in mini_batch]
    dones = [rb['done'] for rb in mini_batch]
    for t, (r, nqv, done) in enumerate(zip(rewards, next_q_values, dones)):
        if done:
            td_q_values.append(rewards[t])
        else:
            td_q_values.append(rewards[t] + gamma * np.max(next_q_values[t]))
    return td_q_values

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
BATCH_SIZE = 64
replay_buffer = deque(maxlen=5000)

for episode in range(5001):
    state = env.reset()
    total_reward = 0
    while True:
        act = choose_action(state)
        next_state, r, done, _ = env.step(act)
        replay_buffer.append({'s':state, 'a':act, 'r':r, 'next_s':next_state, 'done':done})
        state = next_state
        total_reward += 1
        if done:
            break

        if len(replay_buffer) > BATCH_SIZE:
            mini_batch = random.sample(replay_buffer, BATCH_SIZE)
            fd = {
                states: [rb['s'] for rb in mini_batch],
                actions: [rb['a'] for rb in mini_batch],
                td_q_values: create_td_q_values(mini_batch),
            }
            sess.run(train_op, fd)

    if episode % 100 == 0:
        print("Episode: {} | Reward is: {}".format(episode, total_reward))