In [1]:
import tensorflow as tf
import gym
import re
import numpy as np

In [2]:
env = gym.make('CartPole-v0')
n_input = env.observation_space.shape[0]
n_actions = int(re.findall('\d',str(env.action_space))[0])

# Parameters
learning_rate = 0.001
training_epochs = 500
n_steps = 100
display_step = 100
exploration = 0.1
batch_size = 50

# Network Parameters
n_hidden_1 = 4 # 1st layer number of features
n_hidden_2 = 4 # 2nd layer number of features

# tf Graph input
x = tf.placeholder(tf.float64, [None, n_input])
y = tf.placeholder(tf.float64, [None, n_actions])

[2016-07-07 21:55:12,860] Making new env: CartPole-v0


In [3]:
class ExperienceQModel(object):
    def __init__(self, max_memory=500, discount=.9):
        # Memory replay parameters
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def exp_remember(self, states):
        self.memory.append(states.copy())
        if len(self.memory) > self.max_memory:
          del self.memory[0]

    def exp_get_batch(self,batch_size=10):
        len_memory = len(self.memory)
        n_examples = min(len_memory, batch_size)
        inputs = np.zeros((n_examples, n_input))
        targets = np.zeros((n_examples, n_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
          #get_memory
          states = self.memory[idx]
          state_t = states['state_t']
          state_tp1 = states['state_tp1']

          inputs[i] = state_t.astype(np.float64) # assign features

          if states['endgame']:
            targets[i] = states['reward'] # assign just reward if endgame
          else:
            feed_dict = {x: states['state_tp1'].reshape(1,-1)}
            qvals = sess.run(pred, feed_dict)
#             Q_sa = np.max(qvals)
            targets[i] = states['reward'] + self.discount * qvals
        return inputs, targets

In [4]:
# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # Hidden layer with RELU activation
#     layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
#     layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
    return out_layer

In [5]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1],dtype=tf.float64)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_actions],dtype=tf.float64))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1],dtype=tf.float64)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_actions],dtype=tf.float64))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*batch_size)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [6]:
# Launch the graph
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

# initialize states and experience replay
states = {}
exp_replay = ExperienceQModel()
env.render()

# Training cycle
for epoch in range(training_epochs):
    avg_cost = 0.
    state_tp1 = env.reset()

    for t in range(n_steps):
        state_t1 = state_tp1
        
        # exploration cycle
        if np.random.rand() <= exploration:
            action = env.action_space.sample()
        else:
            feed_dict = {x: state_t1.reshape(1,-1)}
            qvals = sess.run(pred, feed_dict)
            action = np.argmax(qvals)

        # take a next step
        state_tp1, reward, done, info = env.step(action)

        # store experience
        states['action'] = np.array(action)
        states['reward'] = float(reward)
        states['endgame'] = done
        states['state_t'] = np.array(state_t1)
        states['state_tp1'] = np.array(state_tp1)
        exp_replay.exp_remember(states)

        # get experience replay
        x_batch, y_batch = exp_replay.exp_get_batch(batch_size)
        _, c = sess.run([optimizer, cost], feed_dict={x: x_batch, y: y_batch})
        # Compute average loss
        avg_cost += c / n_steps

        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

    # Display logs per epoch step
    if epoch % display_step == 0:
        print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)
        
    print "Optimization Finished!"

Episode finished after 11 timesteps
Epoch: 0001 cost= 0.041301270
Optimization Finished!
Episode finished after 9 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 9 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 8 timesteps
Optimization Finished!
Episode finished after 8 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 9 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 9 timesteps
Optimization Finished!
Episode finished after 10 timesteps
Optimization Finished!
Episode finished after 11 timest

In [7]:
states['state_tp1'].reshape(1,-1).shape

(1, 4)

In [8]:
feed_dict = {x: states['state_tp1'].reshape(1,-1)}
qvals = sess.run(pred, feed_dict)
print(qvals)

[[ 4.69812352  4.70092836]]


In [10]:
weights

{'h1': <tensorflow.python.ops.variables.Variable at 0x7f093f431fd0>,
 'h2': <tensorflow.python.ops.variables.Variable at 0x7f093f431a90>,
 'out': <tensorflow.python.ops.variables.Variable at 0x7f093f431c50>}