In [1]:
import tensorflow as tf
import gym
import re
import numpy as np

In [2]:
env = gym.make('CartPole-v0')
n_input = env.observation_space.shape[0]
n_actions = int(re.findall('\d',str(env.action_space))[0])

# Parameters
learning_rate = 1.e-1
training_epochs = 100
n_steps = 100
display_step = 100
exp_a = 0.1
exp_b = 0.0
batch_size = 100
loss_multiplier = 1
max_memory = 20000
discount = 0.9

# Network Parameters
n_hidden_1 = 4 # 1st layer number of features
n_hidden_2 = 4 # 2nd layer number of features

# tf Graph input
x = tf.placeholder(tf.float64, [None, n_input])
y = tf.placeholder(tf.float64, [None, n_actions])

[2016-07-08 13:53:09,499] Making new env: CartPole-v0


In [3]:
class ExperienceQModel(object):
    def __init__(self, max_memory=500, discount=.9):
        # Memory replay parameters
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def exp_remember(self, states):
        self.memory.append(states.copy())
        if len(self.memory) > self.max_memory:
          del self.memory[0]

    # based on https://gist.github.com/EderSantana/c7222daa328f0e885093
    def exp_get_batch(self,batch_size=10):
        len_memory = len(self.memory)
        n_examples = min(len_memory, batch_size)
        inputs = np.zeros((n_examples, n_input))
        targets = np.zeros((n_examples, n_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
            #get_memory
            states = self.memory[idx]
            state_t = states['state_t']
            state_tp1 = states['state_tp1']
            action = states['action']

            # input
            inputs[i] = state_t.astype(np.float64)

            # targets - not correcting those which are not taken
            feed_dict = {x: states['state_t'].reshape(1,-1)}
            targets[i] = sess.run(pred, feed_dict)
            
            # acted action
            feed_dict = {x: states['state_tp1'].reshape(1,-1)}
            Qsa = np.max(sess.run(pred, feed_dict))

            # check if endgame and if not apply discount
            if states['endgame']:
                targets[i,action] = states['reward'] # assign just reward if endgame
            else:
                targets[i,action] = states['reward'] + self.discount * Qsa
        return inputs, targets

In [4]:
# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # Hidden layer with RELU activation
#     layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
#     layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
    return out_layer

In [5]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1],dtype=tf.float64)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_actions],dtype=tf.float64))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1],dtype=tf.float64)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_actions],dtype=tf.float64))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*batch_size)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [6]:
# Launch the graph
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

# initialize states and experience replay
states = {}
exp_replay = ExperienceQModel(max_memory=max_memory)

# Training cycle
for epoch in range(training_epochs):
    avg_cost = 0.
    state_tp1 = env.reset()
    done = False

    for t in range(n_steps):
        env.render()
        state_t1 = np.array(state_tp1)
        
        # exploration cycle
        if np.random.rand() <= exp_a-exp_b*epoch/100:
            action = env.action_space.sample()
        else:
            feed_dict = {x: state_t1.reshape(1,-1)}
            qvals = sess.run(pred, feed_dict)
            action = np.argmax(qvals)

        # take a next step
        state_tp1, reward, done, info = env.step(action)

        # print (reward,done)
        # rewards
        if (t == 99) and (done == False):
            print("{}: won!".format(epoch))
        if done:
            reward = 0;

        # store experience
        states['action'] = action
        states['reward'] = float(reward)
        states['endgame'] = done
        states['state_t'] = np.array(state_t1)
        states['state_tp1'] = np.array(state_tp1)
        exp_replay.exp_remember(states)

        # get experience replay
        x_batch, y_batch = exp_replay.exp_get_batch(batch_size)
        _, c = sess.run([optimizer, cost], feed_dict={x: x_batch, y: y_batch})
        # Compute average loss
        avg_cost += c / n_steps

        # Lost
        if done:
            print("{}: lost after {}, cost {}".format(epoch,t+1,avg_cost))
            break

0: lost after 9, cost 0.0132851110676
1: lost after 9, cost 0.0351267394777
2: lost after 11, cost 0.029899303251
3: lost after 9, cost 0.0197067634291
4: lost after 9, cost 0.0743710002683
5: lost after 31, cost 0.823964048087
6: lost after 12, cost 0.39561808126
7: lost after 9, cost 0.295080199084
8: lost after 14, cost 0.512148328226
9: lost after 48, cost 2.61133565385
10: lost after 81, cost 1.91153745107
11: lost after 68, cost 0.519286056907
12: lost after 86, cost 0.401396074184
13: lost after 63, cost 0.418735949306
14: won!
15: won!
16: won!
17: lost after 72, cost 0.215839674744
18: won!
19: won!
20: won!
21: won!
22: won!
23: won!
24: lost after 98, cost 0.227931555383
25: won!
26: lost after 98, cost 0.244660035313
27: lost after 87, cost 0.209337725435
28: won!
29: won!
30: won!
31: won!
32: won!
33: lost after 96, cost 0.195273969966
34: won!
35: won!
36: won!
37: won!
38: lost after 40, cost 0.0869911708162
39: won!
40: won!
41: won!
42: won!
43: won!
44: won!
45: won!

KeyboardInterrupt: 

In [None]:
exp_replay.memory

In [None]:
feed_dict = {x: states['state_tp1'].reshape(1,-1)}
qvals = sess.run(pred, feed_dict)
print(qvals)

In [None]:
weights