In [1]:
# Import MINST data
import tensorflow as tf
import gym
import re
import numpy as np

In [14]:
env = gym.make('CartPole-v0')
n_input = env.observation_space.shape[0]
n_actions = int(re.findall('\d',str(env.action_space))[0])

# Parameters
learning_rate = 0.001
training_epochs = 20
n_steps = 100
display_step = 1
exploration = 0.1
batch_size = 50

# Network Parameters
n_hidden_1 = 4 # 1st layer number of features
n_hidden_2 = 4 # 2nd layer number of features

# tf Graph input
x = tf.placeholder(tf.float64, [None, n_input])
y = tf.placeholder(tf.float64, [None, n_actions])

[2016-07-07 21:40:55,654] Making new env: CartPole-v0


In [21]:
class ExperienceQModel(object):
    def __init__(self, max_memory=500, discount=.9):
        # Memory replay parameters
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def exp_remember(self, states):
        self.memory.append(states.copy())
        if len(self.memory) > self.max_memory:
          del self.memory[0]

    def exp_get_batch(self,batch_size=10):
        len_memory = len(self.memory)
        n_examples = min(len_memory, batch_size)
        inputs = np.zeros((n_examples, n_input))
        targets = np.zeros((n_examples, n_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
          #get_memory
          states = self.memory[idx]
          state_t = states['state_t']
          state_tp1 = states['state_tp1']

          inputs[i] = state_t.astype(np.float64) # assign features

          if states['endgame']:
            targets[i] = states['reward'] # assign just reward if endgame
          else:
            feed_dict = {x: states['state_tp1'].reshape(1,-1)}
            qvals = sess.run(pred, feed_dict)
#             Q_sa = np.max(qvals)
            targets[i] = states['reward'] + self.discount * qvals
        return inputs, targets

In [12]:
# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # Hidden layer with RELU activation
#     layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
#     layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
    return out_layer

In [15]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1],dtype=tf.float64)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_actions],dtype=tf.float64))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1],dtype=tf.float64)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2],dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_actions],dtype=tf.float64))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*batch_size)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [22]:
# Launch the graph
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

# initialize states and experience replay
states = {}
exp_replay = ExperienceQModel()
env.render()

# Training cycle
for epoch in range(training_epochs):
    avg_cost = 0.
    state_tp1 = env.reset()

    for t in range(n_steps):
        state_t1 = state_tp1
        print(state_t1)
        
        # exploration cycle
        if np.random.rand() <= exploration:
            action = env.action_space.sample()
        else:
            feed_dict = {x: state_t1.reshape(1,-1)}
            qvals = sess.run(pred, feed_dict)
            action = np.argmax(qvals)

        # take a next step
        state_tp1, reward, done, info = env.step(action)

        # store experience
        states['action'] = np.array(action)
        states['reward'] = float(reward)
        states['endgame'] = done
        states['state_t'] = np.array(state_t1)
        states['state_tp1'] = np.array(state_tp1)
        exp_replay.exp_remember(states)

        # get experience replay
        x_batch, y_batch = exp_replay.exp_get_batch(batch_size)
        _, c = sess.run([optimizer, cost], feed_dict={x: x_batch, y: y_batch})
        # Compute average loss
        avg_cost += c / n_steps

        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

    # Display logs per epoch step
    if epoch % display_step == 0:
        print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)
        
    print "Optimization Finished!"

[-0.03653408 -0.0017879   0.02365884 -0.04010206]
[-0.03656983 -0.19724099  0.0228568   0.25995057]
[-0.04051465 -0.39268165  0.02805581  0.5597543 ]
[-0.04836829 -0.58818591  0.0392509   0.86114262]
[-0.060132   -0.78381977  0.05647375  1.16590419]
[-0.0758084  -0.97962942  0.07979183  1.47574487]
[-0.09540099 -1.17563038  0.10930673  1.79224481]
[-0.1189136  -1.37179482  0.14515163  2.11680761]
[-0.14634949 -1.56803661  0.18748778  2.45059966]
Episode finished after 9 timesteps
Epoch: 0001 cost= 0.009163758
Optimization Finished!
[-0.00839862  0.00889034 -0.04087267 -0.0292017 ]
[-0.00822081 -0.18562236 -0.0414567   0.25031046]
[-0.01193326 -0.38012853 -0.03645049  0.52963425]
[-0.01953583 -0.18451327 -0.02585781  0.2256922 ]
[-0.0232261  -0.37925631 -0.02134396  0.51010775]
[-0.03081122 -0.57407118 -0.01114181  0.79598884]
[-0.04229265 -0.76903847  0.00477797  1.085146  ]
[-0.05767342 -0.96422312  0.02648089  1.37932436]
[-0.07695788 -1.1596655   0.05406738  1.68016962]
[-0.10015119

In [7]:
states['state_tp1'].reshape(1,-1).shape

(1, 4)

In [26]:
feed_dict = {x: states['state_tp1'].reshape(1,-1)}
qvals = sess.run(pred, feed_dict)
print(qvals)

[[ 0.70402213 -0.98946279]]


In [24]:
y_batch

array([[ 1.8433828 ,  0.17320516],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,  0.17320516],
       [ 1.82289087,  0.16691397],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,  0.17320516],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,  0.17320516],
       [ 1.67419238,  0.12126233],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.8433828 ,  0.17320516],
       [ 1.        ,  1.        ],
       [ 1.8433828 ,