In [1]:
import gym
import re
import tensorflow as tf
import numpy as np

In [2]:
class ExperienceQModel(object):
    def __init__(self, env='CartPole-v0', max_memory=10000, discount=.9, n_episodes=100, 
                 n_steps=100, batch_size=100, learning_rate = 0.01, exploration_a=0.1, exploration_b=0.0):
        
        # Memory replay parameters
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

        # exploration
        self.exp_a = exploration_a
        self.exp_b = exploration_b # epoch/100 coefficient
        
        # environment parameters
        self.env = gym.make(env)
        self.n_input = self.env.observation_space.shape[0]
        self.n_actions = int(re.findall('\d+',str(self.env.action_space))[0]) # shameless hack to get a dim of actions
        
        # training parameters
        self.learning_rate = learning_rate
        self.n_episodes = n_episodes
        self.n_steps = n_steps
        self.batch_size = batch_size

        # Network Parameters
        self.n_hidden_1 = 4 # 1st layer
        self.n_hidden_2 = 4
        
        # Initialize input and output
        self.x = tf.placeholder(tf.float64, [None, self.n_input])
        self.y = tf.placeholder(tf.float64, [None, self.n_actions])
        
        # Initialize layers weight & bias
        self.weights = {
            'h1': tf.Variable(tf.random_normal([self.n_input, self.n_hidden_1],dtype=tf.float64)),
            'h2': tf.Variable(tf.random_normal([self.n_hidden_1, self.n_hidden_2],dtype=tf.float64)),
            'out': tf.Variable(tf.random_normal([self.n_hidden_2, self.n_actions],dtype=tf.float64))
        }
        self.biases = {
            'b1': tf.Variable(tf.random_normal([self.n_hidden_1],dtype=tf.float64)),
            'b2': tf.Variable(tf.random_normal([self.n_hidden_2],dtype=tf.float64)),
            'out': tf.Variable(tf.random_normal([self.n_actions],dtype=tf.float64))
        }
        
        # define graph
        self.define_model()
        

    def exp_remember(self, states):
        self.memory.append(states.copy())
        if len(self.memory) > self.max_memory:
          del self.memory[0]

    # based on https://gist.github.com/EderSantana/
    def exp_get_batch(self):
        len_memory = len(self.memory)
        n_examples = min(len_memory, self.batch_size)
        inputs = np.zeros((n_examples, self.n_input))
        targets = np.zeros((n_examples, self.n_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
            #get_memory
            states = self.memory[idx]
            state_t = states['state_t']
            state_tp1 = states['state_tp1']
            action = states['action']

            # input
            inputs[i] = state_t.astype(np.float64)

            # targets - not correcting those which are not taken
            feed_dict = {self.x: states['state_t'].reshape(1,-1)}
            targets[i] = self.session.run(self.predictor, feed_dict)
            
            # acted action
            feed_dict = {self.x: states['state_tp1'].reshape(1,-1)}
            Qsa = np.max(self.session.run(self.predictor, feed_dict))

            # check if endgame and if not apply discount
            if states['endgame']:
                targets[i,action] = states['reward'] # assign just reward if endgame
            else:
                targets[i,action] = states['reward'] + self.discount * Qsa
        return inputs, targets

    # construct network
    def network_forward(self):
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(self.x, self.weights['h1']), self.biases['b1'])
        layer_1 = tf.nn.relu(layer_1)
        
        layer_2 = tf.add(tf.matmul(layer_1, self.weights['h2']), self.biases['b2'])
        layer_2 = tf.nn.relu(layer_2)
        
        # Output layer with linear activation
        out_layer = tf.matmul(layer_1, self.weights['out']) + self.biases['out']
    
        return out_layer
    
    # Construct model
    def define_model(self):
        self.predictor = self.network_forward()

        # Define loss and optimizer
        self.cost = tf.reduce_sum(tf.pow(self.predictor-self.y, 2))/(2*self.batch_size)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)

        # Initializing the session
        init = tf.initialize_all_variables()
        self.session = tf.Session()
        self.session.run(init)
    
    # Train loop
    def train_model(self):
        # initialize states and experience replay
        states = {}
        # exp_replay = ExperienceQModel(max_memory=max_memory)

        # Training cycle
        for epoch in range(self.n_episodes):
            avg_cost = 0.
            state_tp1 = self.env.reset()
            endgame = False

            for t in range(self.n_steps):
                self.env.render()
                state_t1 = np.array(state_tp1)
        
                # exploration cycle
                eps = self.exp_a-self.exp_b*epoch/100
                if np.random.rand() <= eps:
                    action = self.env.action_space.sample()
                else:
                    feed_dict = {self.x: state_t1.reshape(1,-1)}
                    qvals = self.session.run(self.predictor, feed_dict)
                    action = np.argmax(qvals)

                # take a next step
                state_tp1, reward, endgame, info = self.env.step(action)

                # redefine reward
                if (t == 99) and (endgame == False):
                    print("{:4d}: won!".format(epoch))
                if endgame:
                    reward = 0;

                #store experience
                states['action'] = action
                states['reward'] = float(reward)
                states['endgame'] = endgame
                states['state_t'] = np.array(state_t1)
                states['state_tp1'] = np.array(state_tp1)
                self.exp_remember(states)

                # get experience replay
                x_batch, y_batch = self.exp_get_batch()
                _, c = self.session.run([self.optimizer, self.cost], feed_dict={self.x: x_batch, self.y: y_batch})
                # Compute average loss
                avg_cost += c / self.n_steps

                # Lost
                if endgame:
                    print("{:4d}: lost after {:3d}, cost {:06.4f}".format(epoch,t+1,avg_cost))
                    break

In [3]:
model = ExperienceQModel(env='CartPole-v0',\
                         max_memory=10000,\
                         discount=.9,\
                         n_episodes=400,\
                         n_steps=100,\
                         batch_size=100,\
                         learning_rate = 1.e-2,\
                         exploration_a = 0.1,\
                         exploration_b = 0.0)
model.train_model()

[2016-07-08 18:50:07,730] Making new env: CartPole-v0


   0: lost after  11, cost 0.0043
   1: lost after  10, cost 0.0081
   2: lost after  11, cost 0.0175
   3: lost after  10, cost 0.0272
   4: lost after  10, cost 0.0502
   5: lost after   8, cost 0.0557
   6: lost after   9, cost 0.0808
   7: lost after   9, cost 0.0930
   8: lost after  10, cost 0.0925
   9: lost after   9, cost 0.0804
  10: lost after  10, cost 0.0638
  11: lost after   9, cost 0.0433
  12: lost after   8, cost 0.0325
  13: lost after   9, cost 0.0283
  14: lost after  10, cost 0.0258
  15: lost after  12, cost 0.0383
  16: lost after  10, cost 0.0257
  17: lost after  11, cost 0.0273
  18: lost after  13, cost 0.0368
  19: lost after  11, cost 0.0274
  20: lost after  11, cost 0.0292
  21: lost after  16, cost 0.0464
  22: lost after  13, cost 0.0330
  23: lost after  15, cost 0.0380
  24: lost after  14, cost 0.1072
  25: lost after  19, cost 0.1528
  26: lost after  10, cost 0.1276
  27: lost after  10, cost 0.1888
  28: lost after  25, cost 0.6143
  29: lost aft

[2016-07-08 19:25:20,060] Observation '[-2.42696043 -3.10960661 -0.0755987   0.61427028]' is not contained within observation space 'Box(4,)'.


 390: lost after  97, cost 0.1132
 391: won!
 392: won!
 393: won!
 394: won!
 395: won!
 396: won!
 397: won!
 398: won!
 399: won!
