In [11]:
import gym
import re
import numpy as np
import tensorflow as tf

In [40]:
class ExperienceQModel(object):
    def __init__(self, env, monitor_file, logdir, max_memory=10000, discount=.9, n_episodes=100, 
                 n_steps=100, batch_size=100, learning_rate = 0.01, dropout = 1.0,
                 exploration=lambda x: 0.1, stop_training=10):
        
        # Memory replay parameters
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

        # exploration
        self.eps = exploration # epsilon-greedy as function of epoch
        
        # environment parameters
        self.env = gym.make(env)
        self.monitor_file = monitor_file
        self.n_states = self.env.observation_space.shape[0]
        self.n_actions = int(re.findall('\d+',str(self.env.action_space))[0]) # shameless hack to get a dim of actions
        
        # training parameters
        self.learning_rate = learning_rate
        self.n_episodes = n_episodes
        self.n_steps = n_steps # must be equal to episode length
        self.batch_size = batch_size
        self.stop_training = stop_training # stop training after stop_training consecutive wins
        self.consec_wins = 0 # number of consecutive wins to stop training
        self.global_step = 0 # global step

        # Neural Network Parameters
        self.n_hidden = [self.n_states]
        self.logdir = logdir
        
#         # Initialize tensor flow parameters
        self.x = tf.placeholder(tf.float32, [None, self.n_states],name='states')
        self.y = tf.placeholder(tf.float32, [None, self.n_actions],name='qvals')
#         self.keep_prob = dropout
#         self.dropout = tf.placeholder(tf.float32,name='dropout')

        # Tensorboard directory
#         try:
#             shutil.rmtree(log_dir) #clean
#         except:
#             pass
        
        # define model
        self.estimator = tf.contrib.learn.DNNRegressor(
            hidden_units=self.n_hidden,
            optimizer=tf.train.AdamOptimizer(
              learning_rate=self.learning_rate,
        ))

    # process reward
    def exp_process_reward(self,ts,reward,endgame):
        # if ts < self.n_steps-1 and endgame == True:
        #     reward = -1.0*ts/5.0 #penalize last moves
        # if ts == self.n_steps-1 and endgame == True:
        #     reward = ts #win
        # if endgame == True:
            # reward = 1.0*ts
        return reward

    # saving to memory
    def exp_save_to_memory(self, states):
        self.memory.append(states.copy())
        if len(self.memory) > self.max_memory:
          del self.memory[0]

    # based on https://gist.github.com/EderSantana/c7222daa328f0e885093
    def exp_get_batch(self):
        len_memory = len(self.memory)
        n_examples = min(len_memory, self.batch_size)
        inputs = np.zeros((n_examples, self.n_states))
        targets = np.zeros((n_examples, self.n_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
            #get_memory
            states = self.memory[idx]

            # input
            inputs[i] = states['state_t'].astype(np.float32)

            # targets - not correcting those which are not taken
#             feed_dict = {self.x: states['state_t'].reshape(1,-1), self.dropout: 1.0}
#             targets[i] = self.session.run(self.predictor, feed_dict)
#             targets[i] = self.estimator.predict(x=states['state_t'].reshape(1,-1))
            targets[i] = [1.2,2.4]
            
            # acted action
#             feed_dict = {self.x: states['state_tp1'].reshape(1,-1), self.dropout: 1.0}
#             Qsa = np.max(self.estimator.predict(x=states['state_tp1'].reshape(1,-1)))
#             Qsa = np.max(self.session.run(self.predictor, feed_dict))
            Qsa = 4.

            # check if endgame and if not use Bellman's equation
            if states['endgame']:
                targets[i,states['action']] = states['reward']
            else:
                targets[i,states['action']] = states['reward'] + self.discount * Qsa
        return inputs, targets

    # Train loop
    def tf_train_model(self):
        # start open ai monitor
        if self.monitor_file:
            self.env.monitor.start(self.monitor_file,force=True)

        # Training cycle
        for epoch in range(self.n_episodes):

            # restart episode
            state_tp1 = self.env.reset()
            endgame = False
            avg_loss = 0.
            avg_max_qval = 0.
            states = {}

            for t in range(self.n_steps):
                self.env.render()
                state_t1 = np.array(state_tp1)
        
                # epsilon-greedy exploration
                if self.consec_wins < self.stop_training and np.random.rand() <= self.eps(epoch):
                    action = self.env.action_space.sample()
                else:
#                     feed_dict = {self.x: state_t1.reshape(1,-1), self.dropout: 1.0}
                    qvals = self.estimator.predict(x=state_t1.reshape(1,-1))
#                     qvals = self.session.run(self.predictor, feed_dict)
                    avg_max_qval += np.max(qvals)
                    action = np.argmax(qvals)

                # take a next step
                state_tp1, reward, endgame, info = self.env.step(action)

                # process reward
                reward = self.exp_process_reward(t,reward,endgame)

                #store experience
                states['action'] = action
                states['reward'] = float(reward)/10
                states['endgame'] = endgame
                states['state_t'] = np.array(state_t1)
                states['state_tp1'] = np.array(state_tp1)
                self.exp_save_to_memory(states)

                # Training loop
                if self.consec_wins < self.stop_training:
                    # get experience replay
                    x_batch, y_batch = self.exp_get_batch()
                    # create feed dictionary
#                     feed_dict = {self.x: x_batch, self.y: y_batch, self.dropout: self.keep_prob}
                    # training
                    self.estimator.fit(x=x_batch,y=y_batch)
#                     _, loss, summary = self.session.run([self.train_op, self.loss, self.merged_summary_op],
#                         feed_dict=feed_dict)
                    
                    # add summary to the summary_writer
#                     self.global_step += x_batch.shape[0]
#                     self.summary_writer.add_summary(summary,self.global_step)
                    # avg loss
                    avg_loss += loss

                # Check if lost or not
                if endgame == True:
                    if (t == self.n_steps-1):
                        self.consec_wins +=1
                        print("{:4d}: won!".format(epoch+1))
                        break
                    else:
                        self.consec_wins = 0
                        print("{:4d}: lost after {:3d}, cost {:8.4f}, qval {:8.4f}".
                                format(epoch+1,t+1,avg_loss/t,avg_max_qval/t))
                        break

        # close monitor session
        if self.monitor_file:
            self.env.monitor.close()

    # submit result for the leaderboard
    def submit_result(self,algo_id,api_key):
        gym.upload(self.monitor_file,
            algorithm_id=algo_id,
            api_key=api_key,
            ignore_open_monitors=False)

In [None]:
model = ExperienceQModel(
    env='CartPole-v0',\
    monitor_file = None,\
    logdir = '/tmp/tf/cartpole-256_1.e-3',\
    max_memory=40000,\
    discount=.90,\
    n_episodes=400,\
    n_steps=200,\
    batch_size=256,\
    learning_rate = 1.e-3,\
    dropout = 1.0,\
    exploration = lambda x: 0.9 if x<50 else 0,\
    stop_training = 10
)

model.tf_train_model()

[2016-07-11 19:21:12,025] Making new env: CartPole-v0
