In [1]:
import numpy as np
import tensorflow as tf
import gym
import gym_coverage

# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 50000
DISPLAY_REWARD_THRESHOLD = -10  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 500 # maximum time step in one episode
RENDER = False # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR = 0.0001    # learning rate
MEMORY_SIZE = 10000
BATCH_SIZE = 1000

env = gym.make('Coverage-v0')
env.seed(1)  # reproducible
np.random.seed(1)
tf.set_random_seed(1)  # reproducible

N_F_pos = env.observation_space.spaces[0].n
N_F_map = env.observation_space.spaces[1].shape[0] * env.observation_space.spaces[1].shape[1]
N_F = N_F_pos + N_F_map
N_A = env.action_space.n

In [2]:
class ExpReplay(object):
    def __init__(self, memory_size, state_dim, act_dim, batch_size):
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.actions = np.empty(self.memory_size, dtype=np.float32)
        self.rewards = np.empty(self.memory_size, dtype=np.float32)
        self.next_states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.count = 0
        self.current = 0
    def fifo(self, state, action, reward, next_state):
        self.states[self.current] = state
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.next_states[self.current] = next_state
        self.current = (self.current + 1) % self.memory_size
        self.count = self.count + 1
    def add_trajectory(self, states, actions, rewards, next_states):
        num = len(observes)
        for i in range(0, num):
            self.fifo(states[i], actions[i], rewards[i], next_states[i])
    def sampling(self):
        indexes = np.random.randint(min(self.count, self.memory_size), size=self.batch_size)
        states = self.states[indexes]
        actions = self.actions[indexes]
        rewards = self.rewards[indexes]
        next_states = self.next_states[indexes]
        states = states.reshape((-1, self.state_dim))
        actions = actions.reshape((-1, 1))
        rewards = rewards.reshape((-1, 1))
        next_states = next_states.reshape((-1, self.state_dim))
        return states, actions, rewards, next_states

In [3]:
class QNET(object):
    def __init__(self, sess, n_features, n_actions, lr=0.01):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, (None, n_features), "state")
        self.r = tf.placeholder(tf.float32, (None, 1), 'reward')
        self.q_ = tf.placeholder(tf.float32, (None, 1), 'q_next')
        self.td_error = tf.placeholder(tf.float32, (None, 1), 'td_error')

        with tf.variable_scope('Q'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=32,  # number of hidden units
                activation=tf.nn.tanh,  # None
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )
            
            l2 = tf.layers.dense(
                inputs=l1,
                units=16,  # number of hidden units
                activation=tf.nn.tanh,  # None
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l2'
            )

            self.q = tf.layers.dense(
                inputs=l2,
                units=n_actions,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='Q'
            )
        
        with tf.variable_scope('squared_TD_error'):
            #self.td_error = self.r + GAMMA * self.q_ - self.q[a]
            self.loss = tf.reduce_sum(tf.square(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
    '''
    def learn(self, s, a, r, s_):
        BATCH_SIZE = 200
        NITER = int(s.shape[0]/BATCH_SIZE)
        td_error = self.get_error(s, a, r, s_)
        for i in range(0, NITER):
            _ = self.sess.run(self.train_op, {self.td_error: td_error[i*BATCH_SIZE:(i+1)*BATCH_SIZE]})
        return 0
    '''
    def get_error(self, s, a, r, s_):
        q_ = self.sess.run(self.q, {self.s: s_})
        q = self.sess.run(self.q, {self.s: s})
        td_error = r + gamma * np.amax(q_, axis=1) - q[a]
        BATCH_SIZE = 200
        NITER = int(s.shape[0]/BATCH_SIZE)
        for i in range(0, NITER):
            _ = self.sess.run(self.train_op, {self.td_error: td_error[i*BATCH_SIZE:(i+1)*BATCH_SIZE]})
        return 0
    
    def get_actions(self, s):
        q = self.sess.run(self.q, {self.s: s})
        action = np.argmax(q, axis=1)
        return action

In [5]:
sess = tf.Session()

qnet = QNET(sess, n_features=N_F, n_actions=N_A, lr=LR)     # we need a good teacher, so the teacher should learn faster than the actor
replay = ExpReplay(MEMORY_SIZE, N_F, N_A)
sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    s = np.concatenate((np.eye(100)[s[0]], s[1].flatten()))
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = qnet.get_actions(s)
        s_, r, done, info = env.step(a)
        s_ = np.concatenate((np.eye(100)[s_[0]], s_[1].flatten()))
        track_r.append(r)
       
        replay.fifo(s, a, r, s_)
        s = s_
        t += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)
            running_reward = ep_rs_sum
            '''
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            '''
            print("episode:", i_episode, "  reward:", int(running_reward))
            break
        
    if (i_episode + 1)%10 == 0:
        states, actions, rewards, next_states = replay.sampling()
        qnet.learn(states, actions, next_states, td_errors)

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'Q/l1/kernel:0' shape=(200, 32) dtype=float32_ref>", "<tf.Variable 'Q/l1/bias:0' shape=(32,) dtype=float32_ref>", "<tf.Variable 'Q/l2/kernel:0' shape=(32, 16) dtype=float32_ref>", "<tf.Variable 'Q/l2/bias:0' shape=(16,) dtype=float32_ref>", "<tf.Variable 'Q/Q/kernel:0' shape=(16, 4) dtype=float32_ref>", "<tf.Variable 'Q/Q/bias:0' shape=(4,) dtype=float32_ref>"] and loss Tensor("squared_TD_error/Sum:0", shape=(), dtype=float32).

In [9]:
for i_episode in range(0, 200):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        env.render()

        a = actor.choose_action(s)
        s_, r, done, info = env.step(action_map[a, :])
        s = s_
        t += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

('episode:', 0, '  reward:', 0)
('episode:', 1, '  reward:', 0)
('episode:', 2, '  reward:', 0)
('episode:', 3, '  reward:', 0)
('episode:', 4, '  reward:', 0)
('episode:', 5, '  reward:', 0)
('episode:', 6, '  reward:', 0)
('episode:', 7, '  reward:', 0)
('episode:', 8, '  reward:', 0)
('episode:', 9, '  reward:', 0)
('episode:', 10, '  reward:', 0)
('episode:', 11, '  reward:', 0)
('episode:', 12, '  reward:', 0)
('episode:', 13, '  reward:', 0)
('episode:', 14, '  reward:', 0)
('episode:', 15, '  reward:', 0)
('episode:', 16, '  reward:', 0)
('episode:', 17, '  reward:', 0)
('episode:', 18, '  reward:', 0)
('episode:', 19, '  reward:', 0)
('episode:', 20, '  reward:', 0)
('episode:', 21, '  reward:', 0)
('episode:', 22, '  reward:', 0)
('episode:', 23, '  reward:', 0)
('episode:', 24, '  reward:', 0)
('episode:', 25, '  reward:', 0)
('episode:', 26, '  reward:', 0)
('episode:', 27, '  reward:', 0)
('episode:', 28, '  reward:', 0)
('episode:', 29, '  reward:', 0)
('episode:', 30, '  