In [1]:
import gym
import gym_coverage
import random
import numpy as np
import tensorflow as tf

from collections import deque

from matplotlib import pyplot as plt

seed = 0
np.random.seed(seed)
random.seed(seed)

In [2]:
class ReplayMemory:
    def __init__(self, memory_size=10000, per_alpha=0.2, per_beta0=0.4):       
        self.memory = SumTree(capacity=memory_size)
        self.memory_size = memory_size
        self.per_alpha = per_alpha
        self.per_beta0 = per_beta0
        self.per_beta = per_beta0
        self.per_epsilon = 1E-6
        self.prio_max = 0
    
    def anneal_per_importance_sampling(self, step, max_step):
        self.per_beta = self.per_beta0 + step*(1-self.per_beta0)/max_step

    def error2priority(self, errors):
        return np.power(np.abs(errors) + self.per_epsilon, self.per_alpha)

    def save_experience(self, state, action, reward, state_next, done):
        experience = (state, action, reward, state_next, done)
        self.memory.add(np.max([self.prio_max, self.per_epsilon]), experience)
        
    def retrieve_experience(self, batch_size):
        idx = None
        priorities = None
        w = None

        idx, priorities, experience = self.memory.sample(batch_size)
        sampling_probabilities = priorities / self.memory.total()
        w = np.power(self.memory.n_entries * sampling_probabilities, -self.per_beta)
        w = w / w.max()
        return idx, priorities, w, experience
    
    def update_experience_weight(self, idx, errors):
        priorities = self.error2priority(errors)
        for i in range(len(idx)):
            self.memory.update(idx[i], priorities[i])
        self.prio_max = max(priorities.max(), self.prio_max)
        
class SumTree:
    def __init__(self, capacity=100000):
        self.capacity = capacity
        self.tree = np.zeros(2*capacity - 1)
        self.data = np.zeros(capacity, dtype=object)

        self.write = 0
        self.n_entries = 0

        self.tree_len = len(self.tree)

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1

        if left >= self.tree_len:
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            right = left + 1
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

        if self.n_entries < self.capacity:
            self.n_entries += 1

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        data_idx = idx - self.capacity + 1

        return idx, self.tree[idx], self.data[data_idx]

    def sample(self, batch_size):
        batch_idx = [None] * batch_size
        batch_priorities = [None] * batch_size
        batch = [None] * batch_size
        segment = self.total() / batch_size

        a = [segment*i for i in range(batch_size)]
        b = [segment * (i+1) for i in range(batch_size)]
        s = np.random.uniform(a, b)

        for i in range(batch_size):
            (batch_idx[i], batch_priorities[i], batch[i]) = self.get(s[i])

        return batch_idx, batch_priorities, batch

In [3]:
class DQNAgent:
    def __init__(self, obs_dim, n_action, seed=0,
                 discount_factor = 0.995, epsilon_decay = 0.999, epsilon_min = 0.01,
                 learning_rate = 1e-3, # STEP SIZE
                 batch_size = 64, 
                 memory_size = 10000, hidden_unit_size = 64,
                 target_mode = 'DDQN', memory_mode = 'PER'):
        self.seed = seed 
        self.obs_dim = obs_dim
        self.n_action = n_action

        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.epsilon = 1.0
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.train_start = 5000

        self.target_mode = target_mode
        self.memory_mode = memory_mode
        if memory_mode == 'PER':
            self.memory = ReplayMemory(memory_size=memory_size)
        else:
            self.memory = deque(maxlen=memory_size)
            
        self.hidden_unit_size = hidden_unit_size
        
        self.g = tf.Graph()
        with self.g.as_default():
            self.build_placeholders()
            self.build_model()
            self.build_loss()
            self.build_update_operation()
            self.init_session()        
    
    def build_placeholders(self):
        self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs')
        self.target_ph = tf.placeholder(tf.float32, (None, self.n_action), 'target')
        self.batch_weights_ph = tf.placeholder(tf.float32,(None, self.n_action), name="batch_weights")
        self.learning_rate_ph = tf.placeholder(tf.float32, (), 'lr')        
    
    def build_model(self):
        hid1_size = self.hidden_unit_size  # 10 empirically determined
        hid2_size = self.hidden_unit_size
        
        with tf.variable_scope('q_func'):
            out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='hidden1')
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='hidden2')
            self.q_predict = tf.layers.dense(out, self.n_action,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='q_predict')
                        
        with tf.variable_scope('q_func_old'):
            out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='hidden1')
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='hidden2')
            self.q_predict_old = tf.layers.dense(out, self.n_action,
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed=self.seed), name='q_predict')
        
        self.weights = tf.trainable_variables(scope='q_func')
        self.weights_old = tf.trainable_variables(scope='q_func_old')

    def build_loss(self):
        self.errors = self.target_ph - self.q_predict
        self.loss = 0.5*tf.reduce_mean(tf.square(self.target_ph - self.q_predict))
        self.optim = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph).minimize(self.loss)
            
    def build_update_operation(self):
        update_ops = []
        for var, var_old in zip(self.weights, self.weights_old):
            update_ops.append(var_old.assign(var))
        self.update_ops = update_ops
        
    def init_session(self):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config,graph=self.g)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.update_ops)
        self.saver = tf.train.Saver()
        
    def save_model(self, path):
        self.saver.save(self.sess, path)
        #with self.sess as sess:
        #    tf.train.Saver().save(sess, "model/coverage_6_model.ckpt")
        print("Model saved.")
        
    def restore_model(self, path):
        self.saver.restore(self.sess, path)
        #with self.sess as sess:
        #    tf.train.Saver().restore(sess, "model/coverage_6_model.ckpt")
        print("Model restored.")    
        
    def update_target(self):
        self.sess.run(self.update_ops)
    
    def update_memory(self, step, max_step):
        if self.memory_mode == 'PER':
            self.memory.anneal_per_importance_sampling(step, max_step)
        
    def update_policy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def get_prediction_old(self, obs): 
        q_value_old = self.sess.run(self.q_predict_old, feed_dict={self.obs_ph:obs})        
        return q_value_old
        
    def get_prediction(self, obs):
        q_value = self.sess.run(self.q_predict, feed_dict={self.obs_ph:obs})        
        return q_value
        
    def get_action(self, obs):
        if np.random.rand() <= self.epsilon:
            #return random.randint(0, 3)
            return random.randint(0, self.n_action - 1)
        else:
            q_value = self.get_prediction(obs)
            return np.argmax(q_value[0])

    def add_experience(self, obs, action, reward, next_obs, done):
        if self.memory_mode == 'PER':
            self.memory.save_experience(obs, action, reward, next_obs, done)
        else:
            self.memory.append((obs, action, reward, next_obs, done))

    def train_model(self):
        loss = np.nan
        
        if self.memory_mode == 'PER':
            n_entries = self.memory.memory.n_entries
        else:
            n_entries = len(self.memory)
            
        if n_entries > self.train_start:
            
            if self.memory_mode == 'PER':
                # PRIORITIZED EXPERIENCE REPLAY
                idx, priorities, w, mini_batch = self.memory.retrieve_experience(self.batch_size)
                batch_weights = np.transpose(np.tile(w, (self.n_action, 1)))
            else:
                mini_batch = random.sample(self.memory, self.batch_size)
                batch_weights = np.ones((self.batch_size,self.n_action))

            observations = np.zeros((self.batch_size, self.obs_dim))
            next_observations = np.zeros((self.batch_size, self.obs_dim))
            actions, rewards, dones = [], [], []

            for i in range(self.batch_size):
                observations[i] = mini_batch[i][0]
                actions.append(mini_batch[i][1])
                rewards.append(mini_batch[i][2])
                next_observations[i] = mini_batch[i][3]
                dones.append(mini_batch[i][4])

            target = self.get_prediction(observations)
            if self.target_mode == 'DDQN':
                bast_a = np.argmax(self.get_prediction(next_observations),axis=1)
            next_q_value = self.get_prediction_old(next_observations)

            # BELLMAN UPDATE RULE 
            for i in range(self.batch_size):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    if self.target_mode == 'DDQN':
                        target[i][actions[i]] = rewards[i] + self.discount_factor * next_q_value[i][bast_a[i]]
                    else:
                        target[i][actions[i]] = rewards[i] + self.discount_factor * (np.amax(next_q_value[i]))

            loss, errors, _ = self.sess.run([self.loss, self.errors, self.optim], 
                                 feed_dict={self.obs_ph:observations,self.target_ph:target,self.learning_rate_ph:self.learning_rate,self.batch_weights_ph:batch_weights})
            errors = errors[np.arange(len(errors)), actions]
            
            if self.memory_mode == 'PER':
                # PRIORITIZED EXPERIENCE REPLAY
                self.memory.update_experience_weight(idx, errors)
            
        return loss

In [4]:
env = gym.make('CoverageCount-v0')
env.seed(seed)
N_F_pos = env.observation_space.spaces[0].n
N_F_map = env.observation_space.spaces[1].shape[0] * env.observation_space.spaces[1].shape[1]
N_F = N_F_pos + N_F_map
N_A = env.action_space.n
agent = DQNAgent(N_F, N_A, epsilon_decay= 0.999999, memory_mode='PER', target_mode='DQN', batch_size=64)
#agent.restore_model(path="model/coverage_6_model_new.ckpt")

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
max_t = 80
import time
avg_return_list = deque(maxlen=10)
avg_loss_list = deque(maxlen=10)
for i in range(100000):
    obs = env.reset()
    obs = np.concatenate((np.eye(36)[obs[0]], obs[1].flatten()))
    done = False
    total_reward = 0
    total_loss = 0
    for t in range(max_t):
        action = agent.get_action(np.reshape(obs, (1, -1)))
        next_obs, reward, done, info = env.step(action, count=False)
        next_obs = np.concatenate((np.eye(36)[next_obs[0]], next_obs[1].flatten()))
        agent.add_experience(obs, action, reward, next_obs, done)
        
        loss = agent.train_model()
        agent.update_memory(t, max_t)
        agent.update_policy()
        
        #env.render()
        #time.sleep(0.5)
        #if i % 10000 == 0:
        #    env.render()
        #    time.sleep(0.5)
        obs = next_obs
        total_reward += reward
        total_loss += loss
        if done:
            break
            
    agent.update_target()
    avg_return_list.append(total_reward)
    avg_loss_list.append(total_loss)
    
    if (i%1000)==0:
        print('{} loss : {:.3f}, return : {:.3f}, eps : {:.3f}'.format(i, np.mean(avg_loss_list), np.mean(avg_return_list), agent.epsilon))

0 loss : 60.178, return : -1011.000, eps : 0.290
1000 loss : 185.450, return : -915.200, eps : 0.283
2000 loss : 75.601, return : -1026.500, eps : 0.277
3000 loss : 221.657, return : -927.500, eps : 0.271
4000 loss : 33.357, return : -1014.500, eps : 0.265
5000 loss : 55.089, return : -1014.200, eps : 0.259
6000 loss : 675.927, return : -1026.300, eps : 0.253
7000 loss : 77.757, return : -1021.800, eps : 0.247
8000 loss : 172.773, return : -1033.000, eps : 0.240
9000 loss : 91.850, return : -1020.900, eps : 0.234
10000 loss : 51.412, return : -920.300, eps : 0.228
11000 loss : 21.913, return : -1015.200, eps : 0.223
12000 loss : 84.253, return : -1028.600, eps : 0.216
13000 loss : 71.313, return : -931.700, eps : 0.211
14000 loss : 122.164, return : -1018.800, eps : 0.205
15000 loss : 223.821, return : -1025.700, eps : 0.200
16000 loss : 570.240, return : -932.600, eps : 0.195
17000 loss : 237.144, return : -927.700, eps : 0.189
18000 loss : 199.520, return : -650.500, eps : 0.183
1900

In [6]:
#agent.save_model(path="model/coverage_6_model_new.ckpt")
max_t = 80
for i in range(100):
    obs = env.reset()
    obs = np.concatenate((np.eye(36)[obs[0]], obs[1].flatten()))
    done = False
    total_reward = 0
    env.render()
    time.sleep(1.1)
    for t in range(max_t):
        action = agent.get_action(np.reshape(obs, (1, -1)))
        next_obs, reward, done, info = env.step(action)
        next_obs = np.concatenate((np.eye(36)[next_obs[0]], next_obs[1].flatten()))
        
        obs = next_obs
        total_reward += reward
        env.render()
        time.sleep(1.1)
        if done:
            break
    print(total_reward)        

-1016.0
-1002.0
-1002.0
-1031.0
-80
-1058.0
-1000.0
-1004.0
-1001.0
-1066.0
-1003.0
-1007.0
-1011.0
-1009.0
-1046.0
-1031.0
-1012.0
-1001.0
-1032.0
-1047.0
-1043.0
-1016.0
-1018.0
-1033.0
-1005.0
-1004.0
-1013.0
-1012.0
-1006.0
-1052.0
-1010.0
-1028.0
-1012.0
-1016.0
-1007.0
-1008.0
-1001.0
-1015.0
-1018.0
-1060.0
-1033.0
-1054.0
-1013.0
-1000.0
-1051.0
-1032.0
-1024.0
-1020.0
-1002.0
-1005.0
-1049.0
-1017.0
-1019.0
-1011.0
-1036.0
-1016.0
-1023.0
-1010.0
-1034.0
-1032.0
-1029.0
-1007.0
-1013.0
-1005.0
-1065.0
-1013.0
-1055.0
-1028.0
-1003.0
-80
-1005.0
-1018.0
-1046.0
-1008.0
-1018.0
-1053.0
-1019.0
-1010.0
-1030.0
-1005.0
-1009.0
-80
-1005.0
-1030.0
-80
-1028.0
-1014.0
-1011.0
-1054.0
-1056.0
-1010.0
-1002.0
-1017.0
-1003.0
-1008.0
-1002.0
-1014.0
-1018.0
-1017.0
-1042.0
