In [1]:
import gym
import gym_coverage
import random
import time
import numpy as np
import tensorflow as tf
import torch
from torch.autograd import Variable

from collections import deque

from matplotlib import pyplot as plt

seed = 0
np.random.seed(seed)
random.seed(seed)

In [2]:
class ReplayMemory:
    def __init__(self, memory_size=10000, per_alpha=0.2, per_beta0=0.4):       
        self.memory = SumTree(capacity=memory_size)
        self.memory_size = memory_size
        self.per_alpha = per_alpha
        self.per_beta0 = per_beta0
        self.per_beta = per_beta0
        self.per_epsilon = 1E-6
        self.prio_max = 0
    
    def anneal_per_importance_sampling(self, step, max_step):
        self.per_beta = self.per_beta0 + step*(1-self.per_beta0)/max_step

    def error2priority(self, errors):
        return np.power(np.abs(errors) + self.per_epsilon, self.per_alpha)

    def save_experience(self, state, action, reward, state_next, done):
        experience = (state, action, reward, state_next, done)
        self.memory.add(np.max([self.prio_max, self.per_epsilon]), experience)
        
    def retrieve_experience(self, batch_size):
        idx = None
        priorities = None
        w = None

        idx, priorities, experience = self.memory.sample(batch_size)
        sampling_probabilities = priorities / self.memory.total()
        w = np.power(self.memory.n_entries * sampling_probabilities, -self.per_beta)
        w = w / w.max()
        return idx, priorities, w, experience
    
    def update_experience_weight(self, idx, errors ):
        priorities = self.error2priority(errors)
        for i in range(len(idx)):
            self.memory.update(idx[i], priorities[i])
        self.prio_max = max(priorities.max(), self.prio_max)
        
class SumTree:
    def __init__(self, capacity=100000):
        self.capacity = capacity
        self.tree = np.zeros(2*capacity - 1)
        self.data = np.zeros(capacity, dtype=object)

        self.write = 0
        self.n_entries = 0

        self.tree_len = len(self.tree)

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1

        if left >= self.tree_len:
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            right = left + 1
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

        if self.n_entries < self.capacity:
            self.n_entries += 1

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        data_idx = idx - self.capacity + 1

        return idx, self.tree[idx], self.data[data_idx]

    def sample(self, batch_size):
        batch_idx = [None] * batch_size
        batch_priorities = [None] * batch_size
        batch = [None] * batch_size
        segment = self.total() / batch_size

        a = [segment*i for i in range(batch_size)]
        b = [segment * (i+1) for i in range(batch_size)]
        s = np.random.uniform(a, b)

        for i in range(batch_size):
            (batch_idx[i], batch_priorities[i], batch[i]) = self.get(s[i])

        return batch_idx, batch_priorities, batch

In [3]:
class DQNAgent:
    def __init__(self, obs_dim, n_action, seed=0,
                 discount_factor = 0.995, epsilon_decay = 0.999, epsilon_min = 0.01,
                 learning_rate = 1e-3, # STEP SIZE
                 batch_size = 64, 
                 memory_size = 10000, hidden_unit_size = 64,
                 target_mode = 'DDQN', memory_mode = 'PER'):
        self.seed = seed 
        self.obs_dim = obs_dim
        self.n_action = n_action

        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.epsilon = 1.0
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.train_start = 5000

        self.target_mode = target_mode
        self.memory_mode = memory_mode
        if memory_mode == 'PER':
            self.memory = ReplayMemory(memory_size=memory_size)
        else:
            self.memory = deque(maxlen=memory_size)
            
        self.hidden_unit_size = hidden_unit_size
        
        self.build_model()
        #self.build_loss()
        #self.build_update_operation()   
    
        #self.batch_wegihts_ph = 
        #self.learning_rate_ph =
    
    def build_model(self):
        hid1_size = self.hidden_unit_size  # 10 empirically determined
        hid2_size = self.hidden_unit_size
        
        self.fc1 = torch.nn.Linear(self.obs_dim, hid1_size, bias=True)
        self.fc2 = torch.nn.Linear(hid1_size, hid2_size, bias=True)
        self.fc3 = torch.nn.Linear(hid2_size, self.n_action, bias=True)
        tanh = torch.nn.Tanh()

        self.q_predict = torch.nn.Sequential(self.fc1, tanh, self.fc2, tanh, self.fc3)
        
        self.fc1_old = torch.nn.Linear(self.obs_dim, hid1_size, bias=True)
        self.fc2_old = torch.nn.Linear(hid1_size, hid2_size, bias=True)
        self.fc3_old = torch.nn.Linear(hid2_size, self.n_action, bias=True)

        self.q_predict_old = torch.nn.Sequential(self.fc1_old, tanh, self.fc2_old, tanh, self.fc3_old)
    '''
    def build_loss(self):
            
    def build_update_operation(self):
    
    def save_model(self, path):
        
    def restore_model(self, path):
    '''    
    def update_target(self):
        self.fc1_old.weight = self.fc1.weight
        self.fc1_old.bias = self.fc1.bias
        self.fc2_old.weight = self.fc2.weight
        self.fc2_old.bias = self.fc2.bias
        self.fc3_old.weight = self.fc3.weight
        self.fc3_old.bias = self.fc3.bias
        
    def update_memory(self, step, max_step):
        if self.memory_mode == 'PER':
            self.memory.anneal_per_importance_sampling(step, max_step)
        
    def update_policy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def get_prediction_old(self, obs): 
        obs = torch.from_numpy(obs).float()
        q_value_old = self.q_predict_old(Variable(obs))
        return q_value_old.float()
        
    def get_prediction(self, obs):
        obs = torch.from_numpy(obs).float()
        q_value = self.q_predict(Variable(obs))            
        return q_value.float()
    
    def get_action(self, obs):
        if np.random.rand() <= self.epsilon:
            #return random.randint(0, 3)
            return random.randrange(self.n_action)
        else:
            q_value = self.get_prediction(obs).data.numpy()
            return np.argmax(q_value[0])
    
    def add_experience(self, obs, action, reward, next_obs, done):
        if self.memory_mode == 'PER':
            self.memory.save_experience(obs, action, reward, next_obs, done)
        else:
            self.memory.append((obs, action, reward, next_obs, done))

    def train_model(self):
        output = np.nan
        
        if self.memory_mode == 'PER':
            n_entries = self.memory.memory.n_entries
        else:
            n_entries = len(self.memory)
            
        if n_entries > self.train_start:
            
            if self.memory_mode == 'PER':
                # PRIORITIZED EXPERIENCE REPLAY
                idx, priorities, w, mini_batch = self.memory.retrieve_experience(self.batch_size)
                batch_weights = np.transpose(np.tile(w, (self.n_action, 1)))
            else:
                mini_batch = random.sample(self.memory, self.batch_size)
                batch_weights = np.ones((self.batch_size,self.n_action))

            observations = np.zeros((self.batch_size, self.obs_dim))
            next_observations = np.zeros((self.batch_size, self.obs_dim))
            actions, rewards, dones = [], [], []

            for i in range(self.batch_size):
                observations[i] = mini_batch[i][0]
                actions.append(mini_batch[i][1])
                rewards.append(mini_batch[i][2])
                next_observations[i] = mini_batch[i][3]
                dones.append(mini_batch[i][4])
            
            target = self.get_prediction(observations)
            target = target.data.numpy()
            if self.target_mode == 'DDQN':
                bast_a = np.argmax(self.get_prediction(next_observations), axis=1)
            next_q_value = self.get_prediction_old(next_observations)
            next_q_value = next_q_value.data.numpy()

            # BELLMAN UPDATE RULE 
            for i in range(self.batch_size):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    if self.target_mode == 'DDQN':
                        target[i][actions[i]] = rewards[i] + self.discount_factor * next_q_value[i][bast_a[i]]
                    else:
                        target[i][actions[i]] = rewards[i] + self.discount_factor * (np.amax(next_q_value[i]))

            q_predict = self.get_prediction(observations)
            target = torch.from_numpy(target).float()
            errors = target - q_predict.data
            loss = torch.nn.MSELoss()
            output = 0.5 * loss(q_predict, Variable(target))
            optimizer = torch.optim.Adam(self.q_predict.parameters(), lr=self.learning_rate)
            optimizer.zero_grad()
            output.backward()
            optimizer.step()
            
            errors = errors.numpy()
            errors = errors[np.arange(len(errors)), actions]
            
            if self.memory_mode == 'PER':
                # PRIORITIZED EXPERIENCE REPLAY
                self.memory.update_experience_weight(idx, errors)
                
            output = output.data.numpy()    
            
        return output

In [4]:
env = gym.make('Coverage-v0')
env.seed(seed)
N_F_pos = env.observation_space.spaces[0].n
N_F_map = env.observation_space.spaces[1].shape[0] * env.observation_space.spaces[1].shape[1]
N_F = N_F_pos + N_F_map
N_A = env.action_space.n
agent = DQNAgent(N_F, N_A, epsilon_decay= 0.999995, discount_factor=0.999, memory_mode='PER',target_mode='DQN')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
max_t = 100
avg_return_list = deque(maxlen=10)
avg_loss_list = deque(maxlen=10)
for i in range(300000):
    obs = env.reset()
    obs = np.concatenate((np.eye(16)[obs[0]], obs[1].flatten()))
    done = False
    total_reward = 0
    total_loss = 0
    for t in range(max_t):
        action = agent.get_action(np.reshape(obs, (1, -1)))
        next_obs, reward, done, info = env.step(action)
        next_obs = np.concatenate((np.eye(16)[next_obs[0]], next_obs[1].flatten()))
        agent.add_experience(obs, action, reward, next_obs, done)
        
        loss = agent.train_model()
        agent.update_memory(t, max_t)
        agent.update_policy()
        
        #if i % 10000 == 0:
        #    env.render()
        #    time.sleep(0.5)
        obs = next_obs
        total_reward += reward
        total_loss += loss
        if done:
            break
            
    agent.update_target()
    avg_return_list.append(total_reward)
    avg_loss_list.append(total_loss)
    
    if (i%1000)==0:
        print('{} loss : {:.3f}, return : {:.3f}, eps : {:.3f}'.format(i, np.mean(avg_loss_list), np.mean(avg_return_list), agent.epsilon))

0 loss : nan, return : -1001.000, eps : 1.000
1000 loss : nan, return : -1004.400, eps : 0.978
2000 loss : 174995.531, return : -1003.400, eps : 0.957
3000 loss : 37574.988, return : -1001.300, eps : 0.937
4000 loss : 10060.311, return : -1002.700, eps : 0.917
5000 loss : 0.954, return : -1002.700, eps : 0.897
6000 loss : 0.012, return : -1002.500, eps : 0.875
7000 loss : 0.010, return : -1002.100, eps : 0.855
8000 loss : 0.017, return : -1004.900, eps : 0.834
9000 loss : 0.020, return : -1005.100, eps : 0.813
10000 loss : 0.034, return : -1005.300, eps : 0.793
11000 loss : 0.060, return : -1004.300, eps : 0.774
12000 loss : 0.149, return : -1005.300, eps : 0.753
13000 loss : 0.310, return : -1008.800, eps : 0.731
14000 loss : 15.914, return : -1009.500, eps : 0.703
15000 loss : 794.355, return : -1008.900, eps : 0.669
full cover
16000 loss : 135715.078, return : -1014.300, eps : 0.629
full cover
full cover
17000 loss : 171657.250, return : -1011.700, eps : 0.588
full cover
18000 loss 

KeyboardInterrupt: 

In [None]:
for i in range(100):
    obs = env.reset()
    obs = np.concatenate((np.eye(16)[obs[0]], obs[1].flatten()))
    done = False
    total_reward = 0
    total_loss = 0
    env.render()
    time.sleep(1.0)
    for t in range(max_t):
        action = agent.get_action(np.reshape(obs, (1, -1)))
        next_obs, reward, done, info = env.step(action)
        next_obs = np.concatenate((np.eye(16)[next_obs[0]], next_obs[1].flatten()))
        
        if i % 10000 == 0:
            env.render()
            time.sleep(1.0)
        obs = next_obs
        if done:
            break

4
5
6
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
14
10
