In [1]:
from keras.layers import Dense
from keras.models import Sequential, load_model
from keras.optimizers import Adam

import numpy as np

import time

import gym

Using TensorFlow backend.


In [2]:
class ReplayBuffer(object):
    def __init__(self, size, minibatch_size):
        """
        Args:
            size (integer): The size of the replay buffer.              
            minibatch_size (integer): The sample size.
        """
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState()
        self.max_size = size
    #--------------------------------------------------------------------------------    

    def append(self, state, action, reward, next_state, done):
        """
        Args:
            state (Numpy array): The state.              
            action (integer): The action.
            reward (float): The reward.
            done (boolen): True if the next state is a terminal state and False otherwise.
                           Is transformed to integer so tha True = 1, False = 0
            next_state (Numpy array): The next state.           
        """
        if self.size() == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, next_state, int(done)])
    #--------------------------------------------------------------------------------    

    def sample(self):
        """
        Returns:
            A list of transition tuples including state, action, reward, terminal, and next_state
        """
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]
    #--------------------------------------------------------------------------------    

    def size(self):
        """
        Returns:
            Number of elements in the buffer
        """
        return len(self.buffer)
    #--------------------------------------------------------------------------------
    
    def min_full(self):
        return (self.size() >= self.minibatch_size)

In [3]:
class DeepQNetwork:
    def __init__(self, input_size, hidden_size, output_size, lr):
        """
        Args:
            input_size (integer): The number of elements in the input of the network (Number of elements in the state).
            hidden_size (Tuple): The number of elements in the hidden layers.
            output_size (integer): The number of elements in the output (Number of possible actions).
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lr = lr
        
        #Creates the model with the specified values
        self.model = Sequential()
        self.model.add(Dense(hidden_size[0], input_dim=self.input_size, activation='relu'))
        for hidden in self.hidden_size[1:]:
            self.model.add(Dense(hidden, activation='relu'))
            
        self.model.add(Dense(self.output_size, activation='linear'))
        optimizer = Adam(learning_rate=lr)
        
        self.model.compile(optimizer=optimizer, loss='mse')
    
    #-------------------------------------------------------------------------------- 
    
    def predict(self, states):
        """
        Args:
            state (Numpy array): The states. (n, len(state))
        Returns:
            The action-values (Numpy array) calculated using the network's weights and softmax. (n, len(q_values))
        """
        q_values = self.model.predict(states)
        
        return q_values
    #-------------------------------------------------------------------------------- 
    
    def train(self, state, target_q):
        """
        Args:
            state (Numpy array): The states. (n, len(state))
            target_q (Numpy array): The target values of the q values (n, len(q_values))
        """
        self.model.fit(state, target_q, epochs=1, verbose=0)
        
    #--------------------------------------------------------------------------------    
    def update_weights(self, new_weights):
        self.model.set_weights(new_weights)
    
    #-------------------------------------------------------------------------------- 
    def get_weights(self):
        return self.model.get_weights()
        
    #--------------------------------------------------------------------------------
    
    def save(self, file_path):
        self.model.save(file_path)
    #--------------------------------------------------------------------------------
    
    def load(self, file_path):
        self.model = load_model(file_path)
        optimizer = Adam(learning_rate=lr)        
        self.model.compile(optimizer=optimizer, loss='mse')
    #--------------------------------------------------------------------------------    
    
    def summary(self):
        self.model.summary()    

In [4]:
class DDQNAgent(object):
    def __init__(self, input_dims, hidden_dims, n_actions, alpha, gamma, 
                 epsilon, epsilon_decay, epsilon_min, 
                 mem_size, batch_size, 
                 replace_target):
        #DDQN uses two networks, one to chose an action and another one to determine the value of that action
        #    and only the action chooseng networks is trained, the other one have its weights replaced by the 
        #    weights of the trained one in every replace_targets number of episodes
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.replace_target = replace_target
        #Creates the memory 
        self.memory = ReplayBuffer(mem_size, batch_size)
        #Creates the network responsible for the Action Values
        self.q_eval = DeepQNetwork(input_dims, hidden_dims, n_actions, alpha)
        #Creates the network responsible for the target values used in the model training
        self.q_target = DeepQNetwork(input_dims, hidden_dims, n_actions, alpha)
        
        self.counter = 0
        
    #-------------------------------------------------------------------------------------    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append(state, action, reward, next_state, done)
        
    #-------------------------------------------------------------------------------------
    def policy(self, state):
        state = state[np.newaxis, :]
        if np.random.rand() < self.epsilon:
            action = np.random.choice(np.arange(self.n_actions))
            
        else:
            action = self.q_eval.predict(state)
            action = np.argmax(np.squeeze(action))
        
        #Creates the one_hot encodeng for the action
        one_hot_action = np.zeros(self.n_actions)   
        one_hot_action[action] = 1.
        
        return action, one_hot_action
    
    #-------------------------------------------------------------------------------------
    def learn(self, state, action, reward, next_state, done):
        if self.memory.min_full(): 
            self.replay_memory(self.memory.sample())
            
        self.remember(state, action, reward, next_state, done)
        
        next_state = next_state[np.newaxis, :]
        state = state[np.newaxis, :]  
        
        q_next = self.q_target.predict(next_state)
        q_eval = self.q_eval.predict(next_state)
        
        q_pred = self.q_eval.predict(state)
        
        max_action = int(np.argmax(q_eval, axis=1))
        action = int(np.argmax(action))
        
        q_target = q_pred
        
        q_target[0, action] = rewards + self.gamma*q_next[0, max_action]*(1 - done)
        
        self.q_eval.train(state, q_target)
        
        self.counter += 1
    
    #-------------------------------------------------------------------------------------
    def replay_memory(self, experiences):
        states = np.array([elem[0] for elem in experiences])
        actions = np.array([elem[1] for elem in experiences])
        rewards = np.array([elem[2] for elem in experiences])
        next_states = np.array([elem[3] for elem in experiences])
        done = np.array([elem[4] for elem in experiences])
        
        action_indices = np.dot(actions, np.arange(self.n_actions)).astype(np.int32)
        
        q_next = self.q_target.predict(next_states)
        q_eval = self.q_eval.predict(next_states)
        
        q_pred = self.q_eval.predict(states)
        
        max_actions = np.argmax(q_eval, axis=1)
        
        q_target = q_pred
        
        q_target[np.arange(len(experiences)), action_indices] = rewards + self.gamma*q_next[np.arange(len(experiences)), max_actions.astype(np.int32)]*(1 - done)
        
        self.q_eval.train(states, q_target)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  
            
        else:
            self.epsilon = self.epsilon_min
            
        if (self.counter % self.replace_target == 0):
            self.update_network_parameters(self.q_eval.get_weights())
            
    #-------------------------------------------------------------------------------------
    def update_network_parameters(self, weights):
        self.q_target.update_weights(weights)

In [5]:
env = gym.make('LunarLander-v2')

input_dims = env.observation_space.shape[0]
hidden_dims = [800, 400]
n_actions = env.action_space.n
alpha = 1e-3
gamma = 0.99
epsilon = 0.1
epsilon_decay = 1
epsilon_min = 0.01
mem_size = 100000
batch_size = 64
replace_target = 100

agent = DDQNAgent(input_dims, hidden_dims, n_actions, alpha, gamma, epsilon, epsilon_decay, epsilon_min, mem_size, batch_size, replace_target)



In [6]:
num_episodes = 1000
verbose = 25
total_rewards = []
total_steps = []

print("BEGIN\n")
complete = 0
for episode in range(num_episodes):
    steps = 0
    rewards = 0
    last_observation = env.reset()
    action, one_hot_action = agent.policy(last_observation)
    done = False
    
    while(done == False): 
        print("\r                                                                                         ", end="")
        print("\rEpisode: "+str(episode+1)+"\tStep: "+str(steps)+"\tReward: "+str(rewards), end="")
        observation, reward, done, _ = env.step(action) 
        agent.learn(last_observation, one_hot_action, reward, observation, int(done))
        action, one_hot_action = agent.policy(observation)
        last_observation = observation
        steps += 1
        rewards += reward
        
    total_rewards.append(rewards) 
    total_steps.append(steps)
    
    if(rewards > 200):
        complete += 1
        
    if((episode + 1) % verbose == 0):
        print("\r                                                                                         ", end="")
        print("\rEpisodes: ", episode+1, "/", num_episodes
              , "\n\tTotal reward: ", np.mean(total_rewards[-verbose:])
              , "\n\tNum. steps: ", np.mean(total_steps[-verbose:])
              , "\n\tCompleted: ", complete, "\n--------------------------\n")
        if(complete >= 0.8*verbose):
            break
        else:
            complete = 0
        
print("\nFINISHED")

BEGIN

Episodes:  25 / 1000                                                                     
	Total reward:  -357.2281663837521 
	Num. steps:  113.32 
	Completed:  0 
--------------------------

Episodes:  50 / 1000                                                                     
	Total reward:  -346.61071722086376 
	Num. steps:  117.08 
	Completed:  0 
--------------------------

Episodes:  75 / 1000                                                                     
	Total reward:  -257.9561017251862 
	Num. steps:  101.24 
	Completed:  0 
--------------------------

Episodes:  100 / 1000                                                                    
	Total reward:  -128.81994512743967 
	Num. steps:  111.08 
	Completed:  0 
--------------------------

Episodes:  125 / 1000                                                                    
	Total reward:  -121.120493206263 
	Num. steps:  107.24 
	Completed:  0 
--------------------------

Episodes:  150 / 1000           

In [8]:
done = False
observation = env.reset()
while(done == False):
    time.sleep(0.05)
    action, _ = agent.policy(observation)
    observation, reward, done, _ = env.step(action)
    env.render()
env.close()    

#### 