In [1]:
#from tensorflow.keras import backend as K
#from tensorflow.keras.layers import Activation, Dense, Input
#from tensorflow.keras.models import Model
#from tensorflow.keras.optimizers import Adam

#from tensorflow import config
#config.experimental_run_functions_eagerly(True)

from keras import backend as K
from keras.layers import Activation, Dense, Input
from keras.models import Model
from keras.optimizers import Adam

import numpy as np

import gym

Using TensorFlow backend.


In [2]:
class Agent(object):
    def __init__(self, input_dims, layer1_size, layer2_size, n_actions, alpha, beta, gamma=0.99):
        '''
        Inputs:
            input_dims(int): Observation dimension
            layer1_size(int): Number of elements in the first hidden layer
            layer2_size(int): Number of elements in the second hidden layer
            n_actions(int): Number of possible discrete actions
            alpha(float): Actor learning rate
            beta(float): Critic learning rate
            gamma(float): Discount factor
        '''
        self.input_dims = input_dims
        self.fc1_size = layer1_size
        self.fc2_size = layer2_size
        self.n_actions = n_actions
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        
        self.actor, self.critic, self.policy = self.build_actor_critic_network()
        self.action_space = [i for i in range(self.n_actions)]
    
    #----------------------------------------------------------------------------------------
    def build_actor_critic_network(self):
        #The actor and the critic will share the same network, differing only in the output
        input_layer = Input(shape=(self.input_dims,))
        delta = Input(shape=[1])
        dense1 = Dense(self.fc1_size, activation='relu')(input_layer)
        dense2 = Dense(self.fc2_size, activation='relu')(dense1)
        
        #Actor output - Probabilities of chosing each action on a given state
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        
        #Critic output - State value of a given state
        values = Dense(1, activation='linear')(dense2)
    
        #To implement a custom loss function in Keras it takes as input the label and the values and return a function
        #    and must be implemented inside the function it is used
        def custom_loss(y_true, y_pred):
            '''
            Inputs:
                y_true(one hot array)(1, n_actions): The action the agent actually took (the label used in the backprop.)
                y_pred(one hot array)(1, n_actions: The output of the neural network
            '''
            #Since the loss function will be calculating the log of y_pred it can not be zero in any position
            out = K.clip(y_pred, 1e-8, 1-1e-8)
            #Since the y_true is a one hot array, the value will change only in the selected action
            log_lik = y_true * K.log(out)

            #delta - Calculated in the learning function - Related to the output of the Critic network
            return K.sum(-log_lik * delta)
    
        actor = Model(inputs=[input_layer, delta], outputs=[probs])
        actor.compile(optimizer=Adam(lr=self.alpha), loss=custom_loss)
        
        critic = Model(inputs=[input_layer], outputs=[values])
        critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
        
        policy = Model(inputs=[input_layer], outputs=[probs])
        
        return actor, critic, policy
    
    #----------------------------------------------------------------------------------------
    def choose_action(self, observation, greedy=False):
        '''
        Inputs:
            observation(array)(1, input_dims): Observations returned by the envirorment
        '''
        #Add one dimension to the observation
        state = observation[np.newaxis, :] #(1, 1, input_dims)
        #Calculate the probabilities of an action given a state
        probabilities = self.policy.predict(state)[0]
        #If the policy is greedy the agent choose the max valued action
        if greedy:
            action = np.argmax(probabilities)
            return action
        
        #Select an action based on the probabilities of each action
        action = np.random.choice(self.action_space, p=probabilities)
        
        return action
    
    #----------------------------------------------------------------------------------------
    def learn(self, state, action, reward, next_state, done):
        '''
        Inputs:
            state(array)(1, input_dims): Current state of the agent
            action(int): Action chosed by the agent
            reward(float): Reward recieved after selection the action in the state
            next_state(array)(1, input_dims): State after selectin action in state
            done(bool): If the episode ended or not
        '''
        #Add one dimension to the state
        state = state[np.newaxis, :] #(1, 1, input_dims)
        #Add one dimension to the next_state
        next_state = next_state[np.newaxis, :] #(1, 1, input_dims)
        
        #Feed the state to the critic so it can output its value 
        critic_value = self.critic.predict(state)
        #Feed the next state to the critic so it can output its value 
        critic_next_value = self.critic.predict(next_state)
        
        #Calculate the TD error
        target = reward + self.gamma * critic_next_value * (1 - int(done))
        delta = target - critic_value
        
        #Generate the one hot encoding for the action
        actions = np.zeros([1, self.n_actions])
        actions[np.arange(1), action] = 1.0
        
        #Train the actor model with the state and the delta as inputs and actions as label
        #    the delta is using in the custom loss function
        self.actor.fit([state, delta], actions, verbose=0)
        
        #Train the critic model with the state as the input and the target as label
        self.critic.fit(state, target, verbose=0)

In [None]:
env = gym.make('LunarLander-v2')
scores_history = []
steps_history = []
num_episodes = 3000
verbose = 100

input_dims = env.observation_space.shape[0]
layer1_size = 1024
layer2_size = 512
n_actions = env.action_space.n
alpha = 1e-5
beta = 5e-5

agent = Agent(input_dims, layer1_size, layer2_size, n_actions, alpha, beta)

print("BEGIN\n")
complete = 0

for episode in range(num_episodes):
    done = False
    score = 0
    steps = 0
    observation = env.reset()
    
    while not done:
        print("\r                                                                                         ", end="")
        print("\rEpisode: "+str(episode+1)+"\tStep: "+str(steps)+"\tReward: "+str(score), end="")
        action = agent.choose_action(observation)
        new_observation, reward, done, _ = env.step(action)
        agent.learn(observation, action,reward, new_observation, done)
        observation = new_observation
        score += reward
        steps += 1
    
    if(score >= 200):
        complete += 1
        
    if((episode+1)%verbose == 0):
        print("\r                                                                                         ", end="")
        print("\rEpisodes: ", episode+1, "/", num_episodes
              , "\n\tTotal reward: ", np.mean(scores_history[-verbose:])
              , "\n\tNum. steps: ", np.mean(steps_history[-verbose:])
              , "\n\tCompleted: ", complete, "\n--------------------------\n")
        
        complete = 0
        
    
    scores_history.append(score)
    steps_history.append(steps)

print("\nFINISHED")



BEGIN

Episodes:  100 / 3000                                                                    
	Total reward:  -511.01451026453014 
	Num. steps:  71.41414141414141 
	Completed:  0 
--------------------------

Episodes:  200 / 3000                                                                    
	Total reward:  -556.6097620066683 
	Num. steps:  66.13 
	Completed:  0 
--------------------------

Episodes:  300 / 3000                                                                    
	Total reward:  -579.0561971934947 
	Num. steps:  67.28 
	Completed:  0 
--------------------------

Episodes:  400 / 3000                                                                    
	Total reward:  -585.9226365052222 
	Num. steps:  67.09 
	Completed:  0 
--------------------------

Episodes:  500 / 3000                                                                    
	Total reward:  -580.8779547140488 
	Num. steps:  67.33 
	Completed:  0 
--------------------------

Episodes:  600 / 3000    

In [None]:
observation = env.reset()
done = False
while not done:
    env.render()
    action = agent.choose_action(observation, greedy=True)
    new_observation, reward, done, _ = env.step(action)
    observation = new_observation
env.close()