## Import important libraries
---------

In [None]:
# Importing libraries 
import numpy as np
import random
import math
import random 
from collections import deque
import collections
import pickle

# for building DQN 
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

## Loading Time Matrix 
#### Also, we load the model
--------------

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("../input/rl-project-files/TM_lower.npy")

# Loading the model
model = keras.models.load_model('../input/trained-model/Converged_1900.h5')

## Define the Environment
#### The environment is the MDP described in problem statement
----------

In [None]:
# Defining hyperparameters
m = 5 # number of cities, ranges from 1 ..... m
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1
C = 5 # Per hour fuel and other costs
R = 9 # per hour revenue from a passenger

class CabDriver():
    
    '''Constructor function for the environment...'''
    def __init__(self):
        # Create the action space
        self.action_space = [[0,0]] + [[i,j] for i in range(1, m+1) for j in range(1, m+1) if i != j]
        
        # Create the state space 
        self.state_space = [[i,j,k] for i in range (1,m+1) for j in range(t) for k in range(d)]
        
        # Choose a random state
        self.state_init = self.state_space[random.randrange(0, len(self.state_space))]

        # Start the first round
        self.reset()


    ''' Function for converting the state into a one-hot encoded vector.
        
        @params: state: State to be one-hot encoded. It is a list [location, time, day]
        @returns: One-hot encoded form of given state
        
    '''
    def state_encod_arch1(self, state):
        location = state[0] # Store the current location
        time = state[1]     # Store the current time
        day = state[2]      # Store the current day
        
        # Create a vector for encoding the state
        state_encod = np.zeros((m+t+d, 1))
        
        # Encode the location information. Notice that location is from [1,2,3,4,5], so we need
        # to subtract 1 from the current location to one-hot encode it properly. 
        state_encod[location-1][0] = 1 
        
        # Encode the time information
        state_encod[m+time][0] = 1
        
        # Encode the day information
        state_encod[m+t+day][0] = 1
        
        # Return the state encoding
        return state_encod

    ''' Function for getting requests given a state. The number of requests
        is decided by the MDP formulation given in the problem statement. 
        
        @params: state: State (not in one-hot encoded form). It is a list [location, time, day]
        @returns: possible_actions_index: List of indices of possible actions.
                  These indices correspond to the index of that action in the action space
        @returns: actions: The actual actions (a list of lists, with each inner list consisting of two locations)
        
    '''
    def requests(self, state):
        # Determining the number of requests basis the location. 
        # Get the current location
        location = state[0]
        
        # Set number of requests to 0 for now...
        requests = 0
        
        # Get the number of requests as per MDP specified in problem statement
        if location == 1:
            requests = np.random.poisson(2)
        elif location == 2:
            requests = np.random.poisson(12)
        elif location == 3:
            requests = np.random.poisson(4)
        elif location == 4:
            requests = np.random.poisson(7)
        elif location == 5:
            requests = np.random.poisson(8)

        # Cap the total number of requests at 15
        if requests > 15:
            requests = 15

        # Randomly choose requests from the action space. Note that (0,0) is not a customer request. 
        possible_actions_index = random.sample(range(1, (m-1)*m +1), requests) 
        
        # Get the actions corresponding to those indices
        actions = [self.action_space[i] for i in possible_actions_index]
        
        # Finally append the lazy action - where the driver doesn't take any request at all
        possible_actions_index.append(0)
        actions.append([0,0])

        # Return the list of action indices, and the actions corresponding to those indices. 
        return possible_actions_index, actions   

    ''' This function gives out the reward for an action taken at a particular state
    
        @params: state: State (not in one-hot encoded form). It is a list [location, time, day]
        @params: action: The action taken from the action space 
        @params: Time_matrix: The time matrix that is used for deciding the time taken to go 
                 from one location to another, at a given time and a given day
                 
        @returns: A scalar, which is the reward the agent gets for taking the corresponding 
                  action from the given state. 
                  
    '''
    def reward_func(self, state, action, Time_matrix):
        # If the driver decides to not take any ride, he still incurs the fuel cost...
        if(action == [0,0]):
            return -1*C
        # However, if the driver decides to take a ride, we should calculate the reward he gets...
        else:
            # time_ip is the amount of time taken by driver to go from his current location to pickup location
            time_ip = int(Time_matrix[state[0]-1][action[0]-1][state[1]][state[2]])
            
            # Once the driver reaches the pickup location, we will have a new time and day. So we calculate that...
            new_time = state[1] + time_ip 
            new_day = state[2]
            new_day = (new_day + (new_time // 24)) % 7
            new_time = new_time % 24 
            
            # time_pq is the amount of tiem taken by the driver to go from his pickup location to destination location
            time_pq = int(Time_matrix[action[0]-1][action[1]-1][new_time][new_day])
            
            # Compute the reward as specified in the MDP
            reward = R * time_pq - C * (time_pq + time_ip)
            
            # Return the reward... 
            return reward
    
    ''' Function for determining the next state, given the current state.
        
        @params: state: State (not in one-hot encoded form). It is a list [location, time, day]
        @params: action: The action taken from the action space 
        @params: Time_matrix: The time matrix that is used for deciding the time taken to go 
                 from one location to another, at a given time and a given day
        @returns: The next state and the time elapsed for taking this action.
        
    '''
    def next_state_func(self, state, action, Time_matrix):
        # If the driver decided to not entertain any ride, his location is same
        # Only the time (and possibly the day) gets increased by one hour
        if(action == [0,0]):
            # New location is same as old location
            new_location = state[0]   
            
            # Compute the new time 
            new_time = state[1] + 1
            
            # Compute the new day
            new_day = state[2]         
            new_day = new_day + (new_time//24)
            
            # Take modulos of new_day and new_time to ensure they are within limits... 
            new_day = new_day % 7
            new_time = new_time % 24
            
            # Create the new state
            new_state = [new_location ,new_time, new_day]
            
            # Return the new state and action
            return [new_state, 1] # 1 hour is the time elapsed
        else:
            # As usual, time_ip is the time taken by driver to go from current location to pickup location
            time_ip = int(Time_matrix[state[0]-1][action[0]-1][state[1]][state[2]])
            
            # Compute the new time and day, since we will use updated state to compute the time taken for 
            # going from pickup location to destination location... 
            new_time = state[1] + time_ip
            new_day = state[2]
            new_day = (new_day + (new_time // 24)) % 7
            new_time = new_time % 24 
            
            # Again, time_pq is the time taken to go from pickup location to destination location
            time_pq = int(Time_matrix[action[0]-1][action[1]-1][new_time][new_day])
            
            # Compute the total time elapsed...
            total_time = time_ip + time_pq 
            
            # Compute the new state
            new_location = action[1]
            new_time = state[1] + total_time
            new_day = state[2]
            new_day = ((new_day + (new_time//24))%7)
            new_time = new_time % 24 
            next_state = [new_location, new_time, new_day]
            
            # Return the new state and total time elapsed...
            return [next_state, total_time] 

    ''' Function for resetting the environment. 
        
        @returns: The action space, state space and the initial space (not one-hot encoded) 
        
    '''
    def reset(self):
        return self.action_space, self.state_space, self.state_init

## Functions for getting actions
#### These functions allow us to get action from the trained model...
------

In [None]:
''' Function for converting the state into a one-hot encoded vector.
 
     This function is identical to the state encoder of the environment, except the fact
     that the one-hot encoded vector of the state being returned is a horizontal vector
     instead of a vertical vector. 
     
     @params: state: State to be one-hot encoded. It is a list [location, time, day]
     @returns: One-hot encoded form of given state
     
'''
def state_encoder(state):
    location = state[0] # Store the current location
    time = state[1]     # Store the current time
    day = state[2]      # Store the current day
        
    # Create a one-hot encoded vector
    state_encod = np.zeros((m+t+d, 1))
    state_encod[location-1][0] = 1
    state_encod[m+time][0] = 1
    state_encod[m+t+day][0] = 1
    
    # Make the vector a horizontal vector
    state_encod = state_encod.reshape((1, state_encod.shape[0]))
    
    # Return the state encoding
    return state_encod

''' This function is used for getting the action from the model. This is, in some
    sense, the policy. To be more precise, it is an epsilon-greedy policy. The 
    epsilon can be controlled using the parameters. 
    
    @params: state: State in which the driver is currently at.
    @params: actions_ind: The list of possible action indices.
    @params: epsilon: The epsilon required for the epsilon-greedy policy. Higher policy means more randomness. 
    @returns: the action index that is being taken by the policy... 
    
'''
def get_action(state, actions_ind, epsilon = 0.1):
    # Decide if a random action should be chosen
    if np.random.rand() <= epsilon:
        return random.choice(actions_ind) # Return the random action
    # Otherwise, choose the action as per the learned model
    else:
        # Encode the current state... 
        encoded_state = state_encoder(state)
            
        # Get the Q values of all the actions
        q_value = model.predict(encoded_state)
        
        # Keep only those Q-values which corresponds to actions that can be taken
        required_q_values = [q_value[0][j] for j in actions_ind]

        # Return the corresponding action index which will be taken
        return actions_ind[np.argmax(required_q_values)]

## Code for checking model performance
#### We will see the rewards the model gets
--------

In [None]:
# Create a temporary environment for getting actio space and state space
env_tmp = CabDriver()

# Getting the action and state space, as well as the initial state
action_space, state_space, initial_state = env_tmp.reset()

# Define state size
state_sz = m + t + d
action_sz = len(action_space)

In [None]:
# Create an environment
env = CabDriver()
    
# Reset the environment
action_space, state_space, current_state = env.reset()
    
# Below variable keeps track of whether the episode has ended or not... 
done = False

# Keeps track of total time elapsed
total_time = 0

# Keeps track of total reward obtained
sum_rew = 0

# Now start the episode
while done == False:
    # Get the actions from the environment
    actions_possible_ind, possible_action = env.requests(current_state)
       
    # Get the action as per the epsilon-greedy policy
    agent_action_index = get_action(current_state, actions_possible_ind)
     
    # Get the actual agent action
    agent_action = action_space[agent_action_index]
    
    # Get the reward and add to tal reward
    reward = env.reward_func(current_state, agent_action, Time_matrix)
    sum_rew += reward
    
    # Get the new state and the total time elapsed
    new_state, time_elapsed = env.next_state_func(current_state, agent_action, Time_matrix)
    
    # Update total time elapsed
    total_time += time_elapsed
    
    # If episode has ended, set done to True
    if(total_time >= 24*30):
        done = True
    # Else, set the current state as the new state
    else:
        current_state = new_state

In [None]:
# See the reward the agent got
print("Total reward agent got: ", sum_rew)

--------------------