### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver
import os
import collections


Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
Q_dict = collections.defaultdict(dict)

States_track = collections.defaultdict(dict)

rewards_tracked = {}
for i in range(0,5):
    for j in range(0,24):
        for k in range(0,7):
            rewards_tracked[(i,j,k)]=[]


In [4]:
total_actions=[(0,0)]
for i in range(0,5):
    for j in range(0,5):
        if i!=j:
            total_actions.append((i,j))

total_state = [(i,j,k) for i in range(0,5) for j in range(0,24) for k in range(0,7)]

for state in total_state:
    Q_dict[state] = {}
    for action in total_actions:
        Q_dict[state][action] = 0.0
        

In [5]:
def find_action_indx(val):
    for i,item in enumerate(total_actions):
        if val == item:
            return i
    return 0

def convert_into_str(arr):
    text=""
    for val in arr:
        text=text+"-"+str(int(val))
    return text



In [6]:
Q_dict[(0,0,0)][0,1]

0.0

In [7]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state-action and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Specify you hyper parameters for the DQN
        self.discount_factor = 0.85
        self.learning_rate =  0.01    
        self.epsilon_max = 1
        self.epsilon_decay = -0.0005
        self.epsilon_min = 0.01
        self.batch_size = 32
        self.epsilon = 1
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()


    # approximate Q function using Neural Network
    def build_model(self):
        input_shape=self.state_size
        model = Sequential()
        # Write your code here: Add layers to your neural nets
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state, action):
        # Write your code here:
        # get action from model using epsilon-greedy policy
        # Decay in ε after we generate each sample from the environment
        if np.random.rand() <= self.epsilon:
            return action[random.randrange(self.action_size)]
        else:
            state = state.reshape(1, self.state_size)
            q_value = self.model.predict(state)
            return total_actions[np.argmax(q_value[0])]

        
    

    def append_sample(self, state, action, reward, next_state):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state))
    

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self,terminal_state):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            actions, rewards = [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state = mini_batch[i]
                state_encod = env.state_encod_arch2(state,action)
                
                update_input[i] = state_encod
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch2(next_state,(0,0))
                
            #Predict the target from earlier model 
            target = self.model.predict(update_input)

            #Get the target for the Q-network
            target_qval = self.model.predict(update_output)
            for i in range(self.batch_size):
                if terminal_state:
                    target[i][find_action_indx(actions[i])] = rewards[i]
                else: # non-terminal state
                    target[i][find_action_indx(actions[i])] = rewards[i] + self.discount_factor * np.max(target_qval[i])

            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
                

    def save(self, name):
        self.model.save_weights(name)

In [9]:
Episodes = 2000

### DQN block

In [10]:
#call Environment class
env = CabDriver()

#get state and action size
state_size = 5+7+24+5+5
action_size = len(env.action_space)

agent= DQNAgent(state_size,action_size)

rewards_per_episode, episodes = [], []

episode_time = 24*30 #a month value in hours
LR = agent.learning_rate
GAMMA = agent.discount_factor
threshold = 200
policy_threshold = 30000
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")
    
for episode in range(Episodes):
    terminal_state=False
    score = 0
    time_stamp=0
    total_time = 0
    time_state_to_end = 0
    env = CabDriver()
    action_space,state_space,state = env.reset()
    #agent.epsilon = - 1/ (1 + np.exp((-episode+7500000)/17000000)) + 1
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)
    
    initial_state = env.state_init
    
    while not terminal_state:
        if time_stamp > episode_time:
            terminal_state=True;
        z = np.random.random()
        #time_stamp+=1
        
        action = env.requests(state)[1]
        agent.action_size=len(action)
        take_action=agent.get_action(env.state_encod_arch2(state,(0,0)),action)
        
        next_state,time_state_to_end = env.next_state_func(state,take_action,Time_matrix)
        reward= env.reward_func(state,take_action,Time_matrix)
#         reward, next_state, step_time = env.step(state, env.action_space[action], Time_matrix)
        time_stamp += time_state_to_end
        
        agent.append_sample(state, take_action, reward, next_state)
        max_next = max(Q_dict[next_state],key=Q_dict[next_state].get)
        Q_dict[state][take_action] += LR * ((reward + (GAMMA*(Q_dict[next_state][max_next]))) - Q_dict[state][take_action] ) 
            
        score += reward
        state = next_state

        agent.train_model(terminal_state)
        
    
        
    rewards_per_episode.append(score)
    episodes.append(episode)
    
#     if agent.epsilon > agent.epsilon_min:
#         agent.epsilon *= agent.epsilon_decay
    
    if (episode % 10 == 0):
        print("episode {0}, reward {1}, memory_length {2}, epsilon {3}, time_taken {4}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon,time_stamp))
    if episode % 1000 == 0:
        # store q-values of some prespecified state-action pairs
        # q_dict = agent.store_q_values()

        # save model weights
        agent.save(name="model_weights.h5")
        print("model saved")

    if initial_state in rewards_tracked:     #storing rewards
        rewards_tracked[initial_state].append(score)
        #save_obj(rewards_tracked,'Rewards')

    if ((episode+1) % threshold) == 0:   #every 2000th episode
        save_obj(rewards_tracked,'Rewards')   
    
#     #TRACKING Q-VALUES
#     if (episode == threshold-1):        #at the 1999th episode
#         initialise_tracking_states()
      
#     if ((episode+1) % threshold) == 0:   #every 2000th episode
#         save_tracking_states()
#         save_obj(States_track,'States_tracked')   
    
    #SAVING POLICY
    if ((episode+1)% policy_threshold ) == 0:  #every 30000th episodes, the Q-dict will be saved
        save_obj(Q_dict,'Policy')    
        
        
save_obj(rewards_tracked,'Rewards')   
save_obj(States_track,'States_tracked')   
save_obj(Q_dict,'Policy')      
print(episode)
    
    

W1125 18:10:20.578776  4528 deprecation_wrapper.py:119] From C:\Users\debasish.b.sahoo\AppData\Roaming\Python\Python37\site-packages\keras\backend\tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1125 18:10:20.612588  4528 deprecation_wrapper.py:119] From C:\Users\debasish.b.sahoo\AppData\Roaming\Python\Python37\site-packages\keras\backend\tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1125 18:10:20.616577  4528 deprecation_wrapper.py:119] From C:\Users\debasish.b.sahoo\AppData\Roaming\Python\Python37\site-packages\keras\backend\tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1125 18:10:20.699354  4528 deprecation_wrapper.py:119] From C:\Users\debasish.b.sahoo\AppData\Roaming\Python\Python37\site-packages\keras\optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.

episode 0, reward -545.0, memory_length 236, epsilon 0.99999, time_taken 732.0
model saved
episode 10, reward -501.0, memory_length 2000, epsilon 0.9950025290678904, time_taken 725.0
episode 20, reward -890.0, memory_length 2000, epsilon 0.9900399332508306, time_taken 724.0
episode 30, reward -177.0, memory_length 2000, epsilon 0.9851020884836666, time_taken 725.0
episode 40, reward -226.0, memory_length 2000, epsilon 0.9801888713200222, time_taken 733.0
episode 50, reward -469.0, memory_length 2000, epsilon 0.9753001589292124, time_taken 725.0
episode 60, reward -104.0, memory_length 2000, epsilon 0.9704358290931727, time_taken 726.0
episode 70, reward -736.0, memory_length 2000, epsilon 0.965595760203404, time_taken 733.0
episode 80, reward -885.0, memory_length 2000, epsilon 0.9607798312579316, time_taken 724.0
episode 90, reward -459.0, memory_length 2000, epsilon 0.9559879218582816, time_taken 725.0
episode 100, reward -423.0, memory_length 2000, epsilon 0.9512199122064691, time_t

episode 890, reward -380.0, memory_length 2000, epsilon 0.6408178677895584, time_taken 722.0
episode 900, reward 48.0, memory_length 2000, epsilon 0.6376217753402571, time_taken 721.0
episode 910, reward 313.0, memory_length 2000, epsilon 0.6344416234685488, time_taken 727.0
episode 920, reward -171.0, memory_length 2000, epsilon 0.6312773326704709, time_taken 730.0
episode 930, reward 105.0, memory_length 2000, epsilon 0.6281288238385889, time_taken 723.0
episode 940, reward -61.0, memory_length 2000, epsilon 0.6249960182600179, time_taken 728.0
episode 950, reward -308.0, memory_length 2000, epsilon 0.6218788376144554, time_taken 731.0
episode 960, reward -123.0, memory_length 2000, epsilon 0.6187772039722228, time_taken 730.0
episode 970, reward 277.0, memory_length 2000, epsilon 0.6156910397923175, time_taken 735.0
episode 980, reward -390.0, memory_length 2000, epsilon 0.6126202679204743, time_taken 731.0
episode 990, reward 65.0, memory_length 2000, epsilon 0.6095648115872363, ti

episode 1770, reward -371.0, memory_length 2000, epsilon 0.41271004613731693, time_taken 728.0
episode 1780, reward -1855.0, memory_length 2000, epsilon 0.410651646194818, time_taken 727.0
episode 1790, reward -485.0, memory_length 2000, epsilon 0.40860351256486205, time_taken 728.0
episode 1800, reward 527.0, memory_length 2000, epsilon 0.4065655940440017, time_taken 731.0
episode 1810, reward 274.0, memory_length 2000, epsilon 0.4045378396841678, time_taken 725.0
episode 1820, reward 562.0, memory_length 2000, epsilon 0.40252019879139567, time_taken 723.0
episode 1830, reward -162.0, memory_length 2000, epsilon 0.4005126209245579, time_taken 729.0
episode 1840, reward -385.0, memory_length 2000, epsilon 0.3985150558941033, time_taken 723.0
episode 1850, reward 632.0, memory_length 2000, epsilon 0.3965274537608021, time_taken 723.0
episode 1860, reward -726.0, memory_length 2000, epsilon 0.3945497648344974, time_taken 723.0
episode 1870, reward -182.0, memory_length 2000, epsilon 0.39

In [11]:
for arr in env.state_space:
    if arr[2]==25:
        print(arr)

In [12]:
env.state_space

[(0, 0, 0),
 (0, 0, 1),
 (0, 0, 2),
 (0, 0, 3),
 (0, 0, 4),
 (0, 0, 5),
 (0, 0, 6),
 (0, 1, 0),
 (0, 1, 1),
 (0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 1, 6),
 (0, 2, 0),
 (0, 2, 1),
 (0, 2, 2),
 (0, 2, 3),
 (0, 2, 4),
 (0, 2, 5),
 (0, 2, 6),
 (0, 3, 0),
 (0, 3, 1),
 (0, 3, 2),
 (0, 3, 3),
 (0, 3, 4),
 (0, 3, 5),
 (0, 3, 6),
 (0, 4, 0),
 (0, 4, 1),
 (0, 4, 2),
 (0, 4, 3),
 (0, 4, 4),
 (0, 4, 5),
 (0, 4, 6),
 (0, 5, 0),
 (0, 5, 1),
 (0, 5, 2),
 (0, 5, 3),
 (0, 5, 4),
 (0, 5, 5),
 (0, 5, 6),
 (0, 6, 0),
 (0, 6, 1),
 (0, 6, 2),
 (0, 6, 3),
 (0, 6, 4),
 (0, 6, 5),
 (0, 6, 6),
 (0, 7, 0),
 (0, 7, 1),
 (0, 7, 2),
 (0, 7, 3),
 (0, 7, 4),
 (0, 7, 5),
 (0, 7, 6),
 (0, 8, 0),
 (0, 8, 1),
 (0, 8, 2),
 (0, 8, 3),
 (0, 8, 4),
 (0, 8, 5),
 (0, 8, 6),
 (0, 9, 0),
 (0, 9, 1),
 (0, 9, 2),
 (0, 9, 3),
 (0, 9, 4),
 (0, 9, 5),
 (0, 9, 6),
 (0, 10, 0),
 (0, 10, 1),
 (0, 10, 2),
 (0, 10, 3),
 (0, 10, 4),
 (0, 10, 5),
 (0, 10, 6),
 (0, 11, 0),
 (0, 11, 1),
 (0, 11, 2),
 (0, 11, 3),
 (0, 11, 4),
 (0,

### Tracking Convergence

In [13]:
state_tracked_sample = [agent.States_track[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i] < 1000]

AttributeError: 'DQNAgent' object has no attribute 'states_tracked'

In [None]:
States_track

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

In [None]:
env = CabDriver()
state_size = env.state_space.shape[0]

In [None]:
state_size

In [None]:
np.argmax([10,14,15,45])