### Cab-Driver Agent

In [None]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import os
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from keras.models import load_model

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

#### Defining Time Matrix

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [None]:
States_track = collections.defaultdict(dict)
print(len(States_track))

#### Tracking the state-action pairs for checking convergence


In [None]:
# make directory
if not os.path.exists("saved_pickle_files"):
    os.mkdir("saved_pickle_files")

In [None]:
#This function will initialise the 7 Q-values which need to be tracked for checking convergence

#iters = []                            #the x-axis of convergence graphs

def initialise_tracking_states():
    sample_q_values = [((1,9,1),(1,2)),((1,11,3),(1,3)),((1,20,5),(2,4)),((4,10,5),(3,1)),((3,13,0),(3,1)),((0,9,6),(1,3))]    #select any 4 Q-values
    for q_values in sample_q_values:
        state = q_values[0]
        action = q_values[1]
        States_track[state][action] = []    #this is an array which will have appended values of that state-action pair for every 2000th episode         

In [None]:
#This function will append latest Q-values of the 6 Q-values which are being tracked for checking convergence

def save_tracking_states(actions_list, qvalues, state):
    if state in States_track.keys():
        action = list(States_track[state].keys())
        ind = actions_list.index(action[0])
        qv = qvalues[0][ind]
        States_track[state][action[0]].append(qv)

In [None]:
#Defining a function to save the Q-dictionary as a pickle file
def save_pickle(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.input_size = state_size   # input for NN which is size of state + size of action
        self.output_size = action_size # output of NN which is 

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate =  0.01     
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.batch_size = 32 
        self.epsilon_decay = -0.0005
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)
        
        self.special_states_tracked = []
        
         # We are going to track state [0,0,0] and action (0,2) at index 2 in the action space.
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets     
        
        # hidden layers
        model.add(Dense(32, input_dim=self.input_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.output_size, activation='relu', kernel_initializer='he_uniform'))

        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state, env):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment 
        action_index, all_actions = env.requests(state)
#          print(all_actions)
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from all possible actions
            # in case of cartpole this will randomly choose an action between 0 and 1
            return random.choice(action_index)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            action_dict = {}
            input_nn = env.state_encod_arch1(state)
            input_nn = input_nn.reshape(1, self.input_size)
            q_value = self.model.predict(input_nn)
            save_tracking_states(env.action_space, q_value, state)
            for ind in action_index:
                action_dict[ind] = q_value[0][ind]
            max_qvalue_action = max(action_dict, key=action_dict.get)
            return max_qvalue_action
        

    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
     self.memory.append((state, action, reward, next_state, done))
    
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.input_size))
            update_input =np.zeros((self.batch_size, self.input_size))
            
            actions, rewards, done = [], [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                state_input_nn = env.state_encod_arch1(state)
                state_input_nn = state_input_nn.reshape(1, self.input_size)
                update_input[i] = state_input_nn
                actions.append(action)
                rewards.append(reward)
                next_state_input_nn = env.state_encod_arch1(state)
                next_state_input_nn = next_state_input_nn.reshape(1, self.input_size)
                update_output[i] = next_state_input_nn
                done.append(done_boolean)
                
                # Write your code from here
                # 1. Predict the target from earlier model
            target = self.model.predict(update_input)
                
                # 2. Get the target for the Q-network
            target_qval = self.model.predict(update_output)
                
                #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    #print("thu")
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                
        # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
    def save_tracking_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.special_states_tracked.append(q_value[0][2])
            

    def save(self, name):
        with open(name, 'wb') as file:  
            pickle.dump(self.model, file,pickle.HIGHEST_PROTOCOL)
    

In [None]:
# Invoke Env class
env = CabDriver()
Episodes = 12000
Save_Rewards_After_Ep = 50
Save_Weights_After_Ep = 50
rewards_per_episode, episodes = [], []
agent = DQNAgent(36,21)
initialise_tracking_states()

In [None]:
#agent.load_weights("model_weights")

In [None]:
# Episodes = 5
# agent = DQNAgent(36,21)

In [None]:
# agent.model.load_weights("model_weights")

### DQN block

In [1]:
for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    env = CabDriver()
    all_actions, all_state, curr_state = env.reset()
    terminal_state = False
    revenue = 0
    #Call the DQN agent
    ep_start_hour = curr_state[1]
    ep_end_hour = curr_state[1] + 24 * 30
    #print("ep {0}".format(episode))
    
    while terminal_state == False:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        fin_action_index = agent.get_action(curr_state, env)
        
        
        
#         if fin_action_index == 20:
#             print("state = {0} {1} {2}, action = {3}".format(curr_state[0],curr_state[1],curr_state[2], fin_action_index))
#         else:
#             act = env.action_space[fin_action_index]
#             print("state = {0} {1} {2}, action = {3} {4}".format(curr_state[0],curr_state[1],curr_state[2],act[0],act[1]))
            
            
        next_state, reward, total_hours_lapsed = env.step(curr_state, fin_action_index, Time_matrix)
        ep_start_hour += total_hours_lapsed
            
        if ep_start_hour >= ep_end_hour:
            terminal_state = True
            
        agent.append_sample(curr_state, fin_action_index, reward, next_state, terminal_state)
        
        agent.train_model()
        revenue += reward
        #print("reward = {0}".format(reward)) 
        curr_state = next_state

    # store total reward obtained in this episode
    rewards_per_episode.append(revenue)
    episodes.append(episode) 
    
    # epsilon decay
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)

    # save rewards_per_episode
    if episode % Save_Rewards_After_Ep == 0: 
        save_pickle(rewards_per_episode, "saved_pickle_files/rewards_per_episode")
    
    if episode % Save_Rewards_After_Ep == 0:     
        save_pickle(States_track,'saved_pickle_files/States_tracked') 
        
    if ((episode + 1) % 5 == 0):
        print("reward after ep {0} = {1}".format(episode, revenue))
        agent.save_tracking_states()
        
        
    if(episode % Save_Weights_After_Ep == 0):
        print("Saving Model {}".format(episode))
        agent.save(name="model_weights.pkl")
            
        

NameError: name 'Episodes' is not defined

### Tracking Convergence

In [None]:
print("Average reward of last 10 episodes is {0}".format(np.max(rewards_per_episode[-100:])))

In [None]:
with open('saved_pickle_files/rewards_per_episode.pkl', 'rb') as f:
    rewards_per_episode = pickle.load(f)

plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.xlabel("episode number")
plt.ylabel("reward per episode")
print(len(rewards_per_episode))

# save plots in saved_plots/ directory
plt.savefig('rewards.png')

print("Average reward of last 10 episodes is {0}".format(np.mean(rewards_per_episode[-1000:])))

In [None]:
with open('saved_pickle_files/States_tracked.pkl', 'rb') as f:
    States_track = pickle.load(f)

In [None]:
plt.figure(0, figsize=(16,7))
plt.subplot(241)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(1,9,1)][(1,2)])))
plt.plot(xaxis,np.asarray(States_track[(1,9,1)][(1,2)]))

plt.subplot(242)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(1,20,5)][(2,4)])))
plt.plot(xaxis,np.asarray(States_track[(1,20,5)][(2,4)]))

plt.subplot(243)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(1,11,3)][(1,3)])))
plt.plot(xaxis,np.asarray(States_track[(1,11,3)][(1,3)]))

plt.subplot(244)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(4,10,5)][(3,1)])))
plt.plot(xaxis,np.asarray(States_track[(4,10,5)][(3,1)]))

plt.subplot(245)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(3,13,0)][(3,1)])))
plt.plot(xaxis,np.asarray(States_track[(3,13,0)][(3,1)]))

plt.subplot(246)
plt.title('state=(15,mon) action=12')
xaxis = np.asarray(range(0, len(States_track[(0,9,6)][(1,3)])))
plt.plot(xaxis,np.asarray(States_track[(0,9,6)][(1,3)]))



In [None]:
state_tracked_sample = [agent.special_states_tracked[i] for i in range(len(agent.special_states_tracked)) if agent.special_states_tracked[i] < 1000]

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Q_value for state [0,0,0]  action (0,2)')
xaxis = np.asarray(range(0, len(agent.special_states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.special_states_tracked))
plt.show()

In [None]:
for key, value in States_track.items():
    for k,v in value.items():
        print(len(v))

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,100)
epsilon = []
for i in range(0,3000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))
   # epsilon.append(np.random.rand())
print(epsilon)

In [None]:
plt.plot(time, epsilon)
plt.show()