### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time
from sklearn.externals import joblib 

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
Time_matrix[0][1]

array([[2., 3., 3., 5., 7., 0., 6.],
       [2., 3., 3., 5., 7., 0., 6.],
       [2., 3., 3., 5., 7., 0., 6.],
       [2., 3., 3., 5., 7., 0., 6.],
       [2., 3., 3., 5., 7., 0., 6.],
       [2., 3., 3., 5., 7., 0., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [9., 7., 9., 7., 8., 5., 6.],
       [4., 7., 6., 4., 3., 4., 2.],
       [4., 7., 6., 4., 3., 4., 2.],
       [4., 7., 6., 4., 3., 4., 2.],
       [4., 7., 6., 4., 3., 4., 2.],
       [4., 7., 6., 4., 3., 4., 2.],
       [4., 7., 6., 4., 3., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.],
       [2., 3., 6., 2., 7., 4., 2.]])

In [None]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [32]:
class DQNAgent:
    def __init__(self, state_size, action_size, env):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size
        self.env = env

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001       
        self.epsilon_max = 1.0
        self.epsilon = self.epsilon_max
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        
        self.batch_size = 32      

        # create replay memory using deque
        self.memory = deque(maxlen=2000)
        
        # Initialize the value of the states tracked
        self.states_tracked = []
        
        # We are going to track state [1,0,0] and action (0,1) at index 2 in the action space.
        self.track_state = np.array(self.env.state_encod_arch1([1,0,0])).reshape(1, 36)


        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment       
        
        possible_actions_index, actions = env.requests(state)
        # choose random action if generated random number is less than ε.
        # Action is represented by index, 0-Number of actions, like (0,1,2,3) for 4 actions
        if np.random.rand() <= self.epsilon:
             return random.choice(possible_actions_index)
        # if generated random number is greater than ε, choose the action which has max Q-value
        else:
            state = np.array(self.env.state_encod_arch1(state)).reshape(1, 36)
            q_value = self.model.predict(state)
            q_vals_possible = [q_value[0][i] for i in possible_actions_index]

            return possible_actions_index[np.argmax(q_vals_possible)]


    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        # Adding sample to the memory. 
        self.memory.append((state, action, reward, next_state, done))
        
        # Decay in ε after we generate each sample from the environment
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            
            action_list, reward_list, done_list = [], [], []
            
            # Set the values of input, action, reward, target and done using memory
            # Note the order of <s,a,r,s',done> 
            for i in range(self.batch_size):
                state, action, reward, next_state, done = mini_batch[i]
                update_input[i] = env.state_encod_arch1(state)
                action_list.append(action)
                reward_list.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done_list.append(done)
                
            # Write your code from here
            # 1. Predict the target from earlier model
            target = self.model.predict(update_input)

            # 2. Get the target for the Q-network
            target_val = self.model.predict(update_output)

            #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                # Q Learning: get maximum Q value at s' from target model
                if done_list[i]:
                    target[i][action_list[i]] = reward_list[i]
                else:
                    target[i][action_list[i]] = reward_list[i] + self.discount_factor * np.max(target_val[i])
                
                
            # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size,
                           epochs=1, verbose=0)

    def save_tracking_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_tracked.append(q_value[0][1])
        
    def save(self, name):
        self.model.save(name)


In [33]:
Episodes = 10000
battery_life_time = 24*30

env = CabDriver()
action_space, state_space, state = env.reset()

# Set up state and action sizes.
agent = DQNAgent(action_size=len(action_space), state_size=36, env=env)

score_tracked = []
# to store rewards in each episode
rewards_per_episode, episodes = [], []


### DQN block

In [None]:
start_time = time.time()

for episode in range(Episodes):
    
    # Write code here
   
    # Call the environment
    
    # Call all the initialised variables of the environment
    action_space, state_space, state = env.reset()
    terminal_state = False
    score = 0
    battery_consumed_time = 0
    #Call the DQN agent
    
    while (terminal_state == False):
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        action = agent.get_action(state)
        # 2. Evaluate your reward and next state
        reward, next_state, step_time = env.step(state, env.action_space[action], Time_matrix)
        # check the episode ended or not
        battery_consumed_time += step_time
        if (battery_consumed_time >= battery_life_time):
            terminal_state = True
        else:
            # 3. Append the experience to the memory
            agent.append_sample(state, action, reward, next_state, False)
            # 4. Train the model by calling function agent.train_model
            agent.train_model()
            # 5. Keep a track of rewards, Q-values, loss
            state = next_state
            score += reward
     
    # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)
    
    # every 20 episodes:
    if ((episode + 1) % 20 == 0):
        print("episode:", episode, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)
    
    if ((episode + 1) % 5 == 0):
        agent.save_tracking_states()
    
    # Total rewards per episode
    score_tracked.append(score)
    
print("Saving Model {}".format(episode))
agent.save(name="model_weights.pkl")

elapsed_time = time.time() - start_time
print(elapsed_time)

episode: 19   score: 165.0   memory length: 2000   epsilon: 0.11157070860138803
episode: 39   score: 548.0   memory length: 2000   epsilon: 0.014874537759091451
episode: 59   score: 476.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 79   score: 268.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 99   score: 833.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 119   score: 749.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 139   score: 797.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 159   score: 1018.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 179   score: 1179.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 199   score: 978.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 219   score: 1054.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 239   score: 1274.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 259   sco

episode: 1999   score: 1204.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2019   score: 1252.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2039   score: 1418.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2059   score: 1261.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2079   score: 1675.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2099   score: 1336.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2119   score: 1505.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2139   score: 1091.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2159   score: 1274.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2179   score: 1594.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2199   score: 1376.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 2219   score: 1221.0   memory length: 2000   epsilon: 0.00999867159

episode: 3959   score: 1301.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 3979   score: 1262.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 3999   score: 1068.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4019   score: 1267.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4039   score: 1203.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4059   score: 1058.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4079   score: 1096.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4099   score: 1625.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4119   score: 1086.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4139   score: 1482.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4159   score: 989.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 4179   score: 1123.0   memory length: 2000   epsilon: 0.009998671593

episode: 5919   score: 1077.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 5939   score: 1442.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 5959   score: 1501.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 5979   score: 1720.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 5999   score: 1446.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6019   score: 1347.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6039   score: 1462.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6059   score: 1469.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6079   score: 1274.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6099   score: 1211.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6119   score: 1451.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 6139   score: 1452.0   memory length: 2000   epsilon: 0.00999867159

episode: 7879   score: 1383.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7899   score: 1100.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7919   score: 1363.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7939   score: 1585.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7959   score: 1291.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7979   score: 1763.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 7999   score: 1648.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 8019   score: 1235.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 8039   score: 1288.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 8059   score: 1400.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 8079   score: 1823.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 8099   score: 1388.0   memory length: 2000   epsilon: 0.00999867159

episode: 9839   score: 1118.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 9859   score: 1775.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 9879   score: 1060.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 9899   score: 1076.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 9919   score: 1314.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 9939   score: 1432.0   memory length: 2000   epsilon: 0.009998671593271896


### Tracking Convergence

In [None]:
state_tracked_sample = [agent.states_tracked[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i] < 1000]

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Q_value for state [0,0,0]  action (0,2)')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.states_tracked))
plt.show()

In [None]:
score_tracked_sample = [score_tracked[i] for i in range(len(score_tracked)) if (i % 4 == 0)]

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Rewards per episode')
xaxis = np.asarray(range(0, len(score_tracked_sample)))
plt.plot(xaxis,np.asarray(score_tracked_sample))
plt.show()

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
epsilon = []
epsilonVal = 1.0
for i in range(0,10000):
    epsilon.append(epsilonVal)
    epsilonVal *= 0.999

In [None]:
plt.plot(epsilon)
plt.show()