# Taxi-DQN

### Resolving the Taxi problem using DQN

Simple environment to experiment the corretness of the DQN implementation

Features:
- Sampling strategy:
- Separate network

In [2]:
import gym
import numpy as np
import tensorflow as tf 
from tensorflow import keras
from collections import deque
import random
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../')

from utils.read_last_saved_model import restore_model

env = gym.make('Taxi-v3')

### Initialization

In [3]:
N_EPOCHS = 100
N_EPISODES = 10
N_TRAINING_STEPS = 20

BATCH_SIZE = 64
DISCOUNT_FACTOR = 0.95

# parameter to save the model
save_after_n_episodes = int((n_epochs-1) * 0.1)

# parameters to set up the graph
graph_interval = int((n_epochs-1) * 0.01) # average the success rate every 1% episodes
success = 0
perc_successes = []

last_episode = 0 # where the model was interrupted

model_path = '.\\models\\DQN\\' # where the models are saved
use_saved_model = False

replay_buffer = deque(maxlen=20000)

In [None]:
class ExperienceReplay(object):
    
    def __init__(self):
        self.replay_buffer = deque(maxlen=20000)
        
    def sample_experiences(batch_size):
        #pick a random batch example from replay buffer
        indices = np.random.randint(len(replay_buffer), size=batch_size)
        batch = [replay_buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(5)]
        return states, actions, rewards, next_states, dones
    
    def collect()

# Training

Semi-gradient Q-Learning update formula:
$ w_{t+1} = w_t + \eta[R_{t+1} + \lambda max_a\hat{q}(S_{t+1},a,w_t)]\nabla\hat{q}(S_t,A_t,w_t)$

$\color{blue}{\text{Code to formula legend:}}$

next_Q_values: $ \hat{q} $

max_next_Q_values: $ max_a\hat{q}(S_{t+1},a,w_t) $

target_Q_values: $ R_{t+1} + \lambda max_a\hat{q}(S_{t+1},a,w_t) $

discount_factor: $ \lambda $ ($\color{red}{\text{0}}$ on terminal states because of $\color{green}{\text{(1-dones)}}$)

In [None]:
class DQNetwork(object):
    
    def __init__(self, discount_factor, use_saved_model):
        this.discount_factor = discount_factor
        create_nn(use_saved_model)
        input_shape = [1] #500 states represented with a number (400 reachable)
        n_outputs = env.action_space.n # 6 actions
    
    def create_nn(self, use_saved_model):
        if (use_saved_model):
            model_folder, last_episode, file_perc_successes = restore_model(model_path)
            self.model = keras.models.load_model(model_folder)
            perc_successes = np.load(file_perc_successes).tolist()
        else:
            self.model = keras.Sequential([
                keras.layers.Dense(32, activation="relu", input_shape=input_shape),
                keras.layers.Dense(32, activation="relu"),
                keras.layers.Dense(n_outputs)
            ])
    
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        self.loss_fn = tf.keras.losses.mean_squared_error
    
    def train(self, batch_size):
        for _ in N_TRAINING_STEPS:
            self.training_step(batch_size)
            
    def training_step(self, batch_size):
        #get experiences from the replay buffer
        experiences = sample_experiences(batch_size)
        states, actions, rewards, next_states, dones = experiences

        #compute the values for the update
        next_Q_values = model.predict(next_states)
        max_next_Q_values = np.max(next_Q_values, axis=1)
        target_Q_values = (rewards + (1-dones)*discount_factor*max_next_Q_values)
        mask = tf.one_hot(actions, n_outputs)

        states = tf.expand_dims(tf.convert_to_tensor(states), axis = 1)

        #update the weights
        with tf.GradientTape() as tape:
            all_Q_values = model(states)
            Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(loss_fn(target_Q_values,Q_values))
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
    def play_one_step(self, env, state, epsilon):
        action = epsilon_greedy_policy(state, epsilon)
        next_state, reward, done, info = env.step(action)
        reward = reward - 1
        replay_buffer.append((state, action, reward, next_state, done))
        return next_state, reward, done, info
    
    def epsilon_greedy_policy(observation, epsilon=0):
        r =  random.randrange(0, 101, 1)/100
        if (r <= epsilon):
            action = env.action_space.sample()
        else:
            action = select_best_action(observation)
        return action
     
    def select_best_action(observation):
        Q_values = model.predict(np.array( [observation,])[np.newaxis, ...])
        return np.argmax(Q_values[0])
    
    

In [19]:
dqn = DQNetwork()

for i_episode in range(last_episode, n_epochs):
    observation = env.reset()
    total_episode_reward = 0
    for t in range(n_steps):
        epsilon = max(1 - i_episode/n_epochs - 0.05, 0.01)
        observation, reward, done, info = play_one_step(env, observation, epsilon)
        
        #env.render()
        if done:
            print("Episode " + str(i_episode) + " finished at state " + str(observation) + " after {} timesteps".format(t+1))
            if (reward == 0):
                success = success + 1
            break
        if i_episode>(last_episode+100):
            for i in range(2):
                training_step(batch_size)
    
    #compute data to build the graph
    if i_episode>0 and i_episode%graph_interval == 0:
        perc_successes.append((success/graph_interval)*100)
        print("episode: " + str(i_episode) + " succ. rate: " + str(perc_successes[len(perc_successes)-1]) )
        success = 0
    
    # checkpoint
    #if i_episode>0 and i_episode%save_after_n_episodes == 0:
    #    model.compile()
    #    model.save(model_path + 'model{}-{}'.format(int(i_episode-save_after_n_episodes),i_episode))
    #    np.save(model_path + 'perc_successes{}-{}'.format(int(i_episode-save_after_n_episodes),i_episode),perc_successes)
    
env.close()

episode: 60 succ. rate: 0.0
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64, 1)
(64,

KeyboardInterrupt: 

## Preparing graph

Prepare, show and save the graph with the percentage success rate during training. 

In [None]:
# x axis values
x = [range(graph_interval,n_epochs-1,graph_interval)]
# corresponding y axis values
perc_successes = np.array(perc_successes).reshape( (1,int((n_epochs-1)/graph_interval)) )
new_perc_successes = np.delete(perc_successes, 0).reshape( (1,int((n_epochs-1)/graph_interval)-1) )
 
print(new_perc_successes)
np.save(model_path + 'perc_successes_ultimate',new_perc_successes)
    
plt.style.use('ggplot')
plt.figure(figsize=(20,5))

# plotting the points
plt.plot(x, new_perc_successes, color='green', marker='o', linestyle='solid')
 
# naming the x axis
plt.xlabel('Number of training episodes')
# naming the y axis
plt.ylabel('Success rate in {} episodes (%)'.format(graph_interval))
 
# giving a title to my graph
plt.title('Percent of successes per training episodes in DQN')
 
# function to show the plot
plt.show()

In [None]:
#np.save(model_path + 'perc_successes{}-{}'.format(i_episode-save_after_n_episodes,i_episode),new_perc_successes)
#perc_successes = np.load(model_path+"perc_successes40-50.npy")
#print(perc_successes.shape)