# DRUN Deep Q-learning driving network (regular observations)

A Deep Q-Learning network which uses the Microsoft AirSim simulation wrapped in OpenAI gym enviourment class for training, practising navigation from point A on a map to point B without colliding.

### Library imports

#### Custom Open AI gym
Installing our custom "airsim_gym" gym enviourment package.

In [None]:
!pip install -e airsim_gym

#### Other libraries
Importing all the libraries used in the project.

In [None]:
from __future__ import absolute_import
from collections import namedtuple, deque
from math import exp

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, Activation, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber

### Connect to environment

In [None]:
env = gym.make("airsim_gym:airsim-regular-v0")

### Hyper-parameters

In [None]:
# Model hyperparameters
STATE_SIZE = [256, 256, 4]
ACTION_SIZE = env.action_space.n
STACK_SIZE = 64
LEARNING_RATE = 0.0002

# Training parameters
TOTAL_EPISODES = 5000
MAX_STEPS = 1000
BATCH_SIZE = 64
PRETRAIN_LENGTH = BATCH_SIZE
MEMORY_SIZE = 1000000
UPDATE_AFTER_ACTIONS = 4

# Epsilon greedy
EXPLORE_START = 1.0
EXPLORE_STOP = 0.01
DECAY_RATE = 0.0001

# Q-learning hyperparameters
GAMMA = 0.95

# Script execution
TRAINING = True
ENV_PREVIEW = False

### Environment preview

In [None]:
if ENV_PREVIEW:
    env.reset()
    for _ in range(10):
        env.step(env.action_space.sample())

### Image processing utilities

#### prepocess_frame
Preprocessing in order to reduce the complexity of our states and consecutively to reduce the computation time needed for training.


In [None]:
def preprocess_frame(frame):
    # Converts frame from RGB to grayscale
    grayscale_frame = np.mean(frame, -1)

    # Normalize Pixel Values
    normalized_frame = grayscale_frame/255.0

    return normalized_frame

#### stack_frames
Stacking frames in or to crate a sense of motion to our Neural Network.

In [None]:
def stack_frames(stacked_frames, state, is_new_episode: bool, stack_size: int = STACK_SIZE):
    # Preprocess frame
    frame = preprocess_frame(state)

    if is_new_episode:

        # Clear our stacked_frames
        stacked_frames = [np.zeros(STATE_SIZE[:2], dtype=np.int) for i in range(stack_size)]
        stacked_frames = deque(stacked_frames, maxlen=stack_size)

        # In a new episode the deque is filled with the same frame
        for _ in range(stack_size):
            stacked_frames.append(frame)

    else:
        # Append frame to deque, pops the last
        stacked_frames.append(frame)

    # Build the stacked state (first dimension specifies different frames)
    stacked_state = np.stack(stacked_frames, axis=2)
    return stacked_state, stacked_frames

In [None]:
stacked_frames = deque([np.zeros(STATE_SIZE[:2], dtype=np.int) for i in range(STACK_SIZE)], maxlen=4)

### Replay memory
Create the Memory object that contains a deque. A deque (double ended queue) is a data type that removes the oldest element each time that you add a new element over the size limit.

#### Expiriance replay

In [None]:
Experience = namedtuple(
    "Experience",
    ("observation", "position", "action", "next_observation", "next_position", "reward")
)

#### Define replay memory class

In [None]:
class ReplayMemory():
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.push_count = 0

    def add(self, experience):
        self.buffer.append(experience)
        self.push_count += 1

    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(
            np.arange(buffer_size),
            size=batch_size,
            replace=False
        )

        return [self.buffer[i] for i in index]

    def is_sample_available(self, batch_size):
        return len(self.buffer) >= batch_size

#### Agent class

#### Initialize replay memory

In [None]:
replay_memory = ReplayMemory(MEMORY_SIZE)
observation_stack = None
next_observation_stack = None
next_action = None
done = False
observation = None

for i in range(PRETRAIN_LENGTH):
    if i == 0:
        # If no state is available, we get one from the reset
        start_observation, position = env.reset()
        _, observation_stack = stack_frames(
            observation_stack,
            start_observation,
            True,
        )
        _, next_observation_stack = stack_frames(
            next_observation_stack,
            start_observation,
            True,
        )

    # Random action
    if (observation is None):
        action = env.action_space.sample()
        observation, position, reward, done = env.step(action)

        _, next_observation_stack = stack_frames(
            next_observation_stack,
            observation,
            False,
        )

    _, observation_stack = stack_frames(
        observation_stack,
        observation,
        False,
    )

    # Hit something
    if done:
        print("done")
        # Empty frame on episode ending
        next_observation = np.zeros(STATE_SIZE[:2], dtype=np.float32)
        _, next_observation_stack = stack_frames(
            next_observation_stack,
            next_observation,
            False,
        )
        next_position = position

        # Add experience to memory
        replay_memory.add(
            Experience(
                observation_stack,
                position,
                action,
                next_observation_stack,
                next_position,
                reward,
            ),
        )

        # Start a new episode
        start_observation, position = env.reset()
        _, observation_stack = stack_frames(
            observation_stack,
            start_observation,
            True,
        )
        _, next_observation_stack = stack_frames(
            next_observation_stack,
            start_observation,
            True,
        )

        observation = None
        position = None
        done = False
    else:
        # Get the next state
        next_observation, next_position, next_action, next_done = env.step(action)
        _, next_observation_stack = stack_frames(
            next_observation_stack,
            observation,
            False,
        )

        # Add experience to memory
        replay_memory.add(
            Experience(
                observation_stack,
                position,
                action,
                next_observation_stack,
                next_position,
                reward,
            ),
        )

        # Our state is now the next_observation
        observation = next_observation
        position = next_position
        done = next_done

### Epsilon greedy strategy
$\epsilon$ select a random action $a_t$, otherwise select $a_t = \mathrm{argmax}_a Q(s_t,a)$. Over time the exploration probability decays in favour of the exploatation rate.

In [None]:
class EpsilonGreedy():
    def __init__(self, start, stop, decay):
        self.start = start
        self.stop = stop
        self.decay = decay

    def get_exploration_rate(self, current_step):
        rate = self.stop + (self.start - self.stop)
        rate *= exp(-1 * current_step * self.decay)
        return rate

    def predict_action(self, current_step, observation, position, env, dqn):
        # Randomizing a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = self.get_exploration_rate(current_step)

        if explore_probability < exp_exp_tradeoff:
            # A random action is sampled
            action = env.action_space.sample()

        else:
            # Get action from Q-network (exploitation)
            # Estimate the Qs values state
            observation = np.array(observation)
            position = np.array(position)

            observation = observation.reshape(1, *observation.shape)
            position = position.reshape(1, *position.shape)

            print(observation.shape, position.shape)
            prediction = dqn.predict([observation, position])

            # Take the biggest Q value (= the best action)
            action = np.argmax(prediction)

        return action, explore_probability

In [None]:
epsilon = EpsilonGreedy(EXPLORE_START, EXPLORE_START, DECAY_RATE)

### Deep Q-learning network
This is our Deep Q-learning model:

We take a stack of 4 frames and two normalized coordinates as input:
- Image is passed through 3 CNN layers
- Then it is concatinated with the coordinates
- Finally it passes through 3 FC layers
- Outputs a Q value for each actions

In [None]:
def drun_dqn() -> Model:
    image_input = Input(STATE_SIZE)
    coords_input = Input(2)

    img_net = Conv2D(32, (4, 4), strides=(4, 4), activation="relu", padding="same", input_shape=STATE_SIZE)(image_input)
    img_net = Conv2D(64, (3, 3), strides=(2, 2), activation="relu", padding="same")(img_net)
    img_net = Conv2D(64, (3, 3), strides=(2, 2), activation="relu", padding="same")(img_net)
    img_net = Flatten()(img_net)

    combined = Concatenate(axis=1)
    combined = combined([img_net, coords_input])

    dense_net = Dense(512, activation=tf.nn.relu)(combined)
    dense_net = Dense(512, activation=tf.nn.relu)(dense_net)
    dense_net = Dense(512, activation=tf.nn.relu)(dense_net)
    output = Dense(ACTION_SIZE, activation=tf.nn.elu)(dense_net)

    return Model(inputs=(image_input, coords_input), outputs=output)

In [None]:
model = drun_dqn()
optimizer = Adam(learning_rate=LEARNING_RATE, clipnorm=1.0)
loss_function = Huber()

model.summary()

### Network training

Standart Q-learning algorithm:

1. Initialize replay memory capacity.
2. Initialize the policy network with random weights.
3. Clone the policy network, and call it the target network.
4. For each episode:
    1. Initialize the starting state.
    2. For each time step:
        1. Select an action.
            - Via exploration or exploitation
        2. Execute selected action in an emulator.
        3. Observe reward and next state.
        4. Store experience in replay memory.
        5. Sample random batch from replay memory.
        6. Preprocess states from batch.
        7. Pass batch of preprocessed states to policy network.
        8. Calculate loss between output Q-values and target Q-values.
            - Requires a pass to the target network for the next state
        9. Gradient descent updates weights in the policy network to minimize loss.
            - After  time steps, weights in the target network are updated to the weights in the policy network.

In [None]:
if TRAINING:
    decay_step = 0

    for episode in range(TOTAL_EPISODES):
        episode_step = 0
        episode_rewards = []

        observation, position = env.reset()
        observation, stacked_frames = stack_frames(stacked_frames, observation, True)

        while episode_step < MAX_STEPS:
            # Increase episode_decay/decay_steps
            episode_step += 1
            decay_step += 1

            # Predict the action to take and take it
            action, explore_probability = epsilon.predict_action(decay_step, observation, position, env, model)

            # Do the action
            observation, position, reward, done = env.step(action)
            observation = preprocess_frame(observation)

            # Add the reward to total reward
            episode_rewards.append(reward)

            # If the game is finished
            if done:
                # Empty frame on episode ending
                next_observation = np.zeros(observation.shape)
                next_position = [0.0, 0.0]

                # Add experience to memory
                replay_memory.add(Experience(stacked_frames, position, action, next_observation, next_position, reward, done))

                # Start a new episode
                observation, position = env.reset()

                # Stack the frames
                observation, stacked_frames = stack_frames(stacked_frames, observation, True)

                # Set episode_step = max_steps to end the episode
                episode_step = MAX_STEPS

                # Get the total reward of the episode
                total_reward = np.sum(episode_rewards)

                print("Episode: {}".format(episode),
                      "Total reward: {}".format(total_reward),
                      "Explore probability: {:.4f}".format(explore_probability))

                replay_memory.add(Experience(stacked_frames, position, action, next_observation, next_position, reward, done))

            else:
                # Get the next state
                next_observation, next_position = env.get_state()
                next_observation, stacked_frames = stack_frames(stacked_frames, next_observation, False)

                # Add experience to memory
                replay_memory.add(Experience(stacked_frames, position, action, next_observation, next_position, reward, done))

                # st+1 is now our current state
                observation = next_observation

            # LEARNING PART
            # Obtain random mini-batch from memory
            if episode_step % UPDATE_AFTER_ACTIONS == 0 and replay_memory.is_sample_available(BATCH_SIZE):
                batch = replay_memory.sample(BATCH_SIZE)
                observation_mb = np.array([item.observation for item in batch])
                observation_mb = np.rollaxis(observation_mb, 1, observation_mb.ndim)
                position_mb = np.array([item.position for item in batch])
                actions_mb = np.array([item.action for item in batch])
                next_observations_mb = np.array([item.next_observation for item in batch])
                next_positions_mb = np.array([item.next_position for item in batch])
                rewards_mb = np.array([item.reward for item in batch])
                dones_mb = np.array([item.done for item in batch])

                print(observation.shape, position_mb.shape)
                target_Qs_batch = []

                # Build the updated Q-values for the sampled future states
                # Use the target model for stability
    
                future_rewards = model.predict([observation_mb, position_mb])
                # Q value = reward + discount factor * expected future reward
                updated_q_values = rewards_mb + GAMMA * tf.reduce_max(
                    future_rewards, axis=1
                )

                # If final frame set the last value to -1
                updated_q_values = updated_q_values * (1 - dones_mb) - dones_mb

                # Create a mask so we only calculate loss on the updated Q-values
                masks = tf.one_hot(actions_mb, ACTION_SIZE)
                
                # Train the model on the states and updated Q-values
                q_values = model([observation_mb, position_mb])

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)
                print("Training loss: {:.4f}".format(loss))

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Save model every 10 episodes
        if episode % 10 == 0:
            model.save("model/")