# Deep Q-Learning 

Deep Q-Learning uses a neural network to approximate $Q$ functions. Hence, we usually refer to this algorithm as DQN (for *deep Q network*).

The parameters of the neural network are denoted by $\theta$. 
*   As input, the network takes a state $s$,
*   As output, the network returns $Q(s, a, \theta)$, the value of each action $a$ in state $s$, according to the parameters $\theta$.


The goal of Deep Q-Learning is to learn the parameters $\theta$ so that $Q(s, a, \theta)$ approximates well the optimal $Q$-function $Q^*(s, a)$. 

In addition to the network with parameters $\theta$, the algorithm keeps another network with the same architecture and parameters $\theta^-$, called **target network**.

The algorithm works as follows:

1.   At each time $t$, the agent is in state $s_t$ and has observed the transitions $(s_i, a_i, r_i, s_i')_{i=1}^{t-1}$, which are stored in a **replay buffer**.

2.  Choose action $a_t = \arg\max_a Q(s_t, a)$ with probability $1-\varepsilon_t$, and $a_t$=random action with probability $\varepsilon_t$. 

3. Take action $a_t$, observe reward $r_t$ and next state $s_t'$.

4. Add transition $(s_t, a_t, r_t, s_t')$ to the **replay buffer**.

4.  Sample a minibatch $\mathcal{B}$ containing $B$ transitions from the replay buffer. Using this minibatch, we define the loss:

$$
L(\theta) = \sum_{(s_i, a_i, r_i, s_i') \in \mathcal{B}}
\left[
Q(s_i, a_i, \theta) -  y_i
\right]^2
$$
where the $y_i$ are the **targets** computed with the **target network** $\theta^-$:

$$
y_i = r_i + \gamma \max_{a'} Q(s_i', a', \theta^-).
$$

5. Update the parameters $\theta$ to minimize the loss, e.g., with gradient descent (**keeping $\theta^-$ fixed**): 
$$
\theta \gets \theta - \eta \nabla_\theta L(\theta)
$$
where $\eta$ is the optimization learning rate. 

6. Every $N$ transitions ($t\mod N$ = 0), update target parameters: $\theta^- \gets \theta$.

7. $t \gets t+1$. Stop if $t = T$, otherwise go to step 2.

# Colab setup

In [None]:
%pip install ribs[all] gym~=0.17.0 Box2D~=2.3.10 tqdm
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
env = gym.make("LunarLander-v2")

In [None]:
# After installing, restart the kernel

if 'google.colab' in str(get_ipython()):
  print("Installing packages, please wait a few moments. You may need to restart the runtime after the installation.")

  # install rlberry library
  !pip install git+https://github.com/rlberry-py/rlberry.git#egg=rlberry[default] > /dev/null 2>&1

  # install gym
  !pip install gym[all] > /dev/null 2>&1

  # packages required to show video
  !pip install pyvirtualdisplay > /dev/null 2>&1
  !apt-get update > /dev/null 2>&1
  !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1


In [None]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy
from gym.wrappers import Monitor
import gym

In [None]:
# Create directory for saving videos
!mkdir videos > /dev/null 2>&1

# Initialize display and import function to show videos
import rlberry.colab_utils.display_setup
from rlberry.colab_utils.display_setup import show_video

In [None]:
# Random number generator
import rlberry.seeding as seeding 
seeder = seeding.Seeder(456)
rng = seeder.rng

# 1. Define the parameters

In [None]:
# Environment
env = gym.make("LunarLander-v2")

# Discount factor
GAMMA = 0.999

# Batch size
BATCH_SIZE = 64
# Capacity of the replay buffer
BUFFER_CAPACITY = 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 30

# Initial value of epsilon
EPSILON_START = .7
# Parameter to decrease epsilon
DECREASE_EPSILON = 100
# Minimum value of epislon
EPSILON_MIN = 0.001

# Number of training episodes
N_EPISODES = 500

# Learning rate
LEARNING_RATE = 1e-3

# 2. Define the replay buffer

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return rng.choice(self.memory, batch_size).tolist()


    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

# 3. Define the neural network architecture, objective and optimizer

In [None]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# create network and target network
hidden_size = 128
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

q_net = Net(obs_size, hidden_size, n_actions)
target_net = Net(obs_size, hidden_size, n_actions)

# objective and optimizer
objective = nn.MSELoss()
optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)

# 4. Implement Deep Q-Learning

In [None]:
#
#  Some useful functions
#

def get_q(states):
    """
    Compute Q function for a list of states
    """
    with torch.no_grad():
        states_v = torch.FloatTensor([states])
        output = q_net.forward(states_v).data.numpy()  # shape (1, len(states), n_actions)
    return output[0, :, :]  # shape (len(states), n_actions)

def eval_dqn(n_sim=5):
    """   
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    env_copy = deepcopy(env)
    episode_rewards = np.zeros(n_sim)

    for ii in range(n_sim):
        state = env_copy.reset()
        done = False 
        while not done:
            action = choose_action(state, 0.0)
            next_state, reward, done, _ = env_copy.step(action)
            episode_rewards[ii] += reward
            state = next_state
    return episode_rewards

In [None]:
def choose_action(state, epsilon):
    """
    Return action according to an epsilon-greedy exploration policy
    """
    if np.random.uniform() < epsilon:
        return env.action_space.sample()
    else:
        q = get_q([state])[0]
        return q.argmax()    
    

def update(state, action, reward, next_state, done):
    # add data to replay buffer
    replay_buffer.push(state, action, reward, next_state, done)
    
    if len(replay_buffer) < BATCH_SIZE:
        return np.inf
    
    # get batch
    transitions = replay_buffer.sample(BATCH_SIZE)

    # process batch of (state, action, reward, next_state)
    states = np.array([transitions[ii][0] for  ii in range(BATCH_SIZE)])
    actions = np.array([transitions[ii][1] for  ii in range(BATCH_SIZE)])
    rewards = np.array([transitions[ii][2] for  ii in range(BATCH_SIZE)])
    next_states = np.array([transitions[ii][3] for  ii in range(BATCH_SIZE)])
    dones = np.array([transitions[ii][4] for  ii in range(BATCH_SIZE)])

    # Convert to torch tensors
    states_torch = torch.FloatTensor(states)
    actions_torch = torch.LongTensor(actions).view(-1,1)
    rewards_torch = torch.FloatTensor(rewards).view(-1, 1)
    next_states_torch = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    # Q(s_i, a_i)
    values = q_net(states_torch)
    values = torch.gather(values, dim=1, index=actions_torch)[:, 0]

    # max_a Q(s_{i+1}, a)
    values_next_states = target_net(next_states_torch).max(dim=1)[0].detach()
    assert values_next_states.shape == values.shape

    targets = torch.squeeze(rewards_torch) + GAMMA * (1.0 - dones) * values_next_states
    loss = objective(values, targets)
     
    # Optimize the model 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.data.numpy()

In [None]:

#
# Train
# 

EVAL_EVERY = 5
REWARD_THRESHOLD = 199

def train():
    state = env.reset()
    epsilon = EPSILON_START
    ep = 0
    total_time = 0
    while ep < N_EPISODES:
        action = choose_action(state, epsilon)

        # take action and update replay buffer and networks
        next_state, reward, done, _ = env.step(action)
        loss = update(state, action, reward, next_state, done)

        # update state
        state = next_state

        # end episode if done
        if done:
            state = env.reset()
            ep   += 1
            if ( (ep+1)% EVAL_EVERY == 0):
                rewards = eval_dqn()
                print("episode =", ep+1, ", reward = ", np.mean(rewards))
                if np.mean(rewards) >= REWARD_THRESHOLD:
                    break

            # update target network
            if ep % UPDATE_TARGET_EVERY == 0:
                target_net.load_state_dict(q_net.state_dict())
            # decrease epsilon
            epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                            np.exp(-1. * ep / DECREASE_EPSILON )    

        total_time += 1

# Run the training loop
train()

# Evaluate the final policy
rewards = eval_dqn(20)
print("")
print("mean reward after training = ", np.mean(rewards))

# Visualize the DQN policy

In [None]:
def render_env(env):
  env = deepcopy(env)
  env = Monitor(env, './videos', force=True, video_callable=lambda episode: True)
  for episode in range(1):
    done = False
    state = env.reset()
    env.render()
    while not done:
        action = action = choose_action(state, 0.0)
        state, reward, done, info = env.step(action)
        env.render()
    env.close()
    show_video()

render_env(env)