<a href="https://colab.research.google.com/github/chambai/Deep_Learning_Course/blob/main/Week%203%20Deep%20RL%202/CartPoleDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q Network (DQN) for CartPole Using Boltzmann Q Policy
This exercise implements a DQN for CartPole using a Boltzmann Q policy for selecting the actions

In [None]:
# install keras rl2 (we need to install keras-rl2 so it works with the tensorflow 2 version that comes pre-installed with colab)
!pip install keras-rl2

In [None]:
!pip install gym

In [None]:
# load the gym module
import gym
import matplotlib.pyplot as plt
# import the usual Keras modules for creating deep neural networks
from keras import Sequential
from keras.layers import Input, Flatten, Dense
from keras.optimizers import Adam

ENV_NAME = 'CartPole-v0'
env = gym.make(ENV_NAME)

Implementation of DQN for CartPole, applying policy BoltzmannQPolicy

In [None]:
import rl
from rl.memory import SequentialMemory  # import the exerience replay buffer module
from rl.policy import BoltzmannQPolicy  # import the policy
from rl.agents.dqn import DQNAgent      # import the DQN agent

# setup experience replay buffer
# here the sequential memory limit is set up the same as the nb_steps (number of steps)
# parameter in the fit method.  This means that all the action-states will fit into the
# memory buffer
# keep window_length as 1. It's used in other RL methods, but keep it to 1 in DQNs
memory = SequentialMemory(limit=10000, window_length=1)

# define the policy (how we select the actions)
policy = BoltzmannQPolicy()

# Q-Network
model = Sequential()
model.add(Input(shape=(1,???)))
model.add(Flatten())
# add extra layers here
model.add(Dense(???, activation='relu'))
model.add(Dense(env.action_space.n, activation='???'))
print(model.summary())

# define the agent
dqn = DQNAgent(model=model,                     # Q-Network model
               nb_actions=???,                  # number of actions
               memory=memory,                   # experience replay memory
               nb_steps_warmup=10,              # how many steps are waited before starting experience replay
               target_model_update=1e-2,        # how often the target network is updated
               policy=policy)                   # the action selection policy

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

history = dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

# summarize the history for number  of episode steps
plt.plot(history.history['nb_episode_steps'])
plt.ylabel('nb_episode_steps')
plt.xlabel('episodes')
plt.show()

dqn.test(env, nb_episodes=20, visualize=False)

##Implement DQN with BoltzmannGumbelQPolicy instead of BoltzmannQPolicy

In [None]:
from rl.policy import BoltzmannGumbelQPolicy

# setup experience replay buffer
memory = SequentialMemory(limit=10000, window_length=1)

# Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning
# based on the paper (https://arxiv.org/pdf/1705.10257.pdf).
policy = BoltzmannGumbelQPolicy()

# Q-Network
model = Sequential()
model.add(Input(shape=(1,env.observation_space.shape[0])))  # The input is 1 observation vector, and the number of observations in that vector 
model.add(Flatten())
# add extra layers here
model.add(Dense(16, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))   # the output is the number of actions in the action space
print(model.summary())

# define the agent
dqn = DQNAgent(model=model,                     # Q-Network model
               nb_actions=env.action_space.n,   # number of actions
               memory=memory,                   # experience replay memory
               nb_steps_warmup=10,              # how many steps are waited before starting experience replay
               target_model_update=1e-2,        # how often the target network is updated
               policy=policy)                   # the action selection policy

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

history = dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

# summarize the history for number  of episode steps
plt.plot(history.history['nb_episode_steps'])
plt.ylabel('nb_episode_steps')
plt.xlabel('episodes')
plt.show()

dqn.test(env, nb_episodes=20, visualize=False)

## Implement DQN with BoltzmannQPolicy and LinearAnnaeledPolicy, changing the tau parameter

In [None]:
from rl.policy import BoltzmannQPolicy

# setup experience replay buffer
memory = SequentialMemory(limit=10000, window_length=1)

# setup the Linear annealed policy with the BoltzmannQPolicy as the inner policy
policy =  LinearAnnealedPolicy(inner_policy=BoltzmannQPolicy(),   # policy used to select actions
                               attr='tau',                        # attribute in the inner policy to vary             
                               value_max=1,                       # maximum value of attribute that is varying
                               value_min=.1,                      # minimum value of attribute that is varying
                               value_test=.05,                    # test if the value selected is < 0.05
                               nb_steps=10000)                    # the number of steps between value_max and value_min

# Q-Network
model = Sequential()
model.add(Input(shape=(1,env.observation_space.shape[0])))  # The input is 1 observation vector, and the number of observations in that vector 
model.add(Flatten())
model.add(Dense(16, activation='relu'))
# add extra layers here
model.add(Dense(env.action_space.n, activation='linear'))   # the output is the number of actions in the action space
print(model.summary())

# define the agent
dqn = DQNAgent(model=model,                     # Q-Network model
               nb_actions=env.action_space.n,   # number of actions
               memory=memory,                   # experience replay memory
               nb_steps_warmup=10,              # how many steps are waited before starting experience replay
               target_model_update=1e-2,        # how often the target network is updated
               policy=policy)                   # the action selection policy

dqn.compile(Adam(lr=1e-3), metrics=['mae'])

history = dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

# summarize the history for number  of episode steps
plt.plot(history.history['nb_episode_steps'])
plt.ylabel('nb_episode_steps')
plt.xlabel('episodes')
plt.show()

dqn.test(env, nb_episodes=20, visualize=False)

## Double DQN Network
Implement a double DQN Network with BoltzmannQPolicy
Add layers to the Q-Network and analyse the results

## Dueling DQN Network
Implement a dueling double DQN Network with BoltzmannQPolicy
Add layers to the Q-Network and analyse the results

## Double Dueling DQN
Implement a double dueling DQN Network with BoltzmannQPolicy
Add layers to the Q-Network and analyse the results