# Deep Q-learning

Deep Reinforcement learning has resulted superman performance in a range of scenarios, such as Chess, Atari 2600, Starcraft and Go. The main difference from traditional Reinforcement Learning is the introduction of deep neural networks.

In deep Reinforcement learning neural networks replace the Q-table which was introduced in the previous notebook. The Q-tables quickly becomes infeasable when the space becomes large, and espacally when it is continous. 

However, using neural networks to approximate the Q-values is not a straight forward task. On of the main issues of earlt attempts using neural networks was the instabilty during training. 

One of the first algorithms to succesfully solve the instability problem was Deepmind, which introduced the DQN-algorithm (Deep Q-networks algorithm). This is the algorithm which will be introduced in this



In the previous notebook we divided

Introduction to game we are going to play, e.g. visit website

### Create Q-Networks
The main 

#### Task


In [None]:
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense
from keras.metrics import MSE
from keras.optimizers import Adam

class QNetwork:

    def __init__(self, env, parameters):
        self.observations_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.learning_rate = parameters["learning_rate"]
        self.learning_rate_decay = parameters["learning_rate_decay"]
        self.loss_metric = parameters["loss_metric"]
        self.hidden_layer_1 = parameters["hidden_layer_1"]
        self.hidden_layer_2 = parameters["hidden_layer_2"]
        
    def build_q_network(self):
        
        model = Sequential()
        model.add(Dense(self.hidden_layer_1, input_dim=self.observations_size, activation='relu'))
        model.add(Dense(self.hidden_layer_2, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=self.loss_metric, optimizer=Adam(lr=self.learning_rate, decay=self.learning_rate_decay))
        
        return model

## Create Experience Replay

Introduction to Experience replay

In [None]:
import numpy as np
import random
from collections import deque, namedtuple

class ExperienceReplay:
    
    def __init__(self, parameters):
        self.buffer_size = parameters["buffer_size"]
        self.batch_size = parameters["batch_size"]
        self.experience_buffer = deque(maxlen=self.buffer_size)
        
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "done", "new_state"])
    
    def add_experience(self, state, action, reward, done, new_state):   
        e = self.experience(state, action, reward, done, new_state)
        self.experience_buffer.append(e)
    
    def get_batch(self):
        
        if len(self.experience_buffer) < self.batch_size:
            experiences =  self.experience_buffer
        else:
            experiences = random.sample(self.experience_buffer, self.batch_size)
        
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences if e is not None])
        rewards = np.vstack([e.reward for e in experiences if e is not None])
        new_states = np.vstack([e.new_state for e in experiences if e is not None])
        dones = np.vstack([e.done for e in experiences if e is not None])
        
        return (states, actions, rewards, dones, new_states)
    
    def warm_up(self, env):
        for _ in range(10):
            state = env.reset()
            done = False
            
            while not done:
                action = env.action_space.sample()
                new_state, reward, done, _ = env.step(action)
                self.add_experience(state, action, reward, done, new_state)
                state = new_state

## Create Agent

Explain DQN algorithm
 * Similarities between tabular and neural networks
 * Similar action selection as earlier
 * Soft updates using target network

In [None]:
import os
import datetime
import gym

class Agent:    

    def __init__(self, env, parameters):
        self.env = env
        self.local_network = QNetwork(env, parameters).build_q_network()
        self.target_network = QNetwork(env, parameters).build_q_network()
        self.experience_replay = ExperienceReplay(parameters)
        
        self.epsilon = parameters["epsilon_init"]
        self.epsilon_decay = parameters["epsilon_decay"]
        self.epsilon_minimum = parameters["epsilon_minimum"]
        self.tau = parameters["tau"]
        self.gamma = parameters["gamma"]
        self.epochs = parameters["epochs"]
    
    def learn(self):
        states, actions, rewards, dones, next_states = self.experience_replay.get_batch()
        
        # Get Q-values for next state
        Q_target = self.target_network.predict(next_states)

        # Apply Q-learning algorithm to calculate the actual Q-value for state
        Q_calc = rewards + (self.gamma * np.amax(Q_target, axis=1).reshape(-1, 1) * (1 - dones))
        
        # Calculate the predicted Q-value for the action taken in the state using local network
        Q_local = self.local_network.predict(states)
        
        # Change Q_values for chosen action with "correct" Q-values, e.g. Q_actual        
        for row, col_id in enumerate(actions):
            Q_local[row, np.asscalar(col_id)] = Q_calc[row]
        
        # Network inputs states and outputs 
        self.local_network.fit(states, Q_local, epochs=self.epochs, verbose=0)
    
    def update_target_network(self):
        local_weights = self.local_network.get_weights()
        target_weights = self.target_network.get_weights()

        for i in range(len(local_weights)):
            target_weights[i] = self.tau * local_weights[i] + (1 - self.tau) * target_weights[i]
        self.target_network.set_weights(target_weights)
    
    def update_epsilon(self):
        if self.epsilon >= self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay
        
    def select_action(self, state):
    
        if self.epsilon > np.random.uniform():
            action = env.action_space.sample()
        else:
            action = np.argmax(self.local_network.predict(np.array([state])))

        return action
    
    def step(self, env, state):
        
        action  = self.select_action(state)
        new_state, reward, done, _ = env.step(action)

        return action, reward, done, new_state
    
    def save(self):
        save_dir = os.path.join(os.getcwd(), env.spec.id +"_"+ datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        os.makedirs(save_dir)
        self.target_network.save("target_network.h5")
        self.local_network.save("local_network.h5")
        print("Weights saved successfully")
        
    def load(self, path):
        self.target_network.load_weights(path + "/target_network.h5")
        self.local_network.load_weights(path + "/local_network.h5")
        print("Weights loaded successfully")

### Train

In [None]:
def train(agent, env, iterations, episodes):
    
    total_reward = 0
    total_reward_list, iterations_list = [], []
    agent.experience_replay.warm_up(env)
    
    for episode in range(episodes):
        
        state = env.reset()
        total_reward=0
        
        
        if (episode != 0): 
            agent.update_epsilon()
    
        for iteration in range(iterations):
            
            action, reward, done, new_state = agent.step(env, state)
            agent.experience_replay.add_experience(state, action, reward, done, new_state)
            
            state = new_state
            
            agent.learn()
            agent.update_target_network()
            total_reward += reward
            
            if done: 
                break
        
        total_reward_list.append(total_reward)
        iterations_list.append(iteration+1)
        
        if episode % 10 == 0:
            print("Episode: {} | Average iterations: {} | Average total reward: {} | Epsilon: {} " \
                  .format(episode, mean(iterations_list), mean(total_reward_list), agent.epsilon))
            total_reward_list.clear()
            iterations_list.clear()

### Play

In [None]:
def play(agent, env):
    
    done = False
    agent.epsilon = 0
    total_reward = 0
    state = env.reset()
    
    while not done:
        action, reward, done, new_state = agent.step(env, state)
        state = new_state
        
        total_reward += reward
    
    print("Total Reward: {}".format(total_reward))

## Use agent

In [None]:
def get_hyperparameters(env):
    
    env_id = env.spec.id

    if env_id == "CartPole-v0":
        print("Hyperparameters for {} chosen!".format("CartPole-v0"))
        parameters = {
            "tau" : 0.05,
            "gamma" : 0.95,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.97,
            "epsilon_minimum": 0.01,
            "buffer_size" : 10000,
            "batch_size" : 32,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.01,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24}
        
    elif env_id == "CartPole-v1":
        print("Hyperparameters for {} chosen!".format("CartPole-v1"))
        parameters = {
            "tau" : 0.05,
            "gamma" : 0.99,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.999,
            "epsilon_minimum": 0.01,
            "buffer_size" : 2000,
            "batch_size" : 64,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.01,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24}
        
    elif env_id == "LunarLander-v2":
        print("Hyperparameters for {} chosen!".format("LunarLander-v2"))
        parameters = {
            "tau" : 0.05,
            "gamma" : 0.99,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.999,
            "epsilon_minimum": 0.01,
            "buffer_size" : 2500,
            "batch_size" : 64,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.01,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24}
    else:
        print("Standard hyperparameters {} chosen!".format(env.spec.id))
        parameters = {
            "tau" : 0.05,
            "gamma" : 0.95,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.97,
            "epsilon_minimum": 0.01,
            "buffer_size" : 10000,
            "batch_size" : 32,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.01,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24}
    
    return parameters

In [None]:
import gym

environment = "CartPole-v1"

env = gym.make(environment)

episodes = 3000
iterations = 500
parameters = get_hyperparameters(env)

dqn_agent = Agent(env, parameters)
train(dqn_agent, env, iterations, episodes)

## Run - Record - Show : One simulation

In [None]:
env = gym.make("CartPole-v1")
monitor = gym.wrappers.Monitor(env, directory="videos", force=True)

#Why is three simulations needed for saving?!
for _ in range(3):
    play(dqn_agent, monitor)

monitor.close()
env.close()

#HTML("""
#<video width="640" height="480" controls>
#  <source src="{}" type="video/mp4">
#</video>
#""".format("./videos/"+list(filter(lambda s: s.endswith(".mp4"), os.listdir("./videos/")))[-1]))

### If we want to plot
from matplotlib import pyplot <br>
from IPython.display import display, clear_output

fig = plt.figure()
ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()
reward_list, episode_list, iteration_list = [], [], []

episode_list.append(episode)
reward_list.append(total_reward)

if episode % 15 == 0:
    ax.clear()
    ax.plot(episode_list, reward_list)
    fig.canvas.draw()