# Deep Q-learning

Introduction to Deep Q-learning (DQN)
Introduction to game we are going to play, e.g. visit website

https://keon.io/deep-q-learning/


## Create Q-Networks

Introduction to neural networks and Keras

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.metrics import MSE
from keras.optimizers import Adam

class Q_Network:

    def __init__(self, env, parameters):
        self.observations_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.learning_rate = parameters["learning_rate"]
        self.loss_metric = parameters["loss_metric"]
        
    
    def build_Q_network(self, env):
        
        model = Sequential()
        model.add(Dense(24, input_dim=self.observations_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=self.loss_metric, optimizer=Adam(lr=self.learning_rate))
        
        model.summary()
        
        return model

## Create Experience Replay

Introduction to Experience replay

In [None]:
import numpy as np
import random
from collections import deque, namedtuple

class ExperienceReplay:
    
    def __init__(self, parameters):
        self.buffer_size = parameters["buffer_size"]
        self.batch_size = parameters["batch_size"]
        self.experience_buffer = deque(maxlen=self.buffer_size)
        
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "new_state", "done"])
    
    def add_experience(self, state, action, reward, new_state, done):   
        e = self.experience(state, action, reward, new_state, done)
        self.experience_buffer.append(e)
    
    def get_batch(self):
        
        if len(self.experience_buffer) < self.batch_size:
            experiences =  self.experience_buffer
        else:
            experiences = random.sample(self.experience_buffer, self.batch_size)
        
        #TODO Change this part?
        states = np.hstack([e.state for e in experiences if e is not None])
        actions = np.hstack([e.action for e in experiences if e is not None])
        rewards = np.hstack([e.reward for e in experiences if e is not None])
        new_states = np.hstack([e.new_state for e in experiences if e is not None])
        dones = np.hstack([e.done for e in experiences if e is not None])
        
        return (states, actions, rewards, new_states, dones)

## Create Agent

Explain DQN algorithm
 * Similarities between tabular and neural networks
 * Similar action selection as earlier
 * Soft updates using target network

In [None]:
class Agent:    

    def __init__(self, env, parameters):
        self.local_network = Q_Network(env, parameters).build_Q_network(env)
        self.target_network = Q_Network(env, parameters).build_Q_network(env)
        self.experience_replay = ExperienceReplay(parameters)
        
        self.epsilon = parameters["epsilon_init"]
        self.epsilon_decay = parameters["epsilon_decay"]
        self.tau = parameters["tau"]
    
    def learn(self):
        # THIS PART IS WRONG
        experience_batch = self.experience_replay.get_batch()
        
        states, actions, rewards, next_states, dones = experiences
        
        # Get best action
        Q_targets_next = self.target_network.predict(next_states)
        
        # If done, Q_target should just be reward
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # 
        Q_expected = self.local_network.predict(states)
        
        self.local_network.fit(Q_expected, Q_targets)
    
    def update_target_network(self):
        weights = self.local_network.get_weights()
        target_weights = self.target_network.get_weights()
        
        target_weights = weights * self.tau + target_weights * (1-self.tau)
    
    def update_epsilon(self):
        self.epsilon *= self.epsilon_decay
        
    def select_action(self, state):
    
        if (self.epsilon > 0.01) and (self.epsilon > np.random.uniform()):
            action = env.action_space.sample()
        else:
            action = np.argmax(self.local_network.predict(state))

        return action
    

## Train agent

NEED TO WORK ON DIMENTIONS

In [None]:
import gym

env = gym.make("CartPole-v0")

episodes = 100
iterations = 50

parameters = {
    "tau" : 0.1,
    "epsilon_init" : 1,
    "epsilon_decay" : 0.95,
    "buffer_size" : 3000,
    "batch_size" : 30,
    "loss_metric" : "mse",
    "learning_rate" : 0.05}


dqn_agent = Agent(env, parameters)
state = env.reset()

for episode in range(episodes):
    
    dqn_agent.update_epsilon()
    
    for _ in range(iterations):
        action = dqn_agent.select_action(state)

## Run - Record - Show : One simulation

In [None]:
env = gym.make("CartPole-v0")
monitor = gym.wrappers.Monitor(env, directory="videos", force=True)

#Why is three simulations needed for saving?!
for _ in range(3):
    done = True
    monitor.reset()
    
    while not done:
        # Select action

monitor.close()
env.close()

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+list(filter(lambda s: s.endswith(".mp4"), os.listdir("./videos/")))[-1]))