# PAISS Practical Deep-RL by Criteo Research

In [None]:
%pylab inline

from utils import RLEnvironment, RLDebugger

import random

from keras.optimizers import Adam, RMSprop, SGD
from keras.layers import Dense, Conv2D, Flatten, Input, Reshape, Lambda, Add, RepeatVector
from keras.models import Sequential, Model
from keras import backend as K

In [None]:
env = RLEnvironment()
print(env.observation_space, env.action_space)

## Random agent

In [None]:
class RandomAgent:
    """The world's simplest agent!"""
    def __init__(self, action_space):
        self.action_space = action_space

    def get_action(self, state):
        return self.action_space.sample()

## Play loop
Note that this Gym environment is considered as solved as soon as you find a policy which scores 200 on average.

In [None]:
env.run(RandomAgent(env.action_space), episodes=20, display_policy=True)


## DQN Agent - Online
Here is a keras code for training a simple DQN. 

It is presented first for the sake of clarity. Nevertheless, the trained network is immediatly used to collect the new training data, unless you are lucky you won't be able to find a way to solve the task. 

Just replace the `???` by some parameters which seems reasonnable to you ($\gamma>1$ is not reasonnable and big steps are prone to numerical instability) and watch the failure of the policy training. 

In [5]:
class DQNAgent(RLDebugger):
    def __init__(self, observation_space, action_space):
        RLDebugger.__init__(self)
        # get size of state and action
        self.state_size = observation_space.shape[0]
        self.action_size = action_space.n
        # hyper parameters 
        self.learning_rate = ??? # recommended value range: [1e-3, 1e-1]
        self.model = self.build_model()  
        self.target_model = self.model
        
    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        # try adding neurons. Recommended value range [10, 256]  
        # try adding layers. Recommended value range [1, 4]
        model.add(Dense(units=10, input_dim=self.state_size, activation='linear'))
        model.add(Dense(units=self.action_size, activation='linear'))
        # usual activations: 'linear', 'relu', 'tanh', 'sigmoid'
        model.compile(loss=???, optimizer=Adam(lr=self.learning_rate))
        # usual losses: 'mse', 'logcosh', 'mean_absolute_error'
        # usual optimizers: Adam, RMSprop, SGD
        model.summary() # Display summary of the network. 
                        # Check that your network contains a "reasonable" number of parameters (a few hundrers)
        return model

    # get action from model using greedy policy. 
    def get_action(self, state):
        q_value = self.model.predict(state)
        best_action = np.argmax(q_value[0]) #The [0] is because keras outputs a set of predictions of size 1
        return best_action

    # train the target network on the selected action and transition
    def train_model(self, action, state, next_state, reward, done):
        target = self.model.predict(state)
        # We use our internal model in order to estimate the V value of the next state 
        target_val = self.target_model.predict(next_state)
        # Q Learning: target values should respect the Bellman's optimality principle
        if done: #We are on a terminal state
            target[0][action] = reward
        else:
            target[0][action] = reward + self.gamma * (np.amax(target_val))

        # and do the model fit!
        loss = self.model.fit(state, target, verbose=0).history['loss'][0]
        self.record(action, state, target, target_val, loss, reward)        

NameError: name 'RLDebugger' is not defined

In [None]:
agent = DQNAgent(env.observation_space, env.action_space)
env.run(agent, episodes=500)

In [None]:
agent.plot_loss()

### Let-s try with a fixed initial position

In [None]:
agent = DQNAgent(env.observation_space, env.action_space, seed=0)
env.run(agent, episodes=500)

In [None]:
agent.plot_loss()

## DQN Agent with Exploration
This is our first agent which is going to solve the task. It will typically require to run a few hundred of episodes to collect the data. 

The difference with the previous agent is that you are going to add an exploration mechanism in order to take care of the data collection for the training. We advise to use an $\varepsilon_n$-greedy, meaning that the value of $\varepsilon$ is going to decay over time. Several kind of decays can be found in the litterature, a simple one is to use a mutiplicative update of $\varepsilon$ by a constant smaller than 1 as long as $\varepsilon$ is smaller than a small minimal rate (typically in the range 1%-5%).

You need to:
* Code your exploration (area are tagged in the code by some TODOs).
* Tune the hyperparameters (including the ones from the previous section) in order to solve the task. This may be not so easy and will likely require more than 500 episodes and a final small value of epsilon. Next sessions will be about techniques to increase sample efficiency (i.e require less episodes).

In [None]:
class DQNAgentWithExploration(DQNAgent):
    def __init__(self, observation_space, action_space):
        super(DQNAgentWithExploration, self).__init__(observation_space, action_space)
        # exploration schedule parameters 
        self.t = 0
        self.epsilon = ???

    # decay epsilon
    def update_epsilon(self):
        # TODO write the code for your decay  
        self.t += 1
        self.epsilon = ???

    # get action from model using greedy policy
    def get_action(self, state):
        # exploration 
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        q_value = self.model.predict(state)
        return np.argmax(q_value[0])

In [None]:
agent = DQNAgentWithExploration(env.observation_space, env.action_space)
env.run(agent, episodes=500, print_delay=50, seed=0)

In [None]:
agent.plot_state()

## DQN Agent with Exploration and Experience Replay

We are now going to save some samples in a limited memory in order to build minibatches during the training. The exploration policy remains the same than in the previous section.  Storage is already coded you just need to modify the tagged section which is about building the mini-batch sent to the optimizer.

In [None]:
from collections import deque

class DQNAgentWithExplorationAndReplay(DQNAgentWithExploration):
    def __init__(self, observation_space, action_space):
        super(DQNAgentWithExplorationAndReplay, self).__init__(observation_space, action_space)
        self.batch_size = ???
        self.train_start = ???
        # create replay memory using deque
        self.memory = deque(maxlen=???)

    def create_minibatch(self):
        # pick samples randomly from replay memory (using batch_size)
        
        batch_size = min(self.batch_size, len(self.memory))
        samples = random.sample(self.memory, batch_size)
        
        states = np.array([_[0][0] for _ in samples])
        actions = np.array([_[1] for _ in samples])
        rewards = np.array([_[2] for _ in samples])
        next_states = np.array([_[3][0] for _ in samples])
        dones = np.array([_[4] for _ in samples])
        
        return (states, actions, rewards, next_states, dones)
        
    def train_model(self, action, state, next_state, reward, done):
        
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))
        
        if len(self.memory) >= self.train_start:
            states, actions, rewards, next_states, dones = self.create_minibatch()
            
            targets = self.model.predict(states)
            target_values = self.target_model.predict(next_states)

            for i in range(self.batch_size):
                # Approx Q Learning
                if dones[i]:
                    targets[i][actions[i]] = rewards[i]
                else:
                    targets[i][actions[i]] = rewards[i] + self.gamma * (np.amax(target_values[i]))
           
            # and do the model fit!
            loss = self.model.fit(states, targets, verbose=0).history['loss'][0]
            
            for i in range(self.batch_size):
                self.record(actions[i], states[i], targets[i], target_values[i], loss, rewards[i])

In [None]:
agent = DQNAgentWithExplorationAndReplay(env.observation_space, env.action_space)
env.run(agent, episodes=300, print_delay=50)

In [None]:
agent.plot_state()

In [None]:
agent.plot_bellman_residual()

## Double DQN Agent with Exploration and Experience Replay

Now we want to have two identical networks and keep frozen for some timesteps the one which is in charge of the evaluation (*i.e* which is used to compute the targets).
Note that you can find some variants where the target network is updated at each timestep but with a small fraction of the difference with the policy network.

In [None]:
class DoubleDQNAgentWithExplorationAndReplay(DQNAgentWithExplorationAndReplay):
    def __init__(self, observation_space, action_space):
        super(DoubleDQNAgentWithExplorationAndReplay, self).__init__(observation_space, action_space)
        # TODO: initialize a second model
        self.target_model = ???

    def update_target_model(self):
        # copy weights from the model used for action selection to the model used for computing targets
        self.target_model.set_weights(self.model.get_weights())

In [None]:
agent = DoubleDQNAgentWithExplorationAndReplay(env.observation_space, env.action_space)
env.run(agent, episodes=200, print_delay=10)

In [None]:
agent.plot_diagnostics()

To observe actual performance of the policy we should set $\varepsilon=0$

In [None]:
agent.epsilon = 0
agent.memory = deque(maxlen=1)
agent.batch_size = 1
agent.train_start = 1
env.run(agent, episodes=200, print_delay=33)

In [None]:
agent.plot_diagnostics()

## Duelling DQN 

If time allows, adapt the description from http://torch.ch/blog/2016/04/30/dueling_dqn.html to our setting

In [None]:
class DoubleDuelingDQNAgentWithExplorationAndReplay(DoubleDQNAgentWithExplorationAndReplay):
    def __init__(self, observation_space, action_space):
         super(DoubleDuelingDQNAgentWithExplorationAndReplay, self).__init__(observation_space, action_space)
            
    def build_model(self):
        value_input = Input(shape=(self.state_size,))
        
        # Value stream
        value_stream_hidden = Dense(???, input_dim=self.state_size, activation=???)(value_input)
        value_stream_activation = Dense(1, activation=???)(value_stream_hidden)
        repeat_value_stream = RepeatVector(self.action_size)(value_stream_activation)
        value_stream = Flatten()(repeat_value_stream)
        
        # Advantage stream
        advantage_stream_hidden = Dense(???, input_dim=self.state_size, activation=???)(value_input)
        advantage_stream_activation = Dense(self.action_size, activation=???)(advantage_stream_hidden)
        advantage_stream = Lambda(lambda layer: layer - K.mean(layer))(advantage_stream_activation)
        
        # Merge both streams
        q_values = Add()([value_stream, advantage_stream])
        
        model = Model(inputs=[value_input], outputs=q_values)
        
        model.compile(loss=???, optimizer=???(lr=self.learning_rate))

        model.summary()

        return model

In [None]:
agent = DoubleDuelingDQNAgentWithExplorationAndReplay(env.observation_space, env.action_space)
env.run(agent, episodes=300, print_delay=50)

In [None]:
agent.plot_diagnostics()

# More ideas
* Use an other environment providing pictures, replace the network by a pre-trained ResNet-50 and learn a policy to play your favorite Atari game (can easily take days of computation if not done asynchroneously on several V100 as described by https://github.com/dgriff777/rl_a3c_pytorch),
* Visualize important areas of the pictures for taking the decision using saliency maps computed by guided backpropagation as shown by http://arxiv.org/abs/1412.6806.
* Go to policy gradient methods 