# Continuous Control

The environment for this project is [Reacher](https://github.com/udacity/deep-reinforcement-learning/tree/master/p2_continuous_control) from Unity, and it's provided in the `setup` folder. We'll implement the A2C algorithm as the synchronous version of [A3C](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) (but the input won't be directly from pixels). Results will be shown in this notebook and the best solution will be implemented in `main.py`.

![final](imgs/gif.gif "final")

> The model used to generate this gif is `final.pth` (Dueling Double DQN), which was trained for 700 episodes using `main.py`.

## 1. Prepare dependencies and environment

Take a look at README.md before executing this notebook and make sure that the kernel is set to **p2_continuous_control**.

In [1]:
!pip -q install ./setup

import sys
import platform

import numpy as np
from numpy_ringbuffer import RingBuffer
from scipy import signal
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
from torch import nn

from setup import unityagents
from unityagents import UnityEnvironment

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Unity environments contain **brains**, our interfaces for controlling agents. We'll be conrtolling the first (default) brain in the environment. It's also useful to keep information such as `state_size` and `action_size`.

In [2]:
env = None
system = platform.system()
if system == 'Linux':
    env = UnityEnvironment(file_name="setup/Reacher_Linux/Reacher.x86_64")
elif system == 'Darwin':
    env = UnityEnvironment(file_name="setup/Reacher.app")
elif system == 'Windows':
    env = UnityEnvironment(file_name="setup/Reacher_Windows_x86_64/Reacher.exe")
else:
    print('Cannot find environment for this system.')

# use the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state_size = env_info.vector_observations.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## 2. Actor



In [6]:
class Actor(nn.Module):
    
    def __init__(self, state_size, action_size, hidden_layers=[64, 128, 64]):
        super(Actor, self).__init__()
        self.action_size = action_size
        
        # prepare the first hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(state_size, hidden_layers[0])])
        
        # prepare the rest of the hidden layers
        A = hidden_layers[:-1]
        B = hidden_layers[1:]
        self.hidden_layers.extend([nn.Linear(a, b) for a, b in zip(A, B)])
        
        # the actor will output the parameters of a normal distribution, 
        # so for each action we need mu and sigma^2 (thus we double the action size)
        self.output_layer = nn.Linear(hidden_layers[-1], action_size * 2)

    def forward(self, state):
        # connect layers to each other and put relu activations between them
        for layer in self.hidden_layers:
            state = layer(state)
            state = F.relu(state)
        state = self.output_layer(state)
        
        # reshape output in two rows (mu and sigma^2)
        distr_params = state.view(2, self.action_size)
        
        # mu is linear while sigma^2 uses a softplus
        distr_params[1, :] = F.softplus(distr_params[1, :])

        return distr_params
    
    # -- some utility functions -- #

    def distribution(self, distr_params):
        # then construct a normal distribution that depends on these values
        distribution = Normal(distr_params[0, :], distr_params[1, :])
        return distribution

    def act(self, distribution):
        # sample an action from a state's distribution
        
        # this policy is naturally stochastic, so there's no need to
        # force randomnes using a strategy like eps-greedy
        return distribution.sample()

    def log_prob(self, distributions, actions):
        # using a state's density function (pdf), calculate how likely it is to take the given action

        # advantage (to be defined) will determine whether this value will increase or decrease
        output = torch.empty([1, 1])
        for distr, a in zip(distributions, actions):
            output = torch.cat((output, distr.log_prob(a)), dim=0)
        return output
    
    def entropy(self, distributions):
        # calculate the entropy of a state's distribution (in other words, how uncertain the 
        # result is -- which only depends on sigma in the case of a normal distribution)

        # this will be maximized so that some exploration is always encouraged
        output = torch.empty([1, 1])
        for distr in distributions:
            output = torch.cat((output, distribution.entropy()), dim=0)
        return output
        

## 3. Critic


In [7]:
class Critic(nn.Module):
    
    def __init__(self, state_size, hidden_layers=[64, 128, 64]):
        super(Critic, self).__init__()
        
        # prepare the first hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(state_size, hidden_layers[0])])
        
        # prepare the rest of the hidden layers
        A = hidden_layers[:-1]
        B = hidden_layers[1:]
        self.hidden_layers.extend([nn.Linear(a, b) for a, b in zip(A, B)])
        
        # the critic outputs only a scalar V(s)
        self.output_layer = nn.Linear(hidden_layers[-1], 1)

    def forward(self, state):
        # connect layers to each other and put relu activations between them
        for layer in self.hidden_layers:
            state = layer(state)
            state = F.relu(state)
        state = self.output_layer(state)
        return state

## 4. Advantage Estimation

### 4.1 $n$-step

In [None]:
def n_step():
    pass

### 4.2 GAE

In [None]:
def gae():
    pass

## 5. Agent

In [None]:
class A2CAgent():
    
    # -- initialization -- #
    def __init__(self, state_size, action_size, calc_advantage, 
                 gamma=0.99, alpha=0.001, beta=0.01, tau=0.25,
                 n=4, learning=True):
        self.state_size, self.action_size = state_size, action_size
        self.calc_advantage = calc_advantage
        self.learning = learning
        self.gamma, self.alpha, self.tau, self.n = gamma, alpha, tau, n
        self.reset()

    def reset_temporary_buffer(self):
        # used to store n consecutive steps
        self.tmp_s, self.tmp_a, self.tmp_r, self.tmp_ns, self.tmp_d = \
            ([None] * self.n, [None] * self.n, [None] * self.n, [None] * self.n, [None] * self.n)
        
        # note that the critic will evaluate n+1 states
        self.tmp_actor_out = [None] * self.n
        self.tmp_critic_out = [None] * (self.n + 1)

    
    def reset(self):
        self.actor = Actor(state_size, action_size).to(device)
        self.critic = Critic(state_size).to(device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.alpha)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.alpha)
        
        self.reset_temporary_buffer()
        self.i = 0
    # -- initialization -- #

    def act(self, s):
        distrib = self.actor(s)

        self.tmp_actor_out[self.update_i] = distrib

        return self.actor.act(distrib)

    def store(self, s, a, r, ns, d):
        self.tmp_s[self.i], self.tmp_a[self.i], self.tmp_r[self.i], self.tmp_ns[self.i], self.tmp_d[self.i] = (s, a, r, ns, d)

        if self.i == self.n - 1:
            self.learn()
            self.beta *= self.beta
        
        self.i = (self.i + 1) % self.n

    def calc_returns(self, rewards):
        pass

    def learn(self):        
        self.tmp_s, self.tmp_a, self.tmp_r, self.tmp_ns, self.tmp_d, self.tmp_actor_out = (
            torch.tensor(self.tmp_s).to(device),
            torch.tensor(self.tmp_a).to(device),
            torch.tensor(self.tmp_r).to(device),
            torch.tensor(self.tmp_ns).to(device),
            torch.tensor(self.tmp_d).to(device),
            torch.tensor(self.tmp_actor_out).to(device)
        )
        
        # use the critic to evaluate all n+1 states
        state_values = self.critic(torch.cat((self.tmp_s, self.ns[-1, :]), dim=0))

        # calculate some slightly more accurate returns using n rewards and a predicion of G_n+1
        returns = self.calc_returns(torch.cat((self.tmp_r, state_values[-1, :]), dim=0))

        # calculate the advantages at each state
        advantages = self.calc_advantage(self.tmp_r, returns, state_values, self.gamma, self.tau)
        
        # actor update
        self.actor.log_prob(self.tmp_actor_out) * advantages + \
            self.actor.entropy(self.tmp_actor_out) * self.beta

        # critic update
        returns - state_values[:-1, :]

        self.reset_temporary_buffer()
        

## 6. Training an agent

In [None]:
agents = [('A2C, n=4', A2CAgent(state_size, action_size, n_step, n=4)),
          ('A2C, n=8', A2CAgent(state_size, action_size, n_step, n=8)),
          ('A2C+GAE, n=8', A2CAgent(state_size, action_size, gae, n=8))]

In [None]:
def execute_episode(agent, env):
    pass

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
def train(agent, env, episodes=1000, repeat=3, consecutive_episodes=100, show_output=True, save_as=None):
    pass

## 7. Comparing the results

## 8. Possible improvements

## 9. Conclusions