# Continuous Control

The environment for this project is [Reacher](https://github.com/udacity/deep-reinforcement-learning/tree/master/p2_continuous_control) from Unity, and it's provided in the `setup` folder. We'll implement the A2C algorithm as the synchronous version of [A3C](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) (but the input won't be directly from pixels). Results will be shown in this notebook and the best solution will be implemented in `main.py`.

![final](imgs/gif.gif "final")

> The model used to generate this gif is `final.pth` (Dueling Double DQN), which was trained for 700 episodes using `main.py`.

## 1. Prepare dependencies and environment

Take a look at README.md before executing this notebook and make sure that the kernel is set to **p2_continuous_control**.

In [1]:
!pip -q install ./setup

import sys
import platform

import numpy as np
from numpy_ringbuffer import RingBuffer
from scipy import signal
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import nn

from setup import unityagents
from unityagents import UnityEnvironment

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Unity environments contain **brains**, our interfaces for controlling agents. We'll be conrtolling the first (default) brain in the environment. It's also useful to keep information such as `state_size`, `action_size` and `num_agents`.

In [2]:
env = None
system = platform.system()
if system == 'Linux':
    env = UnityEnvironment(file_name="setup/Reacher_Linux/Reacher.x86_64")
elif system == 'Darwin':
    env = UnityEnvironment(file_name="setup/Reacher.app")
elif system == 'Windows':
    env = UnityEnvironment(file_name="setup/Reacher_Windows_x86_64/Reacher.exe")
else:
    print('Cannot find environment for this system.')

# use the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## 2. Actor



In [4]:
class Actor(nn.Module):
    
    def __init__(self, state_size, action_size, hidden_layers=[64, 128, 64]):
        super(DQN, self).__init__()
        
        # prepare the first hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(state_size, hidden_layers[0])])
        
        # prepare the rest of the hidden layers
        A = hidden_layers[:-1]
        B = hidden_layers[1:]
        self.hidden_layers.extend([nn.Linear(a, b) for a, b in zip(A, B)])
        
        # prepare the output layer
        self.output_layer = nn.Linear(hidden_layers[-1], action_size)

    def forward(self, state):
        # connect layers to each other and put relu activations between them
        for layer in self.hidden_layers:
            state = layer(state)
            state = F.relu(state)
        state = self.output_layer(state)
        return state
    
    def act(self, state):
        # distrib = forward(state)
        # sample(distrib)
        pass
    
    def log_likelihood(self, state, action):
        # distrib = forward(state)
        # l = likelihood(distrib, action)
        # return log(l)
        pass

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


## 3. Critic


In [5]:
class Critic(nn.Module):
    
    def __init__(self, state_size, action_size, hidden_layers=[64, 128, 64]):
        super(DQN, self).__init__()
        
        # prepare the first hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(state_size, hidden_layers[0])])
        
        # prepare the rest of the hidden layers
        A = hidden_layers[:-1]
        B = hidden_layers[1:]
        self.hidden_layers.extend([nn.Linear(a, b) for a, b in zip(A, B)])
        
        # prepare the output layer
        self.output_layer = nn.Linear(hidden_layers[-1], action_size)

    def forward(self, state):
        # connect layers to each other and put relu activations between them
        for layer in self.hidden_layers:
            state = layer(state)
            state = F.relu(state)
        state = self.output_layer(state)
        return state

Total score (averaged over agents) this episode: 0.11999999731779099


## 4. Advantage Estimation

### 4.1 $n$-step

In [None]:
def n_step():
    pass

### 4.2 GAE

In [None]:
def gae():
    pass

## 5. Agent

In [None]:
class A2CAgent():
    
    # -- initialization -- #
    def __init__(self, QNetwork, state_size, action_size, 
                 replay_buffer, Delta, 
                 eps=1, eps_decay=0.9995, min_eps=0.0001,
                 gamma=0.99, alpha=0.001, tau=0.01,
                 update_every=15, batch_size=64, learning=True):
        self.state_size, self.action_size = state_size, action_size
        self.original_eps = eps
        self.QNetwork = QNetwork
        self.replay_buffer = replay_buffer
        self.Delta = Delta
        self.learning = learning
        self.eps, self.eps_decay, self.min_eps = eps, eps_decay, min_eps
        self.gamma, self.alpha, self.tau = gamma, alpha, tau
        self.update_every, self.batch_size = update_every, batch_size
        self.reset()
    
    def reset(self):
        self.replay_buffer.reset()
        self.eps = self.original_eps
        self.q_local = self.QNetwork(self.state_size, self.action_size).to(device)
        self.q_target = self.QNetwork(self.state_size, self.action_size).to(device)
        self.optimizer = optim.Adam(self.q_local.parameters(), lr=self.alpha)
        self.update_i = 0
    # -- initialization -- #

    def act(self, s):
        pass

    def learn(self):
        pass

## 6. Training an agent

In [None]:
agents = [('A2C', QNetworkAgent(DQN, state_size, action_size, UniformReplayBuffer(100_000), dt_dqn)),
          ('A2C+GAE', QNetworkAgent(DuelingDQN, state_size, action_size, UniformReplayBuffer(100_000), dt_double_dqn))]

In [None]:
def execute_episode(agent, env):
    pass

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
def train(agent, env, episodes=1000, repeat=3, consecutive_episodes=100, show_output=True, save_as=None):
    pass

## 7. Comparing the results

## 8. Possible improvements

## 9. Conclusions