# Continuous Control

---
### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from tensorboardX import SummaryWriter
from unityagents import UnityEnvironment
from udacity.ddpg_agent5 import Agent


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
def get_env():
    from sys import platform as _platform
    if _platform == "linux" or _platform == "linux2":
       # linux
        env = UnityEnvironment(file_name="./Crawler_Linux/Crawler.x86_64", no_graphics = True)
    elif _platform == "darwin":
       # MAC OS X
       env = UnityEnvironment(file_name="Crawler.app", no_graphics = True)
    return env

def welcome():
    env = get_env()

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)

    # size of each action
    action_size = brain.vector_action_space_size

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('Number of agents:', num_agents)
    print('Size of each action:', action_size)
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))

    return env, state_size, action_size, num_agents

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [3]:
import sys
import time

def process_buffered_input(s):
    return np.hstack(s)

def init_buff():
    ''' Initialize the buffer'''
    buffer = deque(maxlen=SKIP_FRAME + 1)
    for _ in range(SKIP_FRAME + 1):
        buffer.append(np.zeros((num_agents, state_size)))
    return buffer

def ddpg(env, agent, n_episodes=2000, max_t=int(10000), prefix=''):
    scores_deque = deque(maxlen=100)
    scores = []
    episode_horizons = deque(maxlen=100)
    brain_name = env.brain_names[0]
    solved = False
    reward_coeff = 0.01
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        n_agents = state.shape[0]
        agent.reset()
        score = np.zeros((n_agents, 1), dtype=np.float32)
        done = np.ones(shape=(n_agents, 1), dtype=np.bool)

        state_buffer = init_buff()
        last_action = None
        for t in range(max_t):
            state_buffer.append(state)
            if t < SKIP_FRAME:
                action = np.random.uniform(-1, 1, size=(num_agents, action_size))
            else:
                if t % SKIP_FRAME == 0:
                    action = agent.act(process_buffered_input([state_buffer[i] for i in range(SKIP_FRAME)]))
                    last_action = action
                else:
                    action = last_action

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations
            reward = np.array(env_info.rewards)[..., None]
            done = np.array(env_info.local_done)[..., None]
            if t+1 == max_t:
                done = np.ones_like(done, dtype = np.bool)

            # agent.step(state, action, reward, next_state, done)
            agent.step(process_buffered_input([state_buffer[i] for i in range(SKIP_FRAME)]),
                       action, reward_coeff*reward,
                       process_buffered_input([state_buffer[i] for i in range(1, SKIP_FRAME + 1)]),
                       done)

            state = next_state
            score += reward

            if np.all(done):
                episode_horizons.append(t)
                break

        scores_deque.append(score)
        scores.append(score)
        writer.add_scalar('score_G', np.mean(scores), i_episode)
        
        _mu_score_moving = np.mean(np.mean(scores_deque, axis = 1))
        print('\rEpisode {}\t100-episode avg score: {:.2f}\tScore: {:.2f}\tTime Step: {}'.format(i_episode, _mu_score_moving, float(np.mean(score)), agent.total_steps), end="")

        if i_episode % 50 == 0:
            print('\rEpisode {}\t100-episode avg score: {:.2f}\tAvg. Horizon: {:.2f}'.format(i_episode, _mu_score_moving, np.mean(episode_horizons)))

        if (np.mean(scores_deque) >= 30.) and (i_episode > 99) and (not solved):
            print('The environment was solved in {} episodes'.format(i_episode))
            solved = True

    torch.save(agent.actor_local.state_dict(), './models/{}checkpoint_actor_multi.pth'.format(prefix))
    torch.save(agent.critic_local.state_dict(), './models/{}checkpoint_critic_multi.pth'.format(prefix))
    return scores


### 2. Instantiate the Environment

In [4]:
SKIP_FRAME = 3
env, state_size, action_size, num_agents = welcome()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


Number of agents: 12
Size of each action: 20
There are 12 agents. Each observes a state with length: 129


### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [5]:
max_t = 1000
n_episodes = 1000

In [6]:
writer = SummaryWriter(log_dir='./logs/crawler_reflective_critic_multi/horizon_{}'.format(max_t))
agent = Agent(state_size=state_size*SKIP_FRAME,
              action_size=action_size,
              random_seed = 0,
              writer=writer,
              explore_assumptions=True)

scores = ddpg(env, agent, n_episodes=n_episodes, max_t = max_t, prefix='crawler_reflective_critic_')

Episode 50	100-episode avg score: 66.71	Avg. Horizon: 999.00tep: 500000
Episode 100	100-episode avg score: 192.06	Avg. Horizon: 999.00tep: 100000
The environment was solved in 100 episodes
Episode 150	100-episode avg score: 237.92	Avg. Horizon: 999.00tep: 150000
Episode 200	100-episode avg score: 125.90	Avg. Horizon: 999.00tep: 200000
Episode 250	100-episode avg score: 200.19	Avg. Horizon: 999.00tep: 250000
Episode 300	100-episode avg score: 340.93	Avg. Horizon: 999.00tep: 300000
Episode 349	100-episode avg score: nan	Score: nan	Time Step: 349000348000

TypeError: 'NoneType' object is not callable

When finished, you can close the environment.

In [None]:
env.close()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), [np.mean(s) for s in scores])
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()