# Deep Deterministic Policy Gradient (DDPG)
---
This notebook implements a DDPG agent with Unity's Udacity-modified Reacher environment.

### 1. Import the Necessary Packages

In [1]:
#import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import time

### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below. 

Change the __*visible_environment*__ variable to enable training with (True) or without (False) a visible environment.

The output below indicates the utilization of a cpu or a gpu.

In [2]:
from unityagents import UnityEnvironment

# set these parameters
visible_environment = False 
one_agent = False # if false, then 20 agent environment

if visible_environment == True and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_one_agent/Reacher.x86_64')
elif visible_environment == False and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_one_agent/Reacher.x86_64')
elif visible_environment == True and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_20_agents/Reacher.x86_64')
elif visible_environment == False and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_20_agents/Reacher.x86_64')    
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# number of actions and states
action_size = brain.vector_action_space_size

states = env_info.vector_observations
state_size = states.shape[1]

from ddpg_agent import Agent

agent = Agent(state_size=state_size, action_size=action_size, random_seed=0)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


device =  cuda:0


### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.

Alternatively, **skip** to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent.

In [None]:
def plot_scores(scores, fig_indicator="normal"):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.ylim((-4, 24))
    plt.show()
    
    fig_name = "Results/Figure_" + fig_indicator + ".png"
    fig.savefig(fig_name)
    
    np.save("Results/scores_" + fig_indicator + ".npy", scores)
    

def ddpg(n_episodes=2000, max_t=700):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset()
        score = np.zeros(num_agents)                          # initialize the score (for each agent)
        for t in range(max_t):
            
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]        # send the action to the environment

            next_states = env_info.vector_observations   # get the next state
            rewards = env_info.rewards                   # get the reward
            dones = env_info.local_done                  # see if episode has finished
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards
            
            if np.any(dones):
                break 
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
        if i_episode % 100 == 0 or np.mean(scores_deque) >= 30.0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
        if np.mean(scores_deque)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break            
    return scores    

t = time.time()
scores = ddpg()
elapsed = time.time() - t # https://stackoverflow.com/questions/5849800/what-is-the-python-equivalent-of-matlabs-tic-and-toc-functions
print('\t Time to train network: {:.2f}'.format(elapsed), 'seconds')

plot_scores(scores, fig_indicator="ddpg_normal_soln_final")

Episode 1	Average Score: 0.27
Episode 2	Average Score: 0.43
Episode 3	Average Score: 0.53
Episode 4	Average Score: 0.57
Episode 5	Average Score: 0.63
Episode 6	Average Score: 0.64
Episode 7	Average Score: 0.64
Episode 8	Average Score: 0.64
Episode 9	Average Score: 0.62
Episode 10	Average Score: 0.62
Episode 11	Average Score: 0.61
Episode 12	Average Score: 0.63
Episode 13	Average Score: 0.64
Episode 14	Average Score: 0.66
Episode 15	Average Score: 0.63
Episode 16	Average Score: 0.60
Episode 17	Average Score: 0.58
Episode 18	Average Score: 0.56
Episode 19	Average Score: 0.54
Episode 20	Average Score: 0.53
Episode 21	Average Score: 0.52
Episode 22	Average Score: 0.53
Episode 23	Average Score: 0.52
Episode 24	Average Score: 0.54
Episode 25	Average Score: 0.56
Episode 26	Average Score: 0.57
Episode 27	Average Score: 0.59
Episode 28	Average Score: 0.59
Episode 29	Average Score: 0.61
Episode 30	Average Score: 0.62
Episode 31	Average Score: 0.63
Episode 32	Average Score: 0.64
Episode 33	Averag

Episode 259	Average Score: 1.69
Episode 260	Average Score: 1.69
Episode 261	Average Score: 1.68
Episode 262	Average Score: 1.68
Episode 263	Average Score: 1.68
Episode 264	Average Score: 1.68
Episode 265	Average Score: 1.67
Episode 266	Average Score: 1.66
Episode 267	Average Score: 1.66
Episode 268	Average Score: 1.65
Episode 269	Average Score: 1.65
Episode 270	Average Score: 1.65
Episode 271	Average Score: 1.65
Episode 272	Average Score: 1.64
Episode 273	Average Score: 1.64
Episode 274	Average Score: 1.64
Episode 275	Average Score: 1.63
Episode 276	Average Score: 1.63
Episode 277	Average Score: 1.62
Episode 278	Average Score: 1.60
Episode 279	Average Score: 1.59
Episode 280	Average Score: 1.59
Episode 281	Average Score: 1.58
Episode 282	Average Score: 1.57
Episode 283	Average Score: 1.57
Episode 284	Average Score: 1.56
Episode 285	Average Score: 1.55
Episode 286	Average Score: 1.54
Episode 287	Average Score: 1.52
Episode 288	Average Score: 1.52
Episode 289	Average Score: 1.51
Episode 

Episode 513	Average Score: 0.76
Episode 514	Average Score: 0.76
Episode 515	Average Score: 0.76
Episode 516	Average Score: 0.76
Episode 517	Average Score: 0.75
Episode 518	Average Score: 0.75
Episode 519	Average Score: 0.74
Episode 520	Average Score: 0.74
Episode 521	Average Score: 0.73
Episode 522	Average Score: 0.73
Episode 523	Average Score: 0.73
Episode 524	Average Score: 0.72
Episode 525	Average Score: 0.72
Episode 526	Average Score: 0.71
Episode 527	Average Score: 0.72
Episode 528	Average Score: 0.72
Episode 529	Average Score: 0.71
Episode 530	Average Score: 0.71
Episode 531	Average Score: 0.71
Episode 532	Average Score: 0.71
Episode 533	Average Score: 0.71
Episode 534	Average Score: 0.71
Episode 535	Average Score: 0.71
Episode 536	Average Score: 0.71
Episode 537	Average Score: 0.71
Episode 538	Average Score: 0.72
Episode 539	Average Score: 0.72
Episode 540	Average Score: 0.72
Episode 541	Average Score: 0.72
Episode 542	Average Score: 0.72
Episode 543	Average Score: 0.72
Episode 

Episode 768	Average Score: 0.63
Episode 769	Average Score: 0.63
Episode 770	Average Score: 0.63
Episode 771	Average Score: 0.63
Episode 772	Average Score: 0.63
Episode 773	Average Score: 0.63
Episode 774	Average Score: 0.63
Episode 775	Average Score: 0.62
Episode 776	Average Score: 0.63
Episode 777	Average Score: 0.63
Episode 778	Average Score: 0.62
Episode 779	Average Score: 0.62
Episode 780	Average Score: 0.62
Episode 781	Average Score: 0.62
Episode 782	Average Score: 0.62
Episode 783	Average Score: 0.62
Episode 784	Average Score: 0.62
Episode 785	Average Score: 0.62
Episode 786	Average Score: 0.63
Episode 787	Average Score: 0.63
Episode 788	Average Score: 0.63
Episode 789	Average Score: 0.63
Episode 790	Average Score: 0.64
Episode 791	Average Score: 0.64
Episode 792	Average Score: 0.64
Episode 793	Average Score: 0.64
Episode 794	Average Score: 0.65
Episode 795	Average Score: 0.65
Episode 796	Average Score: 0.65
Episode 797	Average Score: 0.66
Episode 798	Average Score: 0.66
Episode 

### 4. Watch a Smart Agent!

The code below loads the trained weights from file to watch a smart agent. 

To visualize the trained environment,
 - change the __*visible_environment*__ variable to **True** in (**2. Instantiate the Environment and Agent**)
 - restart the kernel, and 
 - **skip** the previous section (**3. Train the Agent with DQN**).

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# load the weights from file
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

for i in range(1):
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations                  # get the current state (for each agent)
    for j in range(200):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]        # send the action to the environment
        states = env_info.vector_observations
        dones = env_info.local_done                  # see if episode has finished
        
        if np.any(dones):
            break     

In [None]:
env.close()