# Deep Deterministic Policy Gradient (DDPG)
---
This notebook implements a DDPG agent with Unity's Udacity-modified Reacher environment.

### 1. Import the Necessary Packages

In [1]:
#import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import time   

### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below. 

Change the __*visible_environment*__ variable to enable training with (True) or without (False) a visible environment.

The output below indicates the utilization of a cpu or a gpu.

In [2]:
from unityagents import UnityEnvironment

# set these parameters
visible_environment = False 
one_agent = False # if false, then 20 agent environment

if visible_environment == True and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_one_agent/Reacher.x86_64')
elif visible_environment == False and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_one_agent/Reacher.x86_64')
elif visible_environment == True and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_20_agents/Reacher.x86_64')
elif visible_environment == False and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_20_agents/Reacher.x86_64')    
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# number of actions and states
action_size = brain.vector_action_space_size

states = env_info.vector_observations
state_size = states.shape[1]


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### 2a. Hyperparameter Search


In [3]:
random.seed(32)

BUFFER_SIZE = [10 ** random.uniform(a=6, b=6) for x in range(0, 100)]       # replay buffer size
BATCH_SIZE = [2** random.randint(a=7, b=7) for x in range(0, 100)]         # minibatch size
GAMMA = [random.uniform(a=0.99, b=0.99) for x in range(0, 100)]                 # discount factor
TAU = [10 ** random.uniform(a=-2, b=-2) for x in range(0, 100)]               # for soft update of target parameters
LR_ACTOR = [10 ** random.uniform(a=-3.1, b=-2.9) for x in range(0, 100)]             # learning rate of the actor 
LR_CRITIC = [3*(10 ** random.uniform(a=-4.1, b=-3.9)) for x in range(0, 100)]            # learning rate of the critic
WEIGHT_DECAY = np.zeros(100) # 0.0001   # L2 weight decay


from ddpg_agent import Agent

device =  cuda:0


### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.

Alternatively, **skip** to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent.

In [4]:
def plot_scores(scores, fig_indicator="normal"):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.ylim((-4, 24))
    plt.show()
    
    fig_name = "Results/Figure_" + fig_indicator + ".png"
    fig.savefig(fig_name)
    
    np.save("Results/scores_" + fig_indicator + ".npy", scores)
    

def ddpg(n_episodes=2000, max_t=700):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    
    tt = time.time()
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset()
        score = np.zeros(num_agents)                          # initialize the score (for each agent)
        for t in range(max_t):
            
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]        # send the action to the environment

            next_states = env_info.vector_observations   # get the next state
            rewards = env_info.rewards                   # get the reward
            dones = env_info.local_done                  # see if episode has finished
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards
            
            if np.any(dones):
                break 
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        
        """
        if i_episode == 300:
            for param_group in agent.actor_optimizer.param_groups:
                param_group['lr'] = 1e-4 # 1e-3

            for param_group in agent.critic_optimizer.param_groups:
                param_group['lr'] = 1e-4 #3e-4    
        
            agent.TAU = 1e-2
        """
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
        if i_episode % 100 == 0 or np.mean(scores_deque) >= 30.0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            elapsed = time.time() - tt # https://stackoverflow.com/questions/5849800/what-is-the-python-equivalent-of-matlabs-tic-and-toc-functions
            print('\t Elapsed Time: {:.2f}'.format(elapsed), 'seconds')
        if np.mean(scores_deque)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break 
        if (i_episode == 10 and np.mean(scores_deque) < 0.3):
            break
            
    return scores    

for ii in range(1, 2):
    print('ii = ', ii)

    
    BUFFER_SIZE = int(1e6)
    GAMMA = 0.99
    TAU = 1e-3
    WEIGHT_DECAY = 0
    for BATCH_SIZE in [256]: #[32, 64, 128, 256, 512]:
        for LR_ACTOR in [3e-4]:#[1e-3, 3e-4, 1e-4]:
            for LR_CRITIC in [1e-4]:#[1e-3, 3e-4, 1e-4]:
                for WEIGHT_DECAY in [0, 0.0001, 0.0002]:
                    print(['buffer_size = ', int(BUFFER_SIZE), 'batch_size = ', int(BATCH_SIZE), 'gamma=', GAMMA, 
                       'tau=', TAU, 'lr_actor= ', LR_ACTOR, 'lr_critic=',LR_CRITIC, 'weight_decay=', WEIGHT_DECAY])
                    t = time.time()
                    agent = Agent(state_size=state_size, action_size=action_size, random_seed=0,
                                  buffer_size=int(BUFFER_SIZE), batch_size=int(BATCH_SIZE), gamma=GAMMA, tau=TAU,
                                   lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                    scores = ddpg(n_episodes=1500)
                    elapsed = time.time() - t # https://stackoverflow.com/questions/5849800/what-is-the-python-equivalent-of-matlabs-tic-and-toc-functions
                    print('\t Time to train network: {:.2f}'.format(elapsed), 'seconds')

#plot_scores(scores, fig_indicator="ddpg_normal_soln_final")

ii =  1
['buffer_size = ', 1000000, 'batch_size = ', 256, 'gamma=', 0.99, 'tau=', 0.001, 'lr_actor= ', 0.0003, 'lr_critic=', 0.0001, 'weight_decay=', 0]
Episode 1	Average Score: 0.36
Episode 2	Average Score: 0.45
Episode 3	Average Score: 0.47
Episode 4	Average Score: 0.50
Episode 5	Average Score: 0.49
Episode 6	Average Score: 0.51
Episode 7	Average Score: 0.50
Episode 8	Average Score: 0.51
Episode 9	Average Score: 0.52
Episode 10	Average Score: 0.51
Episode 11	Average Score: 0.53
Episode 12	Average Score: 0.55
Episode 13	Average Score: 0.58
Episode 14	Average Score: 0.64
Episode 15	Average Score: 0.71
Episode 16	Average Score: 0.79
Episode 17	Average Score: 0.84
Episode 18	Average Score: 0.93
Episode 19	Average Score: 1.00
Episode 20	Average Score: 1.09
Episode 21	Average Score: 1.18
Episode 22	Average Score: 1.27
Episode 23	Average Score: 1.34
Episode 24	Average Score: 1.42
Episode 25	Average Score: 1.50
Episode 26	Average Score: 1.59
Episode 27	Average Score: 1.70
Episode 28	Average 

Episode 252	Average Score: 9.88
Episode 253	Average Score: 9.87
Episode 254	Average Score: 9.86
Episode 255	Average Score: 9.85
Episode 256	Average Score: 9.84
Episode 257	Average Score: 9.83
Episode 258	Average Score: 9.83
Episode 259	Average Score: 9.83
Episode 260	Average Score: 9.82
Episode 261	Average Score: 9.81
Episode 262	Average Score: 9.81
Episode 263	Average Score: 9.80
Episode 264	Average Score: 9.78
Episode 265	Average Score: 9.78
Episode 266	Average Score: 9.78
Episode 267	Average Score: 9.77
Episode 268	Average Score: 9.76
Episode 269	Average Score: 9.76
Episode 270	Average Score: 9.77
Episode 271	Average Score: 9.78
Episode 272	Average Score: 9.78
Episode 273	Average Score: 9.78
Episode 274	Average Score: 9.78
Episode 275	Average Score: 9.78
Episode 276	Average Score: 9.77
Episode 277	Average Score: 9.75
Episode 278	Average Score: 9.74
Episode 279	Average Score: 9.75
Episode 280	Average Score: 9.76
Episode 281	Average Score: 9.76
Episode 282	Average Score: 9.75
Episode 

Episode 503	Average Score: 9.17
Episode 504	Average Score: 9.16
Episode 505	Average Score: 9.16
Episode 506	Average Score: 9.14
Episode 507	Average Score: 9.13
Episode 508	Average Score: 9.15
Episode 509	Average Score: 9.17
Episode 510	Average Score: 9.17
Episode 511	Average Score: 9.18
Episode 512	Average Score: 9.16
Episode 513	Average Score: 9.17
Episode 514	Average Score: 9.16
Episode 515	Average Score: 9.16
Episode 516	Average Score: 9.15
Episode 517	Average Score: 9.16
Episode 518	Average Score: 9.17
Episode 519	Average Score: 9.17
Episode 520	Average Score: 9.19
Episode 521	Average Score: 9.20
Episode 522	Average Score: 9.19
Episode 523	Average Score: 9.17
Episode 524	Average Score: 9.18
Episode 525	Average Score: 9.16
Episode 526	Average Score: 9.17
Episode 527	Average Score: 9.17
Episode 528	Average Score: 9.18
Episode 529	Average Score: 9.18
Episode 530	Average Score: 9.18
Episode 531	Average Score: 9.18
Episode 532	Average Score: 9.16
Episode 533	Average Score: 9.16
Episode 

Episode 756	Average Score: 9.42
Episode 757	Average Score: 9.41
Episode 758	Average Score: 9.42
Episode 759	Average Score: 9.42
Episode 760	Average Score: 9.43
Episode 761	Average Score: 9.44
Episode 762	Average Score: 9.45
Episode 763	Average Score: 9.46
Episode 764	Average Score: 9.48
Episode 765	Average Score: 9.48
Episode 766	Average Score: 9.48
Episode 767	Average Score: 9.49
Episode 768	Average Score: 9.49
Episode 769	Average Score: 9.49
Episode 770	Average Score: 9.50
Episode 771	Average Score: 9.50
Episode 772	Average Score: 9.50
Episode 773	Average Score: 9.52
Episode 774	Average Score: 9.51
Episode 775	Average Score: 9.50
Episode 776	Average Score: 9.51
Episode 777	Average Score: 9.51
Episode 778	Average Score: 9.51
Episode 779	Average Score: 9.51
Episode 780	Average Score: 9.52
Episode 781	Average Score: 9.52
Episode 782	Average Score: 9.53
Episode 783	Average Score: 9.53
Episode 784	Average Score: 9.54
Episode 785	Average Score: 9.54
Episode 786	Average Score: 9.54
Episode 

Episode 1006	Average Score: 8.83
Episode 1007	Average Score: 8.83
Episode 1008	Average Score: 8.83
Episode 1009	Average Score: 8.83
Episode 1010	Average Score: 8.82
Episode 1011	Average Score: 8.83
Episode 1012	Average Score: 8.82
Episode 1013	Average Score: 8.81
Episode 1014	Average Score: 8.81
Episode 1015	Average Score: 8.81
Episode 1016	Average Score: 8.80
Episode 1017	Average Score: 8.78
Episode 1018	Average Score: 8.78
Episode 1019	Average Score: 8.78
Episode 1020	Average Score: 8.78
Episode 1021	Average Score: 8.77
Episode 1022	Average Score: 8.76
Episode 1023	Average Score: 8.75
Episode 1024	Average Score: 8.74
Episode 1025	Average Score: 8.74
Episode 1026	Average Score: 8.73
Episode 1027	Average Score: 8.74
Episode 1028	Average Score: 8.73
Episode 1029	Average Score: 8.72
Episode 1030	Average Score: 8.72
Episode 1031	Average Score: 8.72
Episode 1032	Average Score: 8.70
Episode 1033	Average Score: 8.69
Episode 1034	Average Score: 8.70
Episode 1035	Average Score: 8.69
Episode 10

Episode 1251	Average Score: 9.83
Episode 1252	Average Score: 9.84
Episode 1253	Average Score: 9.84
Episode 1254	Average Score: 9.87
Episode 1255	Average Score: 9.88
Episode 1256	Average Score: 9.87
Episode 1257	Average Score: 9.87
Episode 1258	Average Score: 9.90
Episode 1259	Average Score: 9.91
Episode 1260	Average Score: 9.94
Episode 1261	Average Score: 9.96
Episode 1262	Average Score: 9.97
Episode 1263	Average Score: 9.97
Episode 1264	Average Score: 9.98
Episode 1265	Average Score: 9.97
Episode 1266	Average Score: 9.97
Episode 1267	Average Score: 9.98
Episode 1268	Average Score: 9.98
Episode 1269	Average Score: 10.00
Episode 1270	Average Score: 10.01
Episode 1271	Average Score: 10.01
Episode 1272	Average Score: 10.02
Episode 1273	Average Score: 10.03
Episode 1274	Average Score: 10.03
Episode 1275	Average Score: 10.03
Episode 1276	Average Score: 10.03
Episode 1277	Average Score: 10.04
Episode 1278	Average Score: 10.04
Episode 1279	Average Score: 10.06
Episode 1280	Average Score: 10.0

Episode 1489	Average Score: 10.49
Episode 1490	Average Score: 10.51
Episode 1491	Average Score: 10.51
Episode 1492	Average Score: 10.52
Episode 1493	Average Score: 10.52
Episode 1494	Average Score: 10.52
Episode 1495	Average Score: 10.53
Episode 1496	Average Score: 10.55
Episode 1497	Average Score: 10.54
Episode 1498	Average Score: 10.54
Episode 1499	Average Score: 10.54
Episode 1500	Average Score: 10.54
Episode 1500	Average Score: 10.54
	 Elapsed Time: 20248.20 seconds
	 Time to train network: 20249.87 seconds
['buffer_size = ', 1000000, 'batch_size = ', 256, 'gamma=', 0.99, 'tau=', 0.001, 'lr_actor= ', 0.0003, 'lr_critic=', 0.0001, 'weight_decay=', 0.0001]
Episode 1	Average Score: 0.52
Episode 2	Average Score: 0.55
Episode 3	Average Score: 0.59
Episode 4	Average Score: 0.52
Episode 5	Average Score: 0.43
Episode 6	Average Score: 0.38
Episode 7	Average Score: 0.33
Episode 8	Average Score: 0.29
Episode 9	Average Score: 0.26
Episode 10	Average Score: 0.24
	 Time to train network: 68.05 s

Episode 221	Average Score: 0.04
Episode 222	Average Score: 0.04
Episode 223	Average Score: 0.04
Episode 224	Average Score: 0.04
Episode 225	Average Score: 0.04
Episode 226	Average Score: 0.04
Episode 227	Average Score: 0.04
Episode 228	Average Score: 0.04
Episode 229	Average Score: 0.04
Episode 230	Average Score: 0.04
Episode 231	Average Score: 0.04
Episode 232	Average Score: 0.04
Episode 233	Average Score: 0.04
Episode 234	Average Score: 0.04
Episode 235	Average Score: 0.04
Episode 236	Average Score: 0.04
Episode 237	Average Score: 0.04
Episode 238	Average Score: 0.04
Episode 239	Average Score: 0.04
Episode 240	Average Score: 0.04
Episode 241	Average Score: 0.04
Episode 242	Average Score: 0.04
Episode 243	Average Score: 0.04
Episode 244	Average Score: 0.04
Episode 245	Average Score: 0.04
Episode 246	Average Score: 0.04
Episode 247	Average Score: 0.04
Episode 248	Average Score: 0.04
Episode 249	Average Score: 0.04
Episode 250	Average Score: 0.04
Episode 251	Average Score: 0.04
Episode 

KeyboardInterrupt: 

### 4. Watch a Smart Agent!

The code below loads the trained weights from file to watch a smart agent. 

To visualize the trained environment,
 - change the __*visible_environment*__ variable to **True** in (**2. Instantiate the Environment and Agent**)
 - restart the kernel, and 
 - **skip** the previous section (**3. Train the Agent with DQN**).

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# load the weights from file
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

for i in range(1):
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations                  # get the current state (for each agent)
    for j in range(200):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]        # send the action to the environment
        states = env_info.vector_observations
        dones = env_info.local_done                  # see if episode has finished
        
        if np.any(dones):
            break     

In [None]:
env.close()