# Deep Deterministic Policy Gradient (DDPG)
---
This notebook implements a DDPG agent with Unity's Udacity-modified Reacher environment.

### 1. Import the Necessary Packages

In [1]:
#import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import time   

### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below. 

Change the __*visible_environment*__ variable to enable training with (True) or without (False) a visible environment.

The output below indicates the utilization of a cpu or a gpu.

In [2]:
from unityagents import UnityEnvironment

# set these parameters
visible_environment = False 
one_agent = False # if false, then 20 agent environment

if visible_environment == True and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_one_agent/Reacher.x86_64')
elif visible_environment == False and one_agent == True:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_one_agent/Reacher.x86_64')
elif visible_environment == True and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/Vis_20_agents/Reacher.x86_64')
elif visible_environment == False and one_agent == False:
    env = UnityEnvironment(file_name='../unity_environments/Reacher/NoVis_20_agents/Reacher.x86_64')    
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# number of actions and states
action_size = brain.vector_action_space_size

states = env_info.vector_observations
state_size = states.shape[1]


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### 2a. Hyperparameter Search


In [3]:
random.seed(32)

BUFFER_SIZE = [10 ** random.uniform(a=6, b=6) for x in range(0, 100)]       # replay buffer size
BATCH_SIZE = [2** random.randint(a=7, b=7) for x in range(0, 100)]         # minibatch size
GAMMA = [random.uniform(a=0.99, b=0.99) for x in range(0, 100)]                 # discount factor
TAU = [10 ** random.uniform(a=-2, b=-2) for x in range(0, 100)]               # for soft update of target parameters
LR_ACTOR = [10 ** random.uniform(a=-3.1, b=-2.9) for x in range(0, 100)]             # learning rate of the actor 
LR_CRITIC = [3*(10 ** random.uniform(a=-4.1, b=-3.9)) for x in range(0, 100)]            # learning rate of the critic
WEIGHT_DECAY = np.zeros(100) # 0.0001   # L2 weight decay


from ddpg_agent import Agent

device =  cuda:0


### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.

Alternatively, **skip** to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent.

In [4]:
def plot_scores(scores, fig_indicator="normal"):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.ylim((-4, 24))
    plt.show()
    
    fig_name = "Results/Figure_" + fig_indicator + ".png"
    fig.savefig(fig_name)
    
    np.save("Results/scores_" + fig_indicator + ".npy", scores)
    

def ddpg(n_episodes=2000, max_t=700):
    scores_deque = deque(maxlen=100)
    local_scores_deque = deque(maxlen=10)
    
    scores = []
    max_score = -np.Inf
    
    tt = time.time()
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset()
        score = np.zeros(num_agents)                          # initialize the score (for each agent)
        for t in range(max_t):
            
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]        # send the action to the environment

            next_states = env_info.vector_observations   # get the next state
            rewards = env_info.rewards                   # get the reward
            dones = env_info.local_done                  # see if episode has finished            
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards
            
            if np.any(dones):
                break 
        scores_deque.append(np.mean(score))
        local_scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        
        #if i_episode == 50:
        #    agent.more_frequent_learning = 0
        
        agent.keep_learning = 1
        if i_episode >= 200 and i_episode % 5 == 0:
            name = 'checkpoint_actor_' + str(i_episode) + '_.pth'
            torch.save(agent.actor_local.state_dict(), name)
            
            name = 'checkpoint_critic_' + str(i_episode) + '_.pth'
            torch.save(agent.critic_local.state_dict(), name)
        #if np.mean(local_scores_deque) >= 35:
        #    agent.keep_learning = 0
        
        
        if i_episode == 200:
            for param_group in agent.actor_optimizer.param_groups:
                param_group['lr'] = 1e-4 # 1e-3

            for param_group in agent.critic_optimizer.param_groups:
                param_group['lr'] = 1e-4 #3e-4
                                            
        if i_episode == 240:
            for param_group in agent.actor_optimizer.param_groups:
                param_group['betas'] = (0, 0) 
            
            for param_group in agent.actor_optimizer.param_groups:
                param_group['lr'] = 0# 1e-3

            for param_group in agent.critic_optimizer.param_groups:
                param_group['lr'] = 0 #3e-4

        """        
        if i_episode == 270:
            
            for param_group in agent.actor_optimizer.param_groups:
                param_group['lr'] = 1e-7# 1e-3

            for param_group in agent.critic_optimizer.param_groups:
                param_group['lr'] = 1e-7 #3e-4
        """
        
        print('\rEpisode {}\tAverage Score: {:.2f} \tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), np.mean(score)))   
        if i_episode % 100 == 0 or np.mean(scores_deque) >= 30.0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
            #torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            #torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            elapsed = time.time() - tt # https://stackoverflow.com/questions/5849800/what-is-the-python-equivalent-of-matlabs-tic-and-toc-functions
            print('\t Elapsed Time: {:.2f}'.format(elapsed), 'seconds')
        if np.mean(scores_deque)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            #torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            #torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break 
        #if (i_episode == 10 and np.mean(scores_deque) < 0.3):
        #    break
            
    return scores    

for ii in range(1, 2):
    print('ii = ', ii)

    
    BUFFER_SIZE = int(1e6)
    GAMMA = 0.99
    TAU = 1e-3
    #WEIGHT_DECAY = 0
    for BATCH_SIZE in [64]: #[32, 64, 128, 256, 512]:
        for LR_ACTOR in [1e-3]:#[1e-3, 3e-4, 1e-4]:
            for LR_CRITIC in [1e-3]:#[1e-3, 3e-4, 1e-4]:
                for WEIGHT_DECAY in [0]:
                    print(['buffer_size = ', int(BUFFER_SIZE), 'batch_size = ', int(BATCH_SIZE), 'gamma=', GAMMA, 
                       'tau=', TAU, 'lr_actor= ', LR_ACTOR, 'lr_critic=',LR_CRITIC, 'weight_decay=', WEIGHT_DECAY])
                    t = time.time()
                    agent = Agent(state_size=state_size, action_size=action_size, random_seed=7,
                                  buffer_size=int(BUFFER_SIZE), batch_size=int(BATCH_SIZE), gamma=GAMMA, tau=TAU,
                                   lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                    #agent.actor_local.load_state_dict(torch.load('checkpoint_actor_215_.pth'))
                    #agent.critic_local.load_state_dict(torch.load('checkpoint_critic_215_.pth'))
                    scores = ddpg(n_episodes=350, max_t = 1400)
                    elapsed = time.time() - t # https://stackoverflow.com/questions/5849800/what-is-the-python-equivalent-of-matlabs-tic-and-toc-functions
                    print('\t Time to train network: {:.2f}'.format(elapsed), 'seconds')

#plot_scores(scores, fig_indicator="ddpg_normal_soln_final")

ii =  1
['buffer_size = ', 1000000, 'batch_size = ', 64, 'gamma=', 0.99, 'tau=', 0.001, 'lr_actor= ', 0.001, 'lr_critic=', 0.001, 'weight_decay=', 0]
Episode 1	Average Score: 0.76 	Score: 0.76
Episode 2	Average Score: 0.91 	Score: 1.06
Episode 3	Average Score: 0.85 	Score: 0.75
Episode 4	Average Score: 0.87 	Score: 0.93
Episode 5	Average Score: 0.90 	Score: 0.99
Episode 6	Average Score: 0.88 	Score: 0.83
Episode 7	Average Score: 0.85 	Score: 0.64
Episode 8	Average Score: 0.88 	Score: 1.05
Episode 9	Average Score: 0.95 	Score: 1.55
Episode 10	Average Score: 1.03 	Score: 1.74
Episode 11	Average Score: 1.07 	Score: 1.48
Episode 12	Average Score: 1.12 	Score: 1.65
Episode 13	Average Score: 1.16 	Score: 1.65
Episode 14	Average Score: 1.21 	Score: 1.91
Episode 15	Average Score: 1.27 	Score: 2.03
Episode 16	Average Score: 1.28 	Score: 1.54
Episode 17	Average Score: 1.31 	Score: 1.67
Episode 18	Average Score: 1.32 	Score: 1.56
Episode 19	Average Score: 1.36 	Score: 2.03
Episode 20	Average Scor

KeyboardInterrupt: 

### 4. Watch a Smart Agent!

The code below loads the trained weights from file to watch a smart agent. 

To visualize the trained environment,
 - change the __*visible_environment*__ variable to **True** in (**2. Instantiate the Environment and Agent**)
 - restart the kernel, and 
 - **skip** the previous section (**3. Train the Agent with DQN**).

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# load the weights from file
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

for i in range(1):
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations                  # get the current state (for each agent)
    for j in range(200):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]        # send the action to the environment
        states = env_info.vector_observations
        dones = env_info.local_done                  # see if episode has finished
        
        if np.any(dones):
            break     

In [None]:
env.close()