# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's Pendulum-v0 environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

### 2. Instantiate the Environment and Agent

In [2]:
!pip install -q ../python/

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 3.0.8 which is incompatible.[0m


In [3]:
class Reacher:
    def __init__(self, path):
        from unityagents import UnityEnvironment
        env = UnityEnvironment(file_name=path)
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        env_info = env.reset(train_mode=True)[brain_name]
        num_agents = len(env_info.agents)
        print('Number of agents:', num_agents)
        action_size = brain.vector_action_space_size
        print('Size of each action:', action_size)
        states = env_info.vector_observations
        state_size = states.shape[1]
        print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
        print('The state for the first agent looks like:', states[0])
        
        self.env = env
        self.brain_name = brain_name
        
    def __del__(self):
        self.env.close()
    
    def reset(self):
        env_info = self.env.reset(train_mode=True)[self.brain_name]
        return env_info.vector_observations
    
    def step(self, actions):
        env_info = self.env.step(actions)[self.brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        return next_states, rewards, dones, env_info

In [6]:
def ddpg(agent, env, n_episodes=1000, max_t=1000, window_size=100, is_20=False, ckpt_prefix='checkpoint',
         reward_accum_steps=35):
    discounts = np.expand_dims(0.99 ** np.arange(reward_accum_steps + 1), 1)
    num_parallel = 20 if is_20 else 1
    
    scores_deque = deque(maxlen=window_size)
    scores = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = np.zeros([num_parallel])
        t = 0
        while t < max_t:
            # collect data
            states = []
            actions = []
            rewards = []
            next_states = []
            dones = []
            for _ in range(reward_accum_steps):
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)
                score += reward
                
                states.append(state)  # [accum_steps, num_parallel, state_size]
                actions.append(action)  # [accum_steps, num_parallel, action_size]
                rewards.append(reward)  # [accum_steps, num_parallel]
                next_states.append(next_state) # [accum_steps, num_parallel, state_size]
                dones.append(done) # [accum_steps, num_parallel]
                
                state = next_state
                t += 1

            # calculate rewards
            rewards = np.array(rewards, dtype=np.float32)
            for accum_step_i in range(reward_accum_steps):
                rewards[accum_step_i,:] = np.sum(rewards[accum_step_i:,:] * discounts[:-accum_step_i-1,:], 0)
            
            # agent step
            for accum_step_i in range(reward_accum_steps):
                for parallel_i in range(num_parallel):
                    agent.step(states[accum_step_i][parallel_i],
                               actions[accum_step_i][parallel_i],
                               rewards[accum_step_i][parallel_i],
                               next_states[accum_step_i][parallel_i],
                               dones[accum_step_i][parallel_i])
            
        scores_deque.append(score)
        scores.append(score)
        cur_mean = np.mean(score)
        moving_mean = np.mean(scores_deque)
        print('\rEpisode {}\tAverage Score: {:.2f}\tCur Score: {:.2f}'.format(i_episode, moving_mean, cur_mean))
        torch.save(agent.actor_local.state_dict(), ckpt_prefix + '_actor.pth')
        torch.save(agent.critic_local.state_dict(), ckpt_prefix + '_critic.pth')

        if len(scores_deque) == window_size and moving_mean >= 30.:
            print("Solved at episode {}!".format(i_episode - window_size + 1))
            
    return scores

In [5]:
def evaluate(env, agent):
    state = env.reset()
    score = None
    for i in range(1000):
        action = agent.act(state, add_noise=False)
        state, reward, done, _ = env.step(action)
        if score is None:
            score = np.zeros(len(reward))
        score += reward
        if np.all(done):
            break
    return score

### 3. Experiment in Reacher_One

In [None]:
env = Reacher('/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

In [None]:
import ddpg_agent
from importlib import reload
reload(ddpg_agent)
agent = ddpg_agent.Agent(state_size=33, action_size=4, random_seed=2)

In [None]:
from workspace_utils import active_session

with active_session():
    scores = ddpg(agent, env, 1000)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
score = evaluate(env, agent)
print(score)

### 4. Experiment in Reacher_20

In [7]:
env_20 = Reacher('/data/Reacher_Linux_NoVis/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
   1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   5.75471878e+00  -1.00000000e+00
   5.55726624e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
  -1.68164849e-01]


In [8]:
import ddpg_agent
from importlib import reload
reload(ddpg_agent)
agent_20 = ddpg_agent.Agent(state_size=33, action_size=4, random_seed=2)

In [None]:
from workspace_utils import active_session

with active_session():
    scores = ddpg(agent_20, env_20, 1000, is_20=True, ckpt_prefix='checkpoint_20', reward_accum_steps=35)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 1	Average Score: 0.12	Cur Score: 0.12: 0.00011599168647080669
actor_loss: 0.007831086404621601, critic_loss: 9.249900176655501e-059

In [None]:
agent_20.actor_local.load_state_dict(torch.load('checkpoint_20_actor.pth'))
agent_20.critic_local.load_state_dict(torch.load('checkpoint_20_critic.pth'))
score = evaluate(env_20, agent_20)
print(score)
print(np.mean(score))

In [6]:
%cp checkpoint_20_actor.pth checkpoint_20_actor_23.pth
%cp checkpoint_20_critic.pth checkpoint_20_critic_23.pth