In [1]:
from unityagents import UnityEnvironment
import numpy as np
import torch
from IPython.display import clear_output, display

In [2]:
env = UnityEnvironment(file_name='Reacher_multi.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
import sys
sys.path.insert(0, './code/')
from ddptagent import Agent

In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents  :', num_agents)
action_size = brain.vector_action_space_size
print('Action space      :', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print("State size        :", state_size)
print('State description :', states[0])

Number of agents  : 20
Action space      : 4
State size        : 33
State description : [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
actor_path='actor_chk.pt'
critic_path='critic_chk.pt'

In [6]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=1)
agent.actor_local.state_dict  = torch.load(actor_path)
agent.critic_local.state_dict = torch.load(critic_path)

In [7]:
def ddpg(n_times=500, actor_path='actor_chk.pt', critic_path='critic_chk.pt'):
    
    """Deep Deterministic Policy Gradient (DDPG)
    
    Params
    ======
        n_times (int)      : maximum number of training episodes
    """
    episode_ist = 1
    env_info = env.reset(train_mode=False)[brain_name] # reset environment
    states = env_info.vector_observations                   # get current state for each agent      
    agents_score = np.zeros(num_agents)                     # initialize score for each agent
    agent.reset()


    while True:
        for t in range(n_times):
            actions = agent.act(states, add_noise=False)        # select an action for agents
            env_info = env.step(actions)[brain_name]            # send actions to environment
            next_states = env_info.vector_observations          # get next states for agents
            rewards = env_info.rewards                          # get the rewards
            dones = env_info.local_done                         # see if episode has finished

            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done, t)
            states = next_states
            agents_score += rewards

            clear_output(wait=True)
            print(np.average(rewards))
        episode_ist+=1  
        if episode_ist > n_times:
            break
    return scores

In [None]:
scores= ddpg(actor_path=actor_path, critic_path=critic_path, n_times=1000)
print("Final Score", np.average(scores))

0.003999999910593033


In [None]:
env.close()