In [1]:
from collections import deque
from unityagents import UnityEnvironment
import numpy as np
from ddpg_agent import Agent
import torch
import time
env = UnityEnvironment(file_name='Tennis')# get the default brain
brain_name = env.brain_names[0]                    # get the default brain
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations              # get the current state for each agent
state_size = states.shape[1]
agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0)

def ddpg(n_episodes=1600, max_t=10000, print_every=100):
    scores_100 = deque(maxlen=print_every)
    scores_all = []
    once = True
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        states = env_info.vector_observations              # get the current state for each agent
        scores = np.zeros(num_agents)                     # initialize the score (for each agent)
        agent.reset()
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]        # send all actions to the environment
            next_states = env_info.vector_observations     # get the next state
            rewards = env_info.rewards                     # get the reward
            dones = env_info.local_done
            scores += env_info.rewards                         # update the score (for each agent)
            #print (scores)
            agent.step(states, actions, rewards, next_states, dones)
            if np.any(dones):
                #print(t)
                break 
            states = next_states
            
        #agent.learn2()
        score = np.mean(scores)
        #print (score)
        scores_100.append(score)
        scores_all.append(score)
        if np.mean(scores_100) > 0.5:
            if once:
                print('\nEnvironment solved in {:d} episodes'.format(i_episode))
                once = False
        
        if i_episode % 10 == 0:
            print('\rEpisode {}, Score last 100 episodes: {:.5f}, Score current episode: {:.5f}'\
              .format(i_episode, np.mean(scores_100), score, end="\n"))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')            
            np.save('scores_all_tennis.npy', scores_all)
    return scores_all

scores = ddpg()



INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Episode 10, Score last 100 episodes: 0.00000, Score current episode: -0.00500
Episode 20, Score last 100 episodes: 0.00000, Score current episode: -0.00500
Episode 30, Score last 100 episodes: 0.00333, Score current episode: -0.00500
Episode 40, Score last 100 episodes: 0.00250, Score current episode: -0.00500
Episode 50, Score last 100 episodes: 0.00100, Score current episode: -0.00500
Episode 60, Score last 100 episodes: 0.00000, Score current episode: -0.00500
Episode 70, Score last 100 episodes: -0.00071, Score current episode: -0.00500
Episode 80, Score last 100 episodes: -0.00125, Score current episode: -0.00500
Episode 90, Score last 100 episodes: -0.00167, Score current episode: -0.00500
Episode 100, Score last 100 episodes: -0.00200, Score current episode: -0.00500
Episode 110, Score last 100 episodes: -0.00250, Score current episode: -0.00500
Episode 120, Score last 100 episodes: -0.00300, Score current episode: -0.00500
Episode 130, Score last 100 episodes: -0.00450, Score c

Episode 1040, Score last 100 episodes: 0.54635, Score current episode: 0.09500
Episode 1050, Score last 100 episodes: 0.55035, Score current episode: 0.09500
Episode 1060, Score last 100 episodes: 0.53430, Score current episode: 0.09500
Episode 1070, Score last 100 episodes: 0.47620, Score current episode: 0.29500
Episode 1080, Score last 100 episodes: 0.42015, Score current episode: 0.19500
Episode 1090, Score last 100 episodes: 0.34615, Score current episode: -0.00500
Episode 1100, Score last 100 episodes: 0.30055, Score current episode: 0.19500
Episode 1110, Score last 100 episodes: 0.26505, Score current episode: 0.19500
Episode 1120, Score last 100 episodes: 0.20350, Score current episode: 0.14500
Episode 1130, Score last 100 episodes: 0.19495, Score current episode: 0.44500
Episode 1140, Score last 100 episodes: 0.20895, Score current episode: 0.29500
Episode 1150, Score last 100 episodes: 0.19895, Score current episode: 0.09500
Episode 1160, Score last 100 episodes: 0.21795, Sco

In [None]:
## from unityagents import UnityEnvironment
import numpy as np
from ddpg_agent import Agent
import torch

env = UnityEnvironment(file_name='Tennis')# get the default brain
brain_name = env.brain_names[0]                    # get the default brain
brain = env.brains[brain_name]
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations              # get the current state for each agent
state_size = states.shape[1]
agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0)

#load weights
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth', map_location=lambda storage, loc: storage))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth', map_location=lambda storage, loc: storage))

scores = []
n_epi = 5
t_steps = 10000

for i in range (1, n_epi):
    for t in range(1, t_steps+1):
        actions = agent.act(states, False) #add noise false
        env_info = env.step(actions)[brain_name]        # send the action to the environment
        states = env_info.vector_observations    # get the next states
        dones  = env_info.local_done
        scores += env_info.rewards                         # update the score (for each agent)
        #print(scores)
        if np.any(dones):                                  # exit loop if episode finished
            break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
env.close()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
scores = np.load('scores_all_tennis.npy')
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

fig.tight_layout()
fig.savefig('output.png', dpi=300)