In [1]:
from ff_environment import ForceField
from agent import Agent
from collections import deque
import numpy as np
import torch

In [2]:
env = ForceField()
env_info = env.reset()

In [3]:
# size of each action
action_size = env.action_size
print('Size of each action:', action_size)

# examine the state space 
state = env_info.state
state_size = len(state)
print('The agent observes a state with length: {}'.format(state_size))
print('The starting state looks like:', state)

Size of each action: 2
The agent observes a state with length: 4
The starting state looks like: [0.5 1.  0.  0. ]


In [4]:
# Instantiate the agent:
agent = Agent(state_size, action_size, random_seed=2)

In [5]:
# train the agent with ddpg
def ddpg(n_episodes=50000, max_t=1000, print_every=2000):

    scores = []
    scores_deque = deque(maxlen=print_every)
    
    for i_episode in range(n_episodes):
        env_info = env.reset()
        state = env_info.state        # current state
        score = 0                      # initialize agent scores
        agent.reset()                  # reset noise process for action exploration
        
        for t in range(max_t):
            
            action = agent.act(state)
            
            env_info = env.step(action)               # send action to environment
            next_state = env_info.state               # get next state 
            reward = env_info.reward                  # get reward 
            done = env_info.done                      # see if trial is finished
            
            agent.step(state, action, reward, next_state, done)
            
            score += reward                         # update the score (for each agent)
            state = next_state                               # enter next states
            states_tracker.append(env_info.pos)
            
            if done:
                break

        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        
        print('\rEpisode {} \tAverage Reward: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        
        if i_episode % print_every == 0:
            torch.save(agent.actor_local.state_dict(), 'actor_model.pth')
            torch.save(agent.critic_local.state_dict(), 'critic_model.pth')
            print('\rEpisode {} \tAverage Reward: {:.2f}'.format(i_episode, np.mean(scores_deque)))
    
        if np.mean(scores_deque) >= 0.07:
            print('\nEnvironment solved in {:d} episodes!\ tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'actor_solved.pth')
            torch.save(agent.critic_local.state_dict(), 'critic_solved.pth')
            break
            
    return scores

scores = ddpg()



Episode 0 	Average Reward: -42.43
Episode 2000 	Average Reward: -41.66
Episode 4000 	Average Reward: -41.32
Episode 6000 	Average Reward: -41.30
Episode 8000 	Average Reward: -41.29
Episode 10000 	Average Reward: -41.30
Episode 12000 	Average Reward: -41.28
Episode 14000 	Average Reward: -41.32
Episode 16000 	Average Reward: -41.33
Episode 18000 	Average Reward: -41.30
Episode 20000 	Average Reward: -41.31
Episode 22000 	Average Reward: -41.28
Episode 24000 	Average Reward: -41.28
Episode 26000 	Average Reward: -41.31
Episode 28000 	Average Reward: -41.27
Episode 30000 	Average Reward: -41.28
Episode 32000 	Average Reward: -41.32
Episode 34000 	Average Reward: -41.28
Episode 36000 	Average Reward: -41.33
Episode 38000 	Average Reward: -41.25
Episode 40000 	Average Reward: -41.27
Episode 41722 	Average Reward: -41.31

KeyboardInterrupt: 

In [8]:
state1 = [(0, 0)]
state1.append((1, 1))

In [9]:
state1

[(0, 0), (1, 1)]

In [None]:
action = [4, 1]

In [None]:
np.linalg.norm(action, 2)