In [1]:
import time
from collections import namedtuple, deque
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment

from agent import Agent

# Env and Agent

In [2]:
# init the enviromnet
env = UnityEnvironment(file_name="Banana.app")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# read out the state and action size
env_info = env.reset(train_mode=True)[brain_name]
state_size = len(env_info.vector_observations[0])
action_size = brain.vector_action_space_size

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# init agent
agent = Agent(state_size, action_size)

# Training

In [None]:
n_episodes = 5000
n_rolling_average = 50
update_every = 4


t0 = time.time()
scores = []
rolling_average_scores = deque(maxlen=n_rolling_average)
smoothed_scores = []

print(' Time Evolved | Episode | Rolling Average Score ')
print('--------------+---------+-----------------------')

for i in range(n_episodes):

    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    score = 0
    
    t = 0
    while True:

        # choose action
        action = agent.act(state, epsilon=0.01, verbose=0)

        # evolve environment
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]                
        done = env_info.local_done[0]               

        # store experience
        agent.store_experience(state, action, reward, next_state, done)
        
        # learn every update_every time step
        if t%update_every == 0:
            agent.learn()

        # updating
        score += reward                             
        state = next_state   
        t += 1
        if done:                                  
            break

    dt = (time.time()-t0)
    scores.append(score)
    rolling_average_scores.append(score)
    sm = np.mean(rolling_average_scores) if len(rolling_average_scores)==n_rolling_average else np.nan
    smoothed_scores.append(sm)

    print(f"\r {dt/60:>8.1f} min | {i+1:>7d} | {sm:>21.3f}", end="")
    if ((i+1)%n_rolling_average==0):
        print('')

 Time Evolved | Episode | Rolling Average Score 
--------------+---------+-----------------------
      1.0 min |      50 |                 0.500
      1.7 min |      88 |                 0.660

In [None]:
plt.plot(scores, alpha=0.5)
plt.plot(smoothed_scores)