# Init

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from unityagents import UnityEnvironment
import numpy as np
import pdb

from replay_buffer import ReplayBuffer
from maddpg_agent import MaddpgAgent

In [None]:
env = UnityEnvironment(file_name="./Tennis.app")

In [None]:
# Get the default brain and reset env
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

# Number of agents 
num_agents = len(env_info.agents)
print(f"Number of agents: {num_agents}")

# Size of the global state/action space (across all agents)
actions = env_info.previous_vector_actions
states = env_info.vector_observations
global_state_space_size = states.flatten().shape[0]
global_action_space_size = actions.flatten().shape[0]
print(f"Global states: {global_state_space_size}")
print(f"Global actions: {global_action_space_size}")

# Size of the local state/action space (for each agent individually)
action_space_size = brain.vector_action_space_size
state_space_size = brain.num_stacked_vector_observations * brain.vector_observation_space_size
print(f"Local states: {state_space_size}")
print(f"Local actions: {action_space_size}")

# Examine the state space 
print('The state for the first agent looks like:', states[0])

# Training

In [None]:
# Config
batch_size = 64 # 1024  # how many samples to process at once
max_episode_length = 250
expected_episode_length = 20 # use for sizing replay buffer
replay_buffer_size_max = int(1e6)
train_every_steps = 5  # steps between training updates
epsilon = 1.0
epsilon_decay = 0.9999

In [None]:
# Create the agents
agents = []
for i in range(num_agents):
    print(f"Agent {i}: state space: {state_space_size}; \
            action space {action_space_size}.")
    agents.append(MaddpgAgent(
        i, num_agents, state_space_size, action_space_size,
        global_state_space_size, global_action_space_size))

# Don't start learning until we have more episodes recorded than we need
# samples to fill our batch (i.e. we're only taking on average 1-2 samples from
# each episode).
min_samples_required = batch_size * expected_episode_length

# Create the replay buffer
replay_buffer = ReplayBuffer(
    max_size=replay_buffer_size_max, min_samples_required=min_samples_required)

# Track progress
episode_rewards = []

In [None]:

# Iterate over episodes
train_step = 0
for episode in range(1, num_episodes):

    # Receive initial state vector s
    #   s = (s_1, . . . , s_N)
    env_info = env.reset(train_mode=True)[brain_name]
    s = env_info.vector_observations 

    episode_rewards.append( np.array( [0] * num_agents) )
    for t in range(1, max_episode_length):

        # For each agent i, select actions:
        #   a = (a_1, . . . , a_N)
        # using the current policy and exploration noise, which we decay
        a = [agent.act(state, epsilon=epsilon)
             for agent, state in zip(agents, s)]
        if replay_buffer.has_enough_samples():
            epsilon *= epsilon_decay

        # Execute actions a = (a_1, . . . , a_N)
        # Observe:
        #   Reward r = (r_1, . . . , r_N)
        #   Next-state vector s' = (s'_1, . . . , s'_N)
        env_info= env.step(a)[brain_name]
        s_prime = env_info.vector_observations
        r = env_info.rewards
        dones = env_info.local_done

        # Store (s, a, r, s') in replay buffer D
        replay_buffer.append((s, a, r, s_prime))

        # Record progress
        episode_rewards[-1] = episode_rewards[-1] + r

        # Advance
        s = s_prime
        train_step += 1

        # Periodically (after a certain number of steps) run update/training
        if train_step % train_every_steps == 0:
            if replay_buffer.has_enough_samples():

                # Sample replay buffer
                sample = replay_buffer.sample(batch_size=batch_size)

                # For every sample tuple, each agent needs to know which action
                # would be chosen under the policy of the other agents in the
                # next state s', in order to calculate q-values.
                next_actions = [[
                     agent.act(next_state, target_actor=True)
                     for agent, next_state in zip(agents, s_prime)]
                    for (s, a, r, s_prime) in sample]

                # Update/train all the agents
                for agent in agents:
                    agent.update(sample, next_actions)
        
        # Terminate episode early if done
        if any(dones):
            break

    if episode % 100 == 0:
        print(f"Epsilon: {epsilon}. Average episode return over last 100 episodes: \
        {np.array(episode_rewards[-100:]).mean(axis=0)}")

In [None]:
for i in range(1, 10):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    t = 0
    while True:
        t += 1
        
        
        #actions = np.random.randn(num_agents, action_space_size)
        
        actions = [agent.act(state) for agent, state in zip(agents, states)]
        
        
        actions = np.clip(actions, -1, 1)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))
    print(t)