# Init

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from unityagents import UnityEnvironment

import pdb, pickle

import numpy as np
import holoviews as hv
import pandas as pd
import xarray as xr

from holoviews import opts
from holoviews.streams import Pipe, Buffer
from holoviews.operation.timeseries import rolling

from replay_buffer import ReplayBuffer
from maddpg_agent import MaddpgAgent

hv.extension('bokeh')

In [None]:
env = UnityEnvironment(file_name="./Tennis.app")

In [None]:
# Get the default brain and reset env
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

# Number of agents 
num_agents = len(env_info.agents)
print(f"Number of agents: {num_agents}")

# Size of the global state/action space (across all agents)
actions = env_info.previous_vector_actions
states = env_info.vector_observations
global_state_space_size = states.flatten().shape[0]
global_action_space_size = actions.flatten().shape[0]
print(f"Global states: {global_state_space_size}")
print(f"Global actions: {global_action_space_size}")

# Size of the local state/action space (for each agent individually)
action_space_size = brain.vector_action_space_size
state_space_size = brain.num_stacked_vector_observations * brain.vector_observation_space_size
print(f"Local states: {state_space_size}")
print(f"Local actions: {action_space_size}")

# Examine the state space 
print('The state for the first agent looks like:', states[0])

# Training

In [None]:
def train(
    num_episodes = 1500,
    batch_size = 1024,
    max_episode_length = 250,
    train_after_batches = 20,
    replay_buffer_size_max = int(1e6),
    train_every_steps = 100,
    noise_level = 2.0,
    noise_decay = 0.9999,
    print_episodes = 100
):

    print(f"------------------------------------------------")
    print(f"New training run.")
    print(f"    num_episodes: {num_episodes}")
    print(f"    batch_size: {batch_size}")
    print(f"    max_episode_length: {max_episode_length}")
    print(f"    train_after_batches: {train_after_batches}")
    print(f"    replay_buffer_size_max: {replay_buffer_size_max}")
    print(f"    train_every_steps: {train_every_steps}")
    print(f"    noise_level: {noise_level}")
    print(f"    noise_decay: {noise_decay}")

    # Create the agents
    if agents is None:
        agents = []
        for i in range(num_agents):
            print(f"Agent {i}: state space: {state_space_size}; \
                    action space {action_space_size}.")
            agents.append(MaddpgAgent(
                i, num_agents, state_space_size, action_space_size,
                global_state_space_size, global_action_space_size))

    # Don't start learning until we have a certain number of batches (obviously
    # we need at least 1).
    min_samples_required = batch_size * train_after_batches

    # Create the replay buffer
    if replay_buffer is None:
        replay_buffer = ReplayBuffer(
            max_size=replay_buffer_size_max, min_samples_required=min_samples_required)
    
    # Iterate over episodes
    for episode in range(num_episodes):

        # Receive initial state vector s
        #   s = (s_1, . . . , s_N)
        env_info = env.reset(train_mode=True)[brain_name]
        s = env_info.vector_observations 

        episode_returns.append( np.array( [0] * num_agents) )
        for t in range(1, max_episode_length):

            # For each agent i, select actions:
            #   a = (a_1, . . . , a_N)
            # using the current policy and exploration noise, which we decay
            a = [agent.act(state, noise_level=noise_level)
                 for agent, state in zip(agents, s)]
            if is_learning:
                noise_level *= noise_decay

            # Execute actions a = (a_1, . . . , a_N)
            # Observe:
            #   Reward r = (r_1, . . . , r_N)
            #   Next-state vector s' = (s'_1, . . . , s'_N)
            env_info= env.step(a)[brain_name]
            r = env_info.rewards
            s_prime = env_info.vector_observations
            dones = env_info.local_done

            # Store (s, a, r, s', done) in replay buffer
            replay_buffer.append((s, a, r, s_prime, dones))

            # Record progress
            episode_returns[-1] = episode_returns[-1] + r

            # Advance
            s = s_prime
            train_step += 1

            # Periodically (after a certain number of steps) run update/training
            if train_step % train_every_steps == 0:
                if replay_buffer.has_enough_samples():
                    
                    if not is_learning:
                        print(f"Started learning at time {train_step}")
                        is_learning = True

                    # Sample replay buffer
                    sample = replay_buffer.sample(batch_size=batch_size)

                    # For every sample tuple, each agent needs to know which action
                    # would be chosen under the policy of the other agents in the
                    # next state s', in order to calculate q-values.
                    next_actions = [[
                         agent.act(next_state, target_actor=True)
                         for agent, next_state in zip(agents, s_prime)]
                        for (s, a, r, s_prime, dones) in sample]
                    
                    # Update/train all the agents
                    per_agent_loss = []
                    for agent in agents:
                        actor_loss, critic_loss = agent.update(sample, next_actions)
                        per_agent_loss.append((actor_loss, critic_loss))
                    loss.append(per_agent_loss)
                    
            # Terminate episode early if done
            if any(dones):
                break

        if episode % print_episodes == 0:
            print(f"t: {train_step}, e: {episode}, noise: {noise_level:.2f}. " + \
                  f"Average last {print_episodes} episode return: " + \
                  f"{np.array(episode_returns[-print_episodes:]).mean(axis=0)}")
            
    print(f"Final average reward over entire run: {np.mean(episode_returns)}")
    if len(loss) > 0:
        average_loss = np.array(loss).mean(axis=1)
    else :
        average_loss = None
    return np.array(episode_returns).sum(axis=1), average_loss, agents

In [None]:
episode_rewards = []
loss = []
agents = []
replay_buffer = None
train_step = 0
is_learning = False

In [None]:
train(
    num_episodes=30000,
    max_episode_length = 2000,
    batch_size=512,
    train_every_steps=4,
    train_after_batches=40,
    noise_level = 1.0,
    noise_decay = 0.9999,
    print_episodes = 50
)

average_loss = np.array(loss).mean(axis=1)
average_return = np.array(episode_returns).sum(axis=1)

In [None]:
results_curves = hv.Curve(results) * rolling(hv.Curve(results), rolling_window=100)
loss_curves = hv.Curve(loss[:,0]).relabel('actor_loss') * hv.Curve(loss[:,1]).relabel('critic_loss')
(results_curves + loss_curves).opts(opts.Curve(axiswise=True))

In [None]:
for i in range(1, 15):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    t = 0
    while True:
        t += 1
        
        #actions = np.random.randn(num_agents, action_space_size)
        
        actions = [agent.act(state, noise_level=1) for agent, state in zip(agents, states)]
        
        actions = np.clip(actions, -1, 1)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print(f'Episode: {i}; length: {t}, max score: {np.max(scores)}')

In [None]:
# Perform a grid search in hyperparam space

coords = {}

num_episodes = 1500
batch_sizes = [32, 64, 256, 1024]
train_everys = [1, 10, 50, 100]

results_1 = []
for batch_size in batch_sizes:
    
    results_2 = []
    for train_every in train_everys:
        
        label = f"Batch size: {batch_size}; Train every: {train_every}"
        results = train(
            num_episodes=num_episodes,
            batch_size=batch_size,
            train_every_steps=train_every)
        
        results_2.append(results)
    
    results_1.append(results_2)
    
    pickle_out = open("results_1.pickle","wb")
    pickle.dump(results_1, pickle_out)
    pickle_out.close()

In [None]:
# Display the grid search output graphically
xr_results = xr.DataArray(results_1,
             coords={'batch_size':batch_sizes, 'train_every':train_everys, 'episode_index':range(1,num_episodes)},
             dims=['batch_size', 'train_every', 'episode_index'])
hv_results = hv.Dataset(xr_results, ['batch_size', 'train_every', 'episode_index'], 'reward')
raw_grid = hv_results.to(hv.Curve, ['episode_index']).grid()
smooth_grid = rolling(hv_results.to(hv.Curve, ['episode_index']), rolling_window=100).grid()
raw_grid * smooth_grid

In [None]:
env.close()