# Imports

In [None]:
#%load_ext autoreload
#%autoreload 2

In [None]:
import pdb, pickle, torch, random

import numpy as np
import holoviews as hv
import pandas as pd
import xarray as xr

from holoviews import opts
from holoviews.streams import Pipe, Buffer
from holoviews.operation.timeseries import rolling

from replay_buffer import ReplayBuffer
from maddpg_agent import MaddpgAgent

hv.extension('bokeh')

In [None]:
from unityagents import UnityEnvironment
env = UnityEnvironment(file_name="./Tennis.app")

# Load environment

In [None]:
# Get the default brain and reset env
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

# Number of agents 
num_agents = len(env_info.agents)
print(f"Number of agents: {num_agents}")

# Size of the global state/action space (across all agents)
actions = env_info.previous_vector_actions
states = env_info.vector_observations
global_state_space_size = states.flatten().shape[0]
global_action_space_size = actions.flatten().shape[0]
print(f"Global states: {global_state_space_size}")
print(f"Global actions: {global_action_space_size}")

# Size of the local state/action space (for each agent individually)
action_space_size = brain.vector_action_space_size
state_space_size = brain.num_stacked_vector_observations * brain.vector_observation_space_size
print(f"Local states: {state_space_size}")
print(f"Local actions: {action_space_size}")

# Examine the state space 
print('The state for the first agent looks like:', states[0])

# Define trainer class

In [None]:
class Trainer():

    def __init__(self,
        train_after_samples = 20,
        replay_buffer_size_max = int(1e6)
                ):
        self.train_after_samples = train_after_samples
        self.replay_buffer_size_max = replay_buffer_size_max
        
        # Create the agents
        self.agents = []
        for i in range(num_agents):
            print(f"Agent {i}: state space: {state_space_size}; \
                    action space {action_space_size}.")
            self.agents.append(MaddpgAgent(
                i, num_agents, state_space_size, action_space_size,
                global_state_space_size, global_action_space_size))
    
        # Create the replay buffer
        self.replay_buffer = ReplayBuffer(
                max_size=replay_buffer_size_max, min_samples_required=train_after_samples)
            
        # Track progres
        self.episode_returns = []
        self.loss = []
        
        # Training vars
        self.train_step = 0
        self.episode = 0
        self.is_learning = False

    def train(self,
        num_episodes = 1500,
        batch_size = 1024,
        max_episode_length = 250,
        train_every_steps = 100,
        noise_level = 2.0,
        noise_decay = 0.9999,
        print_episodes = 100
             ):
        
        try:

            print(f"------------------------------------------------")
            print(f"New training run.")
            print(f"    num_episodes: {num_episodes}")
            print(f"    batch_size: {batch_size}")
            print(f"    max_episode_length: {max_episode_length}")
            print(f"    train_after_samples: {self.train_after_samples}")
            print(f"    replay_buffer_size_max: {self.replay_buffer_size_max}")
            print(f"    train_every_steps: {train_every_steps}")
            print(f"    noise_level: {noise_level}")
            print(f"    noise_decay: {noise_decay}")

            # Iterate over episodes
            episode_max = self.episode + num_episodes
            while self.episode < episode_max:

                # Receive initial state vector s
                #   s = (s_1, . . . , s_N)
                env_info = env.reset(train_mode=True)[brain_name]
                s = env_info.vector_observations 

                self.episode_returns.append( np.array( [0] * num_agents) )
                for t in range(1, max_episode_length):

                    # For each agent i, select actions:
                    #   a = (a_1, . . . , a_N)
                    # using the current policy and exploration noise, which we decay
                    a = [agent.act(state, noise_level=noise_level)
                         for agent, state in zip(self.agents, s)]
                    if self.is_learning:
                        noise_level *= noise_decay

                    # Execute actions a = (a_1, . . . , a_N)
                    # Observe:
                    #   Reward r = (r_1, . . . , r_N)
                    #   Next-state vector s' = (s'_1, . . . , s'_N)
                    env_info= env.step(a)[brain_name]
                    r = env_info.rewards
                    s_prime = env_info.vector_observations
                    dones = env_info.local_done

                    # Store (s, a, r, s', done) in replay buffer
                    self.replay_buffer.append((s, a, r, s_prime, dones))

                    # Record progress
                    self.episode_returns[-1] = self.episode_returns[-1] + r

                    # Advance
                    s = s_prime
                    self.train_step += 1

                    # Periodically (after a certain number of steps) run update/training
                    if self.train_step % train_every_steps == 0:
                        if self.replay_buffer.has_enough_samples():

                            if not self.is_learning:
                                print(f"Started learning at time {self.train_step}")
                                self.is_learning = True

                            # Sample replay buffer
                            sample = self.replay_buffer.sample(batch_size=batch_size)

                            # For every sample tuple, each agent needs to know which action
                            # would be chosen under the policy of the other agents in the
                            # next state s', in order to calculate q-values.
                            next_actions = [[
                                 agent.act(next_state, target_actor=True)
                                 for agent, next_state in zip(self.agents, s_prime)]
                                for (s, a, r, s_prime, dones) in sample]

                            # Update/train all the agents
                            per_agent_loss = []
                            for agent in self.agents:
                                actor_loss, critic_loss = agent.update(sample, next_actions)
                                per_agent_loss.append((actor_loss, critic_loss))
                            self.loss.append(per_agent_loss)

                    # Terminate episode early if done
                    if any(dones):
                        break

                self.episode += 1
                if self.episode % print_episodes == 0:
                    print(f"t: {self.train_step}, e: {self.episode}, noise: {noise_level:.2f}. " + \
                          f"Average last {print_episodes} episode return: " + \
                          f"{np.array(self.episode_returns[-print_episodes:]).mean(axis=0)}")

            print("Finished")
            
        except KeyboardInterrupt:
            print("Interrupted")

    def get_average_loss(self):
        if len(self.loss) > 0:
            return np.array(self.loss).mean(axis=1)
        return [[0, 0]]
    
    def get_max_returns(self):
        if len(self.episode_returns) > 0:
            return np.array(self.episode_returns).max(axis=1)
        return []

# Train

### Create a new trainer

In [None]:
seed = 0
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

trainer = Trainer(
    train_after_samples=40000
)

### Start/resume training sesion

In [None]:
trainer.train(
    num_episodes=15000,
    batch_size=512,
    train_every_steps=4,
    noise_level = 1.0,
    noise_decay = 0.9999,
    max_episode_length = 250,
    print_episodes = 50
)

# Analyse results

In [None]:
# Display returns
max_returns = trainer.get_max_returns()
raw_returns = hv.Curve(max_returns, 'Episode', 'Return').relabel('Single episode')
smooth_returns = rolling(hv.Curve(
    max_returns, 'Episode', 'Return'), rolling_window=100).relabel('100 episode average')
max_returns_curve = (raw_returns * smooth_returns).relabel('Max episode return')

# Display loss
average_loss = trainer.get_average_loss()
actor_loss = hv.Curve(average_loss[:,0], 'Training iteration', 'Loss').relabel('Actor')
critic_loss = hv.Curve(average_loss[:,1], 'Training iteration', 'Loss').relabel('Critic')
loss_curves = (actor_loss * critic_loss).relabel('Actor/critic loss')

(max_returns_curve + loss_curves).opts(opts.Curve(axiswise=True))

# Save/restore training state

### Pausing/resuming training progress

This is especially useful because the Unity environment handle will be corrupted if you interrupt whilst training. Simply save the trainer, restart the kernel and unity environment, then load your progress to resume.

In [None]:
# Save trainer to disk
pickle.dump( trainer, open( "saved_models/trainer.pickle", "wb" ) )

# Save torch params to file
for i, agent in enumerate(trainer.agents):
    torch.save(agent.actor_optimiser,   f"saved_models/agent_{i}_actor_optimiser.pt")
    torch.save(agent.critic_optimiser,  f"saved_models/agent_{i}_critic_optimiser.pt")
    torch.save(agent.actor,         f"saved_models/agent_{i}_actor_model.pt")
    torch.save(agent.actor_target,  f"saved_models/agent_{i}_actor_target_model.pt")
    torch.save(agent.critic,        f"saved_models/agent_{i}_critic_model.pt")
    torch.save(agent.critic_target, f"saved_models/agent_{i}_critic_target_model.pt")

In [None]:
# Load trainer from disk
trainer = pickle.load( open( "saved_models/trainer.pickle", "rb" ) )

# Load torch params from file (NOT safe across refactors)
for i, agent in enumerate(trainer.agents):
    agent.actor_optimiser  = torch.load(f"saved_models/agent_{i}_actor_optimiser.pt")
    agent.critic_optimiser = torch.load(f"saved_models/agent_{i}_critic_optimiser.pt")
    agent.actor         = torch.load(f"saved_models/agent_{i}_actor_model.pt")
    agent.actor_target  = torch.load(f"saved_models/agent_{i}_actor_target_model.pt")
    agent.critic        = torch.load(f"saved_models/agent_{i}_critic_model.pt")
    agent.critic_target = torch.load(f"saved_models/agent_{i}_critic_target_model.pt")

# Watch agent play

To view random play according to the OU noise process, set the noise level to 1. This is what we use to generate exploratory behaviour initially.

In [None]:
for i in range(1, 5):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    t = 0
    while True:
        t += 1

        actions = [agent.act(state, noise_level=0.5) for agent, state in zip(trainer.agents, states)]
        
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print(f'Episode: {i}; length: {t}, max score: {np.max(scores)}')