Simple Tag
https://www.pettingzoo.ml/mpe/simple_tag

> This is a predator-prey environment. Good agents (green) are faster and receive a negative reward for being hit by adversaries (red) (-10 for each collision). Adversaries are slower and are rewarded for hitting good agents (+10 for each collision). Obstacles (large black circles) block the way. By default, there is 1 good agent, 3 adversaries and 2 obstacles.

Baseline agent algorithm with experience replay buffer

In [1]:
import os
import sys
import time
import enum
import math
import random
import collections
import statistics
import json

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F

import imageio

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

class TimeDelta(object):
    def __init__(self, delta_time):
        """Convert time difference in seconds to days, hours, minutes, seconds.
        
        Parameters
        ==========
        delta_time : float
            Time difference in seconds.
        """
        self.fractional, seconds = math.modf(delta_time)
        seconds = int(seconds)
        minutes, self.seconds = divmod(seconds, 60)
        hours, self.minutes = divmod(minutes, 60)
        self.days, self.hours = divmod(hours, 24)
    
    def __repr__(self):
        return f"{self.days}-{self.hours:02}:{self.minutes:02}:{self.seconds + self.fractional:02}"

from pettingzoo.mpe import simple_tag_v2
from pettingzoo.utils import random_demo

# torch.autograd.set_detect_anomaly(True)

Arguments in instantiate environment.

- num_good: number of good agents
- num_adversaries: number of adversaries
- num_obstacles: number of obstacles
- max_cycles: number of frames (a step for each agent) until game terminates
- continuous_actions: Whether agent action spaces are discrete(default) or continuous

In [20]:
# env.close()

In [12]:
num_good = 1
num_adversaries = 3
num_obstacles = 0

env = simple_tag_v2.env(
    num_good=num_good,
    num_adversaries=num_adversaries,
    num_obstacles=num_obstacles,
    max_cycles=30,
    continuous_actions=False
).unwrapped

### What are the environment parameters?

Adversaries (red) capture non-adversary (green). The map is a 2D grid and everything is initialized in the region [-1, +1]. There doesn't seem to be position clipping for out of bounds, but non-adversary agent are penalized for out of bounds.
Agent's observation is a ndarray vector of concatenated data in the following order:

1. current velocity (2,)
2. current position (2,)
3. relative position (2,) of each landmark
4. relative position (2,) of each other agent
5. velocity (2,) of each other non-adversary agent

When there are 3 adverseries and 3 non-adversaries, then advarsary observation space is 24 dimensional and non-advarsary observation space is 22 dimensional.

The environment is sequential. Agents move one at a time. Agents are either `adversary_*` for adversary or `agent_*` for non-adversary.

Actions:

- 0 is NOP
- 1 is go left
- 2 is go right
- 3 is go down
- 4 is go up

### How to train the agents?

When loss is increasing for Double DQN, then increase the interval for updating target network.
<https://stackoverflow.com/questions/56964657/cartpole-v0-loss-increasing-using-dqn>

In [13]:
env.reset()

def get_agent_counts():
    all_agents = 0
    adversaries = 0
    for agent in env.world.agents:
        all_agents += 1
        adversaries += 1 if agent.adversary else 0
    good_agents = all_agents - adversaries
    return (adversaries, good_agents)

def get_landmark_count():
    return len(env.world.landmarks)

def process_config(config):
    for k, v in config.common.items():
        config.adversary[k] = v
        config.agent[k] = v


n_adversaries, n_good_agents = get_agent_counts()
n_landmarks = get_landmark_count()
config = AttrDict(
    n_adversaries=n_adversaries,
    n_good_agents=n_good_agents,
    n_landmarks=n_landmarks,
    discount = 0.99,
    eps_start = 0.9,
    eps_end = 0.07,
    eps_decay = 0.9996,
    n_episodes=5_000,
    batch_size=1,
    update_target_interval=32,
    report_interval=32,
    evaluation_interval=500,
    clip_grad_norm=5.0,
    lr=0.05,
    reward_scale=1,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    common=AttrDict(
        hidden_size=32,
        n_actions=env.action_space(env.agent_selection).n,
    ),
    adversary=AttrDict(
        n_agents=n_adversaries,
        observation_shape=env.observation_space("adversary_0").shape

    ),
    agent=AttrDict(
        n_agents=n_good_agents,
        observation_shape=env.observation_space("agent_0").shape
    )
)
config.exp_name = (f"baseline_nadversaries{config.n_adversaries}"
                   f"_ngoodagents{config.n_good_agents}"
                   f"_landmarks{config.n_landmarks}")
process_config(config)
config

{'n_adversaries': 3,
 'n_good_agents': 1,
 'n_landmarks': 0,
 'discount': 0.99,
 'eps_start': 0.9,
 'eps_end': 0.07,
 'eps_decay': 0.9996,
 'n_episodes': 5000,
 'batch_size': 1,
 'update_target_interval': 32,
 'report_interval': 32,
 'evaluation_interval': 500,
 'clip_grad_norm': 5.0,
 'lr': 0.05,
 'reward_scale': 1,
 'device': device(type='cuda'),
 'common': {'hidden_size': 32, 'n_actions': 5},
 'adversary': {'n_agents': 3,
  'observation_shape': (12,),
  'hidden_size': 32,
  'n_actions': 5},
 'agent': {'n_agents': 1,
  'observation_shape': (10,),
  'hidden_size': 32,
  'n_actions': 5},
 'exp_name': 'baseline_nadversaries3_ngoodagents1_landmarks0'}

In [14]:
class Normalizer(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        self.n_allagents = len(env.world.agents)
        self.n_good = sum(map(lambda a: not a.adversary, env.world.agents))
    
    @staticmethod
    def normalize_abs_pos(s):
        """Clip absolute position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -1.5, 1.5) / 1.5

    @staticmethod
    def normalize_rel_pos(s):
        """Clip relative position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -3, 3) / 3

    def __call__(self, obs):
        # normalize and clip positions
        norm_obs = obs.copy()
        # normalize velocity of current entity
        norm_obs[:2] = norm_obs[:2] / 1.3
        # clip/scale abs. position of current entity
        norm_obs[2:4] = self.normalize_abs_pos(norm_obs[2:4])
        # clip/scale rel. position of other entities
        n_range = self.n_landmarks + self.n_allagents - 1
        for i in range(n_range):
            norm_obs[4 + (2*i):4 + (2*(i + 1))] = self.normalize_rel_pos(
                norm_obs[4 + (2*i):4 + (2*(i + 1))]
            )
        # normalize velocity of other entities
        norm_obs[4 + (2*n_range):] = norm_obs[4 + (2*n_range):] / 1.3
        return norm_obs
    
class RewardsShaper(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        # self.n_allagents = len(env.world.agents)
        self.name_to_idx = {agent.name: i for i, agent in enumerate(env.world.agents)}
        self.idx_to_name = {i: agent.name for i, agent in enumerate(env.world.agents)}
        self.goodagent_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("agent")
        ]
        self.adversary_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("adversary")
        ]
        # rdist - distance between adversary-good agent to start computing rewards.
        self.rdist = 1
        # collision_dist - distance between adversary-good agent to count collision.
        #    Based on PettingZoo numbers. 
        self.collision_dist = 0.075 + 0.05

    @staticmethod
    def bound(x):
        if x < 0.9:
            return 0
        if x < 1.0:
            return (x - 0.9) * 10
        return min(np.exp(2 * x - 2), 10)
        
    def __call__(self, agent_name, obs):
        """Compute reshaped rewards from observation for agent given agent name.
        Adversary: start gaining small rewards as it nears good agents.
        
        Good agent: starts gaining small penality as it nears bad agents.
        """
        _obs = obs[4 + (2*self.n_landmarks):]
        agent_idx = self.name_to_idx[agent_name]
        cum_r = 0.
        if agent_name.startswith("agent"):
            # penalty across all adversaries
            for adversary_idx in self.adversary_indices:
                # penalty from distance of adversary; penalty of collision
                other_idx = adversary_idx - 1 if agent_idx < adversary_idx else adversary_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r -= 1 - (1/self.rdist)*d
                
            # penalty from boudary based on PettingZoo
            # pos = obs[2:4]
            # cum_r -= self.bound(abs(pos[0]))
            # cum_r -= self.bound(abs(pos[1]))
        
        elif agent_name.startswith("adversary"):
            # reward across all agents
            for goodagent_idx in self.goodagent_indices:
                # reward from distance to agent; reward of collision
                other_idx = goodagent_idx - 1 if agent_idx < goodagent_idx else goodagent_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r += 1 - (1/self.rdist)*d
        
        return cum_r

normalize = Normalizer(env) # norm_obs = normalize(obs)
shapereward = RewardsShaper(env) # reward = shapereward(agent_name, obs)
criterion = torch.nn.MSELoss()
# criterion = torch.nn.SmoothL1Loss()

In [15]:
class SimpleTagNet(torch.nn.Module):
    """NN Model for the agents. Both good agents and adversaries use this model."""
        
    def __init__(self, config, agent_type):
        super().__init__()
        # self.config = config
        self.device      = config.device
        self.observation_size = math.prod(config[agent_type].observation_shape)
        self.n_actions   = config[agent_type].n_actions
        self.hidden_size = config[agent_type].hidden_size
        self.output_mlp = torch.nn.Sequential(
            torch.nn.Linear(self.observation_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.n_actions)
        )
    
    def forward(self, observation):
        """Apply DQN to episode step.
        
        Parameters
        ==========
        observation : ndarray
            The observation vector obtained from the environment.
        
        Returns
        =======
        torch.Tensor
            Vector of Q-value associated with each action.
        """
        observation = normalize(observation)
        observation = torch.tensor(observation, dtype=torch.float, device=self.device)
        Q = self.output_mlp(observation)
        return Q

def choose_action(config, agent_type, Q, epsilon=0.05, is_val=False):
    if not is_val and random.random() < epsilon:
        return random.randrange(config[agent_type].n_actions)
    else:
        return torch.argmax(Q).item()

In [16]:
def pad_amt(w,  macro_block_size=16):
    amt = w % macro_block_size
    if amt > 0:
        return macro_block_size - amt
    else:
        return 0
    amt =  macro_block_size - (w % macro_block_size)

def pad_image(img, macro_block_size=16):
    """Pad a image of shape (W, H, C)"""
    _pad_amt = lambda w: pad_amt(w)
    return np.pad(img, [(0, _pad_amt(img.shape[0])), (0, _pad_amt(img.shape[1])), (0, 0)])

def run_episode(
    config, adversary_net, agent_net, epsilon=0.05,
    should_render=False, is_val=False, save_video=False, save_video_path=None
):
    """Run one episodes.
    
    inputs consist of observation, message (backprop), hidden (backprop) indexed by agent
    outputs consist of action, q-value of action (backprop), reward, done indexed by (step, agent)
    
    Returns
    =======
    AttrDict
        Contains episode metrics:
        - steps : number of steps. All agents take an action at each step.
        - reward : episodic rewards indexed by ('adversary', 'agent').
        - step_records : list of quantities produced indiced by step, ('adversary', 'agent'), agent index.
          Each step record has:
            + observation
            + reward
            + done
        - loss : contains episodic losses indexed by ('adversary', 'agent'). To be updated by train_agents()
    """
    rendered_video = []
    episode = AttrDict(
        steps=0,
        reward=AttrDict(adversary=0, agent=0),
        step_records=[],
        loss=AttrDict(adversary=0, agent=0)
    )
    n_agents = config.adversary.n_agents + config.agent.n_agents
    step_record = None
    
    env.reset()
    for agent_step_idx, agent_name in enumerate(env.agent_iter()):
        if agent_step_idx % n_agents == 0:
            episode.steps += 1
            step_record = AttrDict(adversary={}, agent={})
            episode.step_records.append(step_record)
            
        obs_curr, reward, done, _ = env.last()
        if not is_val:
            reward += shapereward(agent_name, obs_curr)
        if should_render:
            env.render()
            time.sleep(0.05)
        if save_video:
            rendered_image = env.render(mode='rgb_array')
            rendered_video.append(pad_image(rendered_image))
        if agent_name == "adversary_0":
            # print("rew, shaped rew", round(_reward, 2), round(reward, 2))
            # print("obs, normed obs", np.round(obs_curr, 2), np.round(normalize(obs_curr), 2))
            # print("obs, normed obs", np.round(obs_curr[4:6], 2), np.round(normalize(obs_curr[4:6]), 2))
            # print("obs, rew", np.round(normalize(obs_curr[4:6]), 2), reward)
            pass
        agent_type, agent_idx = agent_name.split("_")
        agent_idx = int(agent_idx)
        if done:
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=None,
                Q=None,
                reward=reward,
                done=done,
            )
            env.step(None)
            continue
        if agent_type == "agent":
            Q_curr = agent_net(obs_curr)
            action = choose_action(config, agent_type, Q_curr, epsilon, is_val=is_val)
            env.step(action)
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=action,
                reward=reward,
                done=done,
            )
        else:
            # agent_type == "adversary"
            Q_curr = adversary_net(obs_curr)
            action = choose_action(config, agent_type, Q_curr, epsilon, is_val=is_val)
            env.step(action)
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=action,
                reward=reward,
                done=done,
            )
        episode.reward[agent_type] += reward
    
    if should_render:
        env.close()
    if save_video:
        imageio.mimwrite(save_video_path, rendered_video, fps=30)
    return episode

def train_agents(config, batch, adversary_net, agent_net,
                 adversary_target_net, agent_target_net,
                 adversary_optimizer, agent_optimizer):
    """Compute loss of episode and update agent weights."""
    adversary_optimizer.zero_grad()
    agent_optimizer.zero_grad()
    device = config.device
    discount = torch.tensor(config.discount, dtype=torch.float, device=device)
    adversary_losses = []
    agent_losses = []
    for episode in batch:
        for step_idx in range(episode.steps):
            # Optimize adversary network
            for agent_idx in episode.step_records[step_idx].adversary.keys():
                curr_record = episode.step_records[step_idx].adversary[agent_idx]
                if curr_record.done:
                    # agent is done at this step
                    continue
                next_record = episode.step_records[step_idx + 1].adversary[agent_idx]
                r = torch.tensor(next_record.reward, dtype=torch.float, device=device)
                y = None
                if next_record.done:
                    # agent terminates at next step
                    y = r
                else:
                    next_o = next_record.observation
                    target_Q = adversary_target_net(next_o)
                    max_target_Q = torch.max(target_Q)
                    y = r + discount*max_target_Q
                curr_o = curr_record.observation
                u = curr_record.action
                Q = adversary_net(curr_o)
                Q_u = Q[u]
                adversary_losses.append(criterion(y, Q_u))
        
            # Optimize agent network
            for agent_idx in episode.step_records[step_idx].agent.keys():
                curr_record = episode.step_records[step_idx].agent[agent_idx]
                if curr_record.done:
                    # agent is done at this step
                    continue
                
                next_record = episode.step_records[step_idx + 1].agent[agent_idx]
                r = torch.tensor(next_record.reward, dtype=torch.float, device=device)
                y = None
                if next_record.done:
                    # agent terminates at next step
                    y = r
                else:
                    next_o = next_record.observation
                    target_Q = agent_target_net(next_o)
                    max_target_Q = torch.max(target_Q)
                    y = r + discount*max_target_Q
                curr_o = curr_record.observation
                u = curr_record.action
                Q = agent_net(curr_o)
                Q_u = Q[u]
                agent_losses.append(criterion(y, Q_u))
    
    show_norms = False
    adversary_loss = torch.mean(torch.stack(adversary_losses))
    adversary_loss.backward()
    
    if show_norms:
        norms = [p.grad.detach().data.norm().item() for p in adversary_net.parameters()]
        print("norm of gradiants", *np.round(norms, 2))

    torch.nn.utils.clip_grad_norm_(adversary_net.parameters(), config.clip_grad_norm)
    adversary_optimizer.step()
    episode.loss.adversary += adversary_loss.item()
    
    agent_loss = torch.mean(torch.stack(agent_losses))
    agent_loss.backward()
    if show_norms:
        norms = [p.grad.detach().data.norm().item() for p in agent_net.parameters()]
        print("norm of gradiants", *np.round(norms, 2))

    torch.nn.utils.clip_grad_norm_(agent_net.parameters(), config.clip_grad_norm)
    agent_optimizer.step()
    episode.loss.agent += agent_loss.item()


def evaluate_agents(config, savedir, episode_idx, best_evaluation_score, adversary_net):
    videodir = os.path.join(savedir, "videos")
    torch.save(
        adversary_net.state_dict(),
        os.path.join(savedir, f"adversary-net-{episode_idx}.pth")
    )
    adversary_net.eval()
    with torch.no_grad():
        episodic_rewards = []
        for e in range(100):
            save_video = e % 10 == 0
            validation_save_path = None
            if save_video:
                validation_save_dir = os.path.join(videodir, f"epoch{episode_idx}")
                os.makedirs(validation_save_dir, exist_ok=True)
                validation_save_path = os.path.join(validation_save_dir, f"eval{e}.mp4")
            episode = run_episode(
                config, adversary_net,
                should_render=save_video, save_video=save_video,
                save_video_path=validation_save_path, is_val=True
            )
            episodic_rewards.append(episode.reward.adversary)
    avg_episodic_rewards = statistics.fmean(episodic_rewards)
    print(f"Average evaluation reward at episode {episode_idx} is: {avg_episodic_rewards}")
    if avg_episodic_rewards > best_evaluation_score:
        best_reward_evaluation = avg_episodic_rewards
        torch.save(
            adversary_net.state_dict(),
            os.path.join(savedir, f"adversary-net-best.pth")
        )
    adversary_net.train()
    return best_evaluation_score

def train(config):
    """
    - Use parameter sharing between agents of the same class.
    - Good agents use one RL model, adversaries use another RL model.
      Train the agents side by side.
    - Separate, disjoint communication channels for two classes of agents,
      maintained by a container to store the messages.
    """
    eps_start = 0.9
    eps_end = 0.05
    eps_decay = 0.9996
    epsilon = config.eps_start
    best_reward_evaluation = 0
    
    datestamp = time.strftime('%d_%b_%Y_%H_%M_%S', time.localtime())
    savedir = os.path.join("models", config.exp_name, datestamp)
    os.makedirs(savedir, exist_ok=True)
    
    print("Training the agents...")
    t0 = time.time()
    device = config.device
    adversary_net = SimpleTagNet(config, "adversary").to(device)
    agent_net = SimpleTagNet(config, "agent").to(device)
    adversary_target_net = SimpleTagNet(config, "adversary").to(device)
    agent_target_net = SimpleTagNet(config, "agent").to(device)
    adversary_target_net.eval()
    agent_target_net.eval()
    print("Created the agent nets.")
    adversary_optimizer = torch.optim.SGD(adversary_net.parameters(), lr=config.lr)
    agent_optimizer = torch.optim.RMSprop(agent_net.parameters())
    logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[]),
        best_evaluation_score=float("-inf")
    )
    def update_targets():
        adversary_target_net.load_state_dict(adversary_net.state_dict())
    print("Initial update of target nets")
    update_targets()
    
    batch = []
    print("Beginning the episodes...")
    for episode_idx in range(config.n_episodes):
        # Run an episode
        episode = run_episode(config, adversary_net, agent_net, epsilon=epsilon,
                              should_render=episode_idx % config.report_interval == 0 and episode_idx > 0)
        batch.append(episode)
        
        # update epsilon at the end of each episode
        epsilon = max(epsilon*config.eps_decay, config.eps_end)
        
        # Train on the episode
        if episode_idx % config.batch_size == 0 and episode_idx > 0:
            train_agents(
                config, batch, adversary_net, agent_net,
                adversary_target_net, agent_target_net, adversary_optimizer, agent_optimizer
            )
            batch = []
        
        # Logging the reward and los
        logger.episodic_losses.adversary.append(episode.loss.adversary)
        logger.episodic_losses.agent.append(episode.loss.agent)
        logger.episodic_rewards.adversary.append(episode.reward.adversary)
        logger.episodic_rewards.agent.append(episode.reward.agent)

        if episode_idx % config.update_target_interval == 0 and episode_idx > 0:
            # Update double network
            update_targets()
        
        if episode_idx % config.report_interval == 0 and episode_idx > 0:
            # Logging
            t1 = time.time()
            tdelta = TimeDelta(round(t1 - t0, 0))
            print(f"on episode {episode_idx}, curr epsilon {epsilon} (time taken so far: {tdelta})")
            mean_loss_adversary = statistics.fmean(logger.episodic_losses.adversary[-config.report_interval:])
            mean_loss_agent = statistics.fmean(logger.episodic_losses.agent[-config.report_interval:])
            mean_reward_adversary = statistics.fmean(logger.episodic_rewards.adversary[-config.report_interval:])
            mean_reward_agent = statistics.fmean(logger.episodic_rewards.agent[-config.report_interval:])
            print(f"     mean loss: adversary {mean_loss_adversary:.5f}, agent {mean_loss_agent:.5f}")
            print(f"     mean reward: adversary {mean_reward_adversary:.2f}, agent {mean_reward_agent:.2f}")
        
        continue # skip evaluation
        if episode_idx % config.evaluation_interval == 0 and episode_idx > 0:
            logger.best_evaluation_score = evaluate_agents(
                config, savedir, episode_idx, logger.best_evaluation_score, adversary_net
            )
    
    return adversary_net, agent_net, logger


In [17]:
# train model
adversary_net, agent_net, logger = train(config)

Training the agents...
Created the agent nets.
Initial update of target nets
Beginning the episodes...
on episode 32, curr epsilon 0.8881957186750149 (time taken so far: 0-00:00:12.0)
     mean loss: adversary 1.26927, agent 2.55406
     mean reward: adversary -2.46, agent -6.41
on episode 64, curr epsilon 0.8768970195551845 (time taken so far: 0-00:00:23.0)
     mean loss: adversary 1.34070, agent 1.71131
     mean reward: adversary -8.67, agent -1.95
on episode 96, curr epsilon 0.8657420506956067 (time taken so far: 0-00:00:35.0)
     mean loss: adversary 0.99912, agent 1.99653
     mean reward: adversary -11.24, agent -3.47
on episode 128, curr epsilon 0.8547289837098901 (time taken so far: 0-00:00:47.0)
     mean loss: adversary 1.06412, agent 1.80128
     mean reward: adversary -12.96, agent 0.43
on episode 160, curr epsilon 0.843856013470467 (time taken so far: 0-00:00:58.0)
     mean loss: adversary 1.19161, agent 2.56022
     mean reward: adversary -6.40, agent -11.03
on episod

on episode 1504, curr epsilon 0.4928842360762302 (time taken so far: 0-00:09:7.0)
     mean loss: adversary 4.26395, agent 8.00573
     mean reward: adversary 12.62, agent -48.25
on episode 1536, curr epsilon 0.4866142771389811 (time taken so far: 0-00:09:19.0)
     mean loss: adversary 3.73618, agent 2.38236
     mean reward: adversary 28.94, agent -30.04
on episode 1568, curr epsilon 0.4804240780767642 (time taken so far: 0-00:09:30.0)
     mean loss: adversary 4.51855, agent 3.23369
     mean reward: adversary 16.10, agent -27.78
on episode 1600, curr epsilon 0.47431262426767706 (time taken so far: 0-00:09:42.0)
     mean loss: adversary 5.10387, agent 3.26105
     mean reward: adversary 39.27, agent -40.07
on episode 1632, curr epsilon 0.46827891399677846 (time taken so far: 0-00:09:54.0)
     mean loss: adversary 3.46562, agent 1.53686
     mean reward: adversary 11.71, agent -21.35
on episode 1664, curr epsilon 0.46232195829189904 (time taken so far: 0-00:10:5.0)
     mean loss: 

on episode 2976, curr epsilon 0.273515020467395 (time taken so far: 0-00:18:2.0)
     mean loss: adversary 15.65625, agent 5.16030
     mean reward: adversary 55.61, agent -39.81
on episode 3008, curr epsilon 0.2700356477840573 (time taken so far: 0-00:18:13.0)
     mean loss: adversary 14.97888, agent 2.47822
     mean reward: adversary 30.78, agent -29.81
on episode 3040, curr epsilon 0.2666005360493463 (time taken so far: 0-00:18:25.0)
     mean loss: adversary 9.99915, agent 2.02708
     mean reward: adversary 17.03, agent -22.81
on episode 3072, curr epsilon 0.2632091222216591 (time taken so far: 0-00:18:37.0)
     mean loss: adversary 17.99963, agent 4.71009
     mean reward: adversary 35.91, agent -44.81
on episode 3104, curr epsilon 0.2598608504218202 (time taken so far: 0-00:18:49.0)
     mean loss: adversary 12.17611, agent 1.74338
     mean reward: adversary 25.31, agent -28.62
on episode 3136, curr epsilon 0.256555171841969 (time taken so far: 0-00:19:1.0)
     mean loss: a

on episode 4448, curr epsilon 0.1517810084916351 (time taken so far: 0-00:27:1.0)
     mean loss: adversary 18.35329, agent 2.35183
     mean reward: adversary 35.93, agent -32.00
on episode 4480, curr epsilon 0.14985020888182646 (time taken so far: 0-00:27:13.0)
     mean loss: adversary 21.95914, agent 3.04451
     mean reward: adversary 27.20, agent -33.05
on episode 4512, curr epsilon 0.1479439708899059 (time taken so far: 0-00:27:25.0)
     mean loss: adversary 22.37290, agent 3.91796
     mean reward: adversary 34.75, agent -45.51
on episode 4544, curr epsilon 0.1460619820686001 (time taken so far: 0-00:27:37.0)
     mean loss: adversary 19.68555, agent 1.58104
     mean reward: adversary 33.08, agent -32.16
on episode 4576, curr epsilon 0.14420393394526412 (time taken so far: 0-00:27:49.0)
     mean loss: adversary 17.98357, agent 2.61771
     mean reward: adversary 28.19, agent -43.57
on episode 4608, curr epsilon 0.1423695219713199 (time taken so far: 0-00:28:0.0)
     mean lo

ValueError: too many values to unpack (expected 2)

In [None]:
eps_start = 0.9
eps_end = 0.05
eps_decay = 0.9996
epsilon = eps_start

eps = [epsilon]
for i in range(10_000):
    epsilon = max((epsilon*eps_decay), eps_end)
    eps.append(epsilon)

plt.plot(eps)

In [None]:
# load adversary net
adversary_net = SimpleTagNet(config, "adversary").to(config.device)
adversary_net.load_state_dict(torch.load('./models/batched-baseline-test/adversary-net-5632.pth'))

In [None]:
# save logs
logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[])
    )
with open("models/batched-baseline-test/log.json", "w") as f:
    json.dump(logger, f)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger.episodic_losses.adversary[50:], label="adversary")
axes[0].plot(logger.episodic_losses.agent[50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

axes[1].plot(logger.episodic_rewards.adversary[50:], label="adversary")
axes[1].plot(logger.episodic_rewards.agent[50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
fig.savefig("models/batched-baseline-test/vis.png")

In [None]:
# load logs
with open("models/batched-baseline-test/log.json", "r") as f:
    logger = json.load(f)

In [None]:
def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def f(a, n):
    """Zero out beginning of last axis"""
    pad_width = [(0, 0) for _ in range(a.ndim - 1)] + [(n, 0)]
    return np.pad(a, pad_width, mode='constant', constant_values=0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger["episodic_losses"]["adversary"][50:], label="adversary")
axes[0].plot(logger["episodic_losses"]["agent"][50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

adversary_episodic_rewards = np.array(logger["episodic_rewards"]["adversary"])*10
mean_adversary_episodic_rewards = f(moving_average(adversary_episodic_rewards, n=512), 0)
# axes[1].plot(adversary_episodic_rewards, label="adversary")
axes[1].plot(mean_adversary_episodic_rewards, label="adversary mean")
# axes[1].plot(logger["episodic_rewards"]["agent"][50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
env = simple_tag_v2.env(
    num_good=1,
    num_adversaries=3,
    num_obstacles=0,
    max_cycles=30,
    continuous_actions=False
).unwrapped
def visualize(config, adversary_net):
    adversary_net.eval()
    with torch.no_grad():
        return run_episode(config, adversary_net, should_render=True, is_val=True, evaluation=True)

episode = visualize(config, adversary_net)
print("episode steps", episode.steps)
print("episode rewards", *episode.reward.items())