Simple Tag
https://www.pettingzoo.ml/mpe/simple_tag

> This is a predator-prey environment. Good agents (green) are faster and receive a negative reward for being hit by adversaries (red) (-10 for each collision). Adversaries are slower and are rewarded for hitting good agents (+10 for each collision). Obstacles (large black circles) block the way. By default, there is 1 good agent, 3 adversaries and 2 obstacles.

Baseline agent algorithm with experience replay buffer

In [1]:
import os
import time
import enum
import math
import random
import collections
import statistics
import json

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F

import imageio

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

class TimeDelta(object):
    def __init__(self, delta_time):
        """Convert time difference in seconds to days, hours, minutes, seconds.
        
        Parameters
        ==========
        delta_time : float
            Time difference in seconds.
        """
        self.fractional, seconds = math.modf(delta_time)
        seconds = int(seconds)
        minutes, self.seconds = divmod(seconds, 60)
        hours, self.minutes = divmod(minutes, 60)
        self.days, self.hours = divmod(hours, 24)
    
    def __repr__(self):
        return f"{self.days}-{self.hours:02}:{self.minutes:02}:{self.seconds + self.fractional:02}"

from pettingzoo.mpe import simple_tag_v2
from pettingzoo.utils import random_demo

# torch.autograd.set_detect_anomaly(True)

Arguments in instantiate environment.

- num_good: number of good agents
- num_adversaries: number of adversaries
- num_obstacles: number of obstacles
- max_cycles: number of frames (a step for each agent) until game terminates
- continuous_actions: Whether agent action spaces are discrete(default) or continuous

In [2]:
# env.close()

In [3]:
num_good = 1
num_adversaries = 3
num_obstacles = 0

env = simple_tag_v2.env(
    num_good=num_good,
    num_adversaries=num_adversaries,
    num_obstacles=num_obstacles,
    max_cycles=30,
    continuous_actions=False
).unwrapped

### What are the environment parameters?

Adversaries (red) capture non-adversary (green). The map is a 2D grid and everything is initialized in the region [-1, +1]. There doesn't seem to be position clipping for out of bounds, but non-adversary agent are penalized for out of bounds.
Agent's observation is a ndarray vector of concatenated data in the following order:

1. current velocity (2,)
2. current position (2,)
3. relative position (2,) of each landmark
4. relative position (2,) of each other agent
5. velocity (2,) of each other non-adversary agent

When there are 3 adverseries and 3 non-adversaries, then advarsary observation space is 24 dimensional and non-advarsary observation space is 22 dimensional.

The environment is sequential. Agents move one at a time. Agents are either `adversary_*` for adversary or `agent_*` for non-adversary.

Actions:

- 0 is NOP
- 1 is go left
- 2 is go right
- 3 is go down
- 4 is go up

### How to train the agents?

When loss is increasing for Double DQN, then increase the interval for updating target network.
<https://stackoverflow.com/questions/56964657/cartpole-v0-loss-increasing-using-dqn>

In [4]:
env.reset()

In [5]:
def get_agent_counts():
    all_agents = 0
    adversaries = 0
    for agent in env.world.agents:
        all_agents += 1
        adversaries += 1 if agent.adversary else 0
    good_agents = all_agents - adversaries
    return (adversaries, good_agents)

def process_config(config):
    for k, v in config.common.items():
        config.adversary[k] = v
        config.agent[k] = v

n_adversaries, n_good_agents = get_agent_counts()
config = AttrDict(
    exp_name = "agents={}-adversary={}-obs={}-network={}".format(num_good, num_adversaries, num_obstacles, 'MLP'),
    discount = 0.99,
    epsilon = 0.1,
    n_episodes=20_000,
    batch_size=1,
    update_target_interval=32,
    report_interval=32,
    clip_grad_norm=5.0,
    lr=0.002,
    reward_scale=1,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    common=AttrDict(
        hidden_size=32,
        n_actions=env.action_space(env.agent_selection).n,
    ),
    adversary=AttrDict(
        n_agents=n_adversaries,
        observation_shape=env.observation_space("adversary_0").shape

    ),
    agent=AttrDict(
        n_agents=n_good_agents,
        observation_shape=env.observation_space("agent_0").shape
    )
)
process_config(config)
config

{'exp_name': 'agents=1-adversary=3-obs=0-network=MLP',
 'discount': 0.99,
 'epsilon': 0.1,
 'n_episodes': 20000,
 'batch_size': 1,
 'update_target_interval': 32,
 'report_interval': 32,
 'clip_grad_norm': 5.0,
 'lr': 0.002,
 'reward_scale': 1,
 'device': device(type='cuda'),
 'common': {'hidden_size': 32, 'n_actions': 5},
 'adversary': {'n_agents': 3,
  'observation_shape': (12,),
  'hidden_size': 32,
  'n_actions': 5},
 'agent': {'n_agents': 1,
  'observation_shape': (10,),
  'hidden_size': 32,
  'n_actions': 5}}

In [6]:
class Normalizer(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        self.n_allagents = len(env.world.agents)
        self.n_good = sum(map(lambda a: not a.adversary, env.world.agents))
    
    @staticmethod
    def normalize_abs_pos(s):
        """Clip absolute position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -1.5, 1.5) / 1.5

    @staticmethod
    def normalize_rel_pos(s):
        """Clip relative position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -3, 3) / 3

    def __call__(self, obs):
        # normalize and clip positions
        norm_obs = obs.copy()
        # normalize velocity of current entity
        norm_obs[:2] = norm_obs[:2] / 1.3
        # clip/scale abs. position of current entity
        norm_obs[2:4] = self.normalize_abs_pos(norm_obs[2:4])
        # clip/scale rel. position of other entities
        n_range = self.n_landmarks + self.n_allagents - 1
        for i in range(n_range):
            norm_obs[4 + (2*i):4 + (2*(i + 1))] = self.normalize_rel_pos(
                norm_obs[4 + (2*i):4 + (2*(i + 1))]
            )
        # normalize velocity of other entities
        norm_obs[4 + (2*n_range):] = norm_obs[4 + (2*n_range):] / 1.3
        return norm_obs
    
class RewardsShaper(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        # self.n_allagents = len(env.world.agents)
        self.name_to_idx = {agent.name: i for i, agent in enumerate(env.world.agents)}
        self.idx_to_name = {i: agent.name for i, agent in enumerate(env.world.agents)}
        self.goodagent_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("agent")
        ]
        self.adversary_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("adversary")
        ]
        # rdist - distance between adversary-good agent to start computing rewards.
        self.rdist = 1
        # collision_dist - distance between adversary-good agent to count collision.
        #    Based on PettingZoo numbers. 
        self.collision_dist = 0.075 + 0.05

    @staticmethod
    def bound(x):
        if x < 0.9:
            return 0
        if x < 1.0:
            return (x - 0.9) * 10
        return min(np.exp(2 * x - 2), 10)
        
    def __call__(self, agent_name, obs):
        """Compute reshaped rewards from observation for agent given agent name.
        Adversary: start gaining small rewards as it nears good agents.
        
        Good agent: starts gaining small penality as it nears bad agents.
        """
        _obs = obs[4 + (2*self.n_landmarks):]
        agent_idx = self.name_to_idx[agent_name]
        cum_r = 0.
        if agent_name.startswith("agent"):
            # penalty across all adversaries
            for adversary_idx in self.adversary_indices:
                # penalty from distance of adversary; penalty of collision
                other_idx = adversary_idx - 1 if agent_idx < adversary_idx else adversary_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r -= min(max(1 -  (1/self.rdist)*d, 0), 1)
                
            # penalty from boudary based on PettingZoo
            pos = obs[2:4]
            cum_r -= self.bound(abs(pos[0]))
            cum_r -= self.bound(abs(pos[1]))
        
        elif agent_name.startswith("adversary"):
            # reward across all agents
            for goodagent_idx in self.goodagent_indices:
                # reward from distance to agent; reward of collision
                other_idx = goodagent_idx - 1 if agent_idx < goodagent_idx else goodagent_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r += 1 -  (1/self.rdist)*d
        
        return cum_r

normalize = Normalizer(env) # norm_obs = normalize(obs)
shapereward = RewardsShaper(env) # reward = shapereward(agent_name, obs)
criterion = torch.nn.MSELoss()
# criterion = torch.nn.SmoothL1Loss()rnn

In [7]:
class SimpleTagNet(torch.nn.Module):
    """NN Model for the agents. Both good agents and adversaries use this model."""
        
    def __init__(self, config, agent_type):
        super().__init__()
        # self.config = config
        self.device      = config.device
        self.observation_size = math.prod(config[agent_type].observation_shape)
        self.n_actions   = config[agent_type].n_actions
        self.hidden_size = config[agent_type].hidden_size
        self.output_mlp = torch.nn.Sequential(
            torch.nn.Linear(self.observation_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.n_actions)
        )
    
    def forward(self, observation):
        """Apply DQN to episode step.
        
        Parameters
        ==========
        observation : ndarray
            The observation vector obtained from the environment.
        
        Returns
        =======
        torch.Tensor
            Vector of Q-value associated with each action.
        """
        observation = normalize(observation)
        observation = torch.tensor(observation, dtype=torch.float, device=self.device)
        Q = self.output_mlp(observation)
        return Q

def choose_action(config, agent_type, Q, epsilon=0.05, is_val=False):
    if not is_val and random.random() < epsilon:
        return random.randrange(config[agent_type].n_actions)
    else:
        return torch.argmax(Q).item()

In [8]:
def run_episode(config, adversary_net, should_render=False, epsilon=0.05, is_val=False,
                evaluation=False, save_video=False, save_video_dict=None):
    """Run one episodes.
    
    inputs consist of observation, message (backprop), hidden (backprop) indexed by agent
    outputs consist of action, q-value of action (backprop), reward, done indexed by (step, agent)
    
    Returns
    =======
    AttrDict
        Contains episode metrics:
        - steps : number of steps. All agents take an action at each step.
        - reward : episodic rewards indexed by ('adversary', 'agent').
        - step_records : list of quantities produced indiced by step, ('adversary', 'agent'), agent index.
          Each step record has:
            + observation
            + Q
            + reward
            + done
        - loss : contains episodic losses indexed by ('adversary', 'agent'). To be updated by train_agents()
    """
    rendered_video = []
    episode = AttrDict(
        steps=0,
        reward=AttrDict(adversary=0, agent=0),
        step_records=[],
        loss=AttrDict(adversary=0, agent=0)
    )
    n_agents = config.adversary.n_agents + config.agent.n_agents
    step_record = None
    
    env.reset()
    for agent_step_idx, agent_name in enumerate(env.agent_iter()):
        if agent_step_idx % n_agents == 0:
            episode.steps += 1
            step_record = AttrDict(adversary={}, agent={})
            episode.step_records.append(step_record)
            
        obs_curr, reward, done, _ = env.last()
        if not evaluation:
            reward += shapereward(agent_name, obs_curr)
        if should_render:
            if save_video:
                rendered_image = env.render(mode='rgb_array')
#                 print(rendered_image.shape)
                rendered_video.append(rendered_image)
            env.render()
            if agent_name == "adversary_0":
                # print("rew, shaped rew", round(_reward, 2), round(reward, 2))
                # print("obs, normed obs", np.round(obs_curr, 2), np.round(normalize(obs_curr), 2))
                # print("obs, normed obs", np.round(obs_curr[4:6], 2), np.round(normalize(obs_curr[4:6]), 2))
                # print("obs, rew", np.round(normalize(obs_curr[4:6]), 2), reward)
                pass
        agent_type, agent_idx = agent_name.split("_")
        agent_idx = int(agent_idx)
        if done:
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=None,
                Q=None,
                reward=reward,
                done=done,
            )
            env.step(None)
            continue
        if agent_type == "agent":
            env.step(0)
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=0,
                Q=None,
                reward=reward,
                done=done,
            )
        else:
            # agent_type == "adversary"
            Q_curr = adversary_net(obs_curr)
            action = choose_action(config, agent_type, Q_curr, epsilon, is_val=is_val)
            env.step(action)
            step_record[agent_type][agent_idx] = AttrDict(
                # inputs to network
                observation=obs_curr,
                # outputs of network / inputs to environment
                action=action,
#                 Q=Q_curr,
                Q=None,
                # output of environment
                reward=reward,
                done=done,
            )
        episode.reward[agent_type] += reward
    
    if should_render:
        env.close()
        if save_video:
            print(len(rendered_video))
            imageio.mimwrite(os.path.join(save_video_dict['validation_save_dir'],'result-{}.mp4'.format(save_video_dict['image_counter'])), rendered_video, fps=30)
    return episode

def train_agents(config, batch, adversary_net, adversary_target_net, adversary_optimizer):
    """Compute loss of episode and update agent weights.
    """
    device = config.device
    discount = torch.tensor(config.discount, dtype=torch.float, device=device)
    for episode in batch:
        for step_idx in range(episode.steps):
            for agent_idx in episode.step_records[step_idx].adversary.keys():
                curr_record = episode.step_records[step_idx].adversary[agent_idx]
                if curr_record.done:
                    # agent is done at this step
                    continue
                adversary_optimizer.zero_grad()
                next_record = episode.step_records[step_idx + 1].adversary[agent_idx]
                r = torch.tensor(next_record.reward, dtype=torch.float, device=device)
                y = None
                if next_record.done:
                    # agent terminates at next step
                    y = r
                else:
                    next_o = next_record.observation
                    target_Q = adversary_target_net(next_o)
                    max_target_Q = torch.max(target_Q)
                    y = r + discount*max_target_Q
                curr_o = curr_record.observation
                u = curr_record.action
#                 Q = curr_record.Q
                Q = adversary_net(curr_o)
                Q_u = Q[u]
                # adversary_loss = torch.pow(y - Q_u, 2.)
                adversary_loss = criterion(y, Q_u)
                adversary_loss.backward()#retain_graph=True)
                
                show_norms = False
                if show_norms:
                    norms = [p.grad.detach().data.norm().item() for p in adversary_net.parameters()]
                    print("norm of gradiants", *np.round(norms, 2))
                    
                torch.nn.utils.clip_grad_norm_(adversary_net.parameters(), config.clip_grad_norm)
                adversary_optimizer.step()
                episode.loss.adversary += adversary_loss.item()
                
def train(config):
    """
    - Use parameter sharing between agents of the same class.
    - Good agents use one RL model, adversaries use another RL model.
      Train the agents side by side.
    - Separate, disjoint communication channels for two classes of agents,
      maintained by a container to store the messages.
    """
    eps_start = 0.9
    eps_end = 0.05
    eps_decay = 0.9996
    epsilon = eps_start
    
    best_reward_evaluation = 0
        
    print("Training the agents...")
    os.makedirs("models/{}".format(config.exp_name), exist_ok=True)
    t0 = time.time()
    device = config.device
    adversary_net = SimpleTagNet(config, "adversary").to(device)
    adversary_target_net = SimpleTagNet(config, "adversary").to(device)
    adversary_target_net.eval()
    print("Created the agent nets.")
    adversary_optimizer = torch.optim.SGD(adversary_net.parameters(), lr=config.lr)
    logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[])
    )
    def update_targets():
        adversary_target_net.load_state_dict(adversary_net.state_dict())
    print("Initial update of target nets")
    update_targets()
    
    batch = []
    print("Beginning the episodes...")
    for episode_idx in range(config.n_episodes):
        # Run an episode
        episode = run_episode(config, adversary_net,
                              epsilon=epsilon,
                              should_render=episode_idx % config.report_interval == 0 and episode_idx > 0)
        batch.append(episode)
        epsilon = max(epsilon*eps_decay, eps_end)
        
        # Train on the episode
        if episode_idx % config.batch_size == 0 and episode_idx > 0:
            train_agents(config, batch, adversary_net,
                         adversary_target_net,
                         adversary_optimizer)
            batch = []
        
        # Logging the reward and los
        logger.episodic_losses.adversary.append(episode.loss.adversary)
        logger.episodic_losses.agent.append(episode.loss.agent)
        logger.episodic_rewards.adversary.append(episode.reward.adversary)
        logger.episodic_rewards.agent.append(episode.reward.agent)

        if episode_idx % config.update_target_interval == 0 and episode_idx > 0:
            # Update double network
            update_targets()
        
        if episode_idx % config.report_interval == 0 and episode_idx > 0:
            # Logging
            t1 = time.time()
            tdelta = TimeDelta(round(t1 - t0, 0))
            print(f"on episode {episode_idx}, curr epsilon {epsilon} (time taken so far: {tdelta})")
            mean_loss_adversary = statistics.fmean(logger.episodic_losses.adversary[-config.report_interval:])
            mean_reward_adversary = statistics.fmean(logger.episodic_rewards.adversary[-config.report_interval:])
            mean_reward_agent = statistics.fmean(logger.episodic_rewards.agent[-config.report_interval:])
            print(f"     mean loss: adversary {mean_loss_adversary}")
            print(f"     mean reward: adversary {mean_reward_adversary}, agent {mean_reward_agent}")
            
        if episode_idx % 500 == 0 and episode_idx > 0:
            adversary_net.eval()
            with torch.no_grad():
                validation_save_dir = "models/{}/validation_results/epoch-{}".format(config.exp_name, episode_idx)
                os.makedirs(validation_save_dir, exist_ok=True)
                validation_total_reward = 0
                torch.save(
                    adversary_net.state_dict(),
                    f"models/{config.exp_name}/adversary-net-{episode_idx}.pth"
                )
                for e in range(100):
                    d = {
                        'validation_save_dir':validation_save_dir,
                        'image_counter': e
                    }
                    episode = run_episode(config, adversary_net,
                                          epsilon=epsilon,
                                          should_render=e % 10 == 0, save_video=e%10==0, save_video_dict=d, is_val=True, evaluation=True)
                    print(episode.reward.adversary)
                    validation_total_reward += episode.reward.adversary
                    adversary_net.train()

                validation_total_reward /= 100
                print('Evaluation Reward for episode {} is: {}'.format(episode_idx, validation_total_reward))
                if validation_total_reward > best_reward_evaluation:
                    best_reward_evaluation = validation_total_reward
                    torch.save(
                        adversary_net.state_dict(),
                        f"models/{config.exp_name}/adversary-net-best.pth"
                    )
            adversary_net.train()
        
    
    return adversary_net, logger


In [None]:
# train model
adversary_net, logger = train(config)

Training the agents...
Created the agent nets.
Initial update of target nets
Beginning the episodes...
on episode 32, curr epsilon 0.8881957186750149 (time taken so far: 0-00:00:10.0)
     mean loss: adversary 167.28427043306448
     mean reward: adversary 4.652160788899588, agent -29.487828606109
on episode 64, curr epsilon 0.8768970195551845 (time taken so far: 0-00:00:17.0)
     mean loss: adversary 81.3071790664473
     mean reward: adversary -2.953299370662403, agent -26.875282971576247
on episode 96, curr epsilon 0.8657420506956067 (time taken so far: 0-00:00:24.0)
     mean loss: adversary 158.7613341749446
     mean reward: adversary 0.5240628673364043, agent -27.419515437632782
on episode 128, curr epsilon 0.8547289837098901 (time taken so far: 0-00:00:32.0)
     mean loss: adversary 131.18770670212078
     mean reward: adversary 2.67667599869457, agent -19.87916808601533
on episode 160, curr epsilon 0.843856013470467 (time taken so far: 0-00:00:39.0)
     mean loss: adversary



124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0




124




60.0
0.0
120.0
0.0
0.0
0.0
120.0
60.0
60.0
240.0




124




90.0
0.0
240.0
0.0
90.0
0.0
30.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0




124




60.0
30.0
0.0
210.0
0.0
0.0
60.0
90.0
90.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
150.0
0.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
120.0
0.0
0.0
30.0
0.0
150.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
60.0
0.0
0.0
Evaluation Reward for episode 500 is: 24.0
on episode 512, curr epsilon 0.7330058717129788 (time taken so far: 0-00:02:49.0)
     mean loss: adversary 103.19473250116954
     mean reward: adversary 17.962014973784516, agent -29.72743458444974
on episode 544, curr epsilon 0.7236813358889278 (time taken so far: 0-00:02:57.0)
     mean loss: adversary 133.32059324762724
     mean reward: adversary 16.59519693431919, agent -30.914906044395053
on episode 576, curr epsilon 0.7144754170796777 (time taken so far: 0-00:03:4.0)
     mean loss: adversary 211.75623231024392
     mean reward: adversary 35.39409226226173, agent -37.15248381039302
on episode 608, curr epsilon 0.7053866063633399 (time taken so far: 0-00:03:12.0)
     mean loss: adversary 295.7254336764547
     mean reward: adversary 32.08137831735509, agent -39.199856039188745
on episode 640, curr epsilon 0.6964134140129569 (time taken so far: 0-00:03:19.0)
     mean loss: adversary 506.411819

In [None]:
eps_start = 0.9
eps_end = 0.05
eps_decay = 0.9996
epsilon = eps_start

eps = [epsilon]
for i in range(10_000):
    epsilon = max((epsilon*eps_decay), eps_end)
    eps.append(epsilon)

plt.plot(eps)

In [None]:
# load adversary net
adversary_net = SimpleTagNet(config, "adversary").to(config.device)
adversary_net.load_state_dict(torch.load('./models/batched-baseline-test/adversary-net-5632.pth'))

In [None]:
# save logs
logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[])
    )
with open("models/batched-baseline-test/log.json", "w") as f:
    json.dump(logger, f)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger.episodic_losses.adversary[50:], label="adversary")
axes[0].plot(logger.episodic_losses.agent[50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

axes[1].plot(logger.episodic_rewards.adversary[50:], label="adversary")
axes[1].plot(logger.episodic_rewards.agent[50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
fig.savefig("models/batched-baseline-test/vis.png")

In [None]:
# load logs
with open("models/batched-baseline-test/log.json", "r") as f:
    logger = json.load(f)

In [None]:
def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def f(a, n):
    """Zero out beginning of last axis"""
    pad_width = [(0, 0) for _ in range(a.ndim - 1)] + [(n, 0)]
    return np.pad(a, pad_width, mode='constant', constant_values=0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger["episodic_losses"]["adversary"][50:], label="adversary")
axes[0].plot(logger["episodic_losses"]["agent"][50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

adversary_episodic_rewards = np.array(logger["episodic_rewards"]["adversary"])*10
mean_adversary_episodic_rewards = f(moving_average(adversary_episodic_rewards, n=512), 0)
# axes[1].plot(adversary_episodic_rewards, label="adversary")
axes[1].plot(mean_adversary_episodic_rewards, label="adversary mean")
# axes[1].plot(logger["episodic_rewards"]["agent"][50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
env = simple_tag_v2.env(
    num_good=1,
    num_adversaries=3,
    num_obstacles=0,
    max_cycles=30,
    continuous_actions=False
).unwrapped
def visualize(config, adversary_net):
    adversary_net.eval()
    with torch.no_grad():
        return run_episode(config, adversary_net, should_render=True, is_val=True, evaluation=True)

episode = visualize(config, adversary_net)
print("episode steps", episode.steps)
print("episode rewards", *episode.reward.items())