In [1]:
import time
import enum
import math
import random
import collections

import matplotlib.pyplot as pd
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

from pettingzoo.mpe import simple_tag_v2
from pettingzoo.utils import random_demo

In [2]:
env = simple_tag_v2.env(
    num_good=3,
    num_adversaries=3,
    num_obstacles=2,
    max_cycles=300,
    continuous_actions=False
).unwrapped
print("Peek into unwrapped environment:", *dir(env))

Peek into unwrapped environment: __class__ __delattr__ __dict__ __dir__ __doc__ __eq__ __format__ __ge__ __getattribute__ __gt__ __hash__ __init__ __init_subclass__ __le__ __lt__ __module__ __ne__ __new__ __reduce__ __reduce_ex__ __repr__ __setattr__ __sizeof__ __str__ __subclasshook__ __weakref__ _accumulate_rewards _agent_selector _clear_rewards _dones_step_first _execute_world_step _index_map _reset_render _set_action _was_done_step action_space action_spaces agent_iter agents close continuous_actions current_actions last local_ratio max_cycles max_num_agents metadata np_random num_agents observation_space observation_spaces observe possible_agents render reset scenario seed state state_space step steps unwrapped viewer world


In [3]:
# Print variables of the environment
# Documentation:   https://www.pettingzoo.ml/api
env.reset()
print("State size", env.state_space.shape)
print("Name of current agent", env.agent_selection)
print("Observation space of current agent", env.observation_space(env.agent_selection).shape)
print("Action space of current agent", env.action_space(env.agent_selection))
print("Sample random action from current agent", env.action_space(env.agent_selection).sample())
print("The agent names:", *env.agents)
print()

# select an agent in the environment world, after using env.unwrapped
agent = env.world.agents[0]
print("agent's name is", agent.name)
print("agent's position and velocity coordinates", agent.state.p_vel, agent.state.p_pos)
print("is agent an adversary?", agent.adversary)

landmark = env.world.landmarks[0]
print("landmark's name is", landmark.name)
print("landmark's position coordinates (doesn't move)", landmark.state.p_pos)


State size (138,)
Name of current agent adversary_0
Observation space of current agent (24,)
Action space of current agent Discrete(5)
Sample random action from current agent 2
The agent names: adversary_0 adversary_1 adversary_2 agent_0 agent_1 agent_2

agent's name is adversary_0
agent's position and velocity coordinates [0. 0.] [ 0.70805589 -0.5818902 ]
is agent an adversary? True
landmark's name is landmark 0
landmark's position coordinates (doesn't move) [ 0.27165083 -0.45720992]


In [4]:

# Print variables of the environment
# Documentation:   https://www.pettingzoo.ml/api
env.reset()
print("State size", env.state_space.shape)
print("Name of current agent", env.agent_selection)
print("Observation space of current agent", env.observation_space(env.agent_selection).shape)
print("Action space of current agent", env.action_space(env.agent_selection))
print("Sample random action from current agent", env.action_space(env.agent_selection).sample())
print("The agent names:", *env.agents)
print()

# select an agent in the environment world, after using env.unwrapped
agent = env.world.agents[0]
print("agent's name is", agent.name)
print("agent's position and velocity coordinates", agent.state.p_vel, agent.state.p_pos)
print("is agent an adversary?", agent.adversary)

landmark = env.world.landmarks[0]
print("landmark's name is", landmark.name)
print("landmark's position coordinates (doesn't move)", landmark.state.p_pos)

State size (138,)
Name of current agent adversary_0
Observation space of current agent (24,)
Action space of current agent Discrete(5)
Sample random action from current agent 2
The agent names: adversary_0 adversary_1 adversary_2 agent_0 agent_1 agent_2

agent's name is adversary_0
agent's position and velocity coordinates [0. 0.] [-0.45257476  0.98974294]
is agent an adversary? True
landmark's name is landmark 0
landmark's position coordinates (doesn't move) [ 0.46874369 -0.89777674]


In [5]:
# Demo environment with random policy
env.reset()
random_demo(env, render=False, episodes=5)

Average total reward -3285.9782898487856


-16429.89144924393

In [6]:
def hardcode_policy(observation, agent):
    """
    Parameters
    ==========
    agent : str
    """
#     print(observation.shape)agent_step_idx
#     print(agent)
    if "adversary" in agent:
        # adversary
        if agent == "adversary_0":
            return np.random.binomial(2, 0.3) + 3
        
    if "agent" in agent:
        # non-adversary
        pass
    return 0

env.reset()
for agent_step_idx, agent in enumerate(env.agent_iter()):
#     env.render()
    observation, reward, done, info = env.last()
    if done:
        env.step(None)
    else:
        action = hardcode_policy(observation, agent)
        env.step(action)
    # time.sleep(0.1)

print(f"episode ran for {agent_step_idx} steps")

episode ran for 1805 steps


In [7]:
env.reset()

In [8]:
def get_agent_counts():
    all_agents = 0
    adversaries = 0
    for agent in env.world.agents:
        all_agents += 1
        adversaries += 1 if agent.adversary else 0
    good_agents = all_agents - adversaries
    return (adversaries, good_agents)

def process_config(config):
    for k, v in config.all.items():
        config.adversary[k] = v
        config.agent[k] = v

n_adversaries, n_good_agents = get_agent_counts()
config = AttrDict(
    discount = 0.99,
    epsilon = 0.05,
    n_episodes=200,
    update_target_interval=10,
    report_interval=20,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    all=AttrDict(
        message_size=4,
        hidden_size=128,
        n_actions=env.action_space(env.agent_selection).n,
        n_rnn_layers=2,
        apply_bn=False,
    ),
    adversary=AttrDict(
        n_agents=n_adversaries,
        observation_shape=env.observation_space("adversary_0").shape

    ),
    agent=AttrDict(
        n_agents=n_good_agents,
        observation_shape=env.observation_space("agent_0").shape
    )
)
process_config(config)

class Container(object):
    """Container of messages and hidden states of agents in environment."""
    
    def reset(self):
#         keys = [*self.__message_d.keys()]
#         for k in keys:
#             del self.__message_d[k]
#         keys = [*self.__hidden_d.keys()]
#         for k in keys:
#             del self.__hidden_d[k]
        
        for idx in range(config.adversary.n_agents):
            self.__message_d[f"adversary_{idx}"] = torch.zeros(
                self.config.adversary.message_size*(config.adversary.n_agents - 1),
                dtype=torch.float
            )
            self.__hidden_d[f"adversary_{idx}"]  = torch.zeros(
                (config.adversary.n_rnn_layers, 1, self.config.adversary.hidden_size,),
                dtype=torch.float
            )
        for idx in range(config.agent.n_agents):
            self.__message_d[f"agent_{idx}"] = torch.zeros(
                self.config.agent.message_size*(config.agent.n_agents - 1),
                dtype=torch.float
            )
            self.__hidden_d[f"agent_{idx}"]  = torch.zeros(
                (config.agent.n_rnn_layers, 1, self.config.agent.hidden_size,),
                dtype=torch.float
            )
        
    def __init__(self, config):
        self.config = config
        self.__message_d = {}
        self.__hidden_d = {}
        self.reset()
    
    def get_message(self, agent_name):
        return self.__message_d[agent_name]

    def get_hidden(self, agent_name):
        return self.__hidden_d[agent_name]

    def update_message(self, agent_name, message):
        """Update message cache.
        
        Messages of multiple agents are concatenated together.
        For example, if agent 2 receives messages from agents 0, 1, and 3 then
        the message is a vector of the form: [ 0's message, 1's message, 3's message ]
        """
        agent_type, agent_idx = agent_name.split("_")
        agent_idx = int(agent_idx)
        for jdx in range(config[agent_type].n_agents):
            if jdx < agent_idx:
                start_idx = agent_idx - 1
            elif jdx == agent_idx: # down update message to oneself
                continue
            else: # agent_idx < jdx
                start_idx = agent_idx
            end_idx   = start_idx + self.config[agent_type].message_size
            # print(jdx, agent_idx, self.__message_d[f"{agent_type}_{jdx}"].shape, start_idx, end_idx)
            messages = self.__message_d[f"{agent_type}_{jdx}"]
            self.__message_d[f"{agent_type}_{jdx}"] = \
                    torch.hstack((messages[:start_idx], message, messages[end_idx:]))

    def update_hidden(self, agent_name, hidden):
        self.__hidden_d[agent_name] = hidden

In [9]:
class SimpleTagNet(torch.nn.Module):
    """NN Model for the agents. Both good agents and adversaries use this model."""
        
    def __init__(self, config, agent_type):
        super().__init__()
        # self.config = config
        self.observation_size = math.prod(config[agent_type].observation_shape)
        self.send_message_size = config[agent_type].message_size
        self.recv_message_size = config[agent_type].message_size*(config[agent_type].n_agents - 1)
        self.n_agents = config[agent_type].n_agents
        self.n_actions = config[agent_type].n_actions
        self.apply_bn = config[agent_type].apply_bn
        self.hidden_size = config[agent_type].hidden_size
        self.n_rnn_layers = config[agent_type].n_rnn_layers
        self.n_output = self.n_actions + self.send_message_size
        
        self.agent_lookup    = torch.nn.Embedding(self.n_agents, self.hidden_size)
        self.action_lookup   = torch.nn.Embedding(self.n_actions, self.hidden_size)
        self.observation_mlp = torch.nn.Sequential(collections.OrderedDict([
            ("linear", torch.nn.Linear(self.observation_size, self.hidden_size)),
            ("relu", torch.nn.ReLU(inplace=True)),
        ]))
        self.message_mlp = torch.nn.Sequential()
        # if self.apply_bn:
        #     # input must have shape (N, C), output has the same shape
        #     self.message_mlp.add_module("bn", torch.nn.BatchNorm1d(self.recv_message_size))
        self.message_mlp.add_module("linear", torch.nn.Linear(self.recv_message_size, self.hidden_size))
        self.message_mlp.add_module("relu", torch.nn.ReLU(inplace=True))
        
        # input must have shape (N, L, H_in)
        # output has shape  (N, L, H_out)
        self.rnn = torch.nn.GRU(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.n_rnn_layers,
            batch_first=True
        )
        self.output_mlp = torch.nn.Sequential()
        self.output_mlp.add_module("linear", torch.nn.Linear(self.hidden_size, self.hidden_size))
        self.output_mlp.add_module("relu", torch.nn.ReLU(inplace=True))
        self.output_mlp.add_module("linear", torch.nn.Linear(self.hidden_size, self.n_output))
    
    def forward(self, agent_idx, observation, message, hidden):
        """Apply DQN to episode step.
        
        Parameters
        ==========
        agent_idx : int
            Index of agent
        observation : ndarray
            The observation vector obtained from the environment.
        message : torch.Tensor
            Messages from the other agents. By default has shape (message_size*(n_agents - 1))
            where message_size=4 and n_agents=3
        hidden : torch.Tensor
            Hidden state of GRU. By default has shape (n_layers=2, N=1, H_out=128).
        
        Returns
        =======
        torch.Tensor
            Vector of Q-value associated with each action.
        torch.Tensor
            The message to pass to other agents.
        torch.Tensor
            The hidden state used by GRU.
        """
        agent_idx   = torch.tensor(agent_idx, dtype=torch.int)
        observation = torch.tensor(observation, dtype=torch.float)
        z_a = self.agent_lookup(agent_idx)
        z_o = self.observation_mlp(observation)
        z_m = self.message_mlp(message)
        z = z_a + z_o + z_m
        # z has shape (N=1, L=1, H_in=128)
        z = z.unsqueeze(0).unsqueeze(0)
        # hidden has shape (n_layers=2, N=1, H_out=128) before and after
        # out has shape (N=1, L=1, H_out=128)
        out, hidden = self.rnn(z, hidden)
        out = out.squeeze(0).squeeze(0)
        out = self.output_mlp(out)
        Q = out[0:self.n_actions]
        m = out[self.n_actions:self.n_actions + self.send_message_size]
        return Q, m, hidden

In [37]:
def choose_action(config, agent_type, Q):
    if random.random() < config.epsilon:
        return random.randrange(config[agent_type].n_actions)
    else:
        return torch.argmax(Q).item()

def run_episode(config, container, adversary_net, agent_net, should_render=False, device=torch.device('cpu')):
    """
    inputs consist of observation, message (backprop), hidden (backprop) indexed by agent
    outputs consist of action, q-value of action (backprop), reward, done indexed by (step, agent)
    """
    episode = AttrDict(steps=0, reward=AttrDict(adversary=0, agent=0), step_records=[])
    n_agents = config.adversary.n_agents + config.agent.n_agents
    step_record = None
    cpu_device = torch.device('cpu')
    
    env.reset()
    for agent_step_idx, agent_name in enumerate(env.agent_iter()):
        if should_render:
            env.render()
        if agent_step_idx % n_agents == 0:
            episode.steps += 1
            step_record = AttrDict(adversary={}, agent={})
            episode.step_records.append(step_record)
            
        obs_curr, reward, done, _ = env.last()
        agent_type, agent_idx = agent_name.split("_")
        agent_idx = int(agent_idx)
        if done:
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                message=None,
                hidden=None,
                action=None,
                Q=None,
                reward=reward,
                done=done,
            )
            env.step(None)
            continue
            
        m_prev = container.get_message(agent_name)
        h_prev = container.get_hidden(agent_name)
        if agent_type == "adversary":
            Q_curr, m_curr, h_curr = adversary_net(
                torch.as_tensor(agent_idx, dtype=torch.int).to(device), 
                torch.as_tensor(obs_curr, dtype=torch.float32).to(device), 
                torch.as_tensor(m_prev, dtype=torch.float32).to(device), 
                torch.as_tensor(h_prev, dtype=torch.float32).to(device)
            )
        else: # good agent
            Q_curr, m_curr, h_curr = agent_net(
                torch.as_tensor(agent_idx, dtype=torch.int).to(device), 
                torch.as_tensor(obs_curr, dtype=torch.float32).to(device), 
                torch.as_tensor(m_prev, dtype=torch.float32).to(device), 
                torch.as_tensor(h_prev, dtype=torch.float32).to(device)
            )

        action = choose_action(config, agent_type, Q_curr)
        env.step(action)
        container.update_message(agent_name, m_curr.to(cpu_device))
        container.update_hidden(agent_name, h_curr.to(cpu_device))
        step_record[agent_type][agent_idx] = AttrDict(
            # inputs to network
            observation=obs_curr,
            message=m_prev,
            hidden=h_prev,
            # outputs of network / inputs to environment
            action=action,
            Q=Q_curr,
            # output of environment
            reward=reward,
            done=done,
        )
        episode.reward[agent_type] += reward
    
    return episode

def train_agents(config, episode, adversary_net, agent_net,
                 adversary_target_net, agent_target_net,
                 adversary_optimizer, agent_optimizer, device=torch.device('cpu')):
    """
    """
    discount = torch.tensor(config.discount, dtype=torch.float)
    
    adversary_loss = torch.tensor(0.).to(device)
    agent_loss = torch.tensor(0.).to(device)
    for step_idx in range(episode.steps):
        
        for agent_idx in episode.step_records[step_idx].adversary.keys():
            curr_record = episode.step_records[step_idx].adversary[agent_idx]
            if curr_record.done:
                # agent is done at this step
                continue
            next_record = episode.step_records[step_idx + 1].adversary[agent_idx]
            r = torch.tensor(next_record.reward, dtype=torch.float)
            y = None
            if next_record.done:
                # agent terminates at next step
                y = r
            else:
                next_o = next_record.observation
                next_m = next_record.message
                next_h = next_record.hidden
                target_Q, _, _ = adversary_target_net(
                    torch.as_tensor(agent_idx, dtype=torch.int).to(device), 
                    torch.as_tensor(next_o).to(device), 
                    torch.as_tensor(next_m).to(device), 
                    torch.as_tensor(next_h).to(device)
                )
                max_target_Q = torch.max(target_Q.detach())
                y = r + discount*max_target_Q
            u = curr_record.action
            Q_u = curr_record.Q[u]
            adversary_loss += torch.pow(y - Q_u, 2.)
            
        for agent_idx in episode.step_records[step_idx].agent.keys():
            curr_record = episode.step_records[step_idx].agent[agent_idx]
            if curr_record.done:
                # agent is done at this step
                continue
            next_record = episode.step_records[step_idx + 1].agent[agent_idx]
            r = torch.tensor(next_record.reward, dtype=torch.float)
            y = None
            if next_record.done:
                # agent terminates at next step
                y = r
            else:
                next_o = next_record.observation
                next_m = next_record.message
                next_h = next_record.hidden
                target_Q, _, _ = agent_target_net(
                    torch.as_tensor(agent_idx, dtype=torch.int).to(device), 
                    torch.as_tensor(next_o).to(device), 
                    torch.as_tensor(next_m).to(device), 
                    torch.as_tensor(next_h).to(device)
                )
                max_target_Q = torch.max(target_Q.detach())
                y = r + discount*max_target_Q
            u = curr_record.action
            Q_u = curr_record.Q[u]
            agent_loss += torch.pow(y - Q_u, 2.)
    
    adversary_optimizer.zero_grad()
    agent_optimizer.zero_grad()
    adversary_loss.backward()
    agent_loss.backward()
    adversary_optimizer.step()
    agent_optimizer.step()
    episode.loss = AttrDict(adversary=adversary_loss.item(), agent=agent_loss.item())
    

def train(config):
    """
    - Use parameter sharing between agents of the same class.
    - Good agents use one RL model, adversaries use another RL model.
      Train the agents side by side.
    - Separate, disjoint communication channels for two classes of agents,
      maintained by a container to store the messages.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     device = torch.device('cpu')

    adversary_net = SimpleTagNet(config, "adversary")
    agent_net = SimpleTagNet(config, "agent")
    adversary_target_net = SimpleTagNet(config, "adversary")
    agent_target_net = SimpleTagNet(config, "agent")
    
    print('hello?')
    
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            adversary_net = torch.nn.DataParallel(adversary_net)
            agent_net = torch.nn.DataParallel(agent_net)
            adversary_target_net = torch.nn.DataParallel(adversary_target_net)
            agent_target_net = torch.nn.DataParallel(agent_target_net)
        adversary_net.to(device)
        agent_net.to(device)
        adversary_target_net.to(device)
        agent_target_net.to(device)

    print('Is cuda?', next(agent_target_net.parameters()).is_cuda)
    
    adversary_target_net.eval()
    agent_target_net.eval()
    adversary_optimizer = torch.optim.RMSprop(adversary_net.parameters())
    agent_optimizer = torch.optim.RMSprop(agent_net.parameters())
    container = Container(config)
    def update_targets():
        adversary_target_net.load_state_dict(adversary_net.state_dict())
        agent_target_net.load_state_dict(agent_net.state_dict())
    
    for episode_idx in range(config.n_episodes):
#         episode = run_episode(config, container, adversary_net, agent_net,
#                               should_render=episode_idx % config.report_interval == 0 and episode_idx > 0)
        episode = run_episode(config, container, adversary_net, agent_net, should_render=False, device=device)
        train_agents(config, episode, adversary_net, agent_net, adversary_target_net, agent_target_net,
                     adversary_optimizer, agent_optimizer, device=device)

        if episode_idx % config.update_target_interval == 0 and episode_idx > 0:
            update_targets()
        if episode_idx % config.report_interval == 0 and episode_idx > 0:
            print(f"on episode {episode_idx}")
            print(f"     loss: adversary {episode.loss.adversary}, agent {episode.loss.agent}")
            print(f"     reward: adversary {episode.reward.adversary}, agent {episode.reward.agent}")
        container.reset()
    
    return adversary_net, agent_net

adversary_net, agent_net = train(config)

hello?
Is cuda? True


  agent_idx   = torch.tensor(agent_idx, dtype=torch.int)
  observation = torch.tensor(observation, dtype=torch.float)


on episode 20
     loss: adversary 300.0, agent 5264.982421875
     reward: adversary 30.0, agent -1430.9673374470208
on episode 40
     loss: adversary 300.0, agent 393.92047119140625
     reward: adversary 30.0, agent -266.0359323966399
on episode 60
     loss: adversary 1500.0, agent 728.2037353515625
     reward: adversary 150.0, agent -264.1979001515051
on episode 80
     loss: adversary 0.0, agent 950.7318725585938
     reward: adversary 0.0, agent -512.0540965474062
on episode 100
     loss: adversary 300.0, agent 906.8951416015625
     reward: adversary 30.0, agent -570.8632940518224
on episode 120
     loss: adversary 0.0, agent 0.0
     reward: adversary 0.0, agent 0.0
on episode 140
     loss: adversary 600.0, agent 1315.36279296875
     reward: adversary 60.0, agent -532.1831796500637
on episode 160
     loss: adversary 0.0, agent 448.57940673828125
     reward: adversary 0.0, agent -317.907411321856
on episode 180
     loss: adversary 0.0, agent 113.54822540283203
     rew

In [38]:
print(f"on episode {episode_idx}")
print(f"     loss: adversary {episode.loss.adversary}, agent {episode.loss.agent}")
print(f"     reward: adversary {episode.reward.adversary}, agent {episode.reward.agent}")


NameError: name 'episode_idx' is not defined