In [12]:

import glob
import os
import time
import numpy as np
import time

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy, MlpPolicy
# from sb3_contrib import RecurrentPPO
# from stable_baselines3.common.evaluation import evaluate_policy
# from stable_baselines3 import DQN
# from stable_baselines3.dqn import CnnPolicy, MlpPolicy


from pettingzoo.mpe import simple_tag_v3


def train(env_fn, steps: int = 100, seed: int | None = 0, **env_kwargs):
    # Train a single model to play as each agent in an AEC environment(https://pettingzoo.farama.org/api/aec/#about-aec:~:text=In%20an%20AEC%20environment%2C)
    # initializes the environment in a parallel setting where all agents can act simultaneously.
    env = env_fn.parallel_env(**env_kwargs)

    env.reset(seed=seed)

    print(f"Starting training on {str(env.metadata['name'])}.")
    #
    env = ss.multiagent_wrappers.pad_observations_v0(env)
    # Converts the PettingZoo multi-agent environment into a vectorized environment, which makes it compatible with algorithms from Stable-Baselines3.
    env = ss.pettingzoo_env_to_vec_env_v1(env)
    # pad_observations_v0 ensures that all agents have observations of the same size, 
    # making the data structure uniform and compatible with standard deep learning
    # models, which typically require fixed input sizes.
    env = ss.concat_vec_envs_v1(env, 8, num_cpus=1, base_class="stable_baselines3")

    # Model
    model = PPO("MlpPolicy", env, verbose=1, batch_size=256,)
#   model = RecurrentPPO("MlpLstmPolicy", env, verbose=1, batch_size=256,)
#   model = DQN("MlpPolicy", env, verbose=1, batch_size=256,)

    # Train
    model.learn(total_timesteps=steps)

    model.save(f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}")

    print("Model has been saved.")

    print(f"Finished training on {str(env.unwrapped.metadata['name'])}.")

    env.close()

In [8]:
# Set render_mode to None to reduce training time
#https://pettingzoo.farama.org/environments/mpe/simple_tag/#simple-tag
env_kwargs = dict(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=25, continuous_actions=False)
env_fn = simple_tag_v3
train(env_fn, steps=100, seed=0, render_mode=None, **env_kwargs)

Starting training on simple_tag_v3.
Using cpu device
------------------------------
| time/              |       |
|    fps             | 10080 |
|    iterations      | 1     |
|    time_elapsed    | 6     |
|    total_timesteps | 65536 |
------------------------------
Model has been saved.
Finished training on simple_tag_v3.


In [22]:

def eval(env_fn, num_games: int = 100, render_mode: str | None = None, **env_kwargs):
    # Evaluate a trained agent vs a random agent
    env = env_fn.env(render_mode=render_mode, **env_kwargs)
    print(F"env: {env}")
    print(
        f"\nStarting evaluation on {str(env.metadata['name'])} (num_games={num_games}, render_mode={render_mode})"
    )

    try:
        latest_policy = max(
            #Searches for the most recently saved model file with a name that matches the environment.
            glob.glob(f"{env.metadata['name']}*.zip"), key=os.path.getctime
        )
    except ValueError:
        print("Policy not found.")
        exit(0)
        
    model = PPO.load(latest_policy)
#     model = RecurrentPPO.load(latest_policy)
#     model = DQN.load(latest_policy)

    rewards = {agent: 0 for agent in env.possible_agents}

    # Note: we evaluate here using an AEC environments, to allow for easy A/B testing against random policies
    # For example, we can see here that using a random agent for archer_0 results in less points than the trained agent
    for i in range(num_games):
        env.reset(seed=i)
        env.action_space(env.possible_agents[0]).seed(i)
        
        for agent in env.agent_iter():
            print(env.last())
            # it obtains the last observation and reward using env.last()
            obs, reward, termination, truncation, info = env.last()
            print(F"obs:{obs}")
            print(F"reward:{reward}")
            print(F"termination:{termination}")
            print(F"truncation:{truncation}")
            print(F"info:{info}")
            if agent == 'agent_0':
                obs=np.append(obs, [0,0])
            #print(obs)
            if render_mode== 'human':
                time.sleep(0.01)
            for agent in env.agents:
                rewards[agent] += env.rewards[agent]

            if termination or truncation:
                break
            else:
                if agent == env.possible_agents[0]:
                    # selects an action randomly from its action space using the sample() method
                    act = env.action_space(agent).sample()
                else:
                    # predict the action based on the current observation (obs)
                    # deterministic=True parameter ensures that the model chooses the action that it believes is best
                    act = model.predict(obs, deterministic=True)[0]
            env.step(act)
    env.close()

    avg_reward = sum(rewards.values()) / len(rewards.values())
    avg_reward_per_agent = {
        agent: rewards[agent] / num_games for agent in env.possible_agents
    }
    print(f"Avg reward: {avg_reward}")
    print("Avg reward per agent, per game: ", avg_reward_per_agent)
    print("Full rewards: ", rewards)
    return avg_reward


In [23]:
eval(env_fn, num_games=10, render_mode=None, **env_kwargs)

env: simple_tag_v3

Starting evaluation on simple_tag_v3 (num_games=10, render_mode=None)
(array([ 0.        ,  0.        ,  0.27392337, -0.46042657, -0.19539839,
        1.243557  ,  0.29461303, -0.43464413, -1.1919763 , -0.5065182 ,
        0.3526171 ,  1.2859378 , -0.06065182,  0.9194197 ,  0.        ,
        0.        ], dtype=float32), 0.0, False, False, {})
obs:[ 0.          0.          0.27392337 -0.46042657 -0.19539839  1.243557
  0.29461303 -0.43464413 -1.1919763  -0.5065182   0.3526171   1.2859378
 -0.06065182  0.9194197   0.          0.        ]
reward:0.0
termination:False
truncation:False
info:{}
(array([ 0.        ,  0.        , -0.918053  , -0.96694475,  0.9965779 ,
        1.7500751 ,  1.4865893 ,  0.07187403,  1.1919763 ,  0.5065182 ,
        1.5445935 ,  1.7924559 ,  1.1313245 ,  1.4259379 ,  0.        ,
        0.        ], dtype=float32), 0.0, False, False, {})
obs:[ 0.          0.         -0.918053   -0.96694475  0.9965779   1.7500751
  1.4865893   0.07187403  1.1

214.62902901516796

import glob
import os
import time
import numpy as np
import time

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy, MlpPolicy
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy

from pettingzoo.mpe import simple_tag_v3
env_fn = simple_tag_v3
print(env_fn)

In [24]:
env_kwargs = dict(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=100, continuous_actions=False )
eval(env_fn, num_games=10,  render_mode='human',**env_kwargs)

env: simple_tag_v3

Starting evaluation on simple_tag_v3 (num_games=10, render_mode=human)
(array([ 0.        ,  0.        ,  0.27392337, -0.46042657, -0.19539839,
        1.243557  ,  0.29461303, -0.43464413, -1.1919763 , -0.5065182 ,
        0.3526171 ,  1.2859378 , -0.06065182,  0.9194197 ,  0.        ,
        0.        ], dtype=float32), 0.0, False, False, {})
obs:[ 0.          0.          0.27392337 -0.46042657 -0.19539839  1.243557
  0.29461303 -0.43464413 -1.1919763  -0.5065182   0.3526171   1.2859378
 -0.06065182  0.9194197   0.          0.        ]
reward:0.0
termination:False
truncation:False
info:{}
(array([ 0.        ,  0.        , -0.918053  , -0.96694475,  0.9965779 ,
        1.7500751 ,  1.4865893 ,  0.07187403,  1.1919763 ,  0.5065182 ,
        1.5445935 ,  1.7924559 ,  1.1313245 ,  1.4259379 ,  0.        ,
        0.        ], dtype=float32), 0.0, False, False, {})
obs:[ 0.          0.         -0.918053   -0.96694475  0.9965779   1.7500751
  1.4865893   0.07187403  1.

: 