An alternative and simplified PredPreyGrass environment


In [1]:
! pip install "ray[rllib]"==2.9.3

Collecting ray==2.9.3 (from ray[rllib]==2.9.3)
  Using cached ray-2.9.3-cp311-cp311-manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gymnasium==0.28.1 (from ray[rllib]==2.9.3)
  Using cached gymnasium-0.28.1-py3-none-any.whl.metadata (9.2 kB)
Using cached ray-2.9.3-cp311-cp311-manylinux2014_x86_64.whl (65.4 MB)
Using cached gymnasium-0.28.1-py3-none-any.whl (925 kB)
Installing collected packages: gymnasium, ray
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 0.29.1
    Uninstalling gymnasium-0.29.1:
      Successfully uninstalled gymnasium-0.29.1
  Attempting uninstall: ray
    Found existing installation: ray 2.3.0
    Uninstalling ray-2.3.0:
      Successfully uninstalled ray-2.3.0
Successfully installed gymnasium-0.28.1 ray-2.9.3


In [2]:
from multi_agent_env_ray_2_9_3 import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
import time

config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env=MultiAgentArena)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
dones = {"__all__" : False}
    
while not dones["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, dones, _, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [3]:
#registering the environment
from multi_agent_env_ray_2_9_3 import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
import time

from ray.tune.registry import register_env

def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env="multi_agent_arena",)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
dones = {"__all__" : False}
    
while not dones["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, dones, _, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [4]:
#define policies in dict with PolicySpec
from multi_agent_env_ray_2_9_3 import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env

import time


def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }


config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
dones = {"__all__" : False}
    
while not dones["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, dones, _, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [5]:
#define policymapping
from multi_agent_env_ray_2_9_3 import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env

import time


def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    """
    is exactly the same as:
    policy_mapping_fn=  lambda agent_id, episode, worker, **kwargs: "policy1" 
                        if agent_id == "agent1" else "policy2"
    """
    if agent_id == "agent1":
        return "policy1"
    else:
        return "policy2"


config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=policy_mapping_fn
    )
)

algo = config.build()
env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
dones = {"__all__" : False}
    
while not dones["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, dones, _, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50
