An alternative and simplified PredPreyGrass environment


In [1]:
from environments.predpreygrass_simple_env import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.pre_checks.env import check_env
from ray.tune.registry import register_env
from ray.rllib.policy.policy import PolicySpec
from ray.tune.logger import pretty_print
import ray
from ray import train, tune

import time
#check_env(MultiAgentArena) # gives error

config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env=MultiAgentArena)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}

while not truncateds["__all__"]:
    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)

    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [2]:
def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env="multi_agent_arena",)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [3]:
#define policies in dict with PolicySpec
def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50
