An alternative and simplified PredPreyGrass environment


In [1]:
from environments.predpreygrass_simple_env import MultiAgentArena
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.pre_checks.env import  check_env
from ray.tune.registry import register_env
from ray.rllib.policy.policy import PolicySpec
from ray.tune.logger import pretty_print
import ray
from ray import train, tune

import time

In [2]:
check_env(MultiAgentArena())

self.observation_space Dict('agent1': MultiDiscrete([36 36]), 'agent2': MultiDiscrete([36 36]))
self.action_space Dict('agent1': Discrete(4), 'agent2': Discrete(4))


In [3]:
config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env=MultiAgentArena)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

________
|1.... |
|....  |
|.2..  |
|...   |
|      |
|      |
‾‾‾‾‾‾‾‾

R1=-3.0
R2=-3.9 (1 collisions)
Env timesteps=50/50


In [None]:
def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

config = (
    PPOConfig()
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0, log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .environment(env="multi_agent_arena",)
    .multi_agent(
        policies=["policy1", "policy2"],
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

In [None]:
#define policies in dict with PolicySpec
def env_creator(env_config):
    return MultiAgentArena(config=env_config)  # return an env instance

register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent1" else "policy2"
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

In [None]:
#define policies in dict with PolicySpec
#add width and height parameters to the environment

MAX_WIDTH = 100
MAX_HEIGHT = 100
                                                   
def env_creator(env_config):
    print(env_config)
    return MultiAgentArena(config=env_config, width=MAX_WIDTH, height=MAX_HEIGHT)  # return an env instance
register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    if agent_id == "agent1":
        return "policy1"
    else:
        return "policy2"



config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=policy_mapping_fn
    )
)

algo = config.build()

env = MultiAgentArena(config={"render": True})
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()

In [None]:
#define policies in dict with PolicySpec
#add width and height parameters to the environment
#train the environment with the algo

MAX_WIDTH = 10
MAX_HEIGHT = 10
                                                   
def env_creator(env_config):
    print(env_config)
    return MultiAgentArena(config=env_config, width=MAX_WIDTH, height=MAX_HEIGHT)  # return an env instance
register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    if agent_id == "agent1":
        return "policy1"
    else:
        return "policy2"


algo = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]})
    .multi_agent(
        policies=policies,
        policy_mapping_fn=policy_mapping_fn
    )
    .build()
)

for i in range(10):
    result = algo.train()
    print(pretty_print(result))

    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")


config_env = {"render": True}


env = MultiAgentArena(config={"render": True}, width=10, height=10)
obs, _ = env.reset()
truncateds = {"__all__" : False}
    
while not truncateds["__all__"]:

    action1 = algo.compute_single_action(obs["agent1"], policy_id="policy1")
    action2 = algo.compute_single_action(obs["agent2"], policy_id="policy2")

    obs, rewards, terminateds, truncateds, infos = env.step({"agent1": action1, "agent2": action2})

    env.render()
    time.sleep(0.5)
    
algo.stop()




In [None]:
#define policies in dict with PolicySpec
#add width and height parameters to the environment
#train the environment with the algo
#use tune for a gridsearch

MAX_WIDTH = 10
MAX_HEIGHT = 10

ray.init(ignore_reinit_error=True)
                                                   
def env_creator(env_config):
    print(env_config)
    return MultiAgentArena(config=env_config, width=MAX_WIDTH, height=MAX_HEIGHT)  # return an env instance
register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    if agent_id == "agent1":
        return "policy1"
    else:
        return "policy2"


config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]},lr=tune.grid_search([0.01, 0.001, 0.0001]))
    .multi_agent(
        policies=policies,
        policy_mapping_fn=policy_mapping_fn
    )
)


tuner = tune.Tuner(
    "PPO",
    run_config=train.RunConfig(
        stop={"episode_reward_mean": 15},
    ),
    param_space=config,
)

tuner.fit()



In [None]:
#define policies in dict with PolicySpec
#add width and height parameters to the environment
#train the environment with the algo
#use tune for a gridsearch
#retrieving the checkpoint(s) of the trained agent

MAX_WIDTH = 10
MAX_HEIGHT = 10

ray.init(ignore_reinit_error=True)
                                                   
def env_creator(env_config):
    print(env_config)
    return MultiAgentArena(config=env_config, width=MAX_WIDTH, height=MAX_HEIGHT)  # return an env instance
register_env("multi_agent_arena", env_creator)

policies = { "policy1": PolicySpec(), "policy2": PolicySpec() }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    if agent_id == "agent1":
        return "policy1"
    else:
        return "policy2"


config = (
    PPOConfig()
    .environment(env="multi_agent_arena")
    .framework("torch")
    .rollouts(create_env_on_local_worker=True)
    .debugging(seed=0,log_level="ERROR")
    .training(model={"fcnet_hiddens" : [64, 64]},lr=tune.grid_search([0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]))
    .multi_agent(
        policies=policies,
        policy_mapping_fn=policy_mapping_fn
    )
)


# ``Tuner.fit()`` allows setting a custom log directory (other than ``~/ray-results``)
tuner = ray.tune.Tuner(
    "PPO",
    param_space=config,
    run_config=train.RunConfig(
        stop={"episode_reward_mean": 10},
        checkpoint_config=train.CheckpointConfig(checkpoint_at_end=True),
    ),
)

results = tuner.fit()

# Get the best result based on a particular metric.
best_result = results.get_best_result(metric="episode_reward_mean", mode="max")

# Get the best checkpoint corresponding to the best result.
best_checkpoint = best_result.checkpoint
