In [11]:
from ray import tune, train, init, shutdown
from ray.tune.registry import register_env
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.stopper import MaximumIterationStopper

from environment.skyjo_env import env as skyjo_env
from ray.rllib.algorithms.callbacks import DefaultCallbacks
import logging
import numpy as np
import os
import json

from models.action_mask_model import TorchActionMaskModel

In [16]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class RewardDecayCallback(DefaultCallbacks):
    def on_train_result(self, *, algorithm, result, **kwargs):
        # Decay the reward scaling factor over training iterations
        action_reward_decay = max(0.05, 1.0 - result["training_iteration"] * 0.005)
        # env = algorithm.workers.local_worker().env
        # env = algorithm.workers.local_env_runner.env
        algorithm.config.env_config["action_reward_decay"] = action_reward_decay
        logger.info(action_reward_decay)


class SkyjoLoggingCallbacks(DefaultCallbacks):
    def on_episode_end(self, *, worker, base_env, policies, episode, **kwargs):
        """
        This is called at the end of each episode. We grab
        the final card sum from the `info` dict for each agent
        and log it as a custom metric.
        """
        for agent_id in episode.get_agents():
            info = episode.last_info_for(agent_id)
            if info is not None and "final_sum_of_revealed_cards" in info:
                metric_name = f"final_sum_of_revealed_cards_{agent_id}"
                episode.custom_metrics[metric_name] = info["final_sum_of_revealed_cards"]
            if info is not None and "n_hidden_cards" in info:
                metric_name = f"n_hidden_cards_{agent_id}"
                episode.custom_metrics[metric_name] = info["n_hidden_cards"]

skyjo_config = {
    "num_players": 3,
    "score_penalty": 2.0,
    "observe_other_player_indirect": False,
    "mean_reward": 1.0,
    "reward_refunded": 10,
    "final_reward": 100,
    "score_per_unknown": 5.0,
    "action_reward_decay": 1.0,
    "old_reward": False,
    "render_mode": "human",
}

model_config = {
    "custom_model": TorchActionMaskModel,
    # Add the following keys:
    # "fcnet_hiddens": [1024, 1024, 1024, 512, 512],
    "fcnet_activation": "tanh",
}

param_space = {
    "lr": tune.grid_search([0.0001, 0.001, 0.01]),  # Learning rate options
    "model": tune.grid_search([{"custom_model": TorchActionMaskModel, "fcnet_activation": "relu"}, {"custom_model": TorchActionMaskModel, "fcnet_activation": "tanh"}])
}

def env_creator(config):
    return PettingZooEnv(skyjo_env(**config))

register_env("skyjo", env_creator)

test_env = env_creator(skyjo_config)
obs_space = test_env.observation_space
act_space = test_env.action_space

def policy_mapping_fn(agent_id, _, **kwargs):
    return "policy_" + str(agent_id) #int(agent_id.split("_")[-1])

config = (
    PPOConfig()
    .training()#model=model_config, )
    .environment("skyjo", env_config=skyjo_config)
    .framework('torch')
    .callbacks(SkyjoLoggingCallbacks)
    #.callbacks(RewardDecayCallback)
    .env_runners(num_env_runners=5)
    .rollouts(num_rollout_workers=20, num_envs_per_worker=1)
    .resources(num_gpus=1)
    .multi_agent(
        policies={
            "policy_0": (None, obs_space[0], act_space[0], {"entropy_coeff":0.03}),
            "policy_1": (None, obs_space[1], act_space[1], {"entropy_coeff":0.03}),
            "policy_2": (None, obs_space[2], act_space[2], {"entropy_coeff":0.03})
        },
        policy_mapping_fn=policy_mapping_fn,#(lambda agent_id, *args, **kwargs: agent_id),
    )
    .evaluation(evaluation_num_env_runners=0)
    .debugging(log_level="INFO")
    .api_stack(
        enable_rl_module_and_learner=False,
        # enable_env_runner_and_connector_v2=True,
    )
    # .training()
    #     lr = ,
    # )
)

storage_path = os.path.join(os.getcwd(), "results")

tuner = tune.Tuner(
    trainable="PPO",
    param_space={**config.to_dict(), **param_space},
    run_config=train.RunConfig(
        stop=MaximumIterationStopper(100),
        storage_path=storage_path,
    ),
)



In [None]:
# 5, 20, 1, num_cpus_per_worker=2
tuner.fit()

0,1
Current time:,2025-01-05 19:08:12
Running for:,00:09:05.52
Memory:,24.2/31.3 GiB

Trial name,status,loc,lr,model,iter,total time (s),ts,num_healthy_workers,num_in_flight_async_ sample_reqs,num_remote_worker_re starts
PPO_skyjo_c1c38_00000,RUNNING,192.168.0.236:25895,0.0001,{'custom_model'_5080,93.0,518.482,372000.0,20.0,0.0,0.0
PPO_skyjo_c1c38_00001,PENDING,,0.001,{'custom_model'_a4c0,,,,,,
PPO_skyjo_c1c38_00002,PENDING,,0.01,{'custom_model'_9940,,,,,,
PPO_skyjo_c1c38_00003,PENDING,,0.0001,{'custom_model'_4200,,,,,,
PPO_skyjo_c1c38_00004,PENDING,,0.001,{'custom_model'_3280,,,,,,
PPO_skyjo_c1c38_00005,PENDING,,0.01,{'custom_model'_bd40,,,,,,


In [26]:
# 10, 10, 1
tuner.fit()

0,1
Current time:,2025-01-05 16:25:44
Running for:,00:28:51.35
Memory:,23.7/31.3 GiB

Trial name,status,loc,lr,iter,total time (s),ts,num_healthy_workers,num_in_flight_async_ sample_reqs,num_remote_worker_re starts
PPO_skyjo_4d094_00000,TERMINATED,192.168.0.236:365195,0.0001,100,560.817,400000,10,0,0
PPO_skyjo_4d094_00001,TERMINATED,192.168.0.236:367970,0.001,100,555.672,400000,10,0,0
PPO_skyjo_4d094_00002,TERMINATED,192.168.0.236:371106,0.01,100,564.558,400000,10,0,0


NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
2025-01-05 16:25:44,944	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/henry/Documents/SharedDocuments/Uni/TU/3.Semester/AdvRL/SkyjoAI/results/PPO_2025-01-05_15-56-35' in 0.0291s.
2025-01-05 16:25:45,136	INFO tune.py:1041 -- Total run time: 1731.56 seconds (1731.32 seconds for the tuning loop).


ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'policy_1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(2.4883952), 'cur_kl_coeff': np.float64(1.0125), 'cur_lr': np.float64(0.0001), 'total_loss': np.float64(9.908790166450268), 'policy_loss': np.float64(-0.08722403675712871), 'vf_loss': np.float64(10.0), 'vf_explained_var': np.float64(4.9208511005748404e-05), 'kl': np.float64(0.019722942662603403), 'entropy': np.float64(0.7985077326947992), 'entropy_coeff': np.float64(0.030000000000000006)}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': np.float64(120.63636363636364), 'num_grad_updates_lifetime': np.float64(32835.5), 'diff_num_grad_updates_vs_sampler_policy': np.float64(164.5)}, 'policy_0': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(1.8336459), 'cur_kl_coeff': np.float64(1.0125), 'cur_lr': np.float64(0.0001), 'total_loss': np.float64(9.84418970

In [15]:
# 5, 20, 1
tuner.fit()

0,1
Current time:,2025-01-05 15:23:37
Running for:,00:27:52.97
Memory:,26.9/31.3 GiB

Trial name,status,loc,lr,iter,total time (s),ts,num_healthy_workers,num_in_flight_async_ sample_reqs,num_remote_worker_re starts
PPO_skyjo_c267c_00000,TERMINATED,192.168.0.236:352221,0.0001,100,546.921,400000,20,0,0
PPO_skyjo_c267c_00001,TERMINATED,192.168.0.236:355087,0.001,100,536.238,400000,20,0,0
PPO_skyjo_c267c_00002,TERMINATED,192.168.0.236:357834,0.01,100,527.571,400000,20,0,0


2025-01-05 15:23:37,998	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/henry/ray_results/PPO_2025-01-05_14-55-44' in 0.0561s.
2025-01-05 15:23:38,467	INFO tune.py:1041 -- Total run time: 1673.44 seconds (1672.91 seconds for the tuning loop).


ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'policy_1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(2.1971877), 'cur_kl_coeff': np.float64(1.0125), 'cur_lr': np.float64(0.0001), 'total_loss': np.float64(9.89376881050341), 'policy_loss': np.float64(-0.0994886607032227), 'vf_loss': np.float64(10.0), 'vf_explained_var': np.float64(8.851127191023393e-05), 'kl': np.float64(0.0167132967919315), 'entropy': np.float64(0.7888238800294471), 'entropy_coeff': np.float64(0.030000000000000006)}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': np.float64(120.9090909090909), 'num_grad_updates_lifetime': np.float64(32835.5), 'diff_num_grad_updates_vs_sampler_policy': np.float64(164.5)}, 'policy_0': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(2.0731082), 'cur_kl_coeff': np.float64(1.0125), 'cur_lr': np.float64(0.0001), 'total_loss': np.float64(9.86859229116728