*** TO DO FOR CATAN: ***
RAINBOW: 
    1. vs Random
    2. vs Weighted Random
    3. vs MTCS
    4. vs Victory Point
    5. vs AlphaBeta
Masked PPO the same 
NFSP 
MuZero

MUZERO

In [1]:
# New SMALLEST SEARCH SPACE, IMPROVED
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_onehot as action_function
from torch.optim import Adam, SGD

# size = 5 * 1 * 1 * 4.0 * 3 * 2.0 * 5 * 1 * 1 = 600

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                # "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 8, 8 + 1e-8, 2)),
                "adam_epsilon": hp.choice("adam_epsilon", [1e-8]),
                "adam_learning_rate": 10
                ** (-hp.quniform("adam_learning_rate", 3, 3 + 1e-8, 1)),
            },
            # {
            #     "optimizer": "sgd",
            #     "momentum": hp.choice("momentum", [0.0, 0.9]),
            #     "sgd_learning_rate": 10 ** (-hp.quniform("sgd_learning_rate", 1, 3, 1)),
            # },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    # "residual_filters": scope.int(
    #     hp.qloguniform("residual_filters", np.log(24), np.log(24) + 1e-8, 8)
    # ),
    # "residual_stacks": scope.int(
    #     hp.qloguniform("residual_stacks", np.log(1), np.log(4), 1)
    # ),
    "residual_layers": hp.choice("residual_layers", [[]]),
    "actor_conv_layers": hp.choice("actor_conv_layers", [[]]),
    "critic_conv_layers": hp.choice("critic_conv_layers", [[]]),
    "reward_conv_layers": hp.choice("reward_conv_layers", [[]]),
    "output_layer_widths": scope.int(hp.quniform("output_layer_widths", 0, 16, 16)),
    "dense_layer_width": scope.int(
        hp.quniform("dense_layer_width", 128, 128 + 1e-8, 128)
    ),
    "dense_layers": scope.int(hp.quniform("dense_layers", 1, 3, 1)),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": 2
    ** (hp.quniform("root_dirichlet_alpha", -1, -1 + 1e-8, 1.0)),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(800 * 2 ** hp.quniform("num_simulations", -5, 0, 1)),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 16, 32, 8))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(35000), np.log(45000), 10000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 5, 1))),
    "min_replay_buffer_size": scope.int(
        10 ** hp.quniform("min_replay_buffer_size", 2, 2 + 1e-8, 1)
    ),
    "replay_buffer_size": scope.int(
        10 ** (hp.quniform("replay_buffer_size", 5, 6 + 1e-8, 1))
    ),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [1000]),
    "clipnorm": hp.choice(
        # "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clip_val", 0, 2, 1)))]
        "clipnorm",
        [0.0],
    ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 2, 4 + 1e-8, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)


def prep_params(params):
    params["dense_layer_widths"] = [params["dense_layer_width"]] * params[
        "dense_layers"
    ]
    del params["dense_layer_width"]
    del params["dense_layers"]
    if params["output_layer_widths"] != 0:
        params["actor_dense_layer_widths"] = [params["output_layer_widths"]]
        params["critic_dense_layer_widths"] = [params["output_layer_widths"]]
        params["reward_dense_layer_widths"] = [params["output_layer_widths"]]
    else:
        params["actor_dense_layer_widths"] = []
        params["critic_dense_layer_widths"] = []
        params["reward_dense_layer_widths"] = []
    del params["output_layer_widths"]

    if params["multi_process"]["multi_process"] == True:
        params["num_workers"] = params["multi_process"]["num_workers"]
        params["multi_process"] = True
    else:
        params["games_per_generation"] = params["multi_process"]["games_per_generation"]
        params["multi_process"] = False

    if params["optimizer"]["optimizer"] == "adam":
        params["adam_epsilon"] = params["optimizer"]["adam_epsilon"]
        params["learning_rate"] = params["optimizer"]["adam_learning_rate"]
        params["optimizer"] = Adam
    elif params["optimizer"]["optimizer"] == "sgd":
        params["momentum"] = params["optimizer"]["momentum"]
        params["learning_rate"] = params["optimizer"]["sgd_learning_rate"]
        params["optimizer"] = SGD

    if isinstance(params["clipnorm"], dict):
        params["clipnorm"] = params["clipnorm"]["clipval"]
    params["support_range"] = None

    return params

In [2]:
from game_configs.catan_config import CatanConfig
import torch
from custom_gym_envs.envs.catan import (
    env as catan_env,
    CatanAECEnv,
)


def play_game(player1, player2):

    env = CatanConfig().make_env()
    with torch.no_grad():  # No gradient computation during testing
        # Reset environment
        env.reset()
        state, reward, termination, truncation, info = env.last()
        done = termination or truncation
        agent_id = env.agent_selection
        current_player = env.agents.index(agent_id)
        # state, info = process_petting_zoo_obs(state, info, current_player)
        agent_names = env.agents.copy()

        episode_length = 0
        while not done and episode_length < 1000:  # Safety limit
            # Get current agent and player
            if current_player == 0:
                prediction = player1.predict(state, info, env=env, temperature=0.05)
                action = player1.select_actions(prediction, info).item()
            else:
                prediction = player2.predict(state, info, env=env, temperature=0.05)
                action = player2.select_actions(prediction, info).item()

            # Step environment
            env.step(action)
            state, reward, termination, truncation, info = env.last()
            agent_id = env.agent_selection
            current_player = env.agents.index(agent_id)
            # state, info = process_petting_zoo_obs(state, info, current_player)
            done = termination or truncation
            episode_length += 1
        print(env.rewards)
        return env.rewards["player_0"]

In [3]:
from agents.catan_player_wrapper import CatanPlayerWrapper
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig

search_space_path, initial_best_config_path = (
    "search_space.pkl",
    "best_config.pkl",
)
# search_space = pickle.load(open(search_space_path, "rb"))
# initial_best_config = pickle.load(open(initial_best_config_path, "rb"))
file_name = "catan_muzero"
max_trials = 64
trials_step = 24  # how many additional trials to do after loading the last ones

from catanatron import Game, RandomPlayer, Color
from catanatron.players.mcts import MCTSPlayer
from catanatron.players.minimax import AlphaBetaPlayer
from catanatron.players.playouts import GreedyPlayoutsPlayer
from catanatron.players.search import VictoryPointPlayer
from catanatron.players.weighted_random import WeightedRandomPlayer
from catanatron.players.value import ValueFunctionPlayer

import pandas as pd
import random
from tqdm import tqdm
import sys
import dill as pickle

sys.path.append("../../")
from elo.elo import StandingsTable

games_per_pair = 10
try:
    players = pickle.load(open("./tictactoe_players.pkl", "rb"))
    table = pickle.load(open("./tictactoe_table.pkl", "rb"))
    print(table.bayes_elo())
    print(table.get_win_table())
    print(table.get_draw_table())
except:
    players = []
    table = StandingsTable([], start_elo=1000)

set_marl_config(
    MarlHyperoptConfig(
        file_name=file_name,
        eval_method="test_agents_elo",
        best_agent=CatanPlayerWrapper(AlphaBetaPlayer, Color.WHITE),
        make_env=CatanConfig().make_env,
        prep_params=prep_params,
        agent_class=MuZeroAgent,
        agent_config=MuZeroConfig,
        game_config=CatanConfig,
        games_per_pair=500,
        num_opps=1,  # not used
        table=table,  # not used
        play_game=play_game,
        checkpoint_interval=100,
        test_interval=1000,
        test_trials=10,
        test_agents=[
            CatanPlayerWrapper(RandomPlayer, Color.WHITE),
            CatanPlayerWrapper(AlphaBetaPlayer, Color.WHITE),
        ],
        test_agent_weights=[1.0, 2.0],
        device="cpu",
    )
)

try:  # try to load an already saved trials object, and increase the max
    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading...")
    max_trials = len(trials.trials) + trials_step
    print(
        "Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step
        )
    )
except:  # create a new trials object and start searching
    print("No saved Trials! Starting from scratch.")
    trials = None

best = fmin(
    fn=marl_objective,  # Objective Function to optimize
    space=search_space,  # Hyperparameter's Search Space
    algo=atpe.suggest,  # Optimization algorithm (representative TPE)
    max_evals=max_trials,  # Number of optimization attempts
    trials=trials,  # Record the results
    # early_stop_fn=no_progress_loss(5, 1),
    trials_save_file=f"./{file_name}_trials.p",
    points_to_evaluate=initial_best_config,
    show_progressbar=False,
)
print(best)
best_trial = space_eval(search_space, best)
# gc.collect()

No saved Trials! Starting from scratch.
Params:  {'action_function': <function action_as_onehot at 0x314d30280>, 'actor_conv_layers': (), 'clip_low_prob': 0.0, 'clipnorm': 0.0, 'conv_layers': (), 'critic_conv_layers': (), 'dense_layer_width': 128, 'dense_layers': 1, 'kernel_initializer': 'glorot_normal', 'known_bounds': (-1, 1), 'lr_ratio': inf, 'min_replay_buffer_size': 100, 'minibatch_size': 16, 'multi_process': {'multi_process': True, 'num_workers': 2}, 'n_step': 1000, 'noisy_sigma': 0.0, 'num_simulations': 200, 'optimizer': {'adam_epsilon': 1e-08, 'adam_learning_rate': 0.001, 'optimizer': 'adam'}, 'output_layer_widths': 16, 'pb_c_base': 19652, 'pb_c_init': 1.25, 'per_alpha': 0.0, 'per_beta': 0.0, 'per_beta_final': 0.0, 'per_epsilon': 0.0001, 'policy_loss_function': <utils.utils.CategoricalCrossentropyLoss object at 0x314d236d0>, 'replay_buffer_size': 100000, 'residual_layers': (), 'reward_conv_layers': (), 'reward_loss_function': <utils.utils.MSELoss object at 0x314d23580>, 'root_d

  f"Shape: {param.shape}, std: {param.std():.4f}, mean: {param.mean():.4f}\n"


Initializing stat 'score' with subkeys None
Initializing stat 'policy_loss' with subkeys None
Initializing stat 'value_loss' with subkeys None
Initializing stat 'reward_loss' with subkeys None
Initializing stat 'loss' with subkeys None
Initializing stat 'test_score' with subkeys ['score', 'max_score', 'min_score']
Initializing stat 'test_score_vs_RandomPlayer' with subkeys ['score', 'player_0_score', 'player_1_score', 'player_0_win%', 'player_1_win%']
Initializing stat 'test_score_vs_AlphaBetaPlayer' with subkeys ['score', 'player_0_score', 'player_1_score', 'player_0_win%', 'player_1_win%']
[Worker 0] Starting self-play...
[Worker 1] Starting self-play...
Move 1Move
 1
Move 2
Move 2
Move 3
Move 3
Move 4
Move 4
Move 5
Move 5
Move 6
Move 6
Move 7
Move 7
Move 8
Move 8
Move 9
Move 9
Move 10
Move 10
Move 11
Move 11
Move 12
Move 12
Move 13
Move 14
Move 13
Move 15
Move 14
Move 16


Process Process-2:
Process Process-1:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/jonathanlamontange-kratz/Documents/GitHub/rl-stuff/experiments/catan/../../muzero/muzero_agent_torch.py", line 217, in worker_fn
    score, num_steps = self.play_game(env=worker_env)
  File "/Users/jonathanlamontange-kratz/Documents/GitHub/rl-stuff/experiments/catan/../../muzero/muzero_agent_torch.py", line 702, in play_game
    prediction = self.predict(
  File "/Users/jonathanlamontange-kratz/Documents/GitHub/rl-stuff/experiments/catan/../../muzero/muzero_agent_torch.py", line 637, in predict
    value, visit_counts = self.monte_car

KeyboardInterrupt: 

RAINBOW

In [None]:
import sys

# sys.path.append("/content/rl-research")
sys.path.append("../..")
import gymnasium as gym
import torch

from wrappers import CatanatronWrapper
from utils import CategoricalCrossentropyLoss, KLDivergenceLoss


from dqn.rainbow.rainbow_agent import RainbowAgent
from agent_configs import RainbowConfig
from game_configs.catan_config import SinglePlayerCatanConfig
from catanatron import Game, RandomPlayer, Color
from catanatron.players.mcts import MCTSPlayer
from catanatron.players.minimax import AlphaBetaPlayer
from catanatron.players.playouts import GreedyPlayoutsPlayer
from catanatron.players.search import VictoryPointPlayer
from catanatron.players.weighted_random import WeightedRandomPlayer
from catanatron.players.value import ValueFunctionPlayer

config_dict = {
    "dense_layer_widths": [256, 256, 256, 256],
    "value_hidden_layers_widths": [256],  #
    "advatage_hidden_layers_widths": [256],  #
    "adam_epsilon": 1e-8,
    "learning_rate": 0.001,
    "training_steps": 30000,
    "per_epsilon": 1e-6,
    "per_alpha": 0.5,
    "per_beta": 0.5,
    "minibatch_size": 16,
    "replay_buffer_size": 500000,
    "min_replay_buffer_size": 5000,
    "transfer_interval": 100,
    "n_step": 9,
    "kernel_initializer": "orthogonal",
    "loss_function": KLDivergenceLoss(),
    "clipnorm": 0.0,
    "discount_factor": 1.0,  # or 0.999 or even 0.9999 not 0.99 < this makes the start of the game possibly 0.05 after bootstrapping
    "atom_size": 51,
    "replay_interval": 512,
}
game_config = SinglePlayerCatanConfig()
config = RainbowConfig(config_dict, game_config)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
import catanatron.gym
import gymnasium as gym

env = CatanatronWrapper(
    gym.make(
        "catanatron/Catanatron-v0",
        config={
            "enemies": [RandomPlayer(Color.RED)],
            "invalid_action_reward": -10,
            "map_type": "BASE",
            "vps_to_win": 10,
            "representation": "vector",
        },
    )
)
agent = RainbowAgent(env, config, "rainbow-catan-10vps", device)
agent.checkpoint_interval = 10
agent.test_interval = 100
agent.test_trials = 25
agent.train()

In [None]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from dqn.NFSP.nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import CatanConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 1e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 100000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=CatanConfig(),
)
config.save_intermediate_weights = False
from custom_gym_envs.envs.catan import (
    env as catan_env,
    CatanAECEnv,
)

env = catan_env(
    num_players=2,
    map_type="BASE",
    vps_to_win=10,
    representation="vector",
    invalid_action_reward=-10,
)

env = ActionMaskInInfoWrapper(env)
# env = FrameStackWrapper(env, 4, channel_first=False)

agent = NFSPDQN(env, config, name="nfsp-catan", device="cuda:0")
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 10
agent.train()

In [None]:
import sys
import catanatron.gym

sys.path.append("../../")
import gymnasium as gym
import random

env = gym.make("catanatron/Catanatron-v0")
observation, info = env.reset()
for _ in range(1000):
    # your agent here (this takes random actions)
    action = random.choice(info["valid_actions"])

    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    if done:
        observation, info = env.reset()
env.close()

In [None]:
from catanatron import Game, RandomPlayer, Color
from catanatron.players.mcts import MCTSPlayer
from catanatron.players.minimax import AlphaBetaPlayer
from catanatron.players.playouts import GreedyPlayoutsPlayer
from catanatron.players.search import VictoryPointPlayer
from catanatron.players.weighted_random import WeightedRandomPlayer
from catanatron.players.value import ValueFunctionPlayer

# Instantiate two random players
player1 = RandomPlayer(Color.RED)
player2 = RandomPlayer(Color.BLUE)

# Create a 2-player game (you can fill remaining slots with random agents if needed)
players = [player1, player2]
game = Game(players)

winner = game.play()
print(f"Winner: {winner}")


def play_game(player1, player2):
    player1 = player1(Color.RED)
    player2 = player2(Color.BLUE)
    game = Game([player1, player2])
    winner = game.play()

    if winner == Color.RED:
        return 1
    elif winner == Color.BLUE:
        return -1
    else:
        return 0

In [None]:
import pandas as pd
import random
from tqdm import tqdm
import sys

sys.path.append("../../")
from elo.elo import StandingsTable

players = [
    RandomPlayer,
    MCTSPlayer,
    AlphaBetaPlayer,
    # GreedyPlayoutsPlayer,
    VictoryPointPlayer,
    WeightedRandomPlayer,
    # ValueFunctionPlayer,
]
games_per_pair = 10

player_names = [p.__name__ for p in players]
table = StandingsTable(player_names, start_elo=1000)


def play_1v1_tournament(players, games_per_pair, play_game):
    tournament_results = []
    for player1 in players:
        results = play_matches(player1, players, games_per_pair, play_game)
        tournament_results.extend(results)
    tournament_results = pd.DataFrame(
        tournament_results, columns=["player1", "player2", "result"]
    )
    return tournament_results


def play_matches(player1, players, games_per_pair, play_game):
    results = []
    for opponent in players:
        if opponent != player1:
            for _ in range(games_per_pair // 2):
                print(f"Playing {player1.__name__} vs {opponent.__name__} game {_+1}")
                result = play_game(player1, opponent)
                results.append((player1.__name__, opponent.__name__, result))

    for opponent in players:
        if opponent != player1:
            for _ in range(games_per_pair // 2):
                print(f"Playing {opponent.__name__} vs {player1.__name__} game {_+1}")
                result = play_game(opponent, player1)
                results.append(
                    (
                        player_names[players.index(opponent)],
                        player_names[players.index(player1)],
                        result,
                    )
                )
    table.add_results_from_array(results)
    print(table.bayes_elo())
    return results

In [None]:
import pickle

print(table.bayes_elo())
print(table.get_win_table())
print(table.get_draw_table())
file = "catan_1v1_tournament_results.pkl"
pickle.dump(table, open(file, "wb"))

In [None]:
from catanatron import Game, RandomPlayer, Color
from catanatron.players.mcts import MCTSPlayer
from catanatron.players.minimax import AlphaBetaPlayer
from catanatron.players.playouts import GreedyPlayoutsPlayer
from catanatron.players.search import VictoryPointPlayer
from catanatron.players.weighted_random import WeightedRandomPlayer
from catanatron.players.value import ValueFunctionPlayer

table.add_player(GreedyPlayoutsPlayer.__name__)
players.append(GreedyPlayoutsPlayer) if GreedyPlayoutsPlayer not in players else None

play_matches(GreedyPlayoutsPlayer, players, games_per_pair * 2, play_game)

In [None]:
results = play_1v1_tournament(players, games_per_pair, play_game)


# table.add_results_from_dataframe(results)  # Adding multiple results
print(table.bayes_elo())

In [None]:
# Test a petting zoo environment to see if it has all the functions and attributes needed
from custom_gym_envs.envs.catan import (
    env as catan_env,
    CatanAECEnv,
)

env = catan_env(
    num_players=2,
    map_type="BASE",
    vps_to_win=10,
    representation="vector",
    invalid_action_reward=-1,
)


# test reset
env.reset()
print(env.agent_selection)

# # Get initial state for first agent
# state, reward, termination, truncation, info = env.last()
# print(state, info)


# ab_player = CatanPlayerWrapper(AlphaBetaPlayer, Color.RED)
# prediction = ab_player.predict(state, info, env)
# action = ab_player.select_actions(prediction, info)
# print(action)