In [None]:
import sys

from utils import CategoricalCrossentropyLoss, KLDivergenceLoss


sys.path.append("../../")
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from muzero.action_functions import action_as_plane

env = TicTacToeConfig().make_env()

params = {
    "num_simulations": 25,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "action_function": action_as_plane,
    "n_step": 9,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": False,
    "gumbel_m": 16,
    "policy_loss_function": CategoricalCrossentropyLoss(),
    "training_steps": 40000,
    "transfer_interval": 1,
    # "num_workers": 1,
    "reanalyze_ratio": 0.8,
    "value_loss_factor": 0.25,
    "reanalyze_method": "mcts",
}
game_config = TicTacToeConfig()
config = MuZeroConfig(config_dict=params, game_config=game_config)


agent = MuZeroAgent(
    env=env,
    config=config,
    name="reanalyze_test",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 100

agent.train()

In [None]:
import sys

from utils import KLDivergenceLoss


sys.path.append("../../")
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from muzero.action_functions import action_as_plane

env = TicTacToeConfig().make_env()

params = {
    "num_simulations": 25,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "action_function": action_as_plane,
    "n_step": 9,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": True,
    "gumbel_m": 16,
    "policy_loss_function": KLDivergenceLoss(),
    "training_steps": 40000,
    "transfer_interval": 1,
    # "num_workers": 1,
    "reanalyze_ratio": 0.8,
    "reanalyze_noise": True,
    "value_loss_factor": 0.25,
    "reanalyze_method": "mcts",
}
game_config = TicTacToeConfig()
config = MuZeroConfig(config_dict=params, game_config=game_config)


agent = MuZeroAgent(
    env=env,
    config=config,
    name="gumbel_reanalyze_test",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 100

agent.train()

In [None]:
import sys

from utils import KLDivergenceLoss


sys.path.append("../../")
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from muzero.action_functions import action_as_plane

env = TicTacToeConfig().make_env()

params = {
    "num_simulations": 25,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "action_function": action_as_plane,
    "n_step": 9,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": False,
    "gumbel_m": 16,
    "policy_loss_function": CategoricalCrossentropyLoss(),
    "training_steps": 40000,
    "transfer_interval": 1,
    # "num_workers": 1,
    "reanalyze_ratio": 0.8,
    "value_loss_factor": 0.25,
    "reanalyze_method": "mcts",
    "injection_frac": 0.25,
}
game_config = TicTacToeConfig()
config = MuZeroConfig(config_dict=params, game_config=game_config)


agent = MuZeroAgent(
    env=env,
    config=config,
    name="unplugged_test",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 100

agent.train()

In [None]:
import sys

from utils import KLDivergenceLoss


sys.path.append("../../")
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from muzero.action_functions import action_as_plane

env = TicTacToeConfig().make_env()

params = {
    "num_simulations": 10,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "action_function": action_as_plane,
    "n_step": 9,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": True,
    "gumbel_m": 16,
    "policy_loss_function": KLDivergenceLoss(),
    "training_steps": 40000,
    "transfer_interval": 1,
    # "num_workers": 1,
}
game_config = TicTacToeConfig()
config = MuZeroConfig(config_dict=params, game_config=game_config)


agent = MuZeroAgent(
    env=env,
    config=config,
    name="gumbel_test_3",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 100

agent.train()

In [None]:
import sys

from utils import CategoricalCrossentropyLoss, KLDivergenceLoss


sys.path.append("../../")
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from muzero.action_functions import action_as_plane

env = TicTacToeConfig().make_env()

params = {
    "num_simulations": 25,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "action_function": action_as_plane,
    "n_step": 9,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": False,
    "gumbel_m": 16,
    "policy_loss_function": CategoricalCrossentropyLoss(),
    "training_steps": 40000,
    "transfer_interval": 1,
    # "num_workers": 1,
}
game_config = TicTacToeConfig()
config = MuZeroConfig(config_dict=params, game_config=game_config)


agent = MuZeroAgent(
    env=env,
    config=config,
    name="to_play_test",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent.checkpoint_interval = 100
agent.test_interval = 500
agent.test_trials = 100

agent.train()

In [None]:
# import sys

# import numpy as np

# sys.path.append("../../")

# from hyperparameter_optimization.hyperopt import save_search_space


# import dill as pickle
# from hyperopt import hp
# from hyperopt.pyll import scope
# from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
# import gymnasium as gym
# import torch
# from muzero.action_functions import action_as_plane as action_function
# from torch.optim import Adam, SGD

# search_space = {
#     "kernel_initializer": hp.choice(
#         "kernel_initializer",
#         [
#             "he_uniform",
#             "he_normal",
#             "glorot_uniform",
#             "glorot_normal",
#             "orthogonal",
#         ],
#     ),
#     "optimizer": hp.choice(
#         "optimizer",
#         [
#             {
#                 "optimizer": "adam",
#                 # "adam_epsilon": hp.qloguniform(
#                 #     "adam_epsilon", np.log(1e-8), np.log(0.5), 1e-8
#                 # ),
#                 "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 1, 8, 1)),
#             },
#             {
#                 "optimizer": "sgd",
#                 "momentum": hp.quniform("momentum", 0, 1, 0.1),
#             },
#         ],
#     ),
#     "conv_layers": hp.choice("conv_layers", [[]]),
#     # "learning_rate": hp.qloguniform(
#     #     "learning_rate", np.log(0.0001), np.log(0.01), 0.0001
#     # ),
#     "learning_rate": 10 ** (-hp.quniform("learning_rate", 1, 4, 1)),
#     "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
#     "residual_filters": scope.int(
#         hp.qloguniform("residual_filters", np.log(8), np.log(32), 8)
#     ),
#     "residual_stacks": scope.int(
#         hp.qloguniform("residual_stacks", np.log(1), np.log(3), 1)
#     ),
#     "conv_layers": hp.choice("conv_layers", [[]]),
#     "actor_and_critic_conv_filters": scope.int(
#         hp.qloguniform(
#             "actor_and_critic_conv_filters", np.log(0 + 8), np.log(32 + 8), 8
#         )
#         - 8  # to make 0 an option
#     ),
#     "reward_conv_layers": hp.choice("reward_conv_layers", [[]]),
#     "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
#     "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
#     "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
#     "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
#     "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
#     "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
#     "root_dirichlet_alpha": hp.quniform(
#         "root_dirichlet_alpha", 0.1, 2.0, 0.1
#     ),  # hp.choice("root_dirichlet_alpha", [0.3, 1.0, 2.0]),
#     "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
#     "num_simulations": scope.int(
#         hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-10, 25)
#     ),
# "temperature_updates": [scope.int(hp.quniform("temperature_updates", 0, 4, 1))],
# "temperatures": hp.choice("temperatures", [1.0, 0.1]),
# "temperature_with_training_steps": hp.choice(
#     "temperature_with_training_steps", False
# ),
#     "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
#     "pb_c_base": hp.choice("pb_c_base", [19652]),
#     "pb_c_init": hp.choice("pb_c_init", [1.25]),
#     "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
#     "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
#     "policy_loss_function": hp.choice(
#         "policy_loss_function", [CategoricalCrossentropyLoss()]
#     ),
#     "training_steps": scope.int(
#         hp.qloguniform("training_steps", np.log(10000), np.log(30000), 10000)
#     ),
#     # "minibatch_size": scope.int(
#     #     hp.qloguniform("minibatch_size", np.log(8), np.log(64), 8)
#     # ),
#     # "min_replay_buffer_size": scope.int(
#     #     hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
#     # ),
#     # "replay_buffer_size": scope.int(
#     #     hp.qloguniform("replay_buffer_size", np.log(10000), np.log(200000), 10000)
#     # ),
#     "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 6, 1))),
#     "min_replay_buffer_size": scope.int(
#         hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
#     ),
#     "replay_buffer_size": scope.int(10 ** (hp.quniform("replay_buffer_size", 4, 6, 1))),
#     "unroll_steps": hp.choice("unroll_steps", [5]),
#     "n_step": hp.choice("n_step", [9]),
#     "clipnorm": scope.int(hp.quniform("clipnorm", 0, 10.0, 1)),
#     "weight_decay": hp.choice("weight_decay", [1e-4]),
#     "per_alpha": hp.choice("per_alpha", [0.0]),
#     "per_beta": hp.choice("per_beta", [0.0]),
#     "per_beta_final": hp.choice("per_beta_final", [0.0]),
#     "per_epsilon": hp.choice("per_epsilon", [1e-4]),
#     "action_function": hp.choice("action_function", [action_function]),
#     "multi_process": hp.choice(
#         "multi_process",
#         [
#             {
#                 "multi_process": True,
#                 "num_workers": scope.int(hp.quniform("num_workers", 1, 3, 1)),
#             },
#             # {
#             #     "multi_process": False,
#             #     "games_per_generation": scope.int(
#             #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
#             #     ),
#             # },
#         ],
#     ),
#     "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
# }

# initial_best_config = []

# search_space, initial_best_config = save_search_space(search_space, initial_best_config)

In [None]:
# New SMALLEST SEARCH SPACE, IMPROVED
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_plane as action_function
from torch.optim import Adam, SGD

# size = 5 * 1 * 1 * 4.0 * 3 * 2.0 * 5 * 1 * 1 = 600

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                # "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 8, 8 + 1e-8, 2)),
                "adam_epsilon": hp.choice("adam_epsilon", [1e-8]),
                "adam_learning_rate": 10
                ** (-hp.quniform("adam_learning_rate", 3, 3 + 1e-8, 1)),
            },
            # {
            #     "optimizer": "sgd",
            #     "momentum": hp.choice("momentum", [0.0, 0.9]),
            #     "sgd_learning_rate": 10 ** (-hp.quniform("sgd_learning_rate", 1, 3, 1)),
            # },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_filters": scope.int(
        hp.qloguniform("residual_filters", np.log(24), np.log(24) + 1e-8, 8)
    ),
    "residual_stacks": scope.int(
        hp.qloguniform("residual_stacks", np.log(1), np.log(4), 1)
    ),
    "output_filters": scope.int(
        hp.qloguniform("output_filters", np.log(16 + 8), np.log(16 + 8) + 1e-8, 8)
        - 8  # to make 0 an option
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": 2 ** (hp.quniform("root_dirichlet_alpha", -3, -1, 1.0)),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(
        hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-8, 25)
    ),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 4, 8, 1))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(35000), np.log(45000), 10000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 3 + 1e-8, 1))),
    "min_replay_buffer_size": scope.int(
        hp.qloguniform(
            "min_replay_buffer_size", np.log(5000), np.log(5000) + 1e-8, 1000
        )
    ),
    "replay_buffer_size": scope.int(
        10 ** (hp.quniform("replay_buffer_size", 5, 5 + 1e-8, 1))
    ),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": hp.choice(
        # "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clip_val", 0, 2, 1)))]
        "clipnorm",
        [0.0],
    ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 2, 2 + 1e-8, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)


def prep_params(params):
    assert params["output_filters"] <= params["residual_filters"]

    params["residual_layers"] = [(params["residual_filters"], 3, 1)] * params[
        "residual_stacks"
    ]
    del params["residual_filters"]
    del params["residual_stacks"]
    if params["output_filters"] != 0:
        params["actor_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["critic_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["reward_conv_layers"] = [(params["output_filters"], 1, 1)]
    else:
        params["actor_conv_layers"] = []
        params["critic_conv_layers"] = []
    del params["output_filters"]

    if params["multi_process"]["multi_process"] == True:
        params["num_workers"] = params["multi_process"]["num_workers"]
        params["multi_process"] = True
    else:
        params["games_per_generation"] = params["multi_process"]["games_per_generation"]
        params["multi_process"] = False

    if params["optimizer"]["optimizer"] == "adam":
        params["adam_epsilon"] = params["optimizer"]["adam_epsilon"]
        params["learning_rate"] = params["optimizer"]["adam_learning_rate"]
        params["optimizer"] = Adam
    elif params["optimizer"]["optimizer"] == "sgd":
        params["momentum"] = params["optimizer"]["momentum"]
        params["learning_rate"] = params["optimizer"]["sgd_learning_rate"]
        params["optimizer"] = SGD

    print(params["clipnorm"])
    if isinstance(params["clipnorm"], dict):
        params["clipnorm"] = params["clipnorm"]["clipval"]
    params["support_range"] = None

    return params

In [None]:
# SMALLEST SEARCH SPACE, IMPROVED
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_plane as action_function
from torch.optim import Adam, SGD

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                # "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 8, 8 + 1e-8, 2)),
                "adam_epsilon": hp.choice("adam_epsilon", [1e-8]),
                "adam_learning_rate": 10
                ** (-hp.quniform("adam_learning_rate", 2, 3, 1)),
            },
            {
                "optimizer": "sgd",
                "momentum": hp.choice("momentum", [0.0, 0.9]),
                "sgd_learning_rate": 10 ** (-hp.quniform("sgd_learning_rate", 1, 3, 1)),
            },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_filters": scope.int(
        hp.qloguniform("residual_filters", np.log(24), np.log(24) + 1e-8, 8)
    ),
    "residual_stacks": scope.int(
        hp.qloguniform("residual_stacks", np.log(1), np.log(1) + 1e-8, 1)
    ),
    "output_filters": scope.int(
        hp.qloguniform("output_filters", np.log(16 + 8), np.log(16 + 8) + 1e-8, 8)
        - 8  # to make 0 an option
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": 2 ** (hp.quniform("root_dirichlet_alpha", -2, 1, 1.0)),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(
        hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-10, 25)
    ),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 0, 8, 1))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(35000), np.log(45000), 10000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 5, 1))),
    "min_replay_buffer_size": scope.int(
        hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
    ),
    "replay_buffer_size": scope.int(10 ** (hp.quniform("replay_buffer_size", 4, 7, 1))),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": hp.choice(
        "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clip_val", 0, 2, 1)))]
    ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 1, 3, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)


def prep_params(params):
    assert params["output_filters"] <= params["residual_filters"]

    params["residual_layers"] = [(params["residual_filters"], 3, 1)] * params[
        "residual_stacks"
    ]
    del params["residual_filters"]
    del params["residual_stacks"]
    if params["output_filters"] != 0:
        params["actor_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["critic_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["reward_conv_layers"] = [(params["output_filters"], 1, 1)]
    else:
        params["actor_conv_layers"] = []
        params["critic_conv_layers"] = []
    del params["output_filters"]

    if params["multi_process"]["multi_process"] == True:
        params["num_workers"] = params["multi_process"]["num_workers"]
        params["multi_process"] = True
    else:
        params["games_per_generation"] = params["multi_process"]["games_per_generation"]
        params["multi_process"] = False

    if params["optimizer"]["optimizer"] == "adam":
        params["adam_epsilon"] = params["optimizer"]["adam_epsilon"]
        params["learning_rate"] = params["optimizer"]["adam_learning_rate"]
        params["optimizer"] = Adam
    elif params["optimizer"]["optimizer"] == "sgd":
        params["momentum"] = params["optimizer"]["momentum"]
        params["learning_rate"] = params["optimizer"]["sgd_learning_rate"]
        params["optimizer"] = SGD

    print(params["clipnorm"])
    if isinstance(params["clipnorm"], dict):
        params["clipnorm"] = params["clipnorm"]["clipval"]
    params["support_range"] = None

    return params

In [None]:
# SLIGHTLY WIDER IMPROVED SPACE
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_plane as action_function
from torch.optim import Adam, SGD

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 8, 8 + 1e-10, 2)),
                "learning_rate": 10 ** (-hp.quniform("learning_rate", 2, 5, 1)),
            },
            {
                "optimizer": "sgd",
                "momentum": hp.choice("momentum", [0.0, 0.9]),
                "learning_rate": 10 ** (-hp.quniform("learning_rate", 1, 3, 1)),
            },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_filters": scope.int(
        hp.qloguniform("residual_filters", np.log(8), np.log(32), 8)
    ),
    "residual_stacks": scope.int(
        hp.qloguniform("residual_stacks", np.log(1), np.log(3), 1)
    ),
    "output_filters": scope.int(
        hp.qloguniform("output_filters", np.log(0 + 8), np.log(32 + 8), 8)
        - 8  # to make 0 an option
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": 2 ** (hp.quniform("root_dirichlet_alpha", -2, 2, 1.0)),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(
        hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-10, 25)
    ),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 0, 8, 1))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(11000), np.log(33000), 11000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 6, 1))),
    "min_replay_buffer_size": scope.int(
        hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
    ),
    "replay_buffer_size": scope.int(10 ** (hp.quniform("replay_buffer_size", 4, 6, 1))),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": hp.choice(
        "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clipnorm", 0, 2, 1)))]
    ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 1, 3, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)

In [None]:
# INITIAL SPACE
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_plane as action_function
from torch.optim import Adam, SGD

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                # "adam_epsilon": hp.qloguniform(
                #     "adam_epsilon", np.log(1e-8), np.log(0.5), 1e-8
                # ),
                "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 2, 8, 2)),
            },
            {
                "optimizer": "sgd",
                "momentum": hp.quniform("momentum", 0, 0.9, 0.1),
                # "momentum": hp.choice(
                #     "momentum", [0.0, 0.9]
                # ),
            },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "learning_rate": 10 ** (-hp.quniform("learning_rate", 1, 4, 1)),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_filters": scope.int(
        hp.qloguniform("residual_filters", np.log(8), np.log(32), 8)
    ),
    "residual_stacks": scope.int(
        hp.qloguniform("residual_stacks", np.log(1), np.log(3), 1)
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "output_filters": scope.int(
        hp.qloguniform("output_filters", np.log(0 + 8), np.log(32 + 8), 8)
        - 8  # to make 0 an option
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": hp.quniform("root_dirichlet_alpha", 0.1, 2.0, 0.1),
    # "root_dirichlet_alpha": 2
    # ** (
    #     hp.quniform("root_dirichlet_alpha", -2, 2, 1.0)
    # ),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(
        hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-10, 25)
    ),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 0, 8, 1))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(11000), np.log(33000), 11000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 6, 1))),
    "min_replay_buffer_size": scope.int(
        hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
    ),
    "replay_buffer_size": scope.int(10 ** (hp.quniform("replay_buffer_size", 4, 6, 1))),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": scope.int(hp.quniform("clipnorm", 0, 10.0, 1)),
    # "clipnorm": hp.choice(
    #     "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clipnorm", 0, 2, 1)))]
    # ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 1, 3, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)

In [None]:
# SMALL STANDARD SPACE (no picking num filters etc), should be compatible with initial
import sys

import numpy as np

sys.path.append("../../")

from hyperparameter_optimization.hyperopt import save_search_space


import dill as pickle
from hyperopt import hp
from hyperopt.pyll import scope
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch
from muzero.action_functions import action_as_plane as action_function
from torch.optim import Adam, SGD

search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "optimizer": "adam",
                # "adam_epsilon": hp.qloguniform(
                #     "adam_epsilon", np.log(1e-8), np.log(0.5), 1e-8
                # ),
                "adam_epsilon": 10 ** (-hp.quniform("adam_epsilon", 8.01, 8.02, 2)),
            },
            {
                "optimizer": "sgd",
                "momentum": hp.quniform("momentum", 0.91, 0.92, 0.1),
                # "momentum": hp.choice(
                #     "momentum", [0.0, 0.9]
                # ),
            },
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "learning_rate": 10 ** (-hp.quniform("learning_rate", 1, 4, 1)),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_filters": scope.int(
        hp.qloguniform("residual_filters", np.log(24), np.log(24) + 1e-8, 8)
    ),
    "residual_stacks": scope.int(
        hp.qloguniform("residual_stacks", np.log(1), np.log(1) + 1e-8, 1)
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "output_filters": scope.int(
        hp.qloguniform("output_filters", np.log(16 + 8), np.log(16 + 8) + 1e-8, 8)
        - 8  # to make 0 an option
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "value_loss_factor": hp.choice("value_loss_factor", [1.0]),
    "root_dirichlet_alpha": hp.quniform("root_dirichlet_alpha", 0.1, 2.0, 0.1),
    # "root_dirichlet_alpha": 2
    # ** (
    #     hp.quniform("root_dirichlet_alpha", -2, 2, 1.0)
    # ),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": scope.int(
        hp.qloguniform("num_simulations", np.log(25), np.log(25) + 1e-10, 25)
    ),
    "temperature_updates": [scope.int(hp.quniform("temperature_updates", 0, 8, 1))],
    "temperatures": hp.choice("temperatures", [[1.0, 0.1]]),
    "temperature_with_training_steps": hp.choice(
        "temperature_with_training_steps", [False]
    ),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": scope.int(
        hp.qloguniform("training_steps", np.log(11000), np.log(33000), 11000)
    ),
    "minibatch_size": scope.int(2 ** (hp.quniform("minibatch_size", 3, 6, 1))),
    "min_replay_buffer_size": scope.int(
        hp.qloguniform("min_replay_buffer_size", np.log(1000), np.log(10000), 1000)
    ),
    "replay_buffer_size": scope.int(10 ** (hp.quniform("replay_buffer_size", 4, 6, 1))),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": scope.int(hp.quniform("clipnorm", 0, 10.0, 1)),
    # "clipnorm": hp.choice(
    #     "clipnorm", [0.0, scope.int(10 ** (hp.quniform("clipnorm", 0, 2, 1)))]
    # ),
    "weight_decay": hp.choice("weight_decay", [1e-4]),
    "per_alpha": hp.choice("per_alpha", [0.0]),
    "per_beta": hp.choice("per_beta", [0.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
    "multi_process": hp.choice(
        "multi_process",
        [
            {
                "multi_process": True,
                "num_workers": scope.int(hp.quniform("num_workers", 1, 3, 1)),
            },
            # {
            #     "multi_process": False,
            #     "games_per_generation": scope.int(
            #         hp.qloguniform("games_per_generation", np.log(8), np.log(32), 8)
            #     ),
            # },
        ],
    ),
    "lr_ratio": hp.choice("lr_ratio", [float("inf")]),
}

initial_best_config = []

search_space, initial_best_config = save_search_space(search_space, initial_best_config)

In [None]:
def prep_params(params):
    assert params["output_filters"] <= params["residual_filters"]

    params["residual_layers"] = [(params["residual_filters"], 3, 1)] * params[
        "residual_stacks"
    ]
    del params["residual_filters"]
    del params["residual_stacks"]
    if params["output_filters"] != 0:
        params["actor_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["critic_conv_layers"] = [(params["output_filters"], 1, 1)]
        params["reward_conv_layers"] = [(params["output_filters"], 1, 1)]
    else:
        params["actor_conv_layers"] = []
        params["critic_conv_layers"] = []
    del params["output_filters"]

    if params["multi_process"]["multi_process"] == True:
        params["num_workers"] = params["multi_process"]["num_workers"]
        params["multi_process"] = True
    else:
        params["games_per_generation"] = params["multi_process"]["games_per_generation"]
        params["multi_process"] = False

    if params["optimizer"]["optimizer"] == "adam":
        params["adam_epsilon"] = params["optimizer"]["adam_epsilon"]
        params["optimizer"] = Adam
    elif params["optimizer"]["optimizer"] == "sgd":
        params["momentum"] = params["optimizer"]["momentum"]
        params["optimizer"] = SGD

    params["support_range"] = None

    return params

In [None]:
import pandas as pd
import random
from tqdm import tqdm
import sys
import dill as pickle

sys.path.append("../../")
from elo.elo import StandingsTable

games_per_pair = 10
try:
    players = pickle.load(open("./tictactoe_players.pkl", "rb"))
    table = pickle.load(open("./tictactoe_table.pkl", "rb"))
    print(table.bayes_elo())
    print(table.get_win_table())
    print(table.get_draw_table())
except:
    players = []
    table = StandingsTable([], start_elo=1000)

In [None]:
from game_configs.tictactoe_config import TicTacToeConfig
import torch

from pettingzoo.classic import tictactoe_v3


def play_game(player1, player2):

    env = TicTacToeConfig().make_env()
    with torch.no_grad():  # No gradient computation during testing
        # Reset environment
        env.reset()
        state, reward, termination, truncation, info = env.last()
        done = termination or truncation
        agent_id = env.agent_selection
        current_player = env.agents.index(agent_id)
        # state, info = process_petting_zoo_obs(state, info, current_player)
        agent_names = env.agents.copy()

        episode_length = 0
        while not done and episode_length < 1000:  # Safety limit
            # Get current agent and player
            episode_length += 1

            if current_player == 0:
                prediction = player1.predict(state, info, env=env)
                action = player1.select_actions(prediction, info).item()
            else:
                prediction = player2.predict(state, info, env=env)
                action = player2.select_actions(prediction, info).item()

            # Step environment
            env.step(action)
            state, reward, termination, truncation, info = env.last()
            agent_id = env.agent_selection
            current_player = env.agents.index(agent_id)
            # state, info = process_petting_zoo_obs(state, info, current_player)
            done = termination or truncation
        print(env.rewards)
        return env.rewards["player_0"]

In [None]:
from agents.random import RandomAgent
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import atpe, tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent

search_space_path, initial_best_config_path = (
    "search_space.pkl",
    "best_config.pkl",
)
# search_space = pickle.load(open(search_space_path, "rb"))
# initial_best_config = pickle.load(open(initial_best_config_path, "rb"))
file_name = "tictactoe_muzero"
max_trials = 64
trials_step = 24  # how many additional trials to do after loading the last ones

set_marl_config(
    MarlHyperoptConfig(
        file_name=file_name,
        eval_method="test_agents_elo",
        best_agent=TicTacToeBestAgent(),
        make_env=TicTacToeConfig().make_env,
        prep_params=prep_params,
        agent_class=MuZeroAgent,
        agent_config=MuZeroConfig,
        game_config=TicTacToeConfig,
        games_per_pair=500,
        num_opps=1,  # not used
        table=table,  # not used
        play_game=play_game,
        checkpoint_interval=100,
        test_interval=1000,
        test_trials=200,
        test_agents=[RandomAgent(), TicTacToeBestAgent()],
        test_agent_weights=[1.0, 2.0],
        device="cpu",
    )
)

try:  # try to load an already saved trials object, and increase the max
    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading...")
    max_trials = len(trials.trials) + trials_step
    print(
        "Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step
        )
    )
except:  # create a new trials object and start searching
    print("No saved Trials! Starting from scratch.")
    trials = None

best = fmin(
    fn=marl_objective,  # Objective Function to optimize
    space=search_space,  # Hyperparameter's Search Space
    algo=atpe.suggest,  # Optimization algorithm (representative TPE)
    max_evals=max_trials,  # Number of optimization attempts
    trials=trials,  # Record the results
    # early_stop_fn=no_progress_loss(5, 1),
    trials_save_file=f"./{file_name}_trials.p",
    points_to_evaluate=initial_best_config,
    show_progressbar=False,
)
print(best)
best_trial = space_eval(search_space, best)
# gc.collect()

In [None]:
from hyperparameter_optimization.hyperopt import (
    marl_objective,
    set_marl_config,
    MarlHyperoptConfig,
)
from hyperopt import tpe, fmin, space_eval
from hyperopt.exceptions import AllTrialsFailed

from muzero.muzero_agent_torch import MuZeroAgent
from agent_configs import MuZeroConfig
from game_configs import TicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent

search_space_path, initial_best_config_path = (
    "search_space.pkl",
    "best_config.pkl",
)
# search_space = pickle.load(open(search_space_path, "rb"))
# initial_best_config = pickle.load(open(initial_best_config_path, "rb"))
file_name = "tictactoe_muzero"
max_trials = 1
trials_step = 64  # how many additional trials to do after loading the last ones

set_marl_config(
    MarlHyperoptConfig(
        file_name=file_name,
        eval_method="elo",
        best_agent=TicTacToeBestAgent(),
        make_env=tictactoe_v3.env,
        prep_params=prep_params,
        agent_class=MuZeroAgent,
        agent_config=MuZeroConfig,
        game_config=TicTacToeConfig,
        games_per_pair=100,
        num_opps=1,  # not used
        table=table,  # not used
        play_game=play_game,
        checkpoint_interval=50,
        test_interval=250,
        test_trials=25,
        test_agents=[RandomAgent(), TicTacToeBestAgent()],
        device="cpu",
    )
)

try:  # try to load an already saved trials object, and increase the max
    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading...")
    max_trials = len(trials.trials) + 1
    print(
        "Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step
        )
    )
except:  # create a new trials object and start searching
    trials = None

for i in range(trials_step):
    try:
        best = fmin(
            fn=marl_objective,  # Objective Function to optimize
            space=search_space,  # Hyperparameter's Search Space
            algo=tpe.suggest,  # Optimization algorithm (representative TPE)
            max_evals=max_trials,  # Number of optimization attempts
            trials=trials,  # Record the results
            # early_stop_fn=no_progress_loss(5, 1),
            trials_save_file=f"./{file_name}_trials.p",
            points_to_evaluate=initial_best_config,
            show_progressbar=False,
        )
    except AllTrialsFailed:
        print("trial failed")

    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading and Updating...")
    try:
        elo_table = table.bayes_elo()["Elo table"]
        for trial in range(len(trials.trials)):
            trial_elo = elo_table.iloc[trial]["Elo"]
            print(f"Trial {trials.trials[trial]['tid']} ELO: {trial_elo}")
            trials.trials[trial]["result"]["loss"] = -trial_elo
            pickle.dump(trials, open(f"./{file_name}_trials.p", "wb"))
    except ZeroDivisionError:
        print("Not enough players to calculate elo.")
    max_trials = len(trials.trials) + 1
    print(best)
    best_trial = space_eval(search_space, best)
# gc.collect()

In [None]:
# shared network but not shared buffer?
# 1 vs 2 minibatches
import sys

sys.path.append("../..")

from dqn.NFSP.nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import TicTacToeConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 10000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 128,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [(128, 3, 1)] * 3,
    "conv_layers": [(32, 3, 1)],
    "dense_layer_widths": [],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 128,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [(128, 3, 1)] * 3,
    "sl_conv_layers": [(32, 3, 1)],
    "sl_dense_layer_widths": [],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=TicTacToeConfig(),
)
config.save_intermediate_weights = True

In [None]:
from pettingzoo.classic import tictactoe_v3

env = tictactoe_v3.env(render_mode="rgb_array")

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-TicTacToe-Standard")

In [None]:
agent.checkpoint_interval = 100
agent.checkpoint_trials = 100
agent.train()

In [None]:
# shared network but not shared buffer?
# 1 vs 2 minibatches
import sys

sys.path.append("../..")

from dqn.NFSP.nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import TicTacToeConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 10000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": KLDivergenceLoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [(128, 3, 1)] * 3,
    "conv_layers": [(32, 3, 1)],
    "dense_layer_widths": [],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.06,
    "eg_epsilon": 0.0,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [(128, 3, 1)] * 3,
    "sl_conv_layers": [(32, 3, 1)],
    "sl_dense_layer_widths": [],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.5,
    "per_beta": 0.5,
    "per_beta_final": 1.0,
    "per_epsilon": 0.00001,
    "n_step": 3,
    "atom_size": 51,
    "dueling": True,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=TicTacToeConfig(),
)
config.save_intermediate_weights = True

In [None]:
from pettingzoo.classic import tictactoe_v3

env = tictactoe_v3.env(render_mode="rgb_array")

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-TicTacToe-Rainbow")

In [None]:
agent.checkpoint_interval = 100
agent.checkpoint_trials = 100
agent.train()