In [None]:
import sys

sys.path.append("../../")

import pickle
from hyperopt import hp
from utils import CategoricalCrossentropyLoss, MSELoss, generate_layer_widths
import gymnasium as gym
import torch


def action_function(x):
    onehot_action = torch.zeros((3, 3)).view(-1)
    onehot_action[x] = 1
    return onehot_action.view(1, 3, 3)


search_space = {
    "kernel_initializer": hp.choice(
        "kernel_initializer",
        [
            "he_uniform",
            "he_normal",
            "glorot_uniform",
            "glorot_normal",
            "orthogonal",
        ],
    ),
    "learning_rate": hp.choice("learning_rate", [0.01, 0.001, 0.0001, 0.00001]),
    "adam_epsilon": hp.choice("adam_epsilon", [0.3125, 0.03125, 0.003125, 0.0003125]),
    "known_bounds": hp.choice("known_bounds", [[-1, 1]]),
    "residual_layers": hp.choice(
        "residual_layers",
        [
            [(16, 3, 1)] * 1,
            [(32, 3, 1)] * 1,
            [(64, 3, 1)] * 1,
            [(128, 3, 1)] * 1,
            [(16, 3, 1)] * 3,
            [(32, 3, 1)] * 3,
            [(64, 3, 1)] * 3,
            [(128, 3, 1)] * 3,
            [(16, 3, 1)] * 5,
            [(32, 3, 1)] * 5,
            [(64, 3, 1)] * 5,
            [(128, 3, 1)] * 5,
            # [(16, 3, 1)] * 10,
            # [(32, 3, 1)] * 10,
            # [(64, 3, 1)] * 10,
            # [(128, 3, 1)] * 10,
            # [(16, 3, 1)] * 20,
            # [(32, 3, 1)] * 20,
            # [(64, 3, 1)] * 20,
            # [(128, 3, 1)] * 20,
        ],
    ),
    "conv_layers": hp.choice("conv_layers", [[]]),
    "dense_layers": hp.choice("dense_layers", [[]]),
    "actor_conv_layers": hp.choice(
        "actor_conv_layers", [[], [(32, 1, 1)], [(64, 1, 1)], [(128, 1, 1)]]
    ),
    "critic_conv_layers": hp.choice(
        "critic_conv_layers", [[], [(32, 1, 1)], [(64, 1, 1)], [(128, 1, 1)]]
    ),
    "reward_conv_layers": hp.choice(
        "reward_conv_layers", [[], [(32, 1, 1)], [(64, 1, 1)], [(128, 1, 1)]]
    ),
    "actor_dense_layer_widths": hp.choice("actor_dense_layer_widths", [[]]),
    "critic_dense_layer_widths": hp.choice("critic_dense_layer_widths", [[]]),
    "reward_dense_layer_widths": hp.choice("reward_dense_layer_widths", [[]]),
    "dense_layer_widths": hp.choice("dense_layer_widths", [[]]),
    "noisy_sigma": hp.choice("noisy_sigma", [0.0]),
    "games_per_generation": hp.choice(
        "games_per_generation",
        [
            32,
            64,
            # 128
        ],
    ),
    "value_loss_factor": hp.choice("value_loss_factor", [0.25, 1.0]),
    "root_dirichlet_alpha": hp.choice("root_dirichlet_alpha", [0.3, 1.0, 2.0]),
    "root_exploration_fraction": hp.choice("root_exploration_fraction", [0.25]),
    "num_simulations": hp.choice(
        "num_simulations",
        [
            25,
            50,
            100,
            200,
            # 400,
            # 800
        ],
    ),
    "num_sampling_moves": hp.choice("num_sampling_moves", [0, 1, 2, 3, 5, 9]),
    "exploration_temperature": hp.choice("exploration_temperature", [1.0]),
    "exploitation_temperature": hp.choice("exploitation_temperature", [0.1]),
    "clip_low_prob": hp.choice("clip_low_prob", [0.0]),
    "pb_c_base": hp.choice("pb_c_base", [19652]),
    "pb_c_init": hp.choice("pb_c_init", [1.25]),
    "value_loss_function": hp.choice("value_loss_function", [MSELoss()]),
    "reward_loss_function": hp.choice("reward_loss_function", [MSELoss()]),
    "policy_loss_function": hp.choice(
        "policy_loss_function", [CategoricalCrossentropyLoss()]
    ),
    "training_steps": hp.choice("training_steps", [200]),
    "minibatch_size": hp.choice(
        "minibatch_size",
        [
            32,
            64,
            # 128
        ],
    ),
    "min_replay_buffer_size": hp.choice(
        "min_replay_buffer_size",
        [
            32,
            1024,
            # 2048
        ],
    ),
    "replay_buffer_size": hp.choice("replay_buffer_size", [4000, 8000, 16000, 32000]),
    "unroll_steps": hp.choice("unroll_steps", [5]),
    "n_step": hp.choice("n_step", [9]),
    "clipnorm": hp.choice("clipnorm", [0.0, 1.0, 10.0]),
    "weight_decay": hp.choice("weight_decay", [1e-5, 1e-4, 1e-3]),
    "per_alpha": hp.choice("per_alpha", [0.0, 0.5, 1.0]),
    "per_beta": hp.choice("per_beta", [0.0, 0.5, 1.0]),
    "per_beta_final": hp.choice("per_beta_final", [0.0, 0.5, 1.0]),
    "per_epsilon": hp.choice("per_epsilon", [1e-4]),
    "action_function": hp.choice("action_function", [action_function]),
}

initial_best_config = [{}]


pickle.dump(search_space, open("./search_spaces/search_space.pkl", "wb"))
pickle.dump(
    initial_best_config,
    open("./search_spaces/initial_best_config.pkl", "wb"),
)

In [2]:
import pandas as pd
import random
from tqdm import tqdm
import sys

sys.path.append("../../")
from elo.elo import StandingsTable

players = []
games_per_pair = 10

player_names = []
table = StandingsTable(player_names, start_elo=1000)

import pickle

print(table.bayes_elo())
print(table.get_win_table())
print(table.get_draw_table())
file = "tic_tac_toe_1v1_tournament_results.pkl"
pickle.dump(table, open(file, "wb"))


def play_1v1_tournament(players, games_per_pair, play_game):
    tournament_results = []
    for player1 in players:
        results = play_matches(player1, players, games_per_pair, play_game)
        tournament_results.extend(results)
    tournament_results = pd.DataFrame(
        tournament_results, columns=["player1", "player2", "result"]
    )
    return tournament_results


def play_matches(player1, players, games_per_pair, play_game):
    results = []
    for opponent in players:
        if opponent != player1:
            for _ in range(games_per_pair // 2):
                print(
                    f"Playing {player1.model_name} vs {opponent.model_name} game {_+1}"
                )
                result = play_game(player1, opponent)
                results.append((player1.model_name, opponent.model_name, result))

    for opponent in players:
        if opponent != player1:
            for _ in range(games_per_pair // 2):
                print(
                    f"Playing {opponent.model_name} vs {player1.model_name} game {_+1}"
                )
                result = play_game(opponent, player1)
                results.append(
                    (
                        opponent.model_name,
                        player1.model_name,
                        result,
                    )
                )
    table.add_results_from_array(results)
    print(table.bayes_elo())
    pickle.dump(table, open(file, "wb"))
    return results

{'Elo table': Empty DataFrame
Columns: [Elo, Games, Score, Draws]
Index: [], 'eloAdvantage': 0.0, 'eloDraw': 1000.0}
[]
[]


In [None]:
import torch
from packages.utils.utils.utils import process_petting_zoo_obs

from pettingzoo.classic import tictactoe_v3


def play_game(player1, player2):

    env = tictactoe_v3.env(render_mode="rgb_array")
    with torch.no_grad():  # No gradient computation during testing
        # Reset environment
        env.reset()
        state, reward, termination, truncation, info = env.last()
        done = termination or truncation
        agent_id = env.agent_selection
        current_player = env.agents.index(agent_id)
        state, info = process_petting_zoo_obs(state, info, current_player)
        agent_names = env.agents.copy()

        episode_length = 0
        while not done and episode_length < 1000:  # Safety limit
            # Get current agent and player
            episode_length += 1

            # Get action from average strategy
            if current_player == 0:
                prediction = player1.predict(state, info, env=env)
                action = player1.select_actions(prediction, info).item()
            else:
                prediction = player2.predict(state, info, env=env)
                action = player2.select_actions(prediction, info).item()

            # Step environment
            env.step(action)
            state, reward, termination, truncation, info = env.last()
            agent_id = env.agent_selection
            current_player = env.agents.index(agent_id)
            state, info = process_petting_zoo_obs(state, info, current_player)
            done = termination or truncation
        print(env.rewards)
        return env.rewards["player_0"]

In [None]:
search_space_path, initial_best_config_path = (
    "./search_spaces/search_space.pkl",
    "./search_spaces/initial_best_config.pkl",
)
search_space = pickle.load(open(search_space_path, "rb"))
initial_best_config = pickle.load(open(initial_best_config_path, "rb"))
file_name = "tictactoe_muzero"
eval_method = "elo"  # elo?
assert (
    eval_method == "final_score"
    or eval_method == "rolling_average"
    or eval_method == "final_score_rolling_average"
    or eval_method == "elo"
)
max_trials = 1
trials_step = 64  # how many additional trials to do after loading the last ones

try:  # try to load an already saved trials object, and increase the max
    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading...")
    max_trials = len(trials.trials) + 1
    print(
        "Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step
        )
    )
except:  # create a new trials object and start searching
    trials = None

for i in range(trials_step):
    best = fmin(
        fn=objective,  # Objective Function to optimize
        space=search_space,  # Hyperparameter's Search Space
        algo=tpe.suggest,  # Optimization algorithm (representative TPE)
        max_evals=max_trials,  # Number of optimization attempts
        trials=trials,  # Record the results
        # early_stop_fn=no_progress_loss(5, 1),
        trials_save_file=f"./{file_name}_trials.p",
        # points_to_evaluate=initial_best_config,
        show_progressbar=False,
    )

    trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
    print("Found saved Trials! Loading and Updating...")
    try:
        elo_table = table.bayes_elo()["Elo table"]
        for trial in range(len(trials.trials)):
            trial_elo = elo_table.iloc[trial]["Elo"]
            print(f"Trial {trials.trials[trial]['tid']} ELO: {trial_elo}")
            trials.trials[trial]["result"]["loss"] = -trial_elo
            pickle.dump(trials, open(f"./{file_name}_trials.p", "wb"))
    except ZeroDivisionError:
        print("Not enough players to calculate elo.")
    max_trials = len(trials.trials) + 1
    print(best)
    best_trial = space_eval(search_space, best)
# gc.collect()

 91%|█████████ | 58/64 [02:12<00:17,  2.89s/it]

In [1]:
import pickle
import sys

sys.path.append("../../")

table = pickle.load(open("./tictactoe_table.pkl", "rb"))
file_name = "tictactoe_muzero"

trials = pickle.load(open(f"./{file_name}_trials.p", "rb"))
print("Found saved Trials! Loading...")

print(trials.trials)
print(table.bayes_elo())

# since the elo only seems to be updating in the trials and not being saved to the pickle file
for trial in trials.trials:
    trial_elo = trial["result"]["loss"]
    print(f"Trial {trial['tid']} ELO: {-trial_elo}")

Found saved Trials! Loading...
[{'state': 2, 'tid': 0, 'spec': None, 'result': {'status': 'ok', 'loss': -917}, 'misc': {'tid': 0, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'action_function': [0], 'actor_conv_layers': [0], 'actor_dense_layer_widths': [0], 'adam_epsilon': [0], 'clip_low_prob': [0], 'clipnorm': [0], 'conv_layers': [0], 'critic_conv_layers': [0], 'critic_dense_layer_widths': [0], 'dense_layer_widths': [0], 'dense_layers': [0], 'exploitation_temperature': [0], 'exploration_temperature': [0], 'games_per_generation': [0], 'kernel_initializer': [0], 'known_bounds': [0], 'learning_rate': [0], 'min_replay_buffer_size': [0], 'minibatch_size': [0], 'n_step': [0], 'noisy_sigma': [0], 'num_sampling_moves': [0], 'num_simulations': [0], 'pb_c_base': [0], 'pb_c_init': [0], 'per_alpha': [0], 'per_beta': [0], 'per_beta_final': [0], 'per_epsilon': [0], 'policy_loss_function': [0], 'replay_buffer_size': [0], 'residual_layers': [0], 'reward_conv_layers': [0]

  +0.5 * drawTable[i][j] * np.log(
  +0.5 * drawTable[i][j] * np.log(
  return 1.0 / (1 + np.power(10, D / 400))
  l += winTable[i][j] * np.log(f(elos[i] - elos[j] - eloAdvantage + eloDraw))
  df = fun(x) - f0
  l += winTable[i][j] * np.log(f(elos[i] - elos[j] - eloAdvantage + eloDraw))
  +0.5 * drawTable[i][j] * np.log(
