In [None]:
from pettingzoo.classic import leduc_holdem_v4
import copy
from agent_configs.cfr_config import CFRConfig
from active_player import ActivePlayer
from cfr_agent import CFRAgent
import torch
from cfr_network import CFRNetwork
game = leduc_holdem_v4.env()

In [None]:
hidden_dim = 72
input_dim = 36
output_dim = 4
num_players = 2
replay_buffer_size = 4000000
minibatch_size = 512
steps_per_epoch = 200
traversals = 200
training_steps = 100
lr = 0.001
optimizer = None
p_v_networks = {'input_shape':input_dim, 'output_shape':output_dim, 'hidden_size':hidden_dim, 'learning_rate':lr, 'optimizer':optimizer}
active_player_obj = ActivePlayer(num_players)
config = CFRConfig(
    config_dict={'network': {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players},
                 'replay_buffer_size':replay_buffer_size,
                 'minibatch_size':minibatch_size,
                 'steps_per_epoch':steps_per_epoch,
                 'traversals': traversals,
                 'training_steps': training_steps,
                 'active_player_obj': active_player_obj,
                 },
    game_config={'num_players':num_players,
                 'observation_space':72,
                 'action_space':4,},
)

In [None]:
sampling = ["MC", "Full"]
for sampling_method in sampling:
    game=leduc_holdem_v4.env()
    model = CFRAgent(env=game,config=config)
    model.train(sampling=sampling_method)


In [None]:
agent1_state = torch.load('checkpoints/policy/linear/38063/1745086660.608601.pt')
agent2_state = torch.load('checkpoints/policy/linear/233694/1745086660.608601.pt')
agent3_state = torch.load('checkpoints/policy/notlinear/38063/1745086660.608601.pt')
agent4_state = torch.load('checkpoints/policy/notlinear/233694/1745086660.608601.pt')

agent1 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent1.policy.load_state_dict(agent1_state)
agent2 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent2.policy.load_state_dict(agent2_state)
agent3 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent3.policy.load_state_dict(agent3_state)
agent4 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent4.policy.load_state_dict(agent4_state)


In [None]:
agent1.policy.eval()
agent2.policy.eval()
agent3.policy.eval()
agent4.policy.eval()

In [None]:
agents = [agent1, agent2, agent3, agent4]

In [None]:
def evaluatebots(agent1, agent2, num_of_eval_games):
    modelselect = CFRAgent(env=game, config=config)
    eval_games = num_of_eval_games
    import numpy as np
    rewards_player_1 = []
    rewards_player_2  = []
    for i in range(eval_games):
        # FOR EACH EVAL GAME, RESET ENVIRONEMENT (DEBATABLE STEP) BUT RESET WITH SET SEED FOR RECREATION
        random_seed = np.random.randint(0, 2**32 - 1)
        observation, reward, termination, truncation, infos =  modelselect.env.last()

        modelselect.env.reset(seed=random_seed)
        active_player =  modelselect.env.agent_selection[-1]
        modelselect.active_player_obj.set_active_player(int(active_player))
        while not termination and not truncation:
            # GET CURRENT STATE
            observation, reward, termination, truncation, infos =  modelselect.env.last()
            if termination or truncation:
                break
            active_player =  modelselect.active_player_obj.get_active_player()
            if active_player == 0:
                predictions = agent1.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,36)).detach().numpy()[0]

                sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
            else:
                # predictions = np.ones(4) / 4
                # sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
                predictions = agent2.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,36)).detach().numpy()[0]
                sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
            # if active player, branch off and traverse
            modelselect.env.step(sample)
            modelselect.active_player_obj.next()
        final_rewards_p_1 = modelselect.env.rewards["player_0"]  # dict of {agent_0: r0, agent_1: r1}
        final_rewards_p_2 = modelselect.env.rewards["player_1"]
        rewards_player_1.append(final_rewards_p_1)
        rewards_player_2.append(final_rewards_p_2)
        modelselect.env.close()
    return rewards_player_1, rewards_player_2
    print("PLAYER 1 REW MEAN: ", np.mean(rewards_player_1))
    print("PLAYER 1 REW STD: ", np.std(rewards_player_1))
    print("PLAYER 2 REW MEAN: ", np.mean(rewards_player_2))
    print("PLAYER 2 REW STD: ", np.std(rewards_player_2))

In [None]:
import numpy as np
results = np.zeros((4,4))
for i in range(4):
    for j in range(4):
        rewards_player_1, rewards_player_2 = evaluatebots(agents[i], agents[j], 100000)
        results[i][j] = np.mean(rewards_player_1)

print("RESULTS: ")
# print results as matplotlib matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(results, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=["agent1", "agent2", "agent3", "agent4"], yticklabels=["agent1", "agent2", "agent3", "agent4"])
plt.title("Results")
plt.xlabel("Agent 2")
plt.ylabel("Agent 1")
plt.show()

In [None]:
modelselect.env.close()
modelselect.env.reset()