In [None]:
from pettingzoo.classic import texas_holdem_v4
import copy
from agent_configs.cfr_config import CFRConfig
from active_player import ActivePlayer
from cfr_agent import CFRAgent
import torch
from cfr_network import CFRNetwork
game = texas_holdem_v4.env(num_players=2)

In [None]:
hidden_dim = 256
input_dim = 72
output_dim = 4
num_players = 2
replay_buffer_size = 4000000
minibatch_size = 5000
steps_per_epoch = 2000
traversals = 1500
training_steps = 200
lr = 0.001
optimizer = None
p_v_networks = {'input_shape':input_dim, 'output_shape':output_dim, 'hidden_size':hidden_dim, 'learning_rate':lr, 'optimizer':optimizer}
active_player_obj = ActivePlayer(num_players)
config = CFRConfig(
    config_dict={'network': {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players},
                 'replay_buffer_size':replay_buffer_size,
                 'minibatch_size':minibatch_size,
                 'steps_per_epoch':steps_per_epoch,
                 'traversals': traversals,
                 'training_steps': training_steps,
                 'active_player_obj': active_player_obj,
                 },
    game_config={'num_players':num_players,
                 'observation_space':72,
                 'action_space':4,},


)

In [None]:
modelselect = CFRAgent(
    env=game,
    config=config,
)
# modelselect.train()


In [None]:
modelselect.train()

LOSS 5.649683475494385
PLAYER ID 0
LEARNING ITERATION 1849
LOSS 5.6683783531188965
PLAYER ID 0
LEARNING ITERATION 1850
LOSS 5.506097793579102
PLAYER ID 0
LEARNING ITERATION 1851
LOSS 5.875100612640381
PLAYER ID 0
LEARNING ITERATION 1852
LOSS 5.72903299331665
PLAYER ID 0
LEARNING ITERATION 1853
LOSS 5.659924030303955
PLAYER ID 0
LEARNING ITERATION 1854
LOSS 5.870151519775391
PLAYER ID 0
LEARNING ITERATION 1855
LOSS 5.8641743659973145
PLAYER ID 0
LEARNING ITERATION 1856
LOSS 5.799927711486816
PLAYER ID 0
LEARNING ITERATION 1857
LOSS 5.886231422424316
PLAYER ID 0
LEARNING ITERATION 1858
LOSS 5.7255377769470215
PLAYER ID 0
LEARNING ITERATION 1859
LOSS 5.787532329559326
PLAYER ID 0
LEARNING ITERATION 1860
LOSS 5.663789749145508
PLAYER ID 0
LEARNING ITERATION 1861
LOSS 5.957200050354004
PLAYER ID 0
LEARNING ITERATION 1862


In [None]:
agent = torch.load('checkpoints/1744571971.415963.pt')
agent2 = torch.load('checkpoints/1744755949.354019.pt')

In [None]:
model = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
model2 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
model.load_state_dict(agent)
model2.load_state_dict(agent2)

In [None]:
model.eval()
model2.eval()

In [None]:
eval_games = 100000
import numpy as np
rewards_player_1 = []
rewards_player_2  = []
for i in range(eval_games):
    # FOR EACH EVAL GAME, RESET ENVIRONEMENT (DEBATABLE STEP) BUT RESET WITH SET SEED FOR RECREATION
    random_seed = np.random.randint(0, 2**32 - 1)
    observation, reward, termination, truncation, infos =  modelselect.env.last()

    modelselect.env.reset(seed=random_seed)
    active_player =  modelselect.env.agent_selection[-1]
    modelselect.active_player_obj.set_active_player(int(active_player))
    while not termination and not truncation:
        # GET CURRENT STATE
        observation, reward, termination, truncation, infos =  modelselect.env.last()
        if termination or truncation:
            break
        active_player =  modelselect.active_player_obj.get_active_player()
        if active_player == 0:
            predictions = model.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,72)).detach().numpy()[0]

            sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
        else:
            # predictions = np.ones(4) / 4
            # sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
            predictions = model2.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,72)).detach().numpy()[0]
            sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
        # if active player, branch off and traverse
        modelselect.env.step(sample)
        modelselect.active_player_obj.next()
    final_rewards_p_1 = modelselect.env.rewards["player_1"]  # dict of {agent_0: r0, agent_1: r1}
    final_rewards_p_2 = modelselect.env.rewards["player_0"]
    rewards_player_1.append(final_rewards_p_1)
    rewards_player_2.append(final_rewards_p_2)
    modelselect.env.close()

print("PLAYER 1 REW MEAN: ", np.mean(rewards_player_1))
print("PLAYER 1 REW STD: ", np.std(rewards_player_1))
print("PLAYER 2 REW MEAN: ", np.mean(rewards_player_2))
print("PLAYER 2 REW STD: ", np.std(rewards_player_2))

In [None]:
modelselect.env.last()

In [None]:
modelselect.env.close()