In [None]:
from pettingzoo.classic import texas_holdem_v4
import copy
from agent_configs.cfr_config import CFRConfig
from active_player import ActivePlayer
from cfr_agent import CFRAgent
import torch
from cfr_network import CFRNetwork
game = texas_holdem_v4.env()

In [None]:
hidden_dim = 128
input_dim = 72
output_dim = 4
num_players = 2
replay_buffer_size = 4000000
minibatch_size = 10000
steps_per_epoch = 3000
traversals = 3000
training_steps = 200
lr = 0.001
optimizer = None
p_v_networks = {'input_shape':input_dim, 'output_shape':output_dim, 'hidden_size':hidden_dim, 'learning_rate':lr, 'optimizer':optimizer}
active_player_obj = ActivePlayer(num_players)
config = CFRConfig(
    config_dict={'network': {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players},
                 'replay_buffer_size':replay_buffer_size,
                 'minibatch_size':minibatch_size,
                 'steps_per_epoch':steps_per_epoch,
                 'traversals': traversals,
                 'training_steps': training_steps,
                 'active_player_obj': active_player_obj,
                 },
    game_config={'num_players':num_players,
                 'observation_space':72,
                 'action_space':4,},
)

In [3]:
sampling = ["MC", "Full"]
for sampling_method in sampling:
    game=texas_holdem_v4.env()
    model = CFRAgent(env=game,config=config, name="CFR_TXAS_HOLD_EM")
    model.train(sampling=sampling_method)


Iteration 123 done
Iteration 124 done
Iteration 125 done
Iteration 126 done
Iteration 127 done
Iteration 128 done
Iteration 129 done
Iteration 130 done
Iteration 131 done
Iteration 132 done
Iteration 133 done
Iteration 134 done


KeyboardInterrupt: 

In [None]:
agent1_state = torch.load('checkpoints/policy/linear/38063/1745086660.608601.pt')
agent2_state = torch.load('checkpoints/policy/linear/233694/1745086660.608601.pt')
agent3_state = torch.load('checkpoints/policy/notlinear/38063/1745086660.608601.pt')
agent4_state = torch.load('checkpoints/policy/notlinear/233694/1745086660.608601.pt')

agent1 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent1.policy.load_state_dict(agent1_state)
agent2 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent2.policy.load_state_dict(agent2_state)
agent3 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent3.policy.load_state_dict(agent3_state)
agent4 = CFRNetwork(
     config = {'policy': p_v_networks, 'value': p_v_networks, 'num_players':num_players}
)
agent4.policy.load_state_dict(agent4_state)


In [None]:
agent1.policy.eval()
agent2.policy.eval()
agent3.policy.eval()
agent4.policy.eval()

In [None]:
agents = [agent1, agent2, agent3, agent4]

In [None]:
def evaluatebots(agent1, agent2, num_of_eval_games):
    modelselect = CFRAgent(env=game, config=config)
    eval_games = num_of_eval_games
    import numpy as np
    rewards_player_1 = []
    rewards_player_2  = []
    for i in range(eval_games):
        # FOR EACH EVAL GAME, RESET ENVIRONEMENT (DEBATABLE STEP) BUT RESET WITH SET SEED FOR RECREATION
        random_seed = np.random.randint(0, 2**32 - 1)
        observation, reward, termination, truncation, infos =  modelselect.env.last()

        modelselect.env.reset(seed=random_seed)
        active_player =  modelselect.env.agent_selection[-1]
        modelselect.active_player_obj.set_active_player(int(active_player))
        while not termination and not truncation:
            # GET CURRENT STATE
            observation, reward, termination, truncation, infos =  modelselect.env.last()
            if termination or truncation:
                break
            active_player =  modelselect.active_player_obj.get_active_player()
            if active_player == 0:
                predictions = agent1.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,36)).detach().numpy()[0]

                sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
            else:
                # predictions = np.ones(4) / 4
                # sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
                predictions = agent2.policy(torch.tensor(observation['observation'], dtype=torch.float32).reshape(1,36)).detach().numpy()[0]
                sample, policy = modelselect.select_actions(predictions, info=torch.from_numpy(observation["action_mask"]).type(torch.float), mask_actions=True)
            # if active player, branch off and traverse
            modelselect.env.step(sample)
            modelselect.active_player_obj.next()
        final_rewards_p_1 = modelselect.env.rewards["player_0"]  # dict of {agent_0: r0, agent_1: r1}
        final_rewards_p_2 = modelselect.env.rewards["player_1"]
        rewards_player_1.append(final_rewards_p_1)
        rewards_player_2.append(final_rewards_p_2)
        modelselect.env.close()
    return rewards_player_1, rewards_player_2
    print("PLAYER 1 REW MEAN: ", np.mean(rewards_player_1))
    print("PLAYER 1 REW STD: ", np.std(rewards_player_1))
    print("PLAYER 2 REW MEAN: ", np.mean(rewards_player_2))
    print("PLAYER 2 REW STD: ", np.std(rewards_player_2))

In [None]:
import numpy as np
results = np.zeros((4,4))
for i in range(4):
    for j in range(4):
        rewards_player_1, rewards_player_2 = evaluatebots(agents[i], agents[j], 100000)
        results[i][j] = np.mean(rewards_player_1)

print("RESULTS: ")
# print results as matplotlib matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(results, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=["agent1", "agent2", "agent3", "agent4"], yticklabels=["agent1", "agent2", "agent3", "agent4"])
plt.title("Results")
plt.xlabel("Agent 2")
plt.ylabel("Agent 1")
plt.show()

In [None]:
# save results to file
import pandas as pd
df = pd.DataFrame(results, columns=["agent1", "agent2", "agent3", "agent4"], index=["agent1", "agent2", "agent3", "agent4"])
df.to_csv("results.csv", index=True)

In [None]:
# load results from file
import pandas as pd
df = pd.read_csv("results.csv", index_col=0)
print(df)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(df, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=["agent1", "agent2", "agent3", "agent4"], yticklabels=["agent1", "agent2", "agent3", "agent4"])
plt.title("Results")
plt.xlabel("Agent 2")
plt.ylabel("Agent 1")
plt.show()

In [1]:
import pyspiel
import open_spiel
import numpy as np

import open_spiel.python
import open_spiel.python.algorithms
import open_spiel.python.algorithms.nfsp
import tensorflow as tf
with tf.compat.v1.Session() as sess:
    game = pyspiel.load_game("universal_poker", {"numPlayers":2, "numSuits": 4, "numRanks":13, "numHoleCards": 2, "numBoardCards": "0 3", "bettingAbstraction": "fcpa", "numRounds":2})

    agent = open_spiel.python.algorithms.nfsp.NFSP(
    session=sess,
    player_id=0,
    state_representation_size=108,
    num_actions=4,
    hidden_layers_sizes=[128,128,128],
    reservoir_buffer_capacity=4000000,
    anticipatory_param=0.2,
    batch_size=10000, rl_learning_rate=0.001, sl_learning_rate=0.001, min_buffer_size_to_learn=1000, learn_every=64, optimizer_str="sgd"
    )
    total_parameters = 0
    for variable in tf.compat.v1.trainable_variables():
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print("Total parameters:", total_parameters)


Instructions for updating:
non-resource variables are not supported in the long term


2025-04-21 15:50:41.554086: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-04-21 15:50:41.554423: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-21 15:50:41.554461: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-04-21 15:50:41.554916: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-21 15:50:41.555318: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-04-21 15:50:41.807567: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
2025-04-21 15:50:41.841221: I tensorflow/core/grappler

Total parameters: 142476


In [10]:
import pyspiel
import open_spiel
import numpy as np
game = pyspiel.load_game("universal_poker", {"numPlayers":2, "numSuits": 4, "numRanks":13, "numHoleCards": 2, "numBoardCards": "0 3", "bettingAbstraction": "fcpa", "numRounds":2})
import copy
class WrapperEnv:
    def __init__(self,game):
        self.game= game
        self.state = game.new_initial_state()
        self.agent_selection = str(self.state.current_player())
        self.traverser = None
    
    def reset(self, seed=None):
        self.state = self.game.new_initial_state()
        while self.state.is_chance_node():
            self.state.apply_action(np.random.choice(self.state.legal_actions()))
        self.agent_selection =  str(self.state.current_player())
        return self.obs()
    
    def step(self, action):
        if self.state.is_chance_node():
            while self.state.is_chance_node():
                self.state.apply_action(np.random.choice(self.state.legal_actions()))

        else:
            self.state.apply_action(action)
            if self.state.is_chance_node():
                while self.state.is_chance_node():
                    self.state.apply_action(np.random.choice(self.state.legal_actions()))
        
        if self.state.is_terminal():
            return self.obs()
        else:
            # store = copy.deepcopy(self.state)
            while self.state.is_chance_node():
                self.state.apply_action(np.random.choice(self.state.legal_actions()))
            # print("3")
            # print(self.state.is_terminal())
            # if self.state.is_terminal():
            #     print("store:", store)
            # print(self.state)
            # print(self.state.legal_actions_mask(int(self.agent_selection)))
            # print("3")
            self.agent_selection =  str(self.state.current_player())

            return self.obs()
    
    def last(self):
        return self.obs()
    
    def obs(self):
        return {"observation":self.state.observation_tensor(int(self.agent_selection)), "action_mask":np.stack(self.state.legal_actions_mask(int(self.agent_selection)))}, self.state.player_reward(self.traverser) if self.traverser is not None else self.state.player_return(int(self.agent_selection)), self.state.is_terminal(), False, "OPENSPIEL"

env = WrapperEnv(game)

In [13]:
type(env.state.observation_tensor(0))

list

In [2]:
class NFSPWrapper:
    def __init__(self,env):
        self.game = env
        self.state = self.game.new_initial_state()
        self.observations = {"info_state":[0 for _ in range(self.state.num_players())], "legal_actions":[0 for _ in range(self.state.num_players())]}
        self.agent_selection = str(self.state.current_player())
        self.traverser = None
        self.rewards = [0 for _ in range(self.state.num_players())]
    
    def is_simultaneous_move(self):
        return self.state.is_simultaneous_node()


    def last(self):
        return self.state.is_terminal()

    def current_player(self):
        return self.state.current_player()
    
    def step(self, action):
        if self.state.is_chance_node():
            while self.state.is_chance_node():
                self.state.apply_action(np.random.choice(self.state.legal_actions()))

        else:
            self.state.apply_action(action)
            if self.state.is_chance_node():
                while self.state.is_chance_node():
                    self.state.apply_action(np.random.choice(self.state.legal_actions()))
        
        if not self.state.is_terminal():
            while self.state.is_chance_node():
                self.state.apply_action(np.random.choice(self.state.legal_actions()))
            self.agent_selection =  str(self.state.current_player())

            return self.obs()
    
    def reset(self, seed=None):
        self.state = self.game.new_initial_state()
        while self.state.is_chance_node():
            self.state.apply_action(np.random.choice(self.state.legal_actions()))
        self.agent_selection =  str(self.state.current_player())
        return self.obs()
    
    
    def obs(self):
        if not self.state.is_terminal():
            self.observations["info_state"][self.state.current_player()] = self.state.observation_tensor(self.state.current_player())
            self.observations["legal_actions"][self.state.current_player()] = np.stack(self.state.legal_actions(self.state.current_player()))
        if not self.state.is_chance_node():
            self.rewards = self.state.rewards()
        else:
            self.rewards = [0 for _ in range(self.state.num_players())]
        return {"observation":self.state.observation_tensor(int(self.agent_selection)), "action_mask":np.stack(self.state.legal_actions(int(self.agent_selection)))}, self.state.player_reward(self.traverser) if self.traverser is not None else self.state.player_return(int(self.agent_selection)), self.state.is_terminal(), False, "OPENSPIEL"

newenv = NFSPWrapper(game)

In [7]:
import os
import copy
def train(agents, env, max_nodes):
    nodes = 0
    checkpoint = 0.1
    while nodes<=max_nodes:
        env.reset()
        while not env.last():
            currrent_player = env.current_player()
            action, probs = agents[currrent_player].step(copy.deepcopy(env))
            env.step(action)
            nodes += 1
        print("Nodes:", nodes)
        if nodes >= checkpoint * max_nodes:
            print("Checkpoint reached: ", checkpoint)
            checkpoint += 0.1
            if not os.path.exists("checkpoints/nfsp/"):
                os.makedirs("checkpoints/nfsp/")
            if not os.path.exists("checkpoints/nfsp/" + str(nodes)):
                os.makedirs("checkpoints/nfsp/" + str(nodes))
            agent.save("checkpoints/nfsp/" + str(nodes))

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
# import tensorflow.python.compiler.mlcompute as mlcompute
# tf.compat.v1.disable_eager_execution()
# mlcompute.set_mlc_device(device_name='gpu')
# print("is_apple_mlc_enabled %s" % mlcompute.is_apple_mlc_enabled())
# print("is_tf_compiled_with_apple_mlc %s" % mlcompute.is_tf_compiled_with_apple_mlc())
# print(f"eagerly? {tf.executing_eagerly()}")
# print(tf.config.list_logical_devices())
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import pyspiel
import open_spiel
import numpy as np
tf.config.set_visible_devices([], 'GPU')
import open_spiel.python
import open_spiel.python.algorithms
import open_spiel.python.algorithms.nfsp
num_players = 2
max_nodes = 4715656
nodes = 0
with tf.compat.v1.Session() as sess:
    agents = [open_spiel.python.algorithms.nfsp.NFSP(
    session=sess,
    player_id=_,
    state_representation_size=108,
    num_actions=4,
    hidden_layers_sizes=[1024,512,1024,512],
    reservoir_buffer_capacity=30000000,
    anticipatory_param=0.1,
    batch_size=256, rl_learning_rate=0.1, sl_learning_rate=0.01, min_buffer_size_to_learn=1000, learn_every=256, optimizer_str="sgd", replay_buffer_capacity= 600000, epsilon_start=0.08, epsilon_end=0,
    ) for _ in range(num_players)]
    sess.run(tf.compat.v1.global_variables_initializer())
    game = pyspiel.load_game("universal_poker", {"numPlayers":2, "numSuits": 4, "numRanks":13, "numHoleCards": 2, "numBoardCards": "0 3", "bettingAbstraction": "fcpa", "numRounds":2})
    newenv = NFSPWrapper(game)
    newenv.reset()
    train(agents, newenv,max_nodes)
    



Num GPUs Available:  1


2025-04-21 18:33:26.345484: W tensorflow/c/c_api.cc:305] Operation '{name:'mlp_16/bias_4/Assign' id:3955 op device:{requested: '', assigned: ''} def:{{{node mlp_16/bias_4/Assign}} = Assign[T=DT_FLOAT, _class=["loc:@mlp_16/bias_4"], _has_manual_control_dependencies=true, use_locking=true, validate_shape=true](mlp_16/bias_4, mlp_16/zeros_4)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Nodes: 3
Nodes: 5
Nodes: 8
Nodes: 11
Nodes: 14
Nodes: 17
Nodes: 20
Nodes: 23
Nodes: 26
Nodes: 28
Nodes: 31
Nodes: 33
Nodes: 35
Nodes: 37
Nodes: 39
Nodes: 41
Nodes: 43
Nodes: 46
Nodes: 48
Nodes: 51
Nodes: 54
Nodes: 56
Nodes: 58
Nodes: 60
Nodes: 62
Nodes: 65
Nodes: 68
Nodes: 71
Nodes: 74
Nodes: 76
Nodes: 79
Nodes: 82
Nodes: 85
Nodes: 88
Nodes: 90
Nodes: 93
Nodes: 95
Nodes: 97
Nodes: 100
Nodes: 103
Nodes: 106
Nodes: 109
Nodes: 112
Nodes: 114
Nodes: 117
Nodes: 120
Nodes: 123
Nodes: 126
Nodes: 129
Nodes: 132
Nodes: 134
Nodes: 136
Nodes: 139
Nodes: 142
Nodes: 145
Nodes: 147
Nodes: 150
Nodes: 152
Nodes: 155
Nodes: 158
Nodes: 161
Nodes: 164
Nodes: 167
Nodes: 170
Nodes: 172
Nodes: 175
Nodes: 178
Nodes: 180
Nodes: 182
Nodes: 184
Nodes: 186
Nodes: 189
Nodes: 191
Nodes: 193
Nodes: 196
Nodes: 198
Nodes: 201
Nodes: 203
Nodes: 205
Nodes: 208
Nodes: 211
Nodes: 213
Nodes: 215
Nodes: 218
Nodes: 221
Nodes: 224
Nodes: 227
Nodes: 229
Nodes: 232
Nodes: 234
Nodes: 236
Nodes: 238
Nodes: 240
Nodes: 242
Nodes: 

KeyboardInterrupt: 

In [51]:
print(newenv.state.legal_actions_mask())

[0, 1, 1, 1]
