In [None]:
from agent_configs import RainbowConfig
import gymnasium as gym
import torch
import random
import numpy as np
import torch
from utils import CategoricalCrossentropyLoss, KLDivergenceLoss
from utils.utils import HuberLoss
from cfr_utils import (
    EvalWrapper,
    evaluatebots,
    WrapperEnv,
    load_agents,
    EmptyConf,
    NFSPWrapper,
    NFSPEvalWrapper,
    LoadNFSPAgent,
)
import pyspiel
import copy
from agent_configs.cfr_config import CFRConfig
from active_player import ActivePlayer
from cfr_agent import CFRAgent
from cfr_network import CFRNetwork
import sys

sys.path.append("..")
from dqn.rainbow.rainbow_agent import RainbowAgent
import tensorflow as tf
import os
import open_spiel.python.algorithms.nfsp

tf.config.set_visible_devices([], "GPU")
num_players = 2
max_nodes = 10000000

fhp = pyspiel.load_game(
    "universal_poker",
    {
        "numPlayers": 2,
        "numSuits": 4,
        "numRanks": 13,
        "numHoleCards": 2,
        "numBoardCards": "0 3",
        "bettingAbstraction": "fcpa",
        "numRounds": 2,
        "blind": "50 100",
    },
)
leduc = pyspiel.load_game(
    "universal_poker",
    {
        "numPlayers": 2,
        "numSuits": 2,
        "numRanks": 3,
        "numHoleCards": 1,
        "numBoardCards": "0 1",
        "bettingAbstraction": "fcpa",
        "numRounds": 2,
        "blind": "50 100",
    },
)
leducconfig = {"state_representation_size": 16}
fhpconfig = {"state_representation_size": 108}
leducgame = NFSPWrapper(leduc)
fhpgame = NFSPWrapper(fhp)

active_player_obj = ActivePlayer(2)

config_dict = {
    "dense_layer_widths": [128, 256, 256, 128],
    "value_hidden_layer_widths": [128, 128],
    "advantage_hidden_layer_widths": [128, 128],
    "adam_epsilon": 1e-8,
    "learning_rate": 0.002,
    "training_steps": 20000,
    "per_epsilon": 0.001,
    "per_alpha": 0,
    "per_beta": 0,
    "per_beta_final": 0.5,
    "minibatch_size": 256,
    "replay_buffer_size": 1000000,
    "min_replay_buffer_size": 256,
    "transfer_interval": 1024,
    "loss_function": KLDivergenceLoss(),
    "clipnorm": 0.0,
    "discount_factor": 0.99,
    "replay_interval": 64,
    "eg_epsilon": 1,
    "eg_epsilon_final": 0.0,
    "eg_epsilon_final_step": 5000,
    "eg_epsilon_decay_type": "linear",
    "num_minibatches": 4,
}
gameconfig = EmptyConf()
config = RainbowConfig(config_dict, gameconfig)
config.v_min = -1200
config.v_max = 1200
device = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.compat.v1.disable_v2_behavior()


mainpath1 = "./checkpoints/fhp/nfsp/0/1000002/"
mainpath2 = "./checkpoints/fhp/nfsp/0/4000001/"
mainpath3 = "./checkpoints/fhp/nfsp/0/7000001/"
mainpath4 = "./checkpoints/fhp/nfsp/0/8000000/"
mainpath5 = "./checkpoints/fhp/nfsp/0/10000000/"
mainpath6 = "./checkpoints/leduc/nfsp/0/1000000/"
mainpath7 = "./checkpoints/leduc/nfsp/0/4000001/"
mainpath8 = "./checkpoints/leduc/nfsp/0/7000001/"
mainpath9 = "./checkpoints/leduc/nfsp/0/8000000/"
mainpath10 = "./checkpoints/leduc/nfsp/0/10000000/"

leduc_agent_paths = [
    mainpath6,
    mainpath7,
    mainpath8,
    mainpath9,
    mainpath10,
]
fhp_agent_paths = [
    mainpath1,
    mainpath2,
    mainpath3,
    mainpath4,
    mainpath5,
]

nodes = 0
games = [fhpgame, leducgame]
for i in games:
    if i == leducgame:
        agent_paths = leduc_agent_paths
        game_string = "leduc"
    else:
        agent_paths = fhp_agent_paths
        game_string = "fhp"
    for number in range(len(agent_paths)):
        i.reset()

        with tf.compat.v1.Session() as sess:
            agent = open_spiel.python.algorithms.nfsp.NFSP(
                session=sess,
                player_id=0,
                state_representation_size=(
                    leducconfig["state_representation_size"]
                    if i == leducgame
                    else fhpconfig["state_representation_size"]
                ),
                num_actions=4,
                hidden_layers_sizes=[1024, 512, 1024, 512],
                reservoir_buffer_capacity=30000000,
                anticipatory_param=0,
                batch_size=256,
                rl_learning_rate=0.1,
                sl_learning_rate=0.01,
                min_buffer_size_to_learn=1000,
                learn_every=256,
                optimizer_str="sgd",
                replay_buffer_capacity=600000,
                epsilon_start=0.08,
                epsilon_end=0,
            )
            LoadNFSPAgent(agent_paths[number], agent, 0)
            agent.restore(agent_paths[number])  # IF YOU HAVE A NFSP AGENT PATH
            # agent.restore(path1) # IF YOU HAVE A NFSP AGENT PATH
            sess.run(tf.compat.v1.global_variables_initializer())

            wrapped = NFSPEvalWrapper(
                i,
                agent,
                (
                    leducconfig["state_representation_size"]
                    if i == leducgame
                    else fhpconfig["state_representation_size"]
                ),
                4,
            )
            model_name = "Rainbow2_" + game_string + "_agent_" + str(number)
            evaluator = RainbowAgent(wrapped, config, name=model_name, device=device)
            evaluator.checkpoint_interval = 200

            for param in evaluator.model.parameters():
                print(param)
            evaluator.train()

Using default save_intermediate_weights     : False
Using         training_steps                : 10000
Using         adam_epsilon                  : 1e-08
Using default momentum                      : 0.9
Using         learning_rate                 : 0.002
Using         clipnorm                      : 0.0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.HuberLoss object at 0x37ca44c50>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 32
Using         replay_buffer_size            : 100000
Using         min_replay_buffer_size        : 32
Using         num_minibatches               : 4
Using default training_iterations           : 1
Using default print_interval                : 100
RainbowConfig
Using default residual_layers               : []
Using defa

INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/1000002/q_network_pid0


INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/1000002/avg_network_pid0


INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/1000002/avg_network_pid0
2025-04-25 15:25:58.999927: W tensorflow/c/c_api.cc:305] Operation '{name:'mlp_2/weights_4_12/Assign' id:11368 op device:{requested: '', assigned: ''} def:{{{node mlp_2/weights_4_12/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](mlp_2/weights_4_12, zeros_239)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


num_actions:  4
float32
Parameter containing:
tensor([[-0.0210, -0.0107,  0.0669,  ...,  0.0763,  0.0327,  0.0693],
        [ 0.0704, -0.0636,  0.0076,  ...,  0.0831,  0.0285,  0.0331],
        [-0.0860,  0.0805, -0.0953,  ..., -0.0751, -0.0037, -0.0065],
        ...,
        [ 0.0430,  0.0038, -0.0745,  ...,  0.0873, -0.0270,  0.0783],
        [ 0.0844, -0.0118,  0.0522,  ...,  0.0861, -0.0417, -0.0047],
        [-0.0076, -0.0014,  0.0261,  ..., -0.0094,  0.0428, -0.0741]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0459, -0.0194, -0.0500,  0.0240, -0.0906,  0.0771, -0.0713,  0.0039,
        -0.0744, -0.0070,  0.0511, -0.0768, -0.0493,  0.0387,  0.0398,  0.0811,
         0.0700, -0.0361, -0.0367,  0.0608,  0.0863,  0.0697,  0.0632,  0.0266,
         0.0837,  0.0108,  0.0782,  0.0203,  0.0471, -0.0671,  0.0954, -0.0226,
        -0.0858, -0.0335,  0.0014, -0.0392,  0.0296, -0.0697, -0.0557, -0.0221,
        -0.0602,  0.0584,  0.0012, -0.0616, -0.0734, -0.0822, -0.0303, 

INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/4000001/q_network_pid0


INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/4000001/avg_network_pid0


INFO:tensorflow:Restoring parameters from ./checkpoints/fhp/nfsp/0/4000001/avg_network_pid0
2025-04-25 15:35:02.068903: W tensorflow/c/c_api.cc:305] Operation '{name:'mlp_2/weights_4_13/Assign' id:12318 op device:{requested: '', assigned: ''} def:{{{node mlp_2/weights_4_13/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](mlp_2/weights_4_13, zeros_259)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


num_actions:  4
float32
Parameter containing:
tensor([[-0.0945, -0.0543, -0.0041,  ...,  0.0148,  0.0190,  0.0957],
        [ 0.0186,  0.0478, -0.0467,  ..., -0.0549,  0.0369, -0.0765],
        [ 0.0908,  0.0347, -0.0447,  ..., -0.0317, -0.0721,  0.0171],
        ...,
        [-0.0881,  0.0395, -0.0657,  ..., -0.0833,  0.0429, -0.0590],
        [ 0.0161, -0.0201, -0.0690,  ..., -0.0248, -0.0124,  0.0098],
        [-0.0730,  0.0190, -0.0150,  ..., -0.0519, -0.0776, -0.0876]],
       requires_grad=True)
Parameter containing:
tensor([-0.0874,  0.0922,  0.0084,  0.0405,  0.0739, -0.0826, -0.0634,  0.0572,
         0.0172,  0.0479, -0.0913,  0.0333,  0.0767, -0.0315,  0.0070,  0.0538,
         0.0003, -0.0337,  0.0637,  0.0577, -0.0430, -0.0927, -0.0287,  0.0707,
         0.0333, -0.0910, -0.0787,  0.0558, -0.0243,  0.0158, -0.0546,  0.0739,
         0.0107, -0.0466,  0.0220,  0.0641, -0.0838, -0.0751, -0.0336,  0.0755,
         0.0787,  0.0224, -0.0193,  0.0385, -0.0107, -0.0325,  0.0398, 

KeyboardInterrupt: 