In [None]:
from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig, RainbowConfig
from game_configs import MississippiMarblesConfig, LeducHoldemConfig, TicTacToeConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": True,
    "training_steps": 500000,  # like 2-5M in the paper (1M for initial test and see if rainbow is faster sooner)
    "num_players": 2,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  # 128
    "num_minibatches": 2,  # 2 # 4
    "activation": "relu",
    "kernel_initializer": "he_normal",
    "learning_rate": 0.025,  # maybe should be lower for distributional, loss magnitudes are similar but there are many more weights for distributions (from atom size)? testing alpha/2 and ... # DQN / 4 # 0.1 for DQN so 0.025 for Rainbow (initial test used 0.1 for rainbow)
    "clipnorm": 1.0,
    "optimizer": Adam,  # SGD
    "loss_function": KLDivergenceLoss(),
    "per_alpha": 0.6,
    "per_beta": 0.5,
    "per_epsilon": 1e-3,
    "training_iterations": 1,  # 1
    "min_replay_buffer_size": 128,
    "minibatch_size": 128,
    "replay_buffer_size": 50000,  # 200000
    "transfer_interval": 300,  # 100
    "n_step": 3,  ##########
    "atom_size": 51,
    # "conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "dense_layer_widths": [],
    # "value_hidden_layer_widths": [],
    # "advantage_hidden_layer_widths": [],
    "conv_layers": [(64, 3, 1), (64, 2, 1)],
    "dense_layer_widths": [],
    "dueling": True,
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.12,  ##########
    "eg_epsilon": 0.06,  # 0
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_activation": "relu",
    "sl_kernel_initializer": "he_normal",
    "sl_learning_rate": 0.005,  # 0.00005
    "sl_clipnorm": 1.0,
    "sl_optimizer": Adam,  # SGD
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_training_iterations": 1,  # 1
    "sl_min_replay_buffer_size": 128,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 1000000,  # 2000000
    # "sl_conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "sl_dense_layer_widths": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
}
# config = NFSPConfig(config_dict=config_dict, game_config=MississippiMarblesConfig(), rl_config_type=RainbowConfig)
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=TicTacToeConfig(),
)
# config = NFSPDQNConfig(config_dict=config_dict, game_config=TicTacToeConfig(), rl_config_type=RainbowConfig)

In [None]:
from pathlib import Path

# TODO: 8, 9, 10, 11, 12
# DONE: 14, 13
dir = "./checkpoints/bad_test_func/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))

In [None]:
import custom_gym_envs
import gymnasium as gym

# env = gym.make('custom_gym_envs/MississippiMarbles-v0', render_mode="human", players=2)
env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="rgb_array", encode_player_turn=True
)
# env = gym.make('custom_gym_envs/TicTacToe-v0', render_mode="rgb_array", player_turn_as_plane=False)
agent = NFSPDQN(env, config, name="NFSP-TicTacToe", device="cpu")

In [None]:
dir = "./checkpoints/NFSPDQN-LeducHoldem"
agent.load_from_checkpoint(dir, 583000)

In [6]:
agent.checkpoint_trials = 100
agent.checkpoint_interval = 500
agent.config.save_intermediate_weights = True
agent.train()

Trial  0
Prediction tensor([[0.0717, 0.0052, 0.2022, 0.0075, 0.3966, 0.0139, 0.2522, 0.0085, 0.0421]])
Prediction tensor([[0.5401, 0.2361, 0.4697, 0.5437, 0.7020, 0.3445, 0.5548, 0.4340, 0.4882]])
Prediction tensor([[0.2356, 0.0114, 0.1276, 0.0000, 0.0000, 0.2200, 0.2389, 0.0347, 0.1318]])
Prediction tensor([[-0.8445, -0.6851, -0.7433, -0.5714, -0.8644, -0.7232, -0.7098, -0.5209,
         -0.5215]])
Prediction tensor([[0.2123, 0.0726, 0.3360, 0.0000, 0.0000, 0.1522, 0.2270, 0.0000, 0.0000]])
Prediction tensor([[ 0.2650,  0.9944,  0.2098,  0.0902,  0.0024, -0.0961,  0.7773, -0.6895,
          0.3458]])
Trial  1
Prediction tensor([[0.0717, 0.0052, 0.2022, 0.0075, 0.3966, 0.0139, 0.2522, 0.0085, 0.0421]])
Prediction tensor([[0.5401, 0.2361, 0.4697, 0.5437, 0.7020, 0.3445, 0.5548, 0.4340, 0.4882]])
Prediction tensor([[0.3498, 0.0036, 0.0112, 0.0394, 0.0000, 0.0024, 0.0000, 0.0473, 0.5462]])
Prediction tensor([[-0.6991, -0.5690, -0.5929, -0.4508, -0.7398, -0.6074, -0.5324, -0.4454,
        

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_46000/videos/self.model_name_0/46000/NFSP-TicTacToe-episode-4574.mp4
-0.28
Trial  0
Prediction tensor([[0.6040, 0.3619, 0.6467, 0.4750, 0.6597, 0.3672, 0.7268, 0.4600, 0.4459]])
Prediction tensor([[0.1827, 0.0043, 0.1057, 0.0246, 0.5844, 0.0204, 0.0000, 0.0344, 0.0435]])
Prediction tensor([[0.6150, 0.2615, 0.5044, 0.5232, 0.7042, 0.2499, 0.5322, 0.3575, 0.5624]])
Prediction tensor([[0.0000, 0.0863, 0.1500, 0.0880, 0.0000, 0.2885, 0.0000, 0.2559, 0.1313]])
Prediction tensor([[ 0.4466, -0.6930,  1.0000,  0.0040,  0.9546, -0.1280,  0.5237,  0.2716,
          0.3072]])
Trial  1
Prediction tensor([[0.6040, 0.3619, 0.6467, 0.4750, 0.6597, 0.3672, 0.7268, 0.4600, 0.4459]])
Prediction tensor([[0.1827, 0.0043, 0.1057, 0.0246, 0.5844, 0.0204, 0.0000, 0.0344, 0.0435]])
Prediction tensor([[0.6108, 0.2598, 0.5024, 0.5240, 0.7041, 0.2551, 0.5336, 0.3619, 0.5585]])
Prediction tensor([[0.0000, 0.0914, 0.2262, 0.2648, 0.0000, 0.0878

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_46000/videos/self.model_name_1/46000/NFSP-TicTacToe-episode-4599.mp4
0.72
0.43999999999999995


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_46500/videos/self.model_name_0/46500/NFSP-TicTacToe-episode-4624.mp4
-0.12
Trial  0
Prediction tensor([[0.5605, 0.0746, 0.6811, 0.4847, 0.5771, 0.3783, 0.7021, 0.4448, 0.5286]])
Prediction tensor([[0.1613, 0.0053, 0.1017, 0.0238, 0.5945, 0.0238, 0.0000, 0.0363, 0.0534]])
Prediction tensor([[0.6391, 0.0718, 0.4339, 0.4080, 0.5646, 0.1333, 0.4531, 0.2675, 0.5558]])
Prediction tensor([[0.0000, 0.0955, 0.1955, 0.2679, 0.0000, 0.0833, 0.0000, 0.1347, 0.2231]])
Prediction tensor([[ 1.0000, -0.1978, -0.7232,  0.9997,  0.2877, -0.3008,  0.9993, -0.3677,
         -0.8058]])
Trial  1
Prediction tensor([[0.5605, 0.0746, 0.6811, 0.4847, 0.5771, 0.3783, 0.7021, 0.4448, 0.5286]])
Prediction tensor([[0.1613, 0.0053, 0.1017, 0.0238, 0.5945, 0.0238, 0.0000, 0.0363, 0.0534]])
Prediction tensor([[0.6391, 0.0718, 0.4339, 0.4080, 0.5646, 0.1333, 0.4531, 0.2675, 0.5558]])
Prediction tensor([[0.0000, 0.0955, 0.1955, 0.2679, 0.0000, 0.0833

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_46500/videos/self.model_name_1/46500/NFSP-TicTacToe-episode-4649.mp4
0.6
0.48


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_47000/videos/self.model_name_0/47000/NFSP-TicTacToe-episode-4674.mp4
-0.68
Trial  0
Prediction tensor([[0.5537, 0.2423, 0.7687, 0.6466, 0.6503, 0.3753, 0.7056, 0.3999, 0.4659]])
Prediction tensor([[0.1506, 0.0076, 0.0000, 0.0699, 0.4852, 0.0500, 0.0701, 0.0863, 0.0803]])
Prediction tensor([[0.6333, 0.1236, 0.6911, 0.5637, 0.5895, 0.2677, 0.4078, 0.1602, 0.6536]])
Prediction tensor([[0.0000, 0.2539, 0.0000, 0.1356, 0.3913, 0.0506, 0.0969, 0.0717, 0.0000]])
Prediction tensor([[ 0.9796,  0.9958,  0.9967, -0.1959, -0.5506, -0.3449, -0.2187, -0.5696,
         -0.0486]])
Trial  1
Prediction tensor([[0.5537, 0.2423, 0.7687, 0.6466, 0.6503, 0.3753, 0.7056, 0.3999, 0.4659]])
Prediction tensor([[0.1506, 0.0076, 0.0000, 0.0699, 0.4852, 0.0500, 0.0701, 0.0863, 0.0803]])
Prediction tensor([[ 0.3829, -0.0917,  0.9156,  0.4710,  0.5891,  0.8618,  0.3340,  0.0628,
          0.8076]])
Prediction tensor([[0.0000, 0.0428, 0.0000, 0.06

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_47000/videos/self.model_name_1/47000/NFSP-TicTacToe-episode-4699.mp4
0.84
0.15999999999999992


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_47500/videos/self.model_name_0/47500/NFSP-TicTacToe-episode-4724.mp4
-0.76
Trial  0
Prediction tensor([[0.4763, 0.4593, 0.7721, 0.6509, 0.6432, 0.2357, 0.6749, 0.4889, 0.6874]])
Prediction tensor([[0.1531, 0.0107, 0.0000, 0.0756, 0.4906, 0.0509, 0.0576, 0.0799, 0.0816]])
Prediction tensor([[ 0.5846,  0.4247,  0.7297,  0.4631,  0.5790,  0.5971,  0.2626, -0.0238,
          0.7836]])
Prediction tensor([[0.0717, 0.1733, 0.0000, 0.0000, 0.4172, 0.0853, 0.2003, 0.0522, 0.0000]])
Prediction tensor([[-0.2737, -0.5108,  0.9987,  0.0600,  0.5688,  0.9996, -0.2788, -0.9110,
          0.9729]])
Trial  1
Prediction tensor([[0.4763, 0.4593, 0.7721, 0.6509, 0.6432, 0.2357, 0.6749, 0.4889, 0.6874]])
Prediction tensor([[0.1531, 0.0107, 0.0000, 0.0756, 0.4906, 0.0509, 0.0576, 0.0799, 0.0816]])
Prediction tensor([[ 0.5846,  0.4247,  0.7297,  0.4631,  0.5790,  0.5971,  0.2626, -0.0238,
          0.7836]])
Prediction tensor([[0.1082, 0.

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_47500/videos/self.model_name_1/47500/NFSP-TicTacToe-episode-4749.mp4
0.96
0.19999999999999996


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_48000/videos/self.model_name_0/48000/NFSP-TicTacToe-episode-4774.mp4
-0.28
Trial  0
Prediction tensor([[0.6466, 0.4099, 0.7447, 0.3849, 0.5877, 0.3462, 0.6946, 0.6557, 0.4853]])
Prediction tensor([[0.1627, 0.0061, 0.0000, 0.0639, 0.5001, 0.0481, 0.0653, 0.0804, 0.0734]])
Prediction tensor([[ 0.7851,  0.4315,  0.3189, -0.1778,  0.5626,  0.1298,  0.2587,  0.3067,
          0.6802]])
Prediction tensor([[0.0000, 0.1794, 0.0000, 0.1005, 0.1203, 0.0450, 0.4341, 0.0000, 0.1208]])
Prediction tensor([[ 0.9999,  1.0000,  0.9997, -0.5428, -0.1061, -0.6894, -0.2003, -0.1045,
         -0.2133]])
Trial  1
Prediction tensor([[0.6466, 0.4099, 0.7447, 0.3849, 0.5877, 0.3462, 0.6946, 0.6557, 0.4853]])
Prediction tensor([[0.1627, 0.0061, 0.0000, 0.0639, 0.5001, 0.0481, 0.0653, 0.0804, 0.0734]])
Prediction tensor([[ 0.7851,  0.4315,  0.3189, -0.1778,  0.5626,  0.1298,  0.2587,  0.3067,
          0.6802]])
Prediction tensor([[0.0000, 0.

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_48000/videos/self.model_name_1/48000/NFSP-TicTacToe-episode-4799.mp4
0.88
0.6


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_48500/videos/self.model_name_0/48500/NFSP-TicTacToe-episode-4824.mp4
-0.88
Trial  0
Prediction tensor([[0.4868, 0.2591, 0.6761, 0.3738, 0.6894, 0.3761, 0.6070, 0.6923, 0.5742]])
Prediction tensor([[0.1158, 0.0171, 0.1857, 0.0332, 0.4301, 0.0468, 0.1060, 0.0000, 0.0652]])
Prediction tensor([[ 0.2731, -0.1375,  0.1516,  0.2253,  0.3634,  0.2096,  0.2408,  0.4466,
          0.2645]])
Prediction tensor([[0.0000, 0.0824, 0.1346, 0.1211, 0.0000, 0.4220, 0.1297, 0.0000, 0.1102]])
Prediction tensor([[ 0.4448,  0.9967,  0.5500,  0.3630, -0.0078, -0.0351, -0.1434, -0.0692,
         -0.4471]])
Trial  1
Prediction tensor([[0.4868, 0.2591, 0.6761, 0.3738, 0.6894, 0.3761, 0.6070, 0.6923, 0.5742]])
Prediction tensor([[0.1158, 0.0171, 0.1857, 0.0332, 0.4301, 0.0468, 0.1060, 0.0000, 0.0652]])
Prediction tensor([[ 0.2696, -0.1421,  0.1462,  0.2178,  0.3580,  0.2036,  0.2457,  0.4521,
          0.2736]])
Prediction tensor([[0.1077, 0.

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_48500/videos/self.model_name_1/48500/NFSP-TicTacToe-episode-4849.mp4
0.28
-0.6


  axs[row][col].legend()


[{'exploitability': 0.64}, {'exploitability': 0.88}, {'exploitability': 0.040000000000000036}, {'exploitability': 0.12}, {'exploitability': 0.43999999999999995}, {'exploitability': 0.6}, {'exploitability': -0.12}, {'exploitability': -0.76}, {'exploitability': -0.32}, {'exploitability': -0.12}, {'exploitability': 0.24}, {'exploitability': -0.43999999999999995}, {'exploitability': 0.28}, {'exploitability': 0.48}, {'exploitability': -0.16}, {'exploitability': 0.24}, {'exploitability': 0.24}, {'exploitability': -0.56}, {'exploitability': 0.8}, {'exploitability': -0.24}, {'exploitability': -0.08000000000000007}, {'exploitability': -0.16000000000000003}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.08000000000000007}, {'exploitability': 0.32000000000000006}, {'exploitability': 1.0}, {'exploitability': -0.12}, {'exploitability': 0.32000000000000006}, {'exploitability': 0.24}, {'exploitability': 0.88}, {'exploitability': 0.24}, {'exploitability': 0.4}, {'exploitability': 0.679

In [None]:
from pathlib import Path
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import custom_gym_envs
import gymnasium as gym

# TODO: 8, 9, 10, 11, 12
# DONE: 14, 13
dir = "./checkpoints/Rainbow-TicTacToe"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))

env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="human", encode_player_turn=True
)

agent = NFSPDQN(env, config, name="Rainbow-TicTacToe", device="cpu")

agent.load_from_checkpoint(dir, 20000)

In [None]:
env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="human", encode_player_turn=True
)
state, info = env.reset()
player = 0
agent_player = 1
done = False
agent.policies = ["best_response", "best_response"]
while not done:
    print(f"Player {player}")
    if player % 2 == agent_player:
        prediction = agent.predict(state, info)
        action = agent.select_actions(prediction, info).item()
    else:
        action = int(input("Enter action: "))
    state, reward, terminated, truncated, info = env.step(action)
    print(action)
    player = (player + 1) % 2
    env.render()
    done = terminated or truncated

In [None]:
from pathlib import Path
import yaml
from utils import plot_comparisons
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs

stats_list = []

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=False)

dir = "./checkpoints/NFSPDQN-LeducHoldem-7"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-9"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-10"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 275000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-11"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-12"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-13"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 85000)
stats_list.append(agent.stats)


dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 685000)
stats_list.append(agent.stats)

In [None]:
# indices:
# 0: Default
# 1: Default Shared
# 2: PER + Shared
# 3: Dueling + PER + Shared
# 4: Distributional + Dueling + PER + Shared
# 5: Distributional + Dueling + PER + Shared + LR 0.05
plot_comparisons(stats_list, "NFSPDQN-LeducHoldem")

In [None]:
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs
from pathlib import Path

# the test agent
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
test_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
test_agent.load_from_checkpoint(dir, 750000)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
challenger_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
challenger_agent.load_from_checkpoint(dir, 750000)

test_agent.policies = ["average_strategy", "average_strategy"]
# the challenger agent
challenger_agent.policies = ["best_response", "best_response"]

In [None]:
import copy


test_player = 0
score = 0
test_score = 0
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            else:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score = test_score / 5000  #


test_player = 1
test_score = 0

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            else:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score += test_score / 5000  #
print(score)

In [None]:
player_1 = agent.nfsp_agents[0]
player_2 = agent.nfsp_agents[1]
player_1.policy = "best_response"
state, info = env.reset()
print(state)
prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(4)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(7)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(3)
print(state)
print(terminated)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)

In [None]:
state, info = env.reset()
print(state)
state_2, reward, terminated, truncated, info = env.step(0)
print(state_2)
print(state)

In [None]:
samples = agent.nfsp_agents[0].rl_agent.replay_buffer.sample()
print(samples)

In [None]:
import torch

q_values = torch.tensor(
    [
        [1, 0, 0, 0.5, -1],
        [-1, 1, 1, 1, -1],
    ]
)
legal_moves = [[0, 1, 3, 4], [2, 3, 4]]
mask = torch.zeros_like(q_values, dtype=torch.int8)
for i, legal in enumerate(legal_moves):
    mask[i, legal] = 1
print(mask)
q_values[mask == 0] = float("-inf")
selected_actions = q_values.argmax(1, keepdim=False)
print(q_values)