In [44]:
from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig, RainbowConfig
from game_configs import MississippiMarblesConfig, LeducHoldemConfig, TicTacToeConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": True,
    "training_steps": 150000,  # like 2-5M in the paper (1M for initial test and see if rainbow is faster sooner)
    "num_players": 2,
    "anticipatory_param": 0.1,
    "replay_interval": 32,  # 128
    "num_minibatches": 4,  # 2 # 4
    "activation": "relu",
    "kernel_initializer": "he_normal",
    "learning_rate": 0.0025,  # maybe should be lower for distributional, loss magnitudes are similar but there are many more weights for distributions (from atom size)? testing alpha/2 and ... # DQN / 4 # 0.1 for DQN so 0.025 for Rainbow (initial test used 0.1 for rainbow)
    "weight_decay": 1e-9,
    "clipnorm": 1.0,
    "optimizer": Adam,  # SGD
    "loss_function": KLDivergenceLoss(),
    "per_alpha": 0.6,
    "per_beta": 0.5,
    "per_epsilon": 1e-3,
    "training_iterations": 1,  # 1
    "min_replay_buffer_size": 128,
    "minibatch_size": 128,
    "replay_buffer_size": 200000,  # 200000
    "transfer_interval": 300,  # 100
    "n_step": 3,  ##########
    "atom_size": 51,
    # "conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "dense_layer_widths": [],
    # "value_hidden_layer_widths": [],
    # "advantage_hidden_layer_widths": [],
    "residual_layers": [],
    "conv_layers": [
        (64, 3, 1),
    ],
    "dense_layer_widths": [],
    "dueling": True,
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.06,  ##########
    "eg_epsilon": 0.06,  # 0
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 150000,
    "sl_activation": "relu",
    "sl_kernel_initializer": "he_normal",
    "sl_learning_rate": 0.005,  # 0.00005
    "sl_weight_decay": 1e-9,
    "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,  # SGD
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_training_iterations": 1,  # 1
    "sl_min_replay_buffer_size": 128,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,  # 2000000
    # "sl_conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "sl_dense_layer_widths": [],
    "sl_residual_layers": [],
    "sl_conv_layers": [(64, 3, 1)],
    "sl_dense_layer_widths": [],
}
# config = NFSPConfig(config_dict=config_dict, game_config=MississippiMarblesConfig(), rl_config_type=RainbowConfig)
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=TicTacToeConfig(),
)
# config = NFSPDQNConfig(config_dict=config_dict, game_config=TicTacToeConfig(), rl_config_type=RainbowConfig)

# from nfsp_agent_clean import NFSPDQN
# from agent_configs import NFSPDQNConfig, RainbowConfig
# from game_configs import MississippiMarblesConfig, LeducHoldemConfig, TicTacToeConfig
# from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
# from torch.optim import Adam, SGD

# config_dict = {
#     "shared_networks_and_buffers": True,
#     "training_steps": 500000,  # like 2-5M in the paper (1M for initial test and see if rainbow is faster sooner)
#     "num_players": 2,
#     "anticipatory_param": 0.1,
#     "replay_interval": 128,  # 128
#     "num_minibatches": 4,  # 2 # 4
#     "activation": "relu",
#     "kernel_initializer": "he_normal",
#     "learning_rate": 0.1,  # maybe should be lower for distributional, loss magnitudes are similar but there are many more weights for distributions (from atom size)? testing alpha/2 and ... # DQN / 4 # 0.1 for DQN so 0.025 for Rainbow (initial test used 0.1 for rainbow)
#     "clipnorm": 1.0,
#     "optimizer": SGD,  # SGD
#     "loss_function": KLDivergenceLoss(),
#     "per_alpha": 0.6,
#     "per_beta": 0.5,
#     "per_epsilon": 1e-3,
#     "training_iterations": 1,  # 1
#     "min_replay_buffer_size": 128,
#     "minibatch_size": 128,
#     "replay_buffer_size": 50000,  # 200000
#     "transfer_interval": 300,  # 100
#     "n_step": 3,  ##########
#     "atom_size": 51,
#     # "conv_layers": [(64, 3, 1), (32, 2, 1)],
#     # "dense_layer_widths": [],
#     # "value_hidden_layer_widths": [],
#     # "advantage_hidden_layer_widths": [],
#     "residual_layers": [(128, 3, 1), (128, 2, 1)],
#     "conv_layers": [],
#     "dense_layer_widths": [],
#     "dueling": True,
#     "value_hidden_layer_widths": [],
#     "advantage_hidden_layer_widths": [],
#     "noisy_sigma": 0.12,  ##########
#     "eg_epsilon": 0.00,  # 0
#     "eg_epsilon_decay_type": "inverse_sqrt",
#     "eg_epsilon_decay_final_step": 0,
#     "sl_activation": "relu",
#     "sl_kernel_initializer": "he_normal",
#     "sl_learning_rate": 0.005,  # 0.00005
#     "sl_clipnorm": 1.0,
#     "sl_optimizer": SGD,  # SGD
#     "sl_loss_function": CategoricalCrossentropyLoss(),
#     "sl_training_iterations": 1,  # 1
#     "sl_min_replay_buffer_size": 128,
#     "sl_minibatch_size": 128,
#     "sl_replay_buffer_size": 1000000,  # 2000000
#     # "sl_conv_layers": [(64, 3, 1), (32, 2, 1)],
#     # "sl_dense_layer_widths": [],
#     "sl_residual_layers": [(128, 3, 1), (128, 2, 1)],
#     "sl_conv_layers": [],
#     "sl_dense_layer_widths": [],
# }
# # config = NFSPConfig(config_dict=config_dict, game_config=MississippiMarblesConfig(), rl_config_type=RainbowConfig)
# config = NFSPDQNConfig(
#     config_dict=config_dict,
#     game_config=TicTacToeConfig(),
# )
# # config = NFSPDQNConfig(config_dict=config_dict, game_config=TicTacToeConfig(), rl_config_type=RainbowConfig)

Using default save_intermediate_weights     : False
Using         training_steps                : 100000
Using default adam_epsilon                  : 1e-06
Using         learning_rate                 : 0.0025
Using         clipnorm                      : 1.0
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Using         weight_decay                  : 1e-09
Using         loss_function                 : <utils.utils.KLDivergenceLoss object at 0x2d065c550>
Using         activation                    : relu
Using         kernel_initializer            : he_normal
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000
Using         min_replay_buffer_size        : 128
Using         num_minibatches               : 4
Using         training_iterations           : 1
Using         num_players                   : 2
Using default save_intermediate_weights     : False
Using         training_steps                : 100000
Us

In [None]:
from pathlib import Path

# TODO: 8, 9, 10, 11, 12
# DONE: 14, 13
dir = "./checkpoints/bad_test_func/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))

In [None]:
import custom_gym_envs
import gymnasium as gym
from gymnasium.wrappers import FrameStack

# env = gym.make('custom_gym_envs/MississippiMarbles-v0', render_mode="human", players=2)
env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="rgb_array", encode_player_turn=True
)
# env = FrameStack(env, 3)
# env = gym.make('custom_gym_envs/TicTacToe-v0', render_mode="rgb_array", player_turn_as_plane=False)
agent = NFSPDQN(env, config, name="NFSP-TicTacToe", device="cpu")

In [None]:
dir = "./checkpoints/NFSP-TicTacToe"
agent.load_from_checkpoint(dir, 10000)

In [43]:
agent.checkpoint_trials = 100
agent.checkpoint_interval = 1000
agent.config.save_intermediate_weights = True
agent.train()

Trial  0
Prediction tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.9834, 0.0000, 0.0000, 0.0000, 0.0166]])
4
Prediction tensor([[-0.2835, -0.6217, -0.5024, -0.4616,    -inf, -0.4579, -0.5437, -0.3705,
         -0.4581]])
0
Prediction tensor([[0.0000, 0.4221, 0.1910, 0.0412, 0.0000, 0.2666, 0.0129, 0.0662, 0.0000]])
1
Prediction tensor([[   -inf,    -inf, -0.8528, -0.9804,    -inf, -0.8703, -0.9786, -0.3069,
         -0.9826]])
7
Prediction tensor([[0.0000, 0.0000, 0.2141, 0.3376, 0.0000, 0.1047, 0.2276, 0.0000, 0.1160]])
6
Prediction tensor([[   -inf,    -inf, -0.5139, -0.4467,    -inf, -0.5127,    -inf,    -inf,
         -0.4333]])
8
Prediction tensor([[0.0000, 0.0000, 0.3891, 0.5552, 0.0000, 0.0558, 0.0000, 0.0000, 0.0000]])
3
Prediction tensor([[  -inf,   -inf, 0.5548,   -inf,   -inf, 0.4767,   -inf,   -inf,   -inf]])
2
Prediction tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0.]])
5
Trial  1
Prediction tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.9834, 0.0000, 0.0000, 0.0000, 0.0166]])
4
Pred

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_27000/videos/self.model_name_0/27000/NFSP-TicTacToe-episode-5299.mp4
-0.48
Trial  0
Prediction tensor([[-0.2835, -0.6217, -0.5024, -0.4616,  0.5574, -0.4579, -0.5437, -0.3705,
         -0.4581]])
4
Prediction tensor([[0.5017, 0.0718, 0.1579, 0.1391, 0.0000, 0.0144, 0.0220, 0.0370, 0.0561]])
0
Prediction tensor([[   -inf,  0.7724,  0.6878,  0.7331,    -inf, -0.0392,  0.5627,  0.6854,
          0.0906]])
1
Prediction tensor([[0.0000, 0.0000, 0.0433, 0.3828, 0.0000, 0.0327, 0.2478, 0.2756, 0.0178]])
3
Prediction tensor([[  -inf,   -inf, 0.1476,   -inf,   -inf, 0.4859, 0.5511, 1.0000, 0.6482]])
7
Trial  1
Prediction tensor([[-0.2835, -0.6217, -0.5024, -0.4616,  0.5574, -0.4579, -0.5437, -0.3705,
         -0.4581]])
4
Prediction tensor([[0.5017, 0.0718, 0.1579, 0.1391, 0.0000, 0.0144, 0.0220, 0.0370, 0.0561]])
1
Prediction tensor([[ 0.7394,    -inf,  0.8039,  0.6392,    -inf,  0.9049,  0.1840, -0.3459,
          0.6379]]

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_27000/videos/self.model_name_1/27000/NFSP-TicTacToe-episode-5399.mp4
0.81
0.33000000000000007


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}]
Trial  0
Prediction tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.9831, 0.0000, 0.0000, 0.0000, 0.0169]])
4
Prediction tensor([[-0.35

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_28000/videos/self.model_name_0/28000/NFSP-TicTacToe-episode-5499.mp4
-0.2
Trial  0
Prediction tensor([[-0.3595, -0.7126, -0.6297, -0.5135,  0.4708, -0.7676, -0.6592, -0.6696,
         -0.5229]])
4
Prediction tensor([[0.4963, 0.0739, 0.1611, 0.1348, 0.0000, 0.0145, 0.0228, 0.0362, 0.0604]])
0
Prediction tensor([[   -inf,  0.7485,  0.6344,  0.7663,    -inf, -0.2889,  0.5163,  0.4710,
          0.3479]])
3
Prediction tensor([[0.0000, 0.3006, 0.1460, 0.0000, 0.0000, 0.0582, 0.3074, 0.1587, 0.0291]])
6
Prediction tensor([[  -inf, 0.3802, 0.1539,   -inf,   -inf, 0.9980,   -inf, 0.1883, 0.1075]])
5
Trial  1
Prediction tensor([[-0.3595, -0.7126, -0.6297, -0.5135,  0.4708, -0.7676, -0.6592, -0.6696,
         -0.5229]])
4
Prediction tensor([[0.4963, 0.0739, 0.1611, 0.1348, 0.0000, 0.0145, 0.0228, 0.0362, 0.0604]])
8
Prediction tensor([[0.4608, 0.7103, 0.8039, 0.7831,   -inf, 0.5966, 0.1698, 0.1451,   -inf]])
2
Prediction tens

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_28000/videos/self.model_name_1/28000/NFSP-TicTacToe-episode-5599.mp4
0.91
0.71


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}]
Trial  0
Prediction tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.9825, 0.0000, 0.0000, 0.0000, 0.0175]])
4

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_29000/videos/self.model_name_0/29000/NFSP-TicTacToe-episode-5699.mp4
-0.57
Trial  0
Prediction tensor([[-0.2633, -0.5386, -0.4563, -0.3445,  0.5629, -0.7431, -0.3601, -0.6447,
         -0.3340]])
4
Prediction tensor([[0.5086, 0.0658, 0.1545, 0.1400, 0.0000, 0.0130, 0.0212, 0.0352, 0.0617]])
1
Prediction tensor([[0.8916,   -inf, 0.9081, 0.5430,   -inf, 0.9046, 0.3473, 0.0325, 0.8124]])
2
Prediction tensor([[0.4072, 0.0000, 0.0000, 0.1134, 0.0000, 0.1141, 0.1010, 0.1115, 0.1528]])
0
Prediction tensor([[   -inf,    -inf,    -inf,  0.3780,    -inf,  0.3404,  1.0000, -0.4631,
          0.2174]])
6
Trial  1
Prediction tensor([[-0.2633, -0.5386, -0.4563, -0.3445,  0.5629, -0.7431, -0.3601, -0.6447,
         -0.3340]])
4
Prediction tensor([[0.5086, 0.0658, 0.1545, 0.1400, 0.0000, 0.0130, 0.0212, 0.0352, 0.0617]])
3
Prediction tensor([[ 0.2976,  0.6956,  0.6825,    -inf,    -inf, -0.3007,  0.6423,  0.2104,
          0.3226]]

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_29000/videos/self.model_name_1/29000/NFSP-TicTacToe-episode-5799.mp4
0.74
0.17000000000000004


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}]
Trial  0
Prediction tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_30000/videos/self.model_name_0/30000/NFSP-TicTacToe-episode-5899.mp4
-0.17
Trial  0
Prediction tensor([[-0.4371, -0.6451, -0.6138, -0.6003,  0.5418, -0.7040, -0.5135, -0.5568,
         -0.5594]])
4
Prediction tensor([[0.5203, 0.0611, 0.1406, 0.1458, 0.0000, 0.0123, 0.0256, 0.0342, 0.0601]])
8
Prediction tensor([[0.2170, 0.7988, 0.7894, 0.7474,   -inf, 0.3718, 0.6363, 0.5631,   -inf]])
1
Prediction tensor([[0.1188, 0.0000, 0.0915, 0.1040, 0.0000, 0.1134, 0.0908, 0.4816, 0.0000]])
7
Prediction tensor([[-0.0677,    -inf,  0.1420,  0.4877,    -inf,  0.1830,  0.4082,    -inf,
            -inf]])
3
Prediction tensor([[0.1117, 0.0000, 0.1783, 0.0000, 0.0000, 0.2743, 0.4357, 0.0000, 0.0000]])
0
Prediction tensor([[  -inf,   -inf, 0.3205,   -inf,   -inf, 0.8657, 0.7015,   -inf,   -inf]])
5
Trial  1
Prediction tensor([[-0.4371, -0.6451, -0.6138, -0.6003,  0.5418, -0.7040, -0.5135, -0.5568,
         -0.5594]])
4
Prediction ten

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_30000/videos/self.model_name_1/30000/NFSP-TicTacToe-episode-5999.mp4
0.75
0.58


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}]
Trial  0
Prediction tensor([[0.0000, 

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_31000/videos/self.model_name_0/31000/NFSP-TicTacToe-episode-6099.mp4
-0.21
Trial  0
Prediction tensor([[-0.4775, -0.7440, -0.7961, -0.7225,  0.5359, -0.7726, -0.5312, -0.7114,
         -0.5261]])
4
Prediction tensor([[0.5226, 0.0610, 0.1420, 0.1329, 0.0000, 0.0122, 0.0244, 0.0322, 0.0727]])
0
Prediction tensor([[   -inf,  0.5166,  0.2833,  0.5845,    -inf, -0.3944,  0.5969,  0.1930,
          0.4941]])
6
Prediction tensor([[0.0000, 0.1142, 0.0748, 0.6058, 0.0000, 0.0212, 0.0000, 0.1586, 0.0254]])
3
Prediction tensor([[  -inf, 0.1684, 0.4915,   -inf,   -inf, 0.0562,   -inf, 0.0880, 0.0718]])
2
Trial  1
Prediction tensor([[-0.4775, -0.7440, -0.7961, -0.7225,  0.5359, -0.7726, -0.5312, -0.7114,
         -0.5261]])
4
Prediction tensor([[0.5226, 0.0610, 0.1420, 0.1329, 0.0000, 0.0122, 0.0244, 0.0322, 0.0727]])
0
Prediction tensor([[   -inf,  0.5166,  0.2833,  0.5845,    -inf, -0.3944,  0.5969,  0.1930,
          0.4941]]

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_31000/videos/self.model_name_1/31000/NFSP-TicTacToe-episode-6199.mp4
0.61
0.4


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}, {'exploitability': 0.4}]
Trial  0
Pre

                                                           

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_32000/videos/self.model_name_0/32000/NFSP-TicTacToe-episode-6299.mp4
-0.36
Trial  0
Prediction tensor([[-0.4395, -0.7440, -0.7243, -0.6823,  0.5588, -0.7445, -0.5714, -0.5767,
         -0.5041]])
4
Prediction tensor([[0.5202, 0.0622, 0.1310, 0.1347, 0.0000, 0.0115, 0.0306, 0.0352, 0.0747]])
0
Prediction tensor([[   -inf,  0.6662,  0.6515,  0.7790,    -inf, -0.1898,  0.5966,  0.4311,
          0.3102]])
3
Prediction tensor([[0.0000, 0.2414, 0.1188, 0.0000, 0.0000, 0.0570, 0.3158, 0.2387, 0.0283]])
5
Prediction tensor([[  -inf, 0.4074, 0.2571,   -inf,   -inf,   -inf, 0.1458, 0.3084, 0.2528]])
1
Prediction tensor([[0.0000, 0.0000, 0.2485, 0.0000, 0.0000, 0.0000, 0.2830, 0.4014, 0.0672]])
2
Prediction tensor([[  -inf,   -inf,   -inf,   -inf,   -inf,   -inf, 0.1424, 1.0000, 0.4558]])
7
Trial  1
Prediction tensor([[-0.4395, -0.7440, -0.7243, -0.6823,  0.5588, -0.7445, -0.5714, -0.5767,
         -0.5041]])
4
Prediction ten

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_32000/videos/self.model_name_1/32000/NFSP-TicTacToe-episode-6399.mp4
0.92
0.56


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}, {'exploitability': 0.4}, {'exploitabi

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_33000/videos/self.model_name_0/33000/NFSP-TicTacToe-episode-6499.mp4
-0.99
Trial  0
Prediction tensor([[-0.4656, -0.5813, -0.6197, -0.6155,  0.4779, -0.6770, -0.6285, -0.4188,
         -0.4497]])
4
Prediction tensor([[0.5367, 0.0595, 0.1280, 0.1205, 0.0000, 0.0114, 0.0281, 0.0331, 0.0827]])
3
Prediction tensor([[0.2694, 0.6961, 0.7401,   -inf,   -inf, 0.2547, 0.5440, 0.4140, 0.4950]])
2
Prediction tensor([[0.8423, 0.0310, 0.0000, 0.0000, 0.0000, 0.0425, 0.0524, 0.0000, 0.0318]])
0
Prediction tensor([[   -inf, -0.3242,    -inf,    -inf,    -inf, -0.0800,  0.9950,  0.2436,
          0.0217]])
6
Trial  1
Prediction tensor([[-0.4656, -0.5813, -0.6197, -0.6155,  0.4779, -0.6770, -0.6285, -0.4188,
         -0.4497]])
4
Prediction tensor([[0.5367, 0.0595, 0.1280, 0.1205, 0.0000, 0.0114, 0.0281, 0.0331, 0.0827]])
0
Prediction tensor([[   -inf,  0.6766,  0.6933,  0.6563,    -inf, -0.1479,  0.5809,  0.5148,
          0.4687]]

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_33000/videos/self.model_name_1/33000/NFSP-TicTacToe-episode-6599.mp4
0.84
-0.15000000000000002


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}, {'exploitability': 0.4}, {'exploitabi

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_34000/videos/self.model_name_0/34000/NFSP-TicTacToe-episode-6699.mp4
-0.7
Trial  0
Prediction tensor([[-0.4472, -0.7095, -0.6726, -0.6014,  0.5027, -0.6575, -0.5476, -0.4609,
         -0.6865]])
4
Prediction tensor([[0.5298, 0.0544, 0.1253, 0.1257, 0.0000, 0.0118, 0.0319, 0.0333, 0.0878]])
8
Prediction tensor([[0.4816, 0.5111, 0.5048, 0.5833,   -inf, 0.3408, 0.6038, 0.4905,   -inf]])
6
Prediction tensor([[0.2478, 0.0503, 0.2145, 0.2175, 0.0000, 0.0809, 0.0000, 0.1891, 0.0000]])
3
Prediction tensor([[ 0.1108, -0.2151, -0.0929,    -inf,    -inf, -0.1053,    -inf, -0.2063,
            -inf]])
0
Prediction tensor([[0.0000, 0.0570, 0.2279, 0.0000, 0.0000, 0.5047, 0.0000, 0.2104, 0.0000]])
2
Prediction tensor([[   -inf, -0.1621,    -inf,    -inf,    -inf, -0.0508,    -inf, -0.1743,
            -inf]])
5
Prediction tensor([[0.0000, 0.4647, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5353, 0.0000]])
7
Prediction tensor([[  -i

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_34000/videos/self.model_name_1/34000/NFSP-TicTacToe-episode-6799.mp4
0.85
0.15000000000000002


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}, {'exploitability': 0.4}, {'exploitabi

                                                          

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_35000/videos/self.model_name_0/35000/NFSP-TicTacToe-episode-6899.mp4
-0.88
Trial  0
Prediction tensor([[-0.4786, -0.5965, -0.5138, -0.4811,  0.6315, -0.5028, -0.4408, -0.4832,
         -0.4021]])
4
Prediction tensor([[0.5296, 0.0523, 0.1177, 0.1283, 0.0000, 0.0121, 0.0359, 0.0343, 0.0898]])
0
Prediction tensor([[  -inf, 0.6882, 0.8243, 0.8454,   -inf, 0.0071, 0.6612, 0.6120, 0.5859]])
3
Prediction tensor([[0.0000, 0.1989, 0.1073, 0.0000, 0.0000, 0.0751, 0.2793, 0.3072, 0.0322]])
1
Prediction tensor([[  -inf,   -inf, 0.1906,   -inf,   -inf, 1.0000, 0.1320, 0.5963, 0.8244]])
5
Trial  1
Prediction tensor([[-0.4786, -0.5965, -0.5138, -0.4811,  0.6315, -0.5028, -0.4408, -0.4832,
         -0.4021]])
4
Prediction tensor([[0.5296, 0.0523, 0.1177, 0.1283, 0.0000, 0.0121, 0.0359, 0.0343, 0.0898]])
8
Prediction tensor([[0.5539, 0.7268, 0.8544, 0.8351,   -inf, 0.7696, 0.4430, 0.5823,   -inf]])
2
Prediction tensor([[0.2509, 0.06

                                                  

Moviepy - Done !
Moviepy - video ready checkpoints/NFSP-TicTacToe/step_35000/videos/self.model_name_1/35000/NFSP-TicTacToe-episode-6999.mp4
0.91
0.030000000000000027


  axs[row][col].legend()


[{'exploitability': 1.16}, {'exploitability': 1.29}, {'exploitability': 1.38}, {'exploitability': 0.52}, {'exploitability': 0.78}, {'exploitability': 0.47000000000000003}, {'exploitability': 0.79}, {'exploitability': 0.53}, {'exploitability': 0.9}, {'exploitability': 0.31999999999999995}, {'exploitability': 0.77}, {'exploitability': 0.05999999999999994}, {'exploitability': 0.37999999999999995}, {'exploitability': -0.07000000000000006}, {'exploitability': 1.22}, {'exploitability': 0.55}, {'exploitability': 1.03}, {'exploitability': 1.01}, {'exploitability': 0.47000000000000003}, {'exploitability': -0.03999999999999998}, {'exploitability': 0.48000000000000004}, {'exploitability': 0.46}, {'exploitability': 0.45999999999999996}, {'exploitability': -0.08000000000000002}, {'exploitability': 0.37}, {'exploitability': 0.69}, {'exploitability': 0.33000000000000007}, {'exploitability': 0.71}, {'exploitability': 0.17000000000000004}, {'exploitability': 0.58}, {'exploitability': 0.4}, {'exploitabi

KeyboardInterrupt: 

In [None]:
import numpy as np
import torch

b = np.array([[1, 0, 1, 0, 1], [0, 1, 0, 1, 0], [1, 2, 3, 4, 5]])
print(np.sum(b == b.max(axis=1, keepdims=True), axis=1) > 1)
for c in range(len(b)):
    print(np.random.choice(np.where(b[c] == b[c].max())[0]))
arr = np.apply_along_axis(lambda x: np.random.choice(np.where(x == x.max())[0]), 1, b)
print(arr)
arr = torch.stack(
    [torch.tensor(np.random.choice(np.where(x == x.max())[0])) for x in b]
)
print(arr)

In [None]:
samples = agent.rl_agents[0].replay_buffer.sample()
observations, actions, rewards, next_observations, dones = (
    samples["observations"],
    samples["actions"],
    samples["rewards"],
    samples["next_observations"],
    samples["dones"],
)
# print(samples)
# print(observations[0])
# print(next_observations[0])

n_step_buffer = agent.rl_agents[0].replay_buffer.n_step_buffers[1]
print(agent.rl_agents[0].replay_buffer.n_step_buffers)
print(agent.rl_agents[0].replay_buffer.n_step_buffers[0])
print(agent.rl_agents[0].replay_buffer.n_step_buffers[1])
n_step_info = agent.rl_agents[0].replay_buffer._get_n_step_info(player=1)
print(n_step_info)
n_step_info = agent.rl_agents[0].replay_buffer._get_n_step_info(player=0)
print(n_step_info)

In [None]:
from pathlib import Path
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import custom_gym_envs
import gymnasium as gym

# TODO: 8, 9, 10, 11, 12
# DONE: 14, 13
dir = "./checkpoints/NFSP-TicTacToe"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))

env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="human", encode_player_turn=True
)
# env = FrameStack(env, 3)

agent = NFSPDQN(env, config, name="NFSP-TicTacToe", device="cpu")

agent.load_from_checkpoint(dir, 30000)

In [None]:
from utils.utils import action_mask


env = gym.make(
    "custom_gym_envs/TicTacToe-v0", render_mode="human", encode_player_turn=True
)
# env = FrameStack(env, 3)
state, info = env.reset()
player = 0
agent_player = 1
done = False
agent.policies = ["best_response", "best_response"]
while not done:
    print(f"Player {player}")
    if player % 2 == agent_player:
        prediction = agent.predict(state, info)
        print(
            "Prediction",
            (
                action_mask(
                    (prediction * agent.rl_agents[0].support).sum(-1, keepdim=False),
                    [info["legal_moves"]],
                    mask_value=float("-inf"),
                )
                if agent.policies[agent_player] == "best_response"
                else prediction
            ),
        )
        action = agent.select_actions(prediction, info).item()
    else:
        action = int(input("Enter action: "))
        # prediction = agent.predict(state, info)
        # print(
        #     "Prediction",
        #     (
        #         action_mask(
        #             (prediction * agent.rl_agents[0].support).sum(-1, keepdim=False),
        #             [info["legal_moves"]],
        #             mask_value=float("-inf"),
        #         )
        #         if agent.policies[agent_player] == "best_response"
        #         else prediction
        #     ),
        # )
        # action = agent.select_actions(prediction, info).item()
    state, reward, terminated, truncated, info = env.step(action)
    print(action)
    player = (player + 1) % 2
    env.render()
    done = terminated or truncated

In [None]:
agent.rl_agents[0].replay_buffer.sample()

In [None]:
import numpy as np

test = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
probs = np.array([[[10, 20, 30], [40, 50, 60]], [[70, 80, 90], [100, 110, 120]]])
dones = np.array([False, True])
test[~dones] = probs[~dones]
print(test)

In [None]:
from pathlib import Path
import yaml
from utils import plot_comparisons
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs

stats_list = []

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=False)

dir = "./checkpoints/NFSPDQN-LeducHoldem-7"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-9"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-10"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 275000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-11"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-12"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-13"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 85000)
stats_list.append(agent.stats)


dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 685000)
stats_list.append(agent.stats)

In [None]:
# indices:
# 0: Default
# 1: Default Shared
# 2: PER + Shared
# 3: Dueling + PER + Shared
# 4: Distributional + Dueling + PER + Shared
# 5: Distributional + Dueling + PER + Shared + LR 0.05
plot_comparisons(stats_list, "NFSPDQN-LeducHoldem")

In [None]:
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs
from pathlib import Path

# the test agent
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
test_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
test_agent.load_from_checkpoint(dir, 750000)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
challenger_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
challenger_agent.load_from_checkpoint(dir, 750000)

test_agent.policies = ["average_strategy", "average_strategy"]
# the challenger agent
challenger_agent.policies = ["best_response", "best_response"]

In [None]:
import copy


test_player = 0
score = 0
test_score = 0
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            else:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score = test_score / 5000  #


test_player = 1
test_score = 0

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            else:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score += test_score / 5000  #
print(score)

In [None]:
player_1 = agent.nfsp_agents[0]
player_2 = agent.nfsp_agents[1]
player_1.policy = "best_response"
state, info = env.reset()
print(state)
prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(4)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(7)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(3)
print(state)
print(terminated)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)

In [None]:
state, info = env.reset()
print(state)
state_2, reward, terminated, truncated, info = env.step(0)
print(state_2)
print(state)

In [None]:
samples = agent.nfsp_agents[0].rl_agent.replay_buffer.sample()
print(samples)

In [None]:
import torch

q_values = torch.tensor(
    [
        [1, 0, 0, 0.5, -1],
        [-1, 1, 1, 1, -1],
    ]
)
legal_moves = [[0, 1, 3, 4], [2, 3, 4]]
mask = torch.zeros_like(q_values, dtype=torch.int8)
for i, legal in enumerate(legal_moves):
    mask[i, legal] = 1
print(mask)
q_values[mask == 0] = float("-inf")
selected_actions = q_values.argmax(1, keepdim=False)
print(q_values)