In [1]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import random

In [2]:
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)

In [3]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig, RainbowConfig
from game_configs import MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 1000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 2,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": HuberLoss(),
    "min_replay_buffer_size": 500,
    "minibatch_size": 128,
    "replay_buffer_size": 1000,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 500,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 20000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=MatchingPenniesConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 1000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.HuberLoss object at 0x16bd3a860>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 1000
Using         min_replay_buffer_size        : 500
Using         num_minibatches               : 2
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using      

In [4]:
import custom_gym_envs
import gymnasium as gym
from gymnasium.wrappers import FrameStack

# env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=False)

env = matching_pennies_env(render_mode="human", max_cycles=1)

agent = NFSPDQN(env, config, name="NFSP-MatchingPennies", device="cpu")

making test env
Test env: matching_pennies_v0
<class 'method'>
petting zoo
Observation dimensions: (6,)
Observation dtype: float32
num_actions:  2
making test env
Test env: matching_pennies_v0
<class 'method'>
petting zoo
Observation dimensions: (6,)
Observation dtype: float32
num_actions:  2
float32
Max size: 1000
making test env
Test env: matching_pennies_v0
<class 'method'>
petting zoo
Observation dimensions: (6,)
Observation dtype: float32
num_actions:  2
float32
Max size: 1000
making test env
Test env: matching_pennies_v0
<class 'method'>
petting zoo
Observation dimensions: (6,)
Observation dtype: float32
num_actions:  2
Max size: 20000
(20000, 6)
making test env
Test env: matching_pennies_v0
<class 'method'>
petting zoo
Observation dimensions: (6,)
Observation dtype: float32
num_actions:  2
Max size: 20000
(20000, 6)


In [5]:
agent.checkpoint_interval = 100
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/1000 [00:00<00:22, 43.59it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 64/1000
   Player 0 SL buffer: 5/20000
   Player 1 RL buffer: 64/1000
   Player 1 SL buffer: 5/20000


 10%|▉         | 96/1000 [00:02<00:21, 42.72it/s]

P1 SL Buffer Size:  645
P1 SL buffer distribution [179. 466.]
P1 actions distribution [0.27751938 0.72248062]
P2 SL Buffer Size:  628
P2 SL buffer distribution [442. 186.]
P2 actions distribution [0.70382166 0.29617834]
average score: 0.0
Player 0 Prediction: tensor([[0.3419, 0.6581]])
average score: -0.235
Player 1 Prediction: tensor([[0.6145, 0.3855]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}]
Plotting test_score_vs_random...


  axs[row][col].legend()
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].legend()
 20%|█▉        | 198/1000 [00:10<00:21, 37.87it/s]

P1 SL Buffer Size:  1306
P1 SL buffer distribution [839. 467.]
P1 actions distribution [0.6424196 0.3575804]
P2 SL Buffer Size:  1265
P2 SL buffer distribution [841. 424.]
P2 actions distribution [0.66482213 0.33517787]
average score: 0.0
Player 0 Prediction: tensor([[0.5887, 0.4113]])
average score: -0.4746
Player 1 Prediction: tensor([[0.7315, 0.2685]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}]
Plotting test_score_vs_random...


 30%|██▉       | 299/1000 [00:18<00:18, 37.93it/s]

P1 SL Buffer Size:  1955
P1 SL buffer distribution [1252.  703.]
P1 actions distribution [0.64040921 0.35959079]
P2 SL Buffer Size:  1905
P2 SL buffer distribution [ 842. 1063.]
P2 actions distribution [0.44199475 0.55800525]
average score: 0.0
Player 0 Prediction: tensor([[0.6751, 0.3249]])
average score: -0.0664
Player 1 Prediction: tensor([[0.4652, 0.5348]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}]
Plotting test_score_vs_random...


 40%|███▉      | 397/1000 [00:26<00:17, 35.11it/s]

P1 SL Buffer Size:  2585
P1 SL buffer distribution [1253. 1332.]
P1 actions distribution [0.48471954 0.51528046]
P2 SL Buffer Size:  2555
P2 SL buffer distribution [ 968. 1587.]
P2 actions distribution [0.37886497 0.62113503]
average score: 0.0
Player 0 Prediction: tensor([[0.5088, 0.4912]])
average score: -0.2624
Player 1 Prediction: tensor([[0.3673, 0.6327]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}]
Plotting test_score_vs_random...


 50%|█████     | 500/1000 [00:35<00:14, 33.69it/s]

P1 SL Buffer Size:  3255
P1 SL buffer distribution [1365. 1890.]
P1 actions distribution [0.41935484 0.58064516]
P2 SL Buffer Size:  3207
P2 SL buffer distribution [1620. 1587.]
P2 actions distribution [0.505145 0.494855]
average score: 0.0
Player 0 Prediction: tensor([[0.4178, 0.5822]])
average score: 0.0274
Player 1 Prediction: tensor([[0.4795, 0.5205]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}]
Plotting test_score_vs_random...


 60%|██████    | 600/1000 [00:44<00:12, 32.29it/s]

P1 SL Buffer Size:  3906
P1 SL buffer distribution [1948. 1958.]
P1 actions distribution [0.49871992 0.50128008]
P2 SL Buffer Size:  3841
P2 SL buffer distribution [2036. 1805.]
P2 actions distribution [0.53007029 0.46992971]
average score: 0.0
Player 0 Prediction: tensor([[0.4893, 0.5107]])
average score: -0.088
Player 1 Prediction: tensor([[0.5386, 0.4614]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}, {'exploitability': 0.044}]
Plotting test_score_vs_random...


 70%|██████▉   | 697/1000 [00:52<00:09, 32.31it/s]

P1 SL Buffer Size:  4501
P1 SL buffer distribution [2280. 2221.]
P1 actions distribution [0.5065541 0.4934459]
P2 SL Buffer Size:  4443
P2 SL buffer distribution [2264. 2179.]
P2 actions distribution [0.50956561 0.49043439]
average score: 0.0
Player 0 Prediction: tensor([[0.5015, 0.4985]])
average score: -0.0136
Player 1 Prediction: tensor([[0.5109, 0.4891]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}, {'exploitability': 0.044}, {'exploitability': 0.0068}]
Plotting test_score_vs_random...


 80%|████████  | 800/1000 [01:01<00:05, 35.21it/s]

P1 SL Buffer Size:  5182
P1 SL buffer distribution [2622. 2560.]
P1 actions distribution [0.50598225 0.49401775]
P2 SL Buffer Size:  5080
P2 SL buffer distribution [2549. 2531.]
P2 actions distribution [0.50177165 0.49822835]
average score: 0.0
Player 0 Prediction: tensor([[0.5035, 0.4965]])
average score: -0.0178
Player 1 Prediction: tensor([[0.5073, 0.4927]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}, {'exploitability': 0.044}, {'exploitability': 0.0068}, {'exploitability': 0.0089}]
Plotting test_score_vs_random...


 90%|█████████ | 900/1000 [01:09<00:03, 31.64it/s]

P1 SL Buffer Size:  5792
P1 SL buffer distribution [2892. 2900.]
P1 actions distribution [0.49930939 0.50069061]
P2 SL Buffer Size:  5746
P2 SL buffer distribution [2902. 2844.]
P2 actions distribution [0.50504699 0.49495301]
average score: 0.0
Player 0 Prediction: tensor([[0.4962, 0.5038]])
average score: 0.0344
Player 1 Prediction: tensor([[0.5139, 0.4861]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}, {'exploitability': 0.044}, {'exploitability': 0.0068}, {'exploitability': 0.0089}, {'exploitability': -0.0172}]
Plotting test_score_vs_random...


100%|██████████| 1000/1000 [01:18<00:00, 12.80it/s]


average score: 0.0
Player 0 Prediction: tensor([[0.5032, 0.4968]])
average score: -0.0246
Player 1 Prediction: tensor([[0.5016, 0.4984]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.1175}, {'exploitability': 0.2373}, {'exploitability': 0.0332}, {'exploitability': 0.1312}, {'exploitability': -0.0137}, {'exploitability': 0.044}, {'exploitability': 0.0068}, {'exploitability': 0.0089}, {'exploitability': -0.0172}, {'exploitability': 0.0123}]
Plotting test_score_vs_random...


In [None]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = False

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x104ea1ea0>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [2]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Standard", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
Test env: leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
Test env: leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
Test env: leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
Test env: leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
Test env: leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [None]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 2/50000 [00:00<43:03, 19.35it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 60/200000
   Player 0 SL buffer: 11/2000000
   Player 1 RL buffer: 68/200000
   Player 1 SL buffer: 8/2000000


  2%|▏         | 1003/50000 [01:36<1:08:23, 11.94it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 64282/200000
   Player 0 SL buffer: 7059/2000000
   Player 1 RL buffer: 63844/200000
   Player 1 SL buffer: 6758/2000000


  4%|▍         | 2000/50000 [03:03<1:34:32,  8.46it/s]

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 128144/200000
   Player 0 SL buffer: 14000/2000000
   Player 1 RL buffer: 127982/200000
   Player 1 SL buffer: 13611/2000000
P1 SL Buffer Size:  14000
P1 SL buffer distribution [4229. 7531.  680. 1560.]
P1 actions distribution [0.30207143 0.53792857 0.04857143 0.11142857]
P2 SL Buffer Size:  13611
P2 SL buffer distribution [4520. 7048.  772. 1271.]
P2 actions distribution [0.33208434 0.51781647 0.05671883 0.09338035]
average score: -1.0087
Player 0 Prediction: tensor([[0.0000, 0.7928, 0.0460, 0.1612]])
Player 0 Prediction: tensor([[0.0000, 0.7660, 0.0612, 0.1728]])
average score: -0.97385
Player 1 Prediction: tensor([[0.0000, 0.8649, 0.0416, 0.0935]])
Player 1 Prediction: tensor([[0.0000, 0.7908, 0.0686, 0.1406]])
Plotting rl_loss...
Plotting sl_loss...
Plotting exploitability...
[{'exploitability': 0.9912749999999999}]
Plotting test_score_vs_random...


  axs[row][col].legend()
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].legend()
  4%|▍         | 2224/50000 [04:30<1:36:45,  8.23it/s]  


KeyboardInterrupt: 

: 