In [None]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import random

In [None]:
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)

In [None]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig, RainbowConfig
from game_configs import MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 10000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 2,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": HuberLoss(),
    "min_replay_buffer_size": 500,
    "minibatch_size": 128,
    "replay_buffer_size": 1000,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 500,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 20000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=MatchingPenniesConfig(),
)
config.save_intermediate_weights = True

In [None]:
import custom_gym_envs
import gymnasium as gym
from gymnasium.wrappers import FrameStack

# env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=False)

env = matching_pennies_env(render_mode="human", max_cycles=1)

agent = NFSPDQN(env, config, name="NFSP-MatchingPennies", device="cpu")

In [None]:
agent.checkpoint_interval = 500
agent.checkpoint_trials = 10000
agent.train()

In [1]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x1037a1ed0>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [2]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Standard", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [3]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/50000 [00:00<17:54, 46.54it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 65/200000
   Player 0 SL buffer: 8/2000000
   Player 1 RL buffer: 62/200000
   Player 1 SL buffer: 7/2000000


  2%|▏         | 1005/50000 [00:28<23:02, 35.45it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 63883/200000
   Player 0 SL buffer: 7051/2000000
   Player 1 RL buffer: 64245/200000
   Player 1 SL buffer: 7171/2000000


  4%|▍         | 1999/50000 [01:07<28:35, 27.97it/s]  

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 127674/200000
   Player 0 SL buffer: 13775/2000000
   Player 1 RL buffer: 128452/200000
   Player 1 SL buffer: 13827/2000000
P1 SL Buffer Size:  13775
P1 SL buffer distribution [4330. 7703.  629. 1113.]
P1 actions distribution [0.31433757 0.55920145 0.04566243 0.08079855]
P2 SL Buffer Size:  13827
P2 SL buffer distribution [4532. 7118.  874. 1303.]
P2 actions distribution [0.32776452 0.5147899  0.06320966 0.09423592]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.9876,  2.5510, -0.5471,  1.6790]])
Player 0 Prediction: tensor([[0.1569, 0.8225, 0.0207, 0.0000]])
Player 1 Prediction: tensor([[ 2.6371,  3.4609, -2.0920,  2.5180]])
Player 0 Prediction: tensor([[0.0000, 0.7888, 0.0563, 0.1549]])
Player 1 Prediction: tensor([[ 1.2185,  2.0621, -2.9521,  1.1818]])
Player 0 Prediction: tensor([[0.9204, 0.0000, 0.0796

  4%|▍         | 1999/50000 [01:20<28:35, 27.97it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 53909
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5440/10000 (54.4%)
    Average reward: -0.990
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4560/10000 (45.6%)
    Average reward: +0.990
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8532 (32.0%)
    Action 1: 15794 (59.2%)
    Action 2: 1247 (4.7%)
    Action 3: 1104 (4.1%)
  Player 1:
    Action 0: 7128 (26.2%)
    Action 1: 14693 (54.0%)
    Action 2: 3154 (11.6%)
    Action 3: 2257 (8.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-9898.5, 9898.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.974 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.986 (max=1.0 for random)
    → Playing near

  axs[row][col].legend()
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].legend()
  6%|▌         | 3003/50000 [02:21<28:40, 27.31it/s]   

   Player 0 ε: 0.0011 → 0.0011

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 191321/200000
   Player 0 SL buffer: 20269/2000000
   Player 1 RL buffer: 192805/200000
   Player 1 SL buffer: 20218/2000000


  8%|▊         | 4000/50000 [02:55<27:05, 28.29it/s]

   Player 0 ε: 0.0009 → 0.0009

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 26832/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 26913/2000000
P1 SL Buffer Size:  26832
P1 SL buffer distribution [ 7979. 14519.  1859.  2475.]
P1 actions distribution [0.29736881 0.54110763 0.06928295 0.09224061]
P2 SL Buffer Size:  26913
P2 SL buffer distribution [ 9034. 12955.  2229.  2695.]
P2 actions distribution [0.33567421 0.48136588 0.08282243 0.10013748]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.8158,  3.4494, -0.5353,  2.5612]])
Player 0 Prediction: tensor([[0.0451, 0.9454, 0.0094, 0.0000]])
Player 1 Prediction: tensor([[ 2.8776,  3.9054, -2.0001,  2.8937]])
Player 0 Prediction: tensor([[0.0000, 0.7997, 0.0640, 0.1362]])
Player 1 Prediction: tensor([[ 0.5948,  1.3128, -2.9548,  0.7965]])
Player 0 Prediction: tensor([[0.9735, 0.0000

  8%|▊         | 4000/50000 [03:10<27:05, 28.29it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 56531
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5222/10000 (52.2%)
    Average reward: -0.899
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4778/10000 (47.8%)
    Average reward: +0.899
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8874 (31.4%)
    Action 1: 15582 (55.2%)
    Action 2: 1875 (6.6%)
    Action 3: 1920 (6.8%)
  Player 1:
    Action 0: 7388 (26.1%)
    Action 1: 14145 (50.0%)
    Action 2: 2717 (9.6%)
    Action 3: 4030 (14.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-8987.5, 8987.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.998 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.006 (max=1.0 for random)
    → Playing near

 10%|█         | 5005/50000 [04:01<24:41, 30.37it/s]   

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 33465/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 33411/2000000


 12%|█▏        | 5999/50000 [04:40<35:30, 20.66it/s]

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 39924/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40167/2000000
P1 SL Buffer Size:  39924
P1 SL buffer distribution [12419. 20646.  3245.  3614.]
P1 actions distribution [0.31106603 0.51713255 0.08127943 0.09052199]
P2 SL Buffer Size:  40167
P2 SL buffer distribution [13138. 19512.  3533.  3984.]
P2 actions distribution [0.32708442 0.4857719  0.08795778 0.0991859 ]


 12%|█▏        | 5999/50000 [04:51<35:30, 20.66it/s]

   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2065, 0.7743, 0.0192, 0.0000]])
Player 1 Prediction: tensor([[ 0.1156,  0.2856, -0.9552,  0.2312]])
Player 0 Prediction: tensor([[0.9813, 0.0000, 0.0187, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 56057
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4966/10000 (49.7%)
    Average reward: -0.922
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5034/10000 (50.3%)
    Average reward: +0.922
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8956 (31.9%)
    Action 1: 14898 (53.0%)
    Action 2: 2372 (8.4%)
    Action 3: 1874 (6.7%)
  Player 1:
    Action 0: 7009 (25.1%)
    Action 1: 14468 (51.8%)
    Action 2: 2521 (9.0%)
    Action 3: 3959 (14.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per playe

 14%|█▍        | 7004/50000 [05:54<26:36, 26.93it/s]   

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 46371/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 46622/2000000


 16%|█▌        | 8000/50000 [06:32<25:38, 27.29it/s]

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 52896/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 53291/2000000
P1 SL Buffer Size:  52896
P1 SL buffer distribution [16936. 26870.  4439.  4651.]
P1 actions distribution [0.32017544 0.50797792 0.08391939 0.08792725]
P2 SL Buffer Size:  53291
P2 SL buffer distribution [17136. 26299.  4782.  5074.]
P2 actions distribution [0.32155523 0.49349796 0.08973373 0.09521308]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.6025, 0.3543, 0.0432, 0.0000]])
Player 1 Prediction: tensor([[ 0.5172,  1.2355, -0.9843,  1.0505]])
Player 0 Prediction: tensor([[0.3522, 0.5874, 0.0604, 0.0000]])
Player 1 Prediction: tensor([[-1.5048, -1.6517, -2.1242, -1.6446]])
Player 0 Prediction: tensor([[0.0000, 0.3136, 0.2567, 0.4297]])
Player 1 Prediction: tensor([[-3.1221, -4.3363, 

 16%|█▌        | 8000/50000 [06:51<25:38, 27.29it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 48490
Average episode length: 4.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4982/10000 (49.8%)
    Average reward: -0.123
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5018/10000 (50.2%)
    Average reward: +0.123
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2491 (10.7%)
    Action 1: 15812 (67.8%)
    Action 2: 2465 (10.6%)
    Action 3: 2538 (10.9%)
  Player 1:
    Action 0: 20742 (82.4%)
    Action 1: 4442 (17.6%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1232.0, 1232.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.725 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.672 (max=1.0 for random)
    → Strongly prefers Heads
  Average 

 18%|█▊        | 9004/50000 [07:38<22:54, 29.83it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 59250/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 59870/2000000


 20%|██        | 10000/50000 [08:19<28:03, 23.76it/s]

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 65727/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 66440/2000000
P1 SL Buffer Size:  65727
P1 SL buffer distribution [21169. 33483.  5622.  5453.]
P1 actions distribution [0.32207464 0.50942535 0.08553562 0.08296438]
P2 SL Buffer Size:  66440
P2 SL buffer distribution [20768. 33505.  5998.  6169.]
P2 actions distribution [0.31258278 0.50428958 0.09027694 0.09285069]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.5055,  1.3937, -0.6673,  1.7244]])
Player 0 Prediction: tensor([[0.4191, 0.5412, 0.0397, 0.0000]])
Player 1 Prediction: tensor([[ 0.4750,  1.4670, -1.8893,  0.9966]])
Player 0 Prediction: tensor([[0.0000, 0.3120, 0.0995, 0.5885]])
Player 1 Prediction: tensor([[ 0.7111,  1.2275, -3.0509,  0.8566]])
Player 0 Prediction: tensor([[0.1007, 0.19

 20%|██        | 10000/50000 [08:31<28:03, 23.76it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 55768
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4368/10000 (43.7%)
    Average reward: -0.596
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5632/10000 (56.3%)
    Average reward: +0.596
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9598 (33.9%)
    Action 1: 13842 (48.9%)
    Action 2: 3571 (12.6%)
    Action 3: 1306 (4.6%)
  Player 1:
    Action 0: 6317 (23.0%)
    Action 1: 17630 (64.2%)
    Action 2: 2134 (7.8%)
    Action 3: 1370 (5.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5961.5, 5961.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.034 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.898 (max=1.0 for random)
    → Mixed strat

 22%|██▏       | 11003/50000 [09:29<24:30, 26.52it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 72080/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 72683/2000000


 24%|██▍       | 11998/50000 [10:09<25:29, 24.85it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 78339/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 78774/2000000
P1 SL Buffer Size:  78339
P1 SL buffer distribution [25125. 39893.  6771.  6550.]
P1 actions distribution [0.32072148 0.5092355  0.08643205 0.08361097]
P2 SL Buffer Size:  78774
P2 SL buffer distribution [24609. 39605.  7154.  7406.]
P2 actions distribution [0.31240003 0.50276741 0.09081677 0.09401579]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.6880, 0.2840, 0.0280, 0.0000]])
Player 1 Prediction: tensor([[ 0.6141,  1.0103, -1.0565,  1.4589]])
Player 0 Prediction: tensor([[0.0000, 0.4803, 0.1040, 0.4157]])
Player 1 Prediction: tensor([[ 2.1422,  3.3378, -0.8530,  3.9413]])
Player 0 Prediction: tensor([[0.0900, 0.0000, 0.9100, 0.0000]])


 24%|██▍       | 11998/50000 [10:21<25:29, 24.85it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 50708
Average episode length: 5.1 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4813/10000 (48.1%)
    Average reward: -0.752
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5187/10000 (51.9%)
    Average reward: +0.752
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6129 (24.3%)
    Action 1: 14250 (56.4%)
    Action 2: 2963 (11.7%)
    Action 3: 1929 (7.6%)
  Player 1:
    Action 0: 8888 (34.9%)
    Action 1: 11208 (44.1%)
    Action 2: 2680 (10.5%)
    Action 3: 2661 (10.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7515.5, 7515.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.962 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.051 (max=1.0 for random)
    → Playing n

 26%|██▌       | 13002/50000 [11:20<31:35, 19.52it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 84465/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 84765/2000000


 28%|██▊       | 13998/50000 [12:01<24:58, 24.03it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 90932/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 90863/2000000
P1 SL Buffer Size:  90932
P1 SL buffer distribution [29287. 45831.  8008.  7806.]
P1 actions distribution [0.32207584 0.50401399 0.08806581 0.08584437]
P2 SL Buffer Size:  90863
P2 SL buffer distribution [28944. 45004.  8361.  8554.]
P2 actions distribution [0.3185455  0.49529511 0.09201765 0.09414173]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[8.7769e-02, 9.1197e-01, 2.5842e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.0493, -0.4834, -1.0713,  0.1920]])
Player 0 Prediction: tensor([[0.0000, 0.9378, 0.0054, 0.0569]])
Player 1 Prediction: tensor([[ 4.4674,  5.8208, -2.0227,  3.7841]])
Player 0 Prediction: tensor([[0.9856, 0.0000, 0.0144, 0.0000]])

📊 TEST RESULTS SUMMARY
Traini

 28%|██▊       | 13998/50000 [12:21<24:58, 24.03it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 48663
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5111/10000 (51.1%)
    Average reward: -0.043
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4889/10000 (48.9%)
    Average reward: +0.043
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2456 (10.6%)
    Action 1: 15501 (67.1%)
    Action 2: 2235 (9.7%)
    Action 3: 2895 (12.5%)
  Player 1:
    Action 0: 20586 (80.5%)
    Action 1: 4990 (19.5%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-427.0, 427.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.730 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.712 (max=1.0 for random)
    → Strongly prefers Heads
  Average st

 30%|███       | 15004/50000 [13:14<23:33, 24.76it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 96972/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 97096/2000000


 32%|███▏      | 16000/50000 [13:57<23:20, 24.27it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 103451/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 103494/2000000
P1 SL Buffer Size:  103451
P1 SL buffer distribution [33258. 51816.  9270.  9107.]
P1 actions distribution [0.32148553 0.50087481 0.08960764 0.08803202]
P2 SL Buffer Size:  103494
P2 SL buffer distribution [33903. 49807.  9574. 10210.]
P2 actions distribution [0.32758421 0.48125495 0.09250778 0.09865306]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[8.0737e-02, 9.1909e-01, 1.6934e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.4182,  2.3985, -1.3833,  1.8471]])
Player 0 Prediction: tensor([[9.9966e-01, 0.0000e+00, 3.3776e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.3881,  0.2094, -2.9431,  0.5407]])
Player 0 Prediction: tensor([[0.0000e+00, 8.8287e-01, 8.5027e-04, 1.1628e-

 32%|███▏      | 16000/50000 [14:11<23:20, 24.27it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 53889
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4958/10000 (49.6%)
    Average reward: -0.563
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5042/10000 (50.4%)
    Average reward: +0.563
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7625 (28.5%)
    Action 1: 13483 (50.3%)
    Action 2: 2311 (8.6%)
    Action 3: 3367 (12.6%)
  Player 1:
    Action 0: 10293 (38.0%)
    Action 1: 10408 (38.4%)
    Action 2: 2393 (8.8%)
    Action 3: 4009 (14.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5628.5, 5628.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.014 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing n

 34%|███▍      | 17005/50000 [15:08<22:36, 24.33it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 109804/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 109796/2000000


 36%|███▌      | 17999/50000 [15:50<22:20, 23.88it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 116141/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 116337/2000000
P1 SL Buffer Size:  116141
P1 SL buffer distribution [37558. 57432. 10562. 10589.]
P1 actions distribution [0.32338278 0.49450237 0.09094118 0.09117366]
P2 SL Buffer Size:  116337
P2 SL buffer distribution [38579. 54754. 10754. 12250.]
P2 actions distribution [0.33161419 0.47064992 0.09243835 0.10529754]


 36%|███▌      | 17999/50000 [16:01<22:20, 23.88it/s]

   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[7.8044e-02, 9.2184e-01, 1.1283e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.1848,  0.5862, -1.1555,  0.6548]])
Player 0 Prediction: tensor([[9.9977e-01, 0.0000e+00, 2.3173e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.1852, -0.2986, -2.9423,  0.2629]])
Player 0 Prediction: tensor([[0.0000e+00, 8.5002e-01, 4.2567e-04, 1.4956e-01]])
Player 1 Prediction: tensor([[-1.0349, -1.9569, -2.9438, -0.6092]])

📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 53070
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5073/10000 (50.7%)
    Average reward: -0.553
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4927/10000 (49.3%)
    Average reward: +0.553
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7317 (27.6%)
   

 38%|███▊      | 19003/50000 [17:05<21:09, 24.41it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 122289/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 122741/2000000


 40%|███▉      | 19998/50000 [17:47<20:43, 24.13it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 128750/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 128998/2000000
P1 SL Buffer Size:  128750
P1 SL buffer distribution [42116. 62548. 11811. 12275.]
P1 actions distribution [0.32711456 0.48580971 0.09173592 0.09533981]
P2 SL Buffer Size:  128998
P2 SL buffer distribution [43195. 59842. 11935. 14026.]
P2 actions distribution [0.33485015 0.46389867 0.09252081 0.10873037]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8274, 0.1629, 0.0097, 0.0000]])
Player 1 Prediction: tensor([[-0.1278, -0.0700, -1.2387,  0.0555]])
Player 0 Prediction: tensor([[0.9881, 0.0000, 0.0119, 0.0000]])
Player 1 Prediction: tensor([[-2.4140, -3.3377, -2.8360, -2.8283]])
Player 0 Prediction: tensor([[0.0000, 0.2664, 0.0522, 0.6814]])
Player 1 Prediction: tensor([[-4.5218, -6.

 40%|███▉      | 19998/50000 [18:01<20:43, 24.13it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 56195
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5328/10000 (53.3%)
    Average reward: -0.316
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4672/10000 (46.7%)
    Average reward: +0.316
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9734 (34.8%)
    Action 1: 12236 (43.7%)
    Action 2: 2108 (7.5%)
    Action 3: 3928 (14.0%)
  Player 1:
    Action 0: 7708 (27.3%)
    Action 1: 12229 (43.4%)
    Action 2: 2134 (7.6%)
    Action 3: 6118 (21.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3160.0, 3160.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.052 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.034 (max=1.0 for random)
    → Playing ne

 42%|████▏     | 21004/50000 [19:00<21:31, 22.46it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 135285/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 135658/2000000


 44%|████▍     | 22000/50000 [19:46<20:35, 22.67it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 141703/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 142118/2000000
P1 SL Buffer Size:  141703
P1 SL buffer distribution [47040. 67481. 13071. 14111.]
P1 actions distribution [0.33196192 0.47621434 0.09224222 0.09958152]
P2 SL Buffer Size:  142118
P2 SL buffer distribution [47852. 64902. 13178. 16186.]
P2 actions distribution [0.33670612 0.45667685 0.09272576 0.11389127]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.0228, -0.6662, -0.6670,  0.1856]])
Player 0 Prediction: tensor([[0.0000, 0.8741, 0.0020, 0.1240]])
Player 1 Prediction: tensor([[-1.1808, -1.8932, -0.9603, -0.2088]])
Player 0 Prediction: tensor([[0.0000, 0.8035, 0.0019, 0.1946]])
Player 1 Prediction: tensor([[-2.1459, -2.0982, -1.2196, -0.8950]])


 44%|████▍     | 22000/50000 [20:01<20:35, 22.67it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 53078
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5256/10000 (52.6%)
    Average reward: -0.332
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4744/10000 (47.4%)
    Average reward: +0.332
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7427 (28.2%)
    Action 1: 12637 (47.9%)
    Action 2: 2044 (7.8%)
    Action 3: 4265 (16.2%)
  Player 1:
    Action 0: 10009 (37.5%)
    Action 1: 9800 (36.7%)
    Action 2: 2586 (9.7%)
    Action 3: 4310 (16.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3317.0, 3317.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.023 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing ne

 46%|████▌     | 23003/50000 [20:58<18:41, 24.06it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 148274/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 148466/2000000


 48%|████▊     | 23998/50000 [21:42<19:30, 22.21it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 154619/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 154906/2000000
P1 SL Buffer Size:  154619
P1 SL buffer distribution [51820. 72265. 14282. 16252.]
P1 actions distribution [0.33514639 0.46737464 0.09236898 0.10510998]
P2 SL Buffer Size:  154906
P2 SL buffer distribution [52577. 69519. 14514. 18296.]
P2 actions distribution [0.33941229 0.44878184 0.09369553 0.11811034]


 48%|████▊     | 23998/50000 [21:52<19:30, 22.21it/s]

   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8712, 0.1228, 0.0060, 0.0000]])
Player 1 Prediction: tensor([[ 0.0368,  0.4037, -1.2790,  0.5799]])
Player 0 Prediction: tensor([[0.9857, 0.0000, 0.0143, 0.0000]])
Player 1 Prediction: tensor([[-2.2191, -1.7212, -2.8877, -2.4732]])
Player 0 Prediction: tensor([[0.2374, 0.4485, 0.3140, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 51831
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5460/10000 (54.6%)
    Average reward: -0.380
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4540/10000 (45.4%)
    Average reward: +0.380
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7437 (28.9%)
    Action 1: 12325 (47.9%)
    Action 2: 2032 (7.9%)
    Action 3: 3962 (15.4%)
  Player 1:
    Action 0: 8791 (33.7%)

 50%|█████     | 25003/50000 [22:58<17:48, 23.40it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 160975/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 161552/2000000


 52%|█████▏    | 25999/50000 [23:43<17:23, 23.00it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 167280/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 168043/2000000
P1 SL Buffer Size:  167280
P1 SL buffer distribution [56103. 77000. 15443. 18734.]
P1 actions distribution [0.33538379 0.46030607 0.09231827 0.11199187]
P2 SL Buffer Size:  168043
P2 SL buffer distribution [57137. 74479. 15882. 20545.]
P2 actions distribution [0.34001416 0.44321394 0.09451152 0.12226037]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8815, 0.1135, 0.0050, 0.0000]])
Player 1 Prediction: tensor([[ 1.2553,  1.8741, -1.4469,  1.2331]])
Player 0 Prediction: tensor([[0.7192, 0.2763, 0.0045, 0.0000]])
Player 1 Prediction: tensor([[ 2.7551,  2.6467, -1.6116,  3.1129]])
Player 0 Prediction: tensor([[0.0000, 0.2881, 0.0534, 0.6585]])

📊 TEST RESULTS SUMMARY
Training step: 260

 52%|█████▏    | 25999/50000 [24:02<17:23, 23.00it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 50217
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5296/10000 (53.0%)
    Average reward: +0.335
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4704/10000 (47.0%)
    Average reward: -0.335
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3407 (14.2%)
    Action 1: 14394 (59.9%)
    Action 2: 2502 (10.4%)
    Action 3: 3708 (15.4%)
  Player 1:
    Action 0: 19351 (73.8%)
    Action 1: 6855 (26.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3352.0, -3352.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.842 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.829 (max=1.0 for random)
    → Mixed strategy
  Average strateg

 54%|█████▍    | 27003/50000 [24:56<17:07, 22.38it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 173633/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 174397/2000000


 56%|█████▌    | 28000/50000 [25:43<17:09, 21.37it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 179943/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 180747/2000000
P1 SL Buffer Size:  179943
P1 SL buffer distribution [60295. 81684. 16643. 21321.]
P1 actions distribution [0.33507833 0.45394375 0.0924904  0.11848752]
P2 SL Buffer Size:  180747
P2 SL buffer distribution [61678. 79043. 17101. 22925.]
P2 actions distribution [0.34123941 0.43731293 0.09461291 0.12683475]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-0.0124, -1.0357, -0.7435,  0.1161]])
Player 0 Prediction: tensor([[0.0000, 0.8211, 0.0017, 0.1772]])
Player 1 Prediction: tensor([[-0.0646, -0.8831, -1.1111, -0.1230]])
Player 0 Prediction: tensor([[0.0000, 0.7379, 0.0034, 0.2587]])
Player 1 Prediction: tensor([[-1.7281, -1.6744, -2.0075, -1.6134]])

📊 TEST RESULTS SUMMARY
Training step:

 56%|█████▌    | 28000/50000 [26:03<17:09, 21.37it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 50200
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5261/10000 (52.6%)
    Average reward: +0.337
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4739/10000 (47.4%)
    Average reward: -0.337
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3471 (14.5%)
    Action 1: 14037 (58.5%)
    Action 2: 2552 (10.6%)
    Action 3: 3936 (16.4%)
  Player 1:
    Action 0: 19071 (72.8%)
    Action 1: 7133 (27.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3371.0, -3371.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.856 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.845 (max=1.0 for random)
    → Mixed strategy
  Average strateg

 58%|█████▊    | 29003/50000 [26:58<15:04, 23.22it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 186123/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 187011/2000000


 60%|█████▉    | 29998/50000 [27:44<14:45, 22.59it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 192292/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 193313/2000000
P1 SL Buffer Size:  192292
P1 SL buffer distribution [64668. 86016. 17893. 23715.]
P1 actions distribution [0.33630104 0.4473197  0.09305119 0.12332806]
P2 SL Buffer Size:  193313
P2 SL buffer distribution [66540. 83123. 18311. 25339.]
P2 actions distribution [0.34420862 0.42999177 0.09472203 0.13107758]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-0.0067, -0.9224, -0.8240,  0.1210]])
Player 0 Prediction: tensor([[0.0000e+00, 9.1790e-01, 7.0311e-05, 8.2027e-02]])
Player 1 Prediction: tensor([[-0.1721, -0.8329, -1.2005, -0.1157]])
Player 0 Prediction: tensor([[0.0000e+00, 7.3572e-01, 1.4757e-04, 2.6413e-01]])
Player 1 Prediction: tensor([[-4.1660, -5.3212, -1.9571, -2.8022]])

📊 TES

 60%|█████▉    | 29998/50000 [28:03<14:45, 22.59it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 50632
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5226/10000 (52.3%)
    Average reward: +0.359
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4774/10000 (47.7%)
    Average reward: -0.359
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3739 (15.4%)
    Action 1: 13838 (56.9%)
    Action 2: 2693 (11.1%)
    Action 3: 4036 (16.6%)
  Player 1:
    Action 0: 18824 (71.5%)
    Action 1: 7502 (28.5%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3587.0, -3587.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.878 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.862 (max=1.0 for random)
    → Mixed strategy
  Average strateg

 62%|██████▏   | 31003/50000 [28:56<12:59, 24.36it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 198569/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 199897/2000000


 64%|██████▍   | 32000/50000 [29:40<13:03, 22.96it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 205033/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 206164/2000000
P1 SL Buffer Size:  205033
P1 SL buffer distribution [69139. 90320. 19106. 26468.]
P1 actions distribution [0.33720913 0.44051445 0.093185   0.12909141]
P2 SL Buffer Size:  206164
P2 SL buffer distribution [71104. 87595. 19500. 27965.]
P2 actions distribution [0.34489048 0.42488019 0.09458489 0.13564444]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[1.8132e-01, 8.1867e-01, 1.0096e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.0033,  0.3038, -1.1784,  0.4330]])
Player 0 Prediction: tensor([[9.9995e-01, 0.0000e+00, 4.6203e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 5.0903,  5.6133, -3.0150,  4.4055]])
Player 0 Prediction: tensor([[0.5481, 0.4493, 0.0025, 0.0000]])


 64%|██████▍   | 32000/50000 [29:54<13:03, 22.96it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 51576
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5667/10000 (56.7%)
    Average reward: -0.329
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4333/10000 (43.3%)
    Average reward: +0.329
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7976 (30.9%)
    Action 1: 11365 (44.1%)
    Action 2: 2286 (8.9%)
    Action 3: 4159 (16.1%)
  Player 1:
    Action 0: 8609 (33.4%)
    Action 1: 10571 (41.0%)
    Action 2: 2249 (8.7%)
    Action 3: 4361 (16.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3290.5, 3290.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.045 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.056 (max=1.0 for random)
    → Playing ne

 66%|██████▌   | 33004/50000 [30:54<13:39, 20.74it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 211256/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 212495/2000000


 68%|██████▊   | 33998/50000 [31:38<11:34, 23.04it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 217549/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 218782/2000000
P1 SL Buffer Size:  217549
P1 SL buffer distribution [73293. 94933. 20269. 29054.]
P1 actions distribution [0.33690341 0.43637525 0.09316981 0.13355152]
P2 SL Buffer Size:  218782
P2 SL buffer distribution [75648. 92110. 20546. 30478.]
P2 actions distribution [0.34576885 0.4210127  0.09391083 0.13930762]
   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9100, 0.0879, 0.0022, 0.0000]])
Player 1 Prediction: tensor([[ 0.2277,  0.7392, -0.9901,  0.8660]])
Player 0 Prediction: tensor([[0.0000, 0.6906, 0.0097, 0.2997]])
Player 1 Prediction: tensor([[-3.1477, -3.1967, -0.9992, -1.6388]])


 68%|██████▊   | 33998/50000 [31:54<11:34, 23.04it/s]


📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 50821
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5656/10000 (56.6%)
    Average reward: -0.280
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4344/10000 (43.4%)
    Average reward: +0.280
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7575 (29.9%)
    Action 1: 11068 (43.7%)
    Action 2: 2066 (8.1%)
    Action 3: 4641 (18.3%)
  Player 1:
    Action 0: 8138 (32.0%)
    Action 1: 9691 (38.0%)
    Action 2: 2086 (8.2%)
    Action 3: 5556 (21.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2795.5, 2795.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.056 (max=1.0 for random)
    → Playing nea

 70%|███████   | 35003/50000 [32:52<11:20, 22.03it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 223744/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 225031/2000000


 72%|███████▏  | 35999/50000 [33:37<10:03, 23.19it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 230114/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 231205/2000000
P1 SL Buffer Size:  230114
P1 SL buffer distribution [77631. 99076. 21443. 31964.]
P1 actions distribution [0.33735887 0.43055181 0.09318425 0.13890506]
P2 SL Buffer Size:  231205
P2 SL buffer distribution [80197. 96394. 21550. 33064.]
P2 actions distribution [0.34686534 0.41692005 0.09320733 0.14300729]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.5106,  0.1816, -0.6669,  0.3396]])
Player 0 Prediction: tensor([[0.0000, 0.1173, 0.0198, 0.8629]])
Player 1 Prediction: tensor([[ 0.3033,  0.4891, -0.8745,  0.8136]])
Player 0 Prediction: tensor([[0.0000, 0.0312, 0.0123, 0.9565]])


 72%|███████▏  | 35999/50000 [33:54<10:03, 23.19it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 49424
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5882/10000 (58.8%)
    Average reward: -0.282
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4118/10000 (41.2%)
    Average reward: +0.282
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5400 (22.3%)
    Action 1: 11144 (46.0%)
    Action 2: 1725 (7.1%)
    Action 3: 5958 (24.6%)
  Player 1:
    Action 0: 11481 (45.6%)
    Action 1: 6313 (25.1%)
    Action 2: 2133 (8.5%)
    Action 3: 5270 (20.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2817.5, 2817.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.998 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.017 (max=1.0 for random)
    → Playing ne

 74%|███████▍  | 37003/50000 [34:51<09:32, 22.70it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 236527/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 237593/2000000


 76%|███████▌  | 37998/50000 [35:37<08:48, 22.73it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 242828/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 243703/2000000
P1 SL Buffer Size:  242828
P1 SL buffer distribution [ 82291. 102922.  22609.  35006.]
P1 actions distribution [0.33888596 0.42384733 0.09310706 0.14415965]
P2 SL Buffer Size:  243703
P2 SL buffer distribution [ 84875. 100648.  22671.  35509.]
P2 actions distribution [0.34827228 0.41299451 0.09302717 0.14570604]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[2.2840e-01, 7.7132e-01, 2.8320e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.3105,  0.8412, -0.9737,  0.6680]])
Player 0 Prediction: tensor([[4.2671e-01, 5.7284e-01, 4.4181e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.9826,  0.7490, -1.7669,  0.5912]])
Player 0 Prediction: tensor([[0.0794, 0.8229, 0.0977, 0.0000]])
Pl

 76%|███████▌  | 37998/50000 [35:56<08:48, 22.73it/s]


📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 51590
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5184/10000 (51.8%)
    Average reward: +0.483
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4816/10000 (48.2%)
    Average reward: -0.483
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3878 (15.6%)
    Action 1: 13454 (54.1%)
    Action 2: 2838 (11.4%)
    Action 3: 4693 (18.9%)
  Player 1:
    Action 0: 18504 (69.2%)
    Action 1: 8223 (30.8%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [4833.5, -4833.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.898 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.890 (max=1.0 for random)
    → Mixed strategy
  Average strateg

 78%|███████▊  | 39005/50000 [36:50<07:54, 23.20it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 249226/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 250147/2000000


 80%|████████  | 40000/50000 [37:35<10:20, 16.12it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 255708/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 256460/2000000
P1 SL Buffer Size:  255708
P1 SL buffer distribution [ 86934. 106984.  23881.  37909.]
P1 actions distribution [0.33997372 0.41838347 0.09339168 0.14825113]
P2 SL Buffer Size:  256460
P2 SL buffer distribution [ 89486. 105646.  23736.  37592.]
P2 actions distribution [0.34892771 0.41193948 0.09255244 0.14658036]


 80%|████████  | 40000/50000 [37:46<10:20, 16.12it/s]

   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[2.7623e-01, 7.2350e-01, 2.7276e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.3464, -0.7075, -1.0231,  0.0444]])
Player 0 Prediction: tensor([[0.0000, 0.2148, 0.0251, 0.7601]])
Player 1 Prediction: tensor([[ 1.9876,  2.7584, -0.8679,  1.8158]])
Player 0 Prediction: tensor([[0.0320, 0.2791, 0.6889, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 50672
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 6000/10000 (60.0%)
    Average reward: -0.199
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4000/10000 (40.0%)
    Average reward: +0.199
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5969 (23.9%)
    Action 1: 11235 (45.0%)
    Action 2: 1983 (7.9%)
    Action 3: 5791 (23.2%)
  Player 1:
    Action

 82%|████████▏ | 41003/50000 [38:54<06:55, 21.64it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 262137/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 262951/2000000


 84%|████████▍ | 41999/50000 [39:47<06:33, 20.33it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 268472/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 269138/2000000
P1 SL Buffer Size:  268472
P1 SL buffer distribution [ 91552. 110948.  25179.  40793.]
P1 actions distribution [0.34101135 0.41325725 0.09378632 0.15194508]
P2 SL Buffer Size:  269138
P2 SL buffer distribution [ 94022. 110376.  24815.  39925.]
P2 actions distribution [0.34934495 0.41010931 0.09220177 0.14834397]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9313, 0.0672, 0.0015, 0.0000]])
Player 1 Prediction: tensor([[ 0.9838,  1.6198, -1.5109,  1.0142]])
Player 0 Prediction: tensor([[0.8091, 0.1895, 0.0014, 0.0000]])
Player 1 Prediction: tensor([[ 0.0829,  0.2305, -1.9180,  0.0481]])
Player 0 Prediction: tensor([[0.0110, 0.8567, 0.1323, 0.0000]])
Player 1 Prediction: tensor([[-5.1

 84%|████████▍ | 41999/50000 [40:07<06:33, 20.33it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 51787
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5269/10000 (52.7%)
    Average reward: +0.586
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4731/10000 (47.3%)
    Average reward: -0.586
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4319 (17.3%)
    Action 1: 13130 (52.5%)
    Action 2: 2860 (11.4%)
    Action 3: 4717 (18.8%)
  Player 1:
    Action 0: 18144 (67.8%)
    Action 1: 8617 (32.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [5860.0, -5860.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.926 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.907 (max=1.0 for random)
    → Playing nearly r

 86%|████████▌ | 43003/50000 [41:06<05:26, 21.40it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 274733/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 275409/2000000


 88%|████████▊ | 44000/50000 [41:56<04:50, 20.67it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 281037/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 281513/2000000
P1 SL Buffer Size:  281037
P1 SL buffer distribution [ 96032. 114802.  26401.  43802.]
P1 actions distribution [0.3417059  0.40849426 0.09394137 0.15585848]
P2 SL Buffer Size:  281513
P2 SL buffer distribution [ 98173. 115453.  25847.  42040.]
P2 actions distribution [0.34873345 0.41011605 0.09181459 0.14933591]


 88%|████████▊ | 44000/50000 [42:07<04:50, 20.67it/s]

   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[3.1224e-01, 6.8755e-01, 2.0675e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.8420,  1.6756, -1.4897,  1.0445]])
Player 0 Prediction: tensor([[5.2182e-01, 4.7787e-01, 3.0168e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.3808,  2.8859, -1.4689,  3.0824]])
Player 0 Prediction: tensor([[0.0000, 0.7046, 0.0007, 0.2947]])

📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 50458
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5618/10000 (56.2%)
    Average reward: -0.284
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4382/10000 (43.8%)
    Average reward: +0.284
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7783 (31.1%)
    Action 1: 10944 (43.7%)
    Action 2: 2809 (11.2%)
    Action 3: 3512 (14.0%)
  Pla

 90%|█████████ | 45003/50000 [43:15<03:57, 21.05it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 287345/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 287753/2000000


 92%|█████████▏| 45999/50000 [44:04<03:16, 20.33it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 293562/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 294081/2000000
P1 SL Buffer Size:  293562
P1 SL buffer distribution [100389. 118757.  27646.  46770.]
P1 actions distribution [0.34196865 0.40453805 0.09417431 0.15931899]
P2 SL Buffer Size:  294081
P2 SL buffer distribution [102556. 120031.  26979.  44515.]
P2 actions distribution [0.34873385 0.40815626 0.09174003 0.15136986]
   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[3.4688e-01, 6.5293e-01, 1.8080e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.1552, -0.6875, -1.0257,  0.0465]])
Player 0 Prediction: tensor([[0.0000e+00, 2.4313e-01, 3.0114e-04, 7.5657e-01]])
Player 1 Prediction: tensor([[-0.2299, -0.7200, -1.0319, -0.4568]])


 92%|█████████▏| 45999/50000 [44:17<03:16, 20.33it/s]


📊 TEST RESULTS SUMMARY
Training step: 46000
Episodes completed: 10000/10000
Total steps: 49291
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5761/10000 (57.6%)
    Average reward: -0.379
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4239/10000 (42.4%)
    Average reward: +0.379
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5909 (24.2%)
    Action 1: 10757 (44.1%)
    Action 2: 2050 (8.4%)
    Action 3: 5655 (23.2%)
  Player 1:
    Action 0: 9502 (38.1%)
    Action 1: 7321 (29.4%)
    Action 2: 2257 (9.1%)
    Action 3: 5840 (23.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3787.5, 3787.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.016 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.050 (max=1.0 for random)
    → Playing nea

 94%|█████████▍| 47003/50000 [45:27<03:22, 14.78it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 299836/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 300339/2000000


 96%|█████████▌| 47998/50000 [46:18<01:39, 20.18it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 306157/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 306586/2000000
P1 SL Buffer Size:  306157
P1 SL buffer distribution [104877. 122627.  28916.  49737.]
P1 actions distribution [0.34255954 0.40053633 0.09444827 0.16245586]
P2 SL Buffer Size:  306586
P2 SL buffer distribution [107117. 124514.  27915.  47040.]
P2 actions distribution [0.34938647 0.40613074 0.09105112 0.15343166]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.6699,  1.3992, -1.1013,  0.7372]])
Player 0 Prediction: tensor([[0.0000, 0.6679, 0.0009, 0.3312]])
Player 1 Prediction: tensor([[ 0.8588,  1.5757, -1.4352,  1.9783]])
Player 0 Prediction: tensor([[9.9936e-01, 0.0000e+00, 6.4345e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.5229,  4.9582, -3.1189,  3.9750]])
Player 0 Predi

 96%|█████████▌| 47998/50000 [46:38<01:39, 20.18it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 51726
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5043/10000 (50.4%)
    Average reward: +0.501
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4957/10000 (49.6%)
    Average reward: -0.501
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4461 (17.8%)
    Action 1: 12648 (50.3%)
    Action 2: 3126 (12.4%)
    Action 3: 4895 (19.5%)
  Player 1:
    Action 0: 17724 (66.6%)
    Action 1: 8872 (33.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [5015.0, -5015.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.941 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.919 (max=1.0 for random)
    → Playing nearly r

 98%|█████████▊| 49004/50000 [47:50<00:49, 20.27it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 312331/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 312977/2000000


100%|██████████| 50000/50000 [48:40<00:00, 17.12it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[1.9943e-01, 8.0056e-01, 4.1785e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.4926,  0.4304, -1.1620,  0.2424]])
Player 0 Prediction: tensor([[0.0000e+00, 3.0523e-01, 2.4477e-05, 6.9475e-01]])
Player 1 Prediction: tensor([[ 0.4147,  0.9067, -1.6984,  0.3878]])
Player 0 Prediction: tensor([[0.0768, 0.9119, 0.0114, 0.0000]])
Player 1 Prediction: tensor([[-2.3630, -1.8097, -3.9533, -1.2207]])

📊 TEST RESULTS SUMMARY
Training step: 49999
Episodes completed: 10000/10000
Total steps: 50481
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5198/10000 (52.0%)
    Average reward: -0.275
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4802/10000 (48.0%)
    Average reward: +0.275
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7585 (29.9%)
    Action 1: 10607

In [4]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": True,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x1037a0130>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [5]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Dueling", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [6]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/50000 [00:00<17:26, 47.77it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 61/200000
   Player 0 SL buffer: 13/2000000
   Player 1 RL buffer: 66/200000
   Player 1 SL buffer: 6/2000000


  2%|▏         | 1006/50000 [00:33<30:00, 27.22it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 63985/200000
   Player 0 SL buffer: 7262/2000000
   Player 1 RL buffer: 64141/200000
   Player 1 SL buffer: 7441/2000000


  4%|▍         | 1998/50000 [01:12<30:47, 25.98it/s]

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 127781/200000
   Player 0 SL buffer: 14123/2000000
   Player 1 RL buffer: 128345/200000
   Player 1 SL buffer: 14479/2000000
P1 SL Buffer Size:  14123
P1 SL buffer distribution [4247. 7418.  719. 1739.]
P1 actions distribution [0.30071515 0.52524251 0.05090986 0.12313248]
P2 SL Buffer Size:  14479
P2 SL buffer distribution [4405. 7647.  853. 1574.]
P2 actions distribution [0.30423372 0.52814421 0.05891291 0.10870916]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.3075, -0.0023, -0.4798,  0.3222]])
Player 0 Prediction: tensor([[0.0000, 0.7841, 0.0405, 0.1754]])


  4%|▍         | 1998/50000 [01:24<30:47, 25.98it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 56808
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5087/10000 (50.9%)
    Average reward: -1.136
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4913/10000 (49.1%)
    Average reward: +1.136
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10126 (35.4%)
    Action 1: 15295 (53.4%)
    Action 2: 1527 (5.3%)
    Action 3: 1676 (5.9%)
  Player 1:
    Action 0: 7907 (28.1%)
    Action 1: 14750 (52.3%)
    Action 2: 2273 (8.1%)
    Action 3: 3254 (11.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-11356.5, 11356.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.013 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.003 (max=1.0 for random)
    → Playing n

  6%|▌         | 3005/50000 [02:19<29:41, 26.39it/s]   

   Player 0 ε: 0.0011 → 0.0011

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 191680/200000
   Player 0 SL buffer: 20605/2000000
   Player 1 RL buffer: 192446/200000
   Player 1 SL buffer: 21142/2000000


  8%|▊         | 4000/50000 [02:55<33:02, 23.21it/s]

   Player 0 ε: 0.0009 → 0.0009

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 27171/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 27477/2000000
P1 SL Buffer Size:  27171
P1 SL buffer distribution [ 7922. 14010.  1890.  3349.]
P1 actions distribution [0.29156086 0.51562327 0.06955946 0.12325641]
P2 SL Buffer Size:  27477
P2 SL buffer distribution [ 8907. 13605.  2204.  2761.]
P2 actions distribution [0.32416203 0.49514139 0.08021254 0.10048404]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 3.3656,  3.7764, -0.2920,  2.8262]])
Player 0 Prediction: tensor([[0.0275, 0.9647, 0.0078, 0.0000]])
Player 1 Prediction: tensor([[ 3.4237,  3.9113, -1.9008,  2.9533]])
Player 0 Prediction: tensor([[0.0000, 0.7853, 0.0512, 0.1635]])
Player 1 Prediction: tensor([[ 1.2736,  1.9811, -2.7428,  1.4753]])
Player 0 Prediction: tensor([[0.9700, 0.0000

  8%|▊         | 4000/50000 [03:14<33:02, 23.21it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 49225
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5371/10000 (53.7%)
    Average reward: -0.165
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4629/10000 (46.3%)
    Average reward: +0.165
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2862 (12.4%)
    Action 1: 16784 (72.5%)
    Action 2: 1332 (5.8%)
    Action 3: 2169 (9.4%)
  Player 1:
    Action 0: 21758 (83.4%)
    Action 1: 4320 (16.6%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1651.0, 1651.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.709 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.648 (max=1.0 for random)
    → Strongly prefers Heads
  Average st

 10%|█         | 5003/50000 [04:03<24:37, 30.46it/s]   

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 33410/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 34078/2000000


 12%|█▏        | 5997/50000 [04:36<24:03, 30.48it/s]

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 39818/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40906/2000000
P1 SL Buffer Size:  39818
P1 SL buffer distribution [11506. 20107.  2948.  5257.]
P1 actions distribution [0.28896479 0.50497263 0.07403687 0.13202572]
P2 SL Buffer Size:  40906
P2 SL buffer distribution [13366. 19829.  3399.  4312.]
P2 actions distribution [0.32674913 0.48474551 0.08309294 0.10541241]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.3417,  2.8479, -0.6317,  2.1675]])
Player 0 Prediction: tensor([[0.0179, 0.9725, 0.0096, 0.0000]])
Player 1 Prediction: tensor([[ 2.2373,  2.9142, -2.1355,  2.1613]])
Player 0 Prediction: tensor([[0.0000, 0.4424, 0.0618, 0.4958]])

📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 53515
Average episode l

 12%|█▏        | 5997/50000 [04:55<24:03, 30.48it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 49234
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5320/10000 (53.2%)
    Average reward: -0.138
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4680/10000 (46.8%)
    Average reward: +0.138
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2603 (11.2%)
    Action 1: 16727 (72.0%)
    Action 2: 1588 (6.8%)
    Action 3: 2306 (9.9%)
  Player 1:
    Action 0: 21696 (83.4%)
    Action 1: 4314 (16.6%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1384.5, 1384.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.695 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.648 (max=1.0 for random)
    → Strongly prefers Heads
  Average st

 14%|█▍        | 7003/50000 [05:41<23:21, 30.69it/s]   

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 46129/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 47599/2000000


 16%|█▌        | 7999/50000 [06:14<22:50, 30.65it/s]

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 52596/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 54174/2000000
P1 SL Buffer Size:  52596
P1 SL buffer distribution [15140. 26241.  4044.  7171.]
P1 actions distribution [0.28785459 0.49891627 0.07688798 0.13634117]
P2 SL Buffer Size:  54174
P2 SL buffer distribution [17642. 26515.  4446.  5571.]
P2 actions distribution [0.32565437 0.48944143 0.08206889 0.10283531]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.0956,  2.7956, -0.6683,  2.6024]])
Player 0 Prediction: tensor([[0.0380, 0.9413, 0.0207, 0.0000]])
Player 1 Prediction: tensor([[ 1.9828,  2.7846, -2.1717,  2.1021]])
Player 0 Prediction: tensor([[0.0000, 0.1403, 0.0638, 0.7959]])
Player 1 Prediction: tensor([[ 0.3440,  1.5855, -2.8761,  0.8359]])
Player 0 Prediction: tensor([[0.0996, 0.1376

 16%|█▌        | 7999/50000 [06:25<22:50, 30.65it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 53755
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4709/10000 (47.1%)
    Average reward: -1.015
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5291/10000 (52.9%)
    Average reward: +1.015
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5888 (21.8%)
    Action 1: 14999 (55.7%)
    Action 2: 3079 (11.4%)
    Action 3: 2985 (11.1%)
  Player 1:
    Action 0: 9870 (36.8%)
    Action 1: 12458 (46.5%)
    Action 2: 2594 (9.7%)
    Action 3: 1882 (7.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-10147.5, 10147.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.950 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.044 (max=1.0 for random)
    → Playing n

 18%|█▊        | 9004/50000 [07:19<22:27, 30.42it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 58683/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 60389/2000000


 20%|█▉        | 9998/50000 [07:52<22:29, 29.64it/s]

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 64709/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 66555/2000000
P1 SL Buffer Size:  64709
P1 SL buffer distribution [19524. 31100.  5214.  8871.]
P1 actions distribution [0.30172001 0.48061321 0.08057612 0.13709067]
P2 SL Buffer Size:  66555
P2 SL buffer distribution [21712. 32556.  5511.  6776.]
P2 actions distribution [0.32622643 0.48915934 0.0828037  0.10181053]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.9105, -0.5653, -0.5459,  1.5045]])
Player 0 Prediction: tensor([[0.0000, 0.7673, 0.0114, 0.2213]])
Player 1 Prediction: tensor([[-0.2932, -0.5084, -0.9638,  0.7269]])
Player 0 Prediction: tensor([[0.0000, 0.8590, 0.0377, 0.1033]])
Player 1 Prediction: tensor([[-3.3951, -4.8629, -1.8539, -2.9551]])


 20%|█▉        | 9998/50000 [08:05<22:29, 29.64it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 52419
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4996/10000 (50.0%)
    Average reward: -0.814
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5004/10000 (50.0%)
    Average reward: +0.814
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5910 (22.7%)
    Action 1: 14196 (54.5%)
    Action 2: 2603 (10.0%)
    Action 3: 3347 (12.8%)
  Player 1:
    Action 0: 9616 (36.5%)
    Action 1: 11401 (43.2%)
    Action 2: 2530 (9.6%)
    Action 3: 2816 (10.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-8143.5, 8143.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.963 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.054 (max=1.0 for random)
    → Playing n

 22%|██▏       | 11006/50000 [08:58<21:23, 30.37it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 70789/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 72572/2000000


 24%|██▍       | 11999/50000 [09:31<21:33, 29.38it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 76892/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 78800/2000000
P1 SL Buffer Size:  76892
P1 SL buffer distribution [24373. 35376.  6435. 10708.]
P1 actions distribution [0.31697706 0.46007387 0.08368881 0.13926026]
P2 SL Buffer Size:  78800
P2 SL buffer distribution [25657. 38400.  6448.  8295.]
P2 actions distribution [0.32559645 0.48730964 0.08182741 0.1052665 ]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[2.2047e-01, 7.7897e-01, 5.6152e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.4173, -0.2833, -0.9900,  0.3369]])
Player 0 Prediction: tensor([[9.9917e-01, 0.0000e+00, 8.3414e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-2.0388, -2.7416, -2.9573, -2.6345]])
Player 0 Prediction: tensor([[0.0000, 0.7152, 0.0024, 0.2825]])
Player 1 Predi

 24%|██▍       | 11999/50000 [09:45<21:33, 29.38it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 56852
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5096/10000 (51.0%)
    Average reward: -0.558
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4904/10000 (49.0%)
    Average reward: +0.558
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8918 (30.8%)
    Action 1: 12895 (44.5%)
    Action 2: 3068 (10.6%)
    Action 3: 4078 (14.1%)
  Player 1:
    Action 0: 7477 (26.8%)
    Action 1: 15409 (55.2%)
    Action 2: 1857 (6.7%)
    Action 3: 3150 (11.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5577.5, 5577.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.982 (max=1.0 for random)
    → Playing n

 26%|██▌       | 13005/50000 [10:37<21:41, 28.42it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 82770/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 85185/2000000


 28%|██▊       | 13999/50000 [11:12<30:32, 19.64it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 88926/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 91482/2000000
P1 SL Buffer Size:  88926
P1 SL buffer distribution [29295. 39601.  7492. 12538.]
P1 actions distribution [0.32943121 0.44532533 0.08424983 0.14099364]
P2 SL Buffer Size:  91482
P2 SL buffer distribution [29775. 44551.  7568.  9588.]
P2 actions distribution [0.32547386 0.48699198 0.08272666 0.1048075 ]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.9660,  2.3294, -0.9484,  2.0236]])
Player 0 Prediction: tensor([[0.3612, 0.6246, 0.0142, 0.0000]])
Player 1 Prediction: tensor([[ 1.7948,  2.1149, -2.1794,  1.6464]])
Player 0 Prediction: tensor([[0.0000, 0.6803, 0.0480, 0.2717]])
Player 1 Prediction: tensor([[-0.4352, -0.2842, -2.8693,  0.3548]])
Player 0 Prediction: tensor([[0.9695, 0.00

 28%|██▊       | 13999/50000 [11:25<30:32, 19.64it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 52729
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5355/10000 (53.5%)
    Average reward: -0.709
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4645/10000 (46.5%)
    Average reward: +0.709
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7292 (27.7%)
    Action 1: 12986 (49.4%)
    Action 2: 2363 (9.0%)
    Action 3: 3649 (13.9%)
  Player 1:
    Action 0: 9065 (34.3%)
    Action 1: 11912 (45.1%)
    Action 2: 2721 (10.3%)
    Action 3: 2741 (10.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7090.0, 7090.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.016 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.048 (max=1.0 for random)
    → Playing n

 30%|███       | 15003/50000 [12:25<22:49, 25.55it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 95215/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 97814/2000000


 32%|███▏      | 15999/50000 [13:04<21:39, 26.16it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 101539/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 104042/2000000
P1 SL Buffer Size:  101539
P1 SL buffer distribution [34026. 44362.  8580. 14571.]
P1 actions distribution [0.33510277 0.43689617 0.08449955 0.14350151]
P2 SL Buffer Size:  104042
P2 SL buffer distribution [34159. 50016.  8841. 11026.]
P2 actions distribution [0.32831933 0.48072894 0.0849753  0.10597643]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3564, 0.6417, 0.0019, 0.0000]])
Player 1 Prediction: tensor([[-0.2120, -1.3182, -0.8646,  0.5995]])
Player 0 Prediction: tensor([[0.0000, 0.8867, 0.0192, 0.0941]])
Player 1 Prediction: tensor([[-4.3428, -5.6993, -1.9690, -4.1966]])


 32%|███▏      | 15999/50000 [13:15<21:39, 26.16it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 51326
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5380/10000 (53.8%)
    Average reward: -0.667
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4620/10000 (46.2%)
    Average reward: +0.667
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7198 (28.2%)
    Action 1: 12092 (47.4%)
    Action 2: 2196 (8.6%)
    Action 3: 4010 (15.7%)
  Player 1:
    Action 0: 8359 (32.4%)
    Action 1: 11024 (42.7%)
    Action 2: 2407 (9.3%)
    Action 3: 4040 (15.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6668.5, 6668.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.026 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.051 (max=1.0 for random)
    → Playing ne

 34%|███▍      | 17003/50000 [14:10<18:49, 29.22it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 107879/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 110113/2000000


 36%|███▌      | 17999/50000 [14:47<21:18, 25.03it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 114026/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 116595/2000000
P1 SL Buffer Size:  114026
P1 SL buffer distribution [38801. 48807.  9797. 16621.]
P1 actions distribution [0.34028204 0.42803396 0.085919   0.145765  ]
P2 SL Buffer Size:  116595
P2 SL buffer distribution [38204. 55443.  9874. 13074.]
P2 actions distribution [0.32766414 0.47551782 0.08468631 0.11213174]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.4569,  2.4981, -0.5468,  3.0162]])
Player 0 Prediction: tensor([[0.1797, 0.8180, 0.0023, 0.0000]])
Player 1 Prediction: tensor([[ 2.1378,  2.1885, -2.0082,  2.7019]])
Player 0 Prediction: tensor([[0.0000, 0.5591, 0.0035, 0.4374]])
Player 1 Prediction: tensor([[ 0.2963, -1.5062, -3.1928,  0.6306]])

📊 TEST RESULTS SUMMARY
Training step:

 36%|███▌      | 17999/50000 [15:06<21:18, 25.03it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 50522
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5364/10000 (53.6%)
    Average reward: +0.009
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4636/10000 (46.4%)
    Average reward: -0.009
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4382 (18.3%)
    Action 1: 13853 (58.0%)
    Action 2: 1769 (7.4%)
    Action 3: 3901 (16.3%)
  Player 1:
    Action 0: 18928 (71.1%)
    Action 1: 7689 (28.9%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [89.0, -89.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.905 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.867 (max=1.0 for random)
    → Mixed strategy
  Aver

 38%|███▊      | 19003/50000 [15:55<19:48, 26.08it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 120171/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 123000/2000000


 40%|███▉      | 19999/50000 [16:34<19:55, 25.10it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 126310/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 129394/2000000
P1 SL Buffer Size:  126310
P1 SL buffer distribution [43679. 53168. 10995. 18468.]
P1 actions distribution [0.34580793 0.42093263 0.08704774 0.1462117 ]
P2 SL Buffer Size:  129394
P2 SL buffer distribution [42487. 60855. 10997. 15055.]
P2 actions distribution [0.32835371 0.47030774 0.08498848 0.11635006]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.7363,  0.5437, -0.8813,  1.1995]])
Player 0 Prediction: tensor([[0.0000, 0.1441, 0.0320, 0.8240]])
Player 1 Prediction: tensor([[ 0.0151,  0.1999, -1.3014,  0.9210]])
Player 0 Prediction: tensor([[0.9800, 0.0000, 0.0200, 0.0000]])
Player 1 Prediction: tensor([[-0.3383,  0.3828, -2.7269,  0.5760]])
Player 0 Prediction: tensor([[0.0000, 

 40%|███▉      | 19999/50000 [16:46<19:55, 25.10it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 51354
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5435/10000 (54.4%)
    Average reward: -0.592
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4565/10000 (45.6%)
    Average reward: +0.592
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7764 (30.2%)
    Action 1: 10769 (41.9%)
    Action 2: 2012 (7.8%)
    Action 3: 5175 (20.1%)
  Player 1:
    Action 0: 8474 (33.1%)
    Action 1: 9992 (39.0%)
    Action 2: 2115 (8.3%)
    Action 3: 5053 (19.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5924.5, 5924.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.048 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.058 (max=1.0 for random)
    → Playing nea

 42%|████▏     | 21005/50000 [17:42<16:36, 29.09it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 132996/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 135658/2000000


 44%|████▍     | 21998/50000 [18:20<19:01, 24.54it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 139273/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 142078/2000000
P1 SL Buffer Size:  139273
P1 SL buffer distribution [48960. 57343. 12246. 20724.]
P1 actions distribution [0.35153978 0.41173092 0.08792803 0.14880128]
P2 SL Buffer Size:  142078
P2 SL buffer distribution [46567. 66103. 12089. 17319.]
P2 actions distribution [0.32775658 0.46525852 0.08508706 0.12189783]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.2224,  1.1267, -0.7008,  1.1610]])
Player 0 Prediction: tensor([[0.0000e+00, 8.0307e-01, 4.8160e-04, 1.9645e-01]])
Player 1 Prediction: tensor([[ 0.7384,  0.7237, -0.9970,  1.3240]])
Player 0 Prediction: tensor([[0.0000, 0.9888, 0.0018, 0.0094]])
Player 1 Prediction: tensor([[-4.1178, -3.6905, -2.0890, -1.9676]])


 44%|████▍     | 21998/50000 [18:36<19:01, 24.54it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 52306
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5583/10000 (55.8%)
    Average reward: -0.652
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4417/10000 (44.2%)
    Average reward: +0.652
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7857 (29.5%)
    Action 1: 11622 (43.7%)
    Action 2: 2057 (7.7%)
    Action 3: 5056 (19.0%)
  Player 1:
    Action 0: 9467 (36.8%)
    Action 1: 9654 (37.5%)
    Action 2: 2358 (9.2%)
    Action 3: 4235 (16.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6523.5, 6523.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.042 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nea

 46%|████▌     | 23003/50000 [19:28<17:01, 26.42it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 145445/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 148451/2000000


 48%|████▊     | 23999/50000 [20:08<17:35, 24.63it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 151685/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 154633/2000000
P1 SL Buffer Size:  151685
P1 SL buffer distribution [54138. 61329. 13518. 22700.]
P1 actions distribution [0.3569107  0.40431816 0.0891189  0.14965224]
P2 SL Buffer Size:  154633
P2 SL buffer distribution [50868. 70864. 13218. 19683.]
P2 actions distribution [0.32895954 0.45827217 0.08547981 0.12728848]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.5384,  2.4208, -0.5284,  2.4960]])
Player 0 Prediction: tensor([[0.0000e+00, 8.1779e-01, 4.3623e-04, 1.8178e-01]])
Player 1 Prediction: tensor([[-0.1901,  0.4463, -1.3249,  0.2941]])
Player 0 Prediction: tensor([[6.3819e-02, 9.3555e-01, 6.3097e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-1.2967, -1.6362, -3.5983,  0.3722]])

📊 TES

 48%|████▊     | 23999/50000 [20:27<17:35, 24.63it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 50973
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5305/10000 (53.0%)
    Average reward: +0.084
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4695/10000 (46.9%)
    Average reward: -0.084
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5159 (21.2%)
    Action 1: 13040 (53.5%)
    Action 2: 2100 (8.6%)
    Action 3: 4073 (16.7%)
  Player 1:
    Action 0: 18006 (67.7%)
    Action 1: 8595 (32.3%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [844.0, -844.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.957 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.908 (max=1.0 for random)
    → Playing nearly rand

 50%|█████     | 25003/50000 [21:17<15:47, 26.38it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 157903/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 161093/2000000


 52%|█████▏    | 26000/50000 [22:00<16:54, 23.66it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 164339/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 167278/2000000
P1 SL Buffer Size:  164339
P1 SL buffer distribution [59367. 65282. 14740. 24950.]
P1 actions distribution [0.36124718 0.39723985 0.08969265 0.15182032]
P2 SL Buffer Size:  167278
P2 SL buffer distribution [55338. 75370. 14423. 22147.]
P2 actions distribution [0.33081457 0.45056732 0.08622174 0.13239637]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.1983,  0.9842, -0.6681,  1.3891]])
Player 0 Prediction: tensor([[0.0000, 0.3504, 0.0056, 0.6440]])
Player 1 Prediction: tensor([[ 0.4601,  0.1338, -0.8060,  0.8198]])
Player 0 Prediction: tensor([[0.0000, 0.1405, 0.0016, 0.8579]])

📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 48506
Average ep

 52%|█████▏    | 26000/50000 [22:19<16:54, 23.66it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 50895
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5348/10000 (53.5%)
    Average reward: +0.178
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4652/10000 (46.5%)
    Average reward: -0.178
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5420 (22.2%)
    Action 1: 12790 (52.5%)
    Action 2: 2180 (8.9%)
    Action 3: 3988 (16.4%)
  Player 1:
    Action 0: 17786 (67.1%)
    Action 1: 8731 (32.9%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1782.0, -1782.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.971 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.914 (max=1.0 for random)
    → Playing nearly ra

 54%|█████▍    | 27003/50000 [23:09<15:16, 25.09it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 170612/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 173462/2000000


 56%|█████▌    | 27999/50000 [23:51<14:51, 24.69it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 176837/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 180015/2000000
P1 SL Buffer Size:  176837
P1 SL buffer distribution [64838. 69136. 15947. 26916.]
P1 actions distribution [0.36665404 0.39095891 0.09017909 0.15220797]
P2 SL Buffer Size:  180015
P2 SL buffer distribution [59601. 80313. 15719. 24382.]
P2 actions distribution [0.33108908 0.44614615 0.0873205  0.13544427]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.7807,  2.2795, -0.7669,  1.9964]])
Player 0 Prediction: tensor([[6.1455e-02, 9.3852e-01, 2.4197e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.2042,  1.6254, -2.0545,  2.6628]])
Player 0 Prediction: tensor([[0.0000e+00, 9.7901e-01, 3.2333e-04, 2.0666e-02]])
Player 1 Prediction: tensor([[ 0.7382,  0.7720, -2.7645,  1.6331]])
Player

 56%|█████▌    | 27999/50000 [24:10<14:51, 24.69it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 50866
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5247/10000 (52.5%)
    Average reward: +0.190
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4753/10000 (47.5%)
    Average reward: -0.190
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5520 (22.6%)
    Action 1: 12505 (51.1%)
    Action 2: 2425 (9.9%)
    Action 3: 4018 (16.4%)
  Player 1:
    Action 0: 17557 (66.5%)
    Action 1: 8841 (33.5%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1904.5, -1904.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.980 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.920 (max=1.0 for random)
    → Playing nearly ra

 58%|█████▊    | 29006/50000 [25:00<12:38, 27.69it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 183273/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 186197/2000000


 60%|█████▉    | 29999/50000 [25:41<14:07, 23.61it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 189756/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 192611/2000000
P1 SL Buffer Size:  189756
P1 SL buffer distribution [70464. 73165. 17171. 28956.]
P1 actions distribution [0.37134004 0.38557411 0.09048989 0.15259597]
P2 SL Buffer Size:  192611
P2 SL buffer distribution [63817. 85363. 17015. 26416.]
P2 actions distribution [0.33132583 0.4431886  0.08833867 0.13714689]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.7682,  0.9931, -1.0171,  0.5568]])
Player 0 Prediction: tensor([[0.7285, 0.2670, 0.0045, 0.0000]])
Player 1 Prediction: tensor([[ 0.0104, -0.2811, -1.4862,  0.5077]])
Player 0 Prediction: tensor([[0.0000, 0.1083, 0.0242, 0.8675]])
Player 1 Prediction: tensor([[-0.0588,  0.0315, -2.5678,  0.4878]])

📊 TEST RESULTS SUMMARY
Training step:

 60%|█████▉    | 29999/50000 [26:00<14:07, 23.61it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 50984
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5282/10000 (52.8%)
    Average reward: +0.289
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4718/10000 (47.2%)
    Average reward: -0.289
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5779 (23.5%)
    Action 1: 12190 (49.5%)
    Action 2: 2543 (10.3%)
    Action 3: 4127 (16.7%)
  Player 1:
    Action 0: 17233 (65.4%)
    Action 1: 9112 (34.6%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [2889.5, -2889.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.993 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.930 (max=1.0 for random)
    → Playing nearly r

 62%|██████▏   | 31003/50000 [26:51<12:45, 24.81it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 196058/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 199180/2000000


 64%|██████▍   | 31999/50000 [27:33<12:35, 23.83it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 202359/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 205578/2000000
P1 SL Buffer Size:  202359
P1 SL buffer distribution [76007. 77044. 18293. 31015.]
P1 actions distribution [0.37560474 0.3807293  0.09039875 0.15326721]
P2 SL Buffer Size:  205578
P2 SL buffer distribution [68268. 90411. 18299. 28600.]
P2 actions distribution [0.33207834 0.43978928 0.08901244 0.13911994]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.5465,  0.8816, -1.0451,  0.5784]])
Player 0 Prediction: tensor([[0.7293, 0.2667, 0.0040, 0.0000]])
Player 1 Prediction: tensor([[ 0.4700,  1.7493, -2.2605,  1.2679]])
Player 0 Prediction: tensor([[0.0147, 0.0370, 0.9483, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 53043
Average ep

 64%|██████▍   | 31999/50000 [27:51<12:35, 23.83it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 50931
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5217/10000 (52.2%)
    Average reward: +0.331
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4783/10000 (47.8%)
    Average reward: -0.331
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5897 (23.9%)
    Action 1: 11963 (48.6%)
    Action 2: 2705 (11.0%)
    Action 3: 4075 (16.5%)
  Player 1:
    Action 0: 17040 (64.8%)
    Action 1: 9251 (35.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3312.0, -3312.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.000 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.936 (max=1.0 for random)
    → Playing nearly r

 66%|██████▌   | 33003/50000 [28:45<11:36, 24.40it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 208945/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 211752/2000000


 68%|██████▊   | 34000/50000 [29:27<11:18, 23.59it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 215509/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 218157/2000000
P1 SL Buffer Size:  215509
P1 SL buffer distribution [81583. 81674. 19228. 33024.]
P1 actions distribution [0.3785596  0.37898185 0.08922133 0.15323722]
P2 SL Buffer Size:  218157
P2 SL buffer distribution [72451. 95413. 19661. 30632.]
P2 actions distribution [0.33210486 0.43735933 0.09012317 0.14041264]
   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9776, 0.0210, 0.0014, 0.0000]])
Player 1 Prediction: tensor([[ 0.9528,  2.1714, -1.1542,  1.4840]])
Player 0 Prediction: tensor([[0.7529, 0.2437, 0.0034, 0.0000]])
Player 1 Prediction: tensor([[-0.1450,  1.2263, -1.8644,  0.9222]])
Player 0 Prediction: tensor([[0.0233, 0.8539, 0.1227, 0.0000]])
Player 1 Prediction: tensor([[-3.8371, -0.

 68%|██████▊   | 34000/50000 [29:42<11:18, 23.59it/s]


📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 50437
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5741/10000 (57.4%)
    Average reward: -0.339
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4259/10000 (42.6%)
    Average reward: +0.339
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7827 (30.9%)
    Action 1: 10424 (41.1%)
    Action 2: 2171 (8.6%)
    Action 3: 4914 (19.4%)
  Player 1:
    Action 0: 8944 (35.6%)
    Action 1: 9218 (36.7%)
    Action 2: 2740 (10.9%)
    Action 3: 4199 (16.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3388.0, 3388.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.051 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing ne

 70%|███████   | 35003/50000 [30:38<09:55, 25.18it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 221903/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 224677/2000000


 72%|███████▏  | 35999/50000 [31:20<09:55, 23.51it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 228329/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 231214/2000000
P1 SL Buffer Size:  228329
P1 SL buffer distribution [87048. 86143. 20099. 35039.]
P1 actions distribution [0.38123935 0.37727577 0.08802649 0.15345839]
P2 SL Buffer Size:  231214
P2 SL buffer distribution [ 76986. 100411.  20927.  32890.]
P2 actions distribution [0.33296427 0.43427734 0.09050923 0.14224917]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9797, 0.0191, 0.0012, 0.0000]])
Player 1 Prediction: tensor([[ 0.7807,  1.9282, -1.3161,  1.1067]])
Player 0 Prediction: tensor([[0.7763, 0.2207, 0.0031, 0.0000]])
Player 1 Prediction: tensor([[ 1.7014,  1.5152, -2.1002,  2.0770]])
Player 0 Prediction: tensor([[0.0000, 0.0428, 0.0220, 0.9352]])
Player 1 Prediction: tensor([[ 0.1385,

 72%|███████▏  | 35999/50000 [31:32<09:55, 23.51it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 51097
Average episode length: 5.1 steps
Episode length range: 2 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5645/10000 (56.5%)
    Average reward: -0.246
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4355/10000 (43.5%)
    Average reward: +0.246
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9230 (36.0%)
    Action 1: 10236 (39.9%)
    Action 2: 2514 (9.8%)
    Action 3: 3668 (14.3%)
  Player 1:
    Action 0: 7400 (29.1%)
    Action 1: 12136 (47.7%)
    Action 2: 2392 (9.4%)
    Action 3: 3521 (13.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2465.0, 2465.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.059 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.028 (max=1.0 for random)
    → Playing ne

 74%|███████▍  | 37003/50000 [32:33<09:01, 24.02it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 234601/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 237496/2000000


 76%|███████▌  | 37999/50000 [33:16<08:30, 23.52it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 240975/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 243925/2000000
P1 SL Buffer Size:  240975
P1 SL buffer distribution [92457. 90466. 21000. 37052.]
P1 actions distribution [0.3836788  0.37541654 0.08714597 0.15375869]
P2 SL Buffer Size:  243925
P2 SL buffer distribution [ 80954. 105462.  22160.  35349.]
P2 actions distribution [0.3318807  0.43235421 0.0908476  0.1449175 ]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9808, 0.0182, 0.0010, 0.0000]])
Player 1 Prediction: tensor([[ 1.2786,  1.1739, -0.9079,  0.8461]])
Player 0 Prediction: tensor([[0.7889, 0.2081, 0.0031, 0.0000]])
Player 1 Prediction: tensor([[ 0.6794,  2.1547, -2.2265,  1.6711]])
Player 0 Prediction: tensor([[0.0155, 0.0360, 0.9484, 0.0000]])


 76%|███████▌  | 37999/50000 [33:32<08:30, 23.52it/s]


📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 51531
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5756/10000 (57.6%)
    Average reward: -0.180
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4244/10000 (42.4%)
    Average reward: +0.180
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9014 (35.0%)
    Action 1: 9809 (38.1%)
    Action 2: 2094 (8.1%)
    Action 3: 4802 (18.7%)
  Player 1:
    Action 0: 8011 (31.0%)
    Action 1: 10645 (41.2%)
    Action 2: 2456 (9.5%)
    Action 3: 4700 (18.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1802.5, 1802.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.051 (max=1.0 for random)
    → Playing nea

 78%|███████▊  | 39005/50000 [34:29<07:33, 24.23it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 247494/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 249934/2000000


 80%|███████▉  | 39999/50000 [35:12<09:16, 17.96it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 253907/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 256386/2000000
P1 SL Buffer Size:  253907
P1 SL buffer distribution [98065. 94806. 21900. 39136.]
P1 actions distribution [0.38622409 0.37338868 0.08625205 0.15413518]
P2 SL Buffer Size:  256386
P2 SL buffer distribution [ 84829. 110420.  23440.  37697.]
P2 actions distribution [0.3308644  0.43067874 0.09142465 0.14703221]
   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.9583,  2.2157, -0.7832,  2.1076]])
Player 0 Prediction: tensor([[0.7990, 0.1979, 0.0031, 0.0000]])
Player 1 Prediction: tensor([[-0.2314,  1.4309, -1.9316,  1.3015]])
Player 0 Prediction: tensor([[0.0077, 0.0296, 0.9627, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 50499
Averag

 80%|███████▉  | 39999/50000 [35:32<09:16, 17.96it/s]


📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 51287
Average episode length: 5.1 steps
Episode length range: 2 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5256/10000 (52.6%)
    Average reward: +0.451
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4744/10000 (47.4%)
    Average reward: -0.451
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6607 (26.4%)
    Action 1: 11688 (46.7%)
    Action 2: 2839 (11.4%)
    Action 3: 3877 (15.5%)
  Player 1:
    Action 0: 16626 (63.3%)
    Action 1: 9650 (36.7%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [4514.0, -4514.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.020 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.949 (max=1.0 for random)
    → Playing nearly r

 82%|████████▏ | 41003/50000 [36:25<06:17, 23.86it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 260443/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 262703/2000000


 84%|████████▍ | 41999/50000 [37:09<06:36, 20.19it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 266744/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 269172/2000000
P1 SL Buffer Size:  266744
P1 SL buffer distribution [103438.  99444.  22779.  41083.]
P1 actions distribution [0.38778004 0.37280689 0.08539649 0.15401659]
P2 SL Buffer Size:  269172
P2 SL buffer distribution [ 89307. 115126.  24831.  39908.]
P2 actions distribution [0.33178414 0.42770422 0.09224957 0.14826208]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.8396e-01, 1.5276e-02, 7.6693e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.1039, -0.2668, -1.2337,  0.5500]])
Player 0 Prediction: tensor([[0.0000, 0.0157, 0.0102, 0.9741]])
Player 1 Prediction: tensor([[-1.6651, -1.9627, -0.9541, -0.4802]])


 84%|████████▍ | 41999/50000 [37:22<06:36, 20.19it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 51527
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5711/10000 (57.1%)
    Average reward: -0.231
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4289/10000 (42.9%)
    Average reward: +0.231
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9168 (35.6%)
    Action 1: 9649 (37.4%)
    Action 2: 2097 (8.1%)
    Action 3: 4873 (18.9%)
  Player 1:
    Action 0: 7982 (31.0%)
    Action 1: 10696 (41.6%)
    Action 2: 2254 (8.8%)
    Action 3: 4808 (18.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2314.0, 2314.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.050 (max=1.0 for random)
    → Playing nea

 86%|████████▌ | 43005/50000 [38:22<04:54, 23.72it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 273016/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 275744/2000000


 88%|████████▊ | 43998/50000 [39:06<04:24, 22.69it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 279365/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 282273/2000000
P1 SL Buffer Size:  279365
P1 SL buffer distribution [108553. 104155.  23682.  42975.]
P1 actions distribution [0.38857051 0.37282766 0.08477082 0.15383101]
P2 SL Buffer Size:  282273
P2 SL buffer distribution [ 93472. 120385.  26257.  42159.]
P2 actions distribution [0.33114042 0.42648429 0.09301988 0.14935541]
   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.8479e-01, 1.4486e-02, 7.2250e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.1781,  2.1392, -1.3146,  1.8375]])
Player 0 Prediction: tensor([[0.9608, 0.0000, 0.0392, 0.0000]])
Player 1 Prediction: tensor([[ 3.1533,  5.4121, -3.3887,  3.6832]])
Player 0 Prediction: tensor([[0.0716, 0.0371, 0.8913, 0.0000]])


 88%|████████▊ | 43998/50000 [39:22<04:24, 22.69it/s]


📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 51370
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5691/10000 (56.9%)
    Average reward: -0.233
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4309/10000 (43.1%)
    Average reward: +0.233
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9186 (35.7%)
    Action 1: 9536 (37.1%)
    Action 2: 2127 (8.3%)
    Action 3: 4868 (18.9%)
  Player 1:
    Action 0: 7813 (30.5%)
    Action 1: 10738 (41.9%)
    Action 2: 2365 (9.2%)
    Action 3: 4737 (18.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2332.5, 2332.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.048 (max=1.0 for random)
    → Playing nea

 90%|█████████ | 45003/50000 [40:19<03:51, 21.61it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 285805/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 288474/2000000


 92%|█████████▏| 45998/50000 [41:06<03:06, 21.45it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 292224/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 294752/2000000
P1 SL Buffer Size:  292224
P1 SL buffer distribution [113856. 108543.  24551.  45274.]
P1 actions distribution [0.38961892 0.37143766 0.08401432 0.1549291 ]
P2 SL Buffer Size:  294752
P2 SL buffer distribution [ 97651. 124872.  27595.  44634.]
P2 actions distribution [0.33129885 0.42365107 0.09362108 0.151429  ]
   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.3065, -0.8368, -1.0779,  0.3165]])
Player 0 Prediction: tensor([[0.0000, 0.5272, 0.0037, 0.4691]])
Player 1 Prediction: tensor([[-1.1467, -1.8374, -0.8460, -0.5767]])
Player 0 Prediction: tensor([[0.0000, 0.0933, 0.0010, 0.9058]])


 92%|█████████▏| 45998/50000 [41:22<03:06, 21.45it/s]


📊 TEST RESULTS SUMMARY
Training step: 46000
Episodes completed: 10000/10000
Total steps: 50725
Average episode length: 5.1 steps
Episode length range: 2 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5907/10000 (59.1%)
    Average reward: -0.228
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4093/10000 (40.9%)
    Average reward: +0.228
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8035 (31.8%)
    Action 1: 10183 (40.3%)
    Action 2: 2092 (8.3%)
    Action 3: 4957 (19.6%)
  Player 1:
    Action 0: 9199 (36.1%)
    Action 1: 9543 (37.5%)
    Action 2: 2760 (10.8%)
    Action 3: 3956 (15.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2279.0, 2279.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.054 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing ne

 94%|█████████▍| 47003/50000 [42:26<02:33, 19.54it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 298467/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 300867/2000000


 96%|█████████▌| 47999/50000 [43:10<01:28, 22.71it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 305012/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 307409/2000000
P1 SL Buffer Size:  305012
P1 SL buffer distribution [119355. 112750.  25428.  47479.]
P1 actions distribution [0.39131247 0.36965759 0.08336721 0.15566273]
P2 SL Buffer Size:  307409
P2 SL buffer distribution [101980. 129554.  28875.  47000.]
P2 actions distribution [0.33174045 0.42143854 0.09393024 0.15289077]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.1714, -0.7160, -1.1162,  0.4215]])
Player 0 Prediction: tensor([[0.0000e+00, 9.2120e-01, 7.4000e-05, 7.8722e-02]])
Player 1 Prediction: tensor([[ 0.0461, -0.2541, -1.3720,  0.3866]])
Player 0 Prediction: tensor([[0.0000, 0.4799, 0.0006, 0.5195]])
Player 1 Prediction: tensor([[ 4.0380,  5.2459, -1.7208,  2.8642]])
Player 0 Predi

 96%|█████████▌| 47999/50000 [43:23<01:28, 22.71it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 51123
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5709/10000 (57.1%)
    Average reward: -0.299
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4291/10000 (42.9%)
    Average reward: +0.299
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9475 (37.0%)
    Action 1: 9961 (38.9%)
    Action 2: 2278 (8.9%)
    Action 3: 3890 (15.2%)
  Player 1:
    Action 0: 6744 (26.4%)
    Action 1: 11575 (45.4%)
    Action 2: 2594 (10.2%)
    Action 3: 4606 (18.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2989.5, 2989.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.025 (max=1.0 for random)
    → Playing ne

 98%|█████████▊| 49005/50000 [44:25<00:43, 22.71it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 311551/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 314036/2000000


100%|██████████| 50000/50000 [45:11<00:00, 18.44it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.7641,  1.8622, -0.9892,  1.7276]])
Player 0 Prediction: tensor([[0.8300, 0.1674, 0.0026, 0.0000]])
Player 1 Prediction: tensor([[ 2.5820,  3.0082, -2.0954,  3.3029]])
Player 0 Prediction: tensor([[0.0000, 0.1286, 0.0405, 0.8310]])

📊 TEST RESULTS SUMMARY
Training step: 49999
Episodes completed: 10000/10000
Total steps: 50192
Average episode length: 5.0 steps
Episode length range: 2 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5833/10000 (58.3%)
    Average reward: -0.288
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4167/10000 (41.7%)
    Average reward: +0.288
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7681 (31.0%)
    Action 1: 9954 (40.1%)
    Action 2: 2123 (8.6%)
    Action 3: 5050 (20.4%)
  Player 1:
    Action 0: 9769 (38.5%)
    Action 1: 8898 (35.1%)
    Action 2: 2187 (8.6%)
    Action 

In [7]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": KLDivergenceLoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 51,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.KLDivergenceLoss object at 0x36c05d0f0>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : Fals

In [8]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Categorical", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [9]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/50000 [00:00<17:32, 47.50it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 63/200000
   Player 0 SL buffer: 8/2000000
   Player 1 RL buffer: 65/200000
   Player 1 SL buffer: 2/2000000


  2%|▏         | 1007/50000 [00:29<25:22, 32.18it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 62961/200000
   Player 0 SL buffer: 7194/2000000
   Player 1 RL buffer: 65166/200000
   Player 1 SL buffer: 7573/2000000


  4%|▍         | 1998/50000 [01:05<30:31, 26.21it/s]  

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 126444/200000
   Player 0 SL buffer: 13800/2000000
   Player 1 RL buffer: 129684/200000
   Player 1 SL buffer: 14576/2000000
P1 SL Buffer Size:  13800
P1 SL buffer distribution [4937. 7710.   17. 1136.]
P1 actions distribution [0.35775362 0.55869565 0.00123188 0.08231884]
P2 SL Buffer Size:  14576
P2 SL buffer distribution [5751. 7201.   96. 1528.]
P2 actions distribution [0.39455269 0.49403128 0.00658617 0.10482986]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.0595e-03, 2.2003e-03, 2.6557e-03, 2.7594e-03, 3.7055e-03,
          3.1820e-03, 2.8264e-03, 8.3665e-03, 1.1789e-02, 4.4661e-03,
          2.9042e-02, 1.3249e-02, 6.9857e-03, 9.1324e-03, 3.9682e-03,
          1.7467e-02, 7.2669e-03, 3.2454e-02, 4.2645e-02, 5.7101e-03,
          2.2708e-02, 3.7740e-03, 4.1712e-03, 5.3284e-03, 2.6989e-03,
          1.0

  4%|▍         | 1998/50000 [01:17<30:31, 26.21it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 63807
Average episode length: 6.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5621/10000 (56.2%)
    Average reward: -0.310
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4379/10000 (43.8%)
    Average reward: +0.310
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 12937 (39.6%)
    Action 1: 17470 (53.4%)
    Action 2: 759 (2.3%)
    Action 3: 1530 (4.7%)
  Player 1:
    Action 0: 13057 (42.0%)
    Action 1: 13514 (43.4%)
    Action 2: 0 (0.0%)
    Action 3: 4540 (14.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3104.5, 3104.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.012 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.048 (max=1.0 for random)
    → Playing nearly

  6%|▌         | 3005/50000 [02:18<29:41, 26.38it/s]   

   Player 0 ε: 0.0011 → 0.0011

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 190179/200000
   Player 0 SL buffer: 19743/2000000
   Player 1 RL buffer: 193948/200000
   Player 1 SL buffer: 21075/2000000


  8%|▊         | 3998/50000 [02:56<29:15, 26.21it/s]

   Player 0 ε: 0.0009 → 0.0009

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 25934/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 27666/2000000
P1 SL Buffer Size:  25934
P1 SL buffer distribution [ 9677. 12969.    24.  3264.]
P1 actions distribution [0.37313951 0.50007712 0.00092543 0.12585795]
P2 SL Buffer Size:  27666
P2 SL buffer distribution [10482. 13017.   101.  4066.]
P2 actions distribution [0.3788766  0.47050531 0.00365069 0.1469674 ]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[7.3234e-04, 1.4678e-03, 1.7172e-03, 1.9467e-03, 2.1477e-03,
          2.2074e-03, 1.9232e-03, 8.3176e-03, 1.4013e-02, 9.1333e-03,
          4.1602e-03, 4.3877e-03, 1.0823e-02, 1.6581e-02, 4.5644e-03,
          6.1908e-03, 3.5408e-03, 4.9269e-02, 6.9916e-02, 1.0313e-02,
          2.0159e-02, 4.4234e-03, 8.2399e-03, 1.1958e-02, 2.4005e-03,
     

  8%|▊         | 3998/50000 [03:07<29:15, 26.21it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 63159
Average episode length: 6.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5817/10000 (58.2%)
    Average reward: -0.321
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4183/10000 (41.8%)
    Average reward: +0.321
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 13629 (43.0%)
    Action 1: 15403 (48.6%)
    Action 2: 278 (0.9%)
    Action 3: 2356 (7.4%)
  Player 1:
    Action 0: 11344 (36.0%)
    Action 1: 15685 (49.8%)
    Action 2: 0 (0.0%)
    Action 3: 4464 (14.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3213.0, 3213.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.029 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.031 (max=1.0 for random)
    → Playing nearly

 10%|█         | 5005/50000 [04:11<28:19, 26.47it/s]   

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 32134/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 34209/2000000


 12%|█▏        | 5999/50000 [04:49<28:19, 25.89it/s]

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 38581/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40899/2000000
P1 SL Buffer Size:  38581
P1 SL buffer distribution [15220. 18738.    52.  4571.]
P1 actions distribution [0.3944947  0.48567948 0.00134781 0.11847801]
P2 SL Buffer Size:  40899
P2 SL buffer distribution [16075. 19384.   220.  5220.]
P2 actions distribution [0.39304139 0.47394802 0.0053791  0.12763148]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[5.1450e-04, 9.3794e-04, 1.2124e-03, 1.4097e-03, 1.7152e-03,
          1.9050e-03, 1.4555e-03, 7.0015e-03, 1.0845e-02, 5.2807e-03,
          4.9401e-03, 2.9780e-03, 1.6192e-02, 2.3693e-02, 6.3646e-03,
          8.7985e-03, 3.3530e-03, 6.7310e-02, 8.7203e-02, 1.8726e-02,
          2.2613e-02, 3.4502e-03, 8.5785e-03, 1.0322e-02, 1.7991e-03,
     

 12%|█▏        | 5999/50000 [05:08<28:19, 25.89it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 53497
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5932/10000 (59.3%)
    Average reward: -0.101
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4068/10000 (40.7%)
    Average reward: +0.101
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5187 (20.7%)
    Action 1: 16767 (66.8%)
    Action 2: 139 (0.6%)
    Action 3: 2990 (11.9%)
  Player 1:
    Action 0: 21727 (76.5%)
    Action 1: 6687 (23.5%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1005.5, 1005.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.859 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.787 (max=1.0 for random)
    → Mixed strategy
  Average strategy e

 14%|█▍        | 7003/50000 [06:07<27:59, 25.60it/s]   

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 45130/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 47468/2000000


 16%|█▌        | 7999/50000 [06:46<32:46, 21.35it/s]

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 51610/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 53870/2000000
P1 SL Buffer Size:  51610
P1 SL buffer distribution [20625. 24911.   556.  5518.]
P1 actions distribution [0.39963185 0.48267778 0.01077311 0.10691726]
P2 SL Buffer Size:  53870
P2 SL buffer distribution [20811. 25914.  1042.  6103.]
P2 actions distribution [0.38631892 0.48104696 0.01934286 0.11329126]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[2.0859e-04, 4.0735e-04, 5.4205e-04, 5.8684e-04, 8.0179e-04,
          9.4503e-04, 7.2056e-04, 7.9345e-03, 1.3241e-02, 5.7412e-03,
          3.4837e-03, 1.8124e-03, 8.9063e-03, 1.2539e-02, 2.7762e-03,
          3.8481e-03, 1.3676e-03, 9.6302e-02, 1.2740e-01, 2.0558e-02,
          7.3580e-03, 1.3374e-03, 3.5422e-03, 4.4496e-03, 7.5334e-04,
     

 16%|█▌        | 7999/50000 [06:58<32:46, 21.35it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 58167
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5948/10000 (59.5%)
    Average reward: -0.736
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4052/10000 (40.5%)
    Average reward: +0.736
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9908 (35.3%)
    Action 1: 15390 (54.8%)
    Action 2: 128 (0.5%)
    Action 3: 2645 (9.4%)
  Player 1:
    Action 0: 11278 (37.5%)
    Action 1: 14821 (49.2%)
    Action 2: 1860 (6.2%)
    Action 3: 2137 (7.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7363.5, 7363.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.006 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.034 (max=1.0 for random)
    → Playing nearl

 18%|█▊        | 9003/50000 [08:01<26:48, 25.49it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 58065/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 60303/2000000


 20%|█▉        | 9999/50000 [08:40<26:28, 25.19it/s]

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 64330/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 66973/2000000
P1 SL Buffer Size:  64330
P1 SL buffer distribution [25263. 30909.  1576.  6582.]
P1 actions distribution [0.39270947 0.48047567 0.02449868 0.10231618]
P2 SL Buffer Size:  66973
P2 SL buffer distribution [25073. 32399.  2180.  7321.]
P2 actions distribution [0.37437475 0.48376211 0.03255043 0.10931271]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5848, 0.4099, 0.0053, 0.0000]])
Player 1 Prediction: tensor([[[8.4465e-05, 1.9256e-04, 1.7405e-04, 2.2242e-04, 2.9289e-04,
          3.5555e-04, 2.3783e-04, 1.9045e-03, 2.8010e-03, 1.6182e-03,
          4.8912e-03, 2.4862e-03, 1.8087e-03, 2.2100e-03, 1.1926e-03,
          9.6713e-03, 1.1613e-03, 9.1634e-02, 1.0836e-01, 1.9583e-02,
         

 20%|█▉        | 9999/50000 [08:58<26:28, 25.19it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 52634
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5854/10000 (58.5%)
    Average reward: -0.069
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4146/10000 (41.5%)
    Average reward: +0.069
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4705 (18.9%)
    Action 1: 16535 (66.4%)
    Action 2: 355 (1.4%)
    Action 3: 3312 (13.3%)
  Player 1:
    Action 0: 21535 (77.7%)
    Action 1: 6192 (22.3%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-687.5, 687.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.847 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.766 (max=1.0 for random)
    → Mixed strategy
  Average strategy en

 22%|██▏       | 11004/50000 [09:54<26:06, 24.90it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 70506/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 73240/2000000


 24%|██▍       | 11998/50000 [10:34<24:58, 25.35it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 76799/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 79455/2000000
P1 SL Buffer Size:  76799
P1 SL buffer distribution [30799. 35584.  2898.  7518.]
P1 actions distribution [0.40103387 0.46333937 0.03773487 0.0978919 ]
P2 SL Buffer Size:  79455
P2 SL buffer distribution [28752. 38767.  3259.  8677.]
P2 actions distribution [0.36186521 0.4879114  0.04101693 0.10920647]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.3431e-04, 2.9656e-04, 2.8183e-04, 3.2056e-04, 3.8505e-04,
          4.1719e-04, 4.2869e-04, 2.5726e-03, 3.5419e-03, 2.2243e-03,
          2.8510e-03, 1.5612e-03, 8.3940e-03, 1.0774e-02, 3.2471e-03,
          3.2113e-03, 8.1881e-04, 5.0279e-02, 6.6403e-02, 1.3961e-02,
          1.8475e-02, 8.5945e-04, 1.9310e-03, 2.7044e-03, 4.2060e-04,
   

 24%|██▍       | 11998/50000 [10:48<24:58, 25.35it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 53485
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5654/10000 (56.5%)
    Average reward: -0.753
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4346/10000 (43.5%)
    Average reward: +0.753
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8853 (33.8%)
    Action 1: 13900 (53.1%)
    Action 2: 839 (3.2%)
    Action 3: 2569 (9.8%)
  Player 1:
    Action 0: 9170 (33.6%)
    Action 1: 14248 (52.1%)
    Action 2: 2062 (7.5%)
    Action 3: 1844 (6.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7528.0, 7528.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.014 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.018 (max=1.0 for random)
    → Playing nearl

 26%|██▌       | 13003/50000 [11:49<24:18, 25.36it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 82961/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 85830/2000000


 28%|██▊       | 14000/50000 [12:31<24:25, 24.56it/s]  

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 89274/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 92051/2000000
P1 SL Buffer Size:  89274
P1 SL buffer distribution [36456. 40261.  4283.  8274.]
P1 actions distribution [0.40836078 0.45098237 0.04797589 0.09268096]
P2 SL Buffer Size:  92051
P2 SL buffer distribution [33164. 44848.  4360.  9679.]
P2 actions distribution [0.36027854 0.48720818 0.04736505 0.10514823]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[2.1512e-04, 3.1835e-04, 3.5827e-04, 3.4739e-04, 3.3873e-04,
          4.1172e-04, 3.8842e-04, 2.0222e-03, 3.7835e-03, 2.2394e-03,
          1.8658e-03, 1.9764e-03, 1.6934e-02, 2.2698e-02, 4.4241e-03,
          3.2165e-03, 1.3961e-03, 1.8419e-02, 2.2988e-02, 6.5088e-03,
          5.4874e-03, 1.9163e-03, 1.3338e-03, 1.7228e-03, 3.7199e-04,
   

 28%|██▊       | 14000/50000 [12:49<24:25, 24.56it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 55616
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5404/10000 (54.0%)
    Average reward: -0.665
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4596/10000 (46.0%)
    Average reward: +0.665
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9975 (36.1%)
    Action 1: 13897 (50.3%)
    Action 2: 1340 (4.9%)
    Action 3: 2389 (8.7%)
  Player 1:
    Action 0: 10165 (36.3%)
    Action 1: 13117 (46.8%)
    Action 2: 2787 (9.9%)
    Action 3: 1946 (6.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6655.0, 6655.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.029 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nea

 30%|███       | 15005/50000 [13:50<23:20, 24.99it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 95485/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 98686/2000000


 32%|███▏      | 15998/50000 [14:30<23:17, 24.33it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 102285/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 105367/2000000
P1 SL Buffer Size:  102285
P1 SL buffer distribution [42097. 45763.  5620.  8805.]
P1 actions distribution [0.41156572 0.44740676 0.05494452 0.086083  ]
P2 SL Buffer Size:  105367
P2 SL buffer distribution [37579. 51407.  5582. 10799.]
P2 actions distribution [0.35664867 0.4878852  0.05297674 0.10248939]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5327, 0.4635, 0.0038, 0.0000]])
Player 1 Prediction: tensor([[[1.7201e-05, 3.4906e-05, 3.8118e-05, 4.1507e-05, 5.7156e-05,
          5.0613e-05, 4.7593e-05, 2.9412e-04, 3.3531e-04, 2.0540e-04,
          5.2216e-03, 1.2001e-03, 3.0543e-04, 3.0736e-04, 1.8953e-04,
          1.6805e-02, 8.2637e-04, 2.7390e-03, 3.5277e-03, 1.8065e-03,
     

 32%|███▏      | 15998/50000 [14:49<23:17, 24.33it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 53067
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5035/10000 (50.3%)
    Average reward: -0.179
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4965/10000 (49.6%)
    Average reward: +0.179
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5073 (19.8%)
    Action 1: 14791 (57.6%)
    Action 2: 2129 (8.3%)
    Action 3: 3672 (14.3%)
  Player 1:
    Action 0: 19879 (72.5%)
    Action 1: 7523 (27.5%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1789.5, 1789.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.921 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.848 (max=1.0 for random)
    → Mixed strategy
  

 34%|███▍      | 17003/50000 [15:47<24:06, 22.81it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 108760/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 111950/2000000


 36%|███▌      | 17998/50000 [16:30<21:56, 24.31it/s]  

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 115290/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 118096/2000000
P1 SL Buffer Size:  115290
P1 SL buffer distribution [47323. 51817.  6905.  9245.]
P1 actions distribution [0.41046925 0.44944922 0.05989245 0.08018909]
P2 SL Buffer Size:  118096
P2 SL buffer distribution [41459. 58122.  6780. 11735.]
P2 actions distribution [0.35106185 0.49215892 0.05741092 0.09936831]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5367, 0.4608, 0.0025, 0.0000]])
Player 1 Prediction: tensor([[[4.8083e-05, 9.7348e-05, 9.2443e-05, 1.2677e-04, 1.5379e-04,
          2.1279e-04, 1.2491e-04, 6.5451e-04, 1.0012e-03, 7.6147e-04,
          2.0106e-03, 1.6322e-03, 4.0931e-04, 4.8985e-04, 6.8877e-04,
          2.3079e-02, 2.1538e-03, 2.6303e-02, 2.9145e-02, 1.2150e-02,
     

 36%|███▌      | 17998/50000 [16:49<21:56, 24.31it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 53471
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5014/10000 (50.1%)
    Average reward: -0.139
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4986/10000 (49.9%)
    Average reward: +0.139
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5128 (19.8%)
    Action 1: 14829 (57.2%)
    Action 2: 2270 (8.8%)
    Action 3: 3709 (14.3%)
  Player 1:
    Action 0: 19798 (71.9%)
    Action 1: 7737 (28.1%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1391.5, 1391.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.923 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.857 (max=1.0 for random)
    → Mixed strategy
  

 38%|███▊      | 19003/50000 [17:46<20:59, 24.62it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 121616/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 124109/2000000


 40%|████      | 20000/50000 [18:28<20:42, 24.14it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 128227/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 130651/2000000
P1 SL Buffer Size:  128227
P1 SL buffer distribution [52439. 58000.  8284.  9504.]
P1 actions distribution [0.40895443 0.45232283 0.06460418 0.07411856]
P2 SL Buffer Size:  130651
P2 SL buffer distribution [45326. 64876.  8079. 12370.]
P2 actions distribution [0.34692425 0.49655954 0.0618365  0.09467972]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.0688e-04, 1.7130e-04, 1.7376e-04, 1.7199e-04, 1.5106e-04,
          2.1228e-04, 1.8495e-04, 2.8075e-03, 6.1698e-03, 2.5260e-03,
          3.8008e-04, 5.9018e-04, 1.0874e-02, 1.4457e-02, 3.3493e-03,
          6.0938e-04, 6.8480e-04, 1.3323e-02, 1.7553e-02, 5.4624e-03,
          1.9330e-03, 5.3019e-04, 1.7162e-03, 2.4049e-03, 1.9402e-04,

 40%|████      | 20000/50000 [18:39<20:42, 24.14it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 56099
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5199/10000 (52.0%)
    Average reward: -0.540
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4801/10000 (48.0%)
    Average reward: +0.540
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10148 (36.3%)
    Action 1: 13756 (49.2%)
    Action 2: 1974 (7.1%)
    Action 3: 2103 (7.5%)
  Player 1:
    Action 0: 9931 (35.3%)
    Action 1: 13632 (48.5%)
    Action 2: 2892 (10.3%)
    Action 3: 1663 (5.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5402.5, 5402.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.034 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.037 (max=1.0 for random)
    → Playing ne

 42%|████▏     | 21005/50000 [19:45<19:40, 24.56it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 134713/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 136826/2000000


 44%|████▍     | 21998/50000 [20:27<19:23, 24.07it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 141063/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 143317/2000000
P1 SL Buffer Size:  141063
P1 SL buffer distribution [57125. 64711.  9545.  9682.]
P1 actions distribution [0.4049609  0.45873829 0.0676648  0.068636  ]
P2 SL Buffer Size:  143317
P2 SL buffer distribution [49090. 71824.  9333. 13070.]
P2 actions distribution [0.3425274  0.50115478 0.06512137 0.09119644]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[6.1736e-05, 1.1351e-04, 1.4031e-04, 1.6974e-04, 2.0451e-04,
          3.1551e-04, 1.9388e-04, 1.5120e-03, 2.1968e-03, 1.4434e-03,
          2.6495e-03, 9.3106e-04, 1.8278e-03, 2.4870e-03, 1.4105e-03,
          5.1835e-03, 1.5545e-03, 2.5808e-02, 2.7979e-02, 1.4907e-02,
          2.0501e-02, 2.9650e-03, 6.7293e-03, 7.8982e-03, 2.6039e-04,

 44%|████▍     | 21998/50000 [20:39<19:23, 24.07it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 54754
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4993/10000 (49.9%)
    Average reward: -0.594
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5007/10000 (50.1%)
    Average reward: +0.594
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10078 (37.3%)
    Action 1: 13233 (49.0%)
    Action 2: 2120 (7.8%)
    Action 3: 1599 (5.9%)
  Player 1:
    Action 0: 8427 (30.4%)
    Action 1: 14747 (53.2%)
    Action 2: 2789 (10.1%)
    Action 3: 1761 (6.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5938.5, 5938.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.035 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.007 (max=1.0 for random)
    → Playing ne

 46%|████▌     | 23003/50000 [21:43<18:29, 24.33it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 147474/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 149476/2000000


 48%|████▊     | 23999/50000 [22:24<19:07, 22.67it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 153929/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 155605/2000000
P1 SL Buffer Size:  153929
P1 SL buffer distribution [61436. 71756. 10650. 10087.]
P1 actions distribution [0.39911907 0.46616297 0.06918774 0.06553021]
P2 SL Buffer Size:  155605
P2 SL buffer distribution [52690. 78541. 10635. 13739.]
P2 actions distribution [0.3386138  0.50474599 0.06834613 0.08829408]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.4821, 0.5167, 0.0012, 0.0000]])
Player 1 Prediction: tensor([[[1.0919e-05, 2.2923e-05, 2.3476e-05, 2.4901e-05, 3.6069e-05,
          3.3422e-05, 3.0870e-05, 1.5271e-04, 1.4896e-04, 1.0439e-04,
          2.7868e-03, 5.9554e-04, 1.0953e-04, 1.0151e-04, 8.1348e-05,
          1.3563e-02, 7.1465e-04, 1.6225e-03, 2.4633e-03, 1.3854e-03,
     

 48%|████▊     | 23999/50000 [22:40<19:07, 22.67it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 54831
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5117/10000 (51.2%)
    Average reward: -0.478
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4883/10000 (48.8%)
    Average reward: +0.478
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10192 (37.6%)
    Action 1: 13264 (48.9%)
    Action 2: 2162 (8.0%)
    Action 3: 1510 (5.6%)
  Player 1:
    Action 0: 8218 (29.7%)
    Action 1: 14930 (53.9%)
    Action 2: 2881 (10.4%)
    Action 3: 1674 (6.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4779.0, 4779.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.035 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.001 (max=1.0 for random)
    → Playing ne

 50%|█████     | 25003/50000 [23:44<17:39, 23.60it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 160201/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 162008/2000000


 52%|█████▏    | 26000/50000 [24:28<17:06, 23.38it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 166520/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 168390/2000000
P1 SL Buffer Size:  166520
P1 SL buffer distribution [65781. 78584. 11881. 10274.]
P1 actions distribution [0.39503363 0.47191929 0.07134879 0.06169829]
P2 SL Buffer Size:  168390
P2 SL buffer distribution [56167. 85849. 11982. 14392.]
P2 actions distribution [0.33355306 0.50982244 0.07115624 0.08546826]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5682, 0.4243, 0.0075, 0.0000]])
Player 1 Prediction: tensor([[[5.2634e-05, 1.3288e-04, 8.7420e-05, 7.9048e-05, 1.2672e-04,
          1.0475e-04, 1.0615e-04, 8.3027e-04, 8.0442e-04, 4.5136e-04,
          1.7004e-02, 1.5725e-03, 1.3820e-03, 1.5769e-03, 6.2582e-04,
          1.8388e-02, 1.6065e-03, 4.2160e-03, 6.1015e-03, 1.8476e-03,
     

 52%|█████▏    | 26000/50000 [24:40<17:06, 23.38it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 54550
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5031/10000 (50.3%)
    Average reward: -0.557
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4969/10000 (49.7%)
    Average reward: +0.557
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9843 (36.5%)
    Action 1: 13452 (49.9%)
    Action 2: 2211 (8.2%)
    Action 3: 1440 (5.3%)
  Player 1:
    Action 0: 8292 (30.0%)
    Action 1: 14745 (53.4%)
    Action 2: 2892 (10.5%)
    Action 3: 1675 (6.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5574.5, 5574.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.031 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.004 (max=1.0 for random)
    → Playing nea

 54%|█████▍    | 27003/50000 [25:51<19:29, 19.66it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 172798/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 174791/2000000


 56%|█████▌    | 27998/50000 [26:34<15:44, 23.29it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 179188/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 181043/2000000
P1 SL Buffer Size:  179188
P1 SL buffer distribution [69875. 85800. 13121. 10392.]
P1 actions distribution [0.38995357 0.47882671 0.07322477 0.05799496]
P2 SL Buffer Size:  181043
P2 SL buffer distribution [59721. 92761. 13400. 15161.]
P2 actions distribution [0.32987191 0.51236999 0.07401557 0.08374254]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[6.5435e-05, 1.6684e-04, 1.3133e-04, 1.8115e-04, 1.8488e-04,
          2.1820e-04, 2.2966e-04, 2.2591e-03, 2.6597e-03, 1.2782e-03,
          9.8293e-04, 6.0618e-04, 1.0682e-02, 1.4436e-02, 3.0814e-03,
          1.1904e-03, 6.0820e-04, 1.8254e-02, 2.5245e-02, 7.2471e-03,
          1.6464e-02, 1.7550e-03, 7.0899e-03, 1.0346e-02, 3.5419e-04,

 56%|█████▌    | 27998/50000 [26:50<15:44, 23.29it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 52969
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5192/10000 (51.9%)
    Average reward: -0.415
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4808/10000 (48.1%)
    Average reward: +0.415
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9272 (35.4%)
    Action 1: 13328 (50.9%)
    Action 2: 2325 (8.9%)
    Action 3: 1257 (4.8%)
  Player 1:
    Action 0: 7394 (27.6%)
    Action 1: 15134 (56.5%)
    Action 2: 2995 (11.2%)
    Action 3: 1264 (4.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4150.0, 4150.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.026 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.978 (max=1.0 for random)
    → Playing nea

 58%|█████▊    | 29003/50000 [27:55<15:02, 23.26it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 185700/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 187262/2000000


 60%|██████    | 30000/50000 [28:39<14:21, 23.21it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 191971/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 193829/2000000
P1 SL Buffer Size:  191971
P1 SL buffer distribution [73971. 93104. 14366. 10530.]
P1 actions distribution [0.38532382 0.48498992 0.07483422 0.05485203]
P2 SL Buffer Size:  193829
P2 SL buffer distribution [63101. 99494. 14750. 16484.]
P2 actions distribution [0.32554984 0.51330812 0.076098   0.08504403]


 60%|██████    | 30000/50000 [28:51<14:21, 23.21it/s]

   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.2009e-04, 1.9049e-04, 1.9877e-04, 2.0073e-04, 1.9153e-04,
          2.7263e-04, 2.2496e-04, 1.6667e-03, 2.7017e-03, 1.4851e-03,
          4.5645e-04, 5.5612e-04, 3.5785e-03, 4.5903e-03, 1.6951e-03,
          4.3023e-04, 5.4931e-04, 1.7156e-02, 2.3452e-02, 4.6753e-03,
          2.2599e-03, 9.6974e-04, 5.8232e-03, 7.3060e-03, 3.2839e-04,
          4.7455e-02, 5.1880e-04, 1.1571e-02, 8.3643e-03, 3.1290e-03,
          1.3540e-02, 6.2086e-02, 3.0264e-01, 2.4677e-01, 1.1031e-03,
          2.5576e-03, 1.6882e-02, 3.5388e-02, 2.3246e-02, 3.5220e-03,
          1.0017e-02, 3.3421e-02, 6.0830e-02, 3.4713e-02, 1.8192e-04,
          1.9072e-04, 1.9160e-04, 1.9241e-04, 1.8900e-04, 1.4157e-04,
          7.2223e-05],
         [1.1186e-04, 2.2022e-04, 2.0489e-04, 1.8626e-04, 2.6275e-04,
          2.7803e-04, 2.6672e-04, 1.1893e-03, 1.4961e-03, 8.4428e-04,
    

 62%|██████▏   | 31004/50000 [30:01<16:33, 19.11it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 198231/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 200203/2000000


 64%|██████▍   | 31998/50000 [30:46<12:58, 23.13it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 204682/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 206737/2000000
P1 SL Buffer Size:  204682
P1 SL buffer distribution [ 77852. 100453.  15652.  10725.]
P1 actions distribution [0.38035587 0.49077594 0.07646984 0.05239835]
P2 SL Buffer Size:  206737
P2 SL buffer distribution [ 66395. 106069.  16209.  18064.]
P2 actions distribution [0.32115683 0.51306249 0.07840396 0.08737672]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.0501e-04, 1.6023e-04, 1.7179e-04, 1.7450e-04, 1.6636e-04,
          2.3768e-04, 1.9845e-04, 1.4346e-03, 2.2697e-03, 1.1396e-03,
          3.4513e-04, 4.5246e-04, 1.8551e-03, 2.3284e-03, 1.1273e-03,
          3.3537e-04, 4.5349e-04, 2.1634e-02, 2.7917e-02, 4.6878e-03,
          1.5740e-03, 7.4862e-04, 4.2447e-03, 5.1159e-03, 2.9

 64%|██████▍   | 31998/50000 [31:01<12:58, 23.13it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 59123
Average episode length: 5.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5351/10000 (53.5%)
    Average reward: -0.559
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4649/10000 (46.5%)
    Average reward: +0.559
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10531 (35.9%)
    Action 1: 15292 (52.1%)
    Action 2: 1432 (4.9%)
    Action 3: 2074 (7.1%)
  Player 1:
    Action 0: 8692 (29.2%)
    Action 1: 12754 (42.8%)
    Action 2: 3034 (10.2%)
    Action 3: 5314 (17.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5585.5, 5585.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.020 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.042 (max=1.0 for random)
    → Playing n

 66%|██████▌   | 33003/50000 [32:10<12:13, 23.16it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 211196/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 213520/2000000


 68%|██████▊   | 33999/50000 [32:54<11:48, 22.59it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 217636/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 220220/2000000
P1 SL Buffer Size:  217636
P1 SL buffer distribution [ 81665. 108128.  16914.  10929.]
P1 actions distribution [0.37523663 0.49682957 0.07771692 0.05021688]
P2 SL Buffer Size:  220220
P2 SL buffer distribution [ 70054. 112515.  17586.  20065.]
P2 actions distribution [0.31810916 0.5109209  0.07985651 0.09111343]


 68%|██████▊   | 33999/50000 [33:04<11:48, 22.59it/s]

   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.4143, 0.5814, 0.0043, 0.0000]])
Player 1 Prediction: tensor([[[1.0214e-04, 1.7816e-04, 1.3940e-04, 1.4523e-04, 1.4848e-04,
          2.1858e-04, 1.5087e-04, 5.0162e-04, 8.7070e-04, 6.2018e-04,
          2.3327e-04, 7.9959e-04, 3.5222e-04, 4.0518e-04, 5.5926e-04,
          1.7144e-03, 1.0325e-03, 1.9011e-02, 1.9942e-02, 3.8995e-03,
          4.6852e-03, 1.1705e-03, 4.2025e-03, 4.2234e-03, 2.4791e-04,
          4.4923e-02, 3.8680e-04, 1.4186e-02, 9.4332e-03, 1.0886e-02,
          1.0026e-01, 4.2578e-02, 2.9474e-01, 2.2636e-01, 2.0731e-03,
          1.6584e-02, 1.5027e-02, 1.2238e-02, 7.3501e-03, 9.1427e-03,
          1.9962e-02, 2.0833e-02, 5.5202e-02, 3.1226e-02, 1.6176e-04,
          1.8433e-04, 1.5491e-04, 1.7260e-04, 1.8435e-04, 1.2595e-04,
          7.3647e-05],
         [8.3235e-05, 1.5090e-04, 1.4959e-04, 1.4769e-04, 1.8579e-04,
          

 70%|███████   | 35003/50000 [34:16<10:55, 22.87it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 223933/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 226781/2000000


 72%|███████▏  | 36000/50000 [35:00<10:25, 22.40it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 230606/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 233160/2000000
P1 SL Buffer Size:  230606
P1 SL buffer distribution [ 85462. 115868.  18168.  11108.]
P1 actions distribution [0.37059747 0.50245007 0.07878373 0.04816874]
P2 SL Buffer Size:  233160
P2 SL buffer distribution [ 73406. 118684.  18986.  22084.]
P2 actions distribution [0.31483102 0.50902385 0.08142906 0.09471607]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[9.5917e-05, 1.3955e-04, 1.5178e-04, 1.5260e-04, 1.4970e-04,
          2.0940e-04, 1.8100e-04, 7.9569e-04, 1.2481e-03, 8.4457e-04,
          2.5599e-04, 3.7480e-04, 9.0034e-04, 1.1546e-03, 7.2704e-04,
          2.4455e-04, 3.6774e-04, 1.7115e-02, 2.2753e-02, 3.9772e-03,
          1.3039e-03, 6.2928e-04, 4.0060e-03, 4.5235e-03, 2.8

 72%|███████▏  | 36000/50000 [35:15<10:25, 22.40it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 58213
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5328/10000 (53.3%)
    Average reward: -0.473
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4672/10000 (46.7%)
    Average reward: +0.473
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10845 (36.5%)
    Action 1: 15080 (50.7%)
    Action 2: 2075 (7.0%)
    Action 3: 1744 (5.9%)
  Player 1:
    Action 0: 6210 (21.8%)
    Action 1: 14980 (52.6%)
    Action 2: 3103 (10.9%)
    Action 3: 4176 (14.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4732.5, 4732.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.028 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.967 (max=1.0 for random)
    → Playing n

 74%|███████▍  | 37004/50000 [36:27<09:57, 21.76it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 237024/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 239912/2000000


 76%|███████▌  | 38000/50000 [37:12<08:45, 22.82it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 243384/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 246253/2000000
P1 SL Buffer Size:  243384
P1 SL buffer distribution [ 89206. 123498.  19376.  11304.]
P1 actions distribution [0.36652368 0.50742037 0.07961082 0.04644512]
P2 SL Buffer Size:  246253
P2 SL buffer distribution [ 76691. 124984.  20452.  24126.]
P2 actions distribution [0.31143174 0.50754306 0.0830528  0.09797241]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.4011, 0.5958, 0.0031, 0.0000]])
Player 1 Prediction: tensor([[[9.5256e-06, 1.8736e-05, 2.0525e-05, 2.1437e-05, 3.4816e-05,
          3.1058e-05, 2.8497e-05, 4.8579e-05, 4.2722e-05, 4.4170e-05,
          9.3876e-04, 3.2903e-04, 4.8327e-05, 5.2114e-05, 2.9857e-05,
          1.2133e-02, 9.1439e-04, 1.4695e-03, 2.6115e-03, 1.5010e-0

 76%|███████▌  | 38000/50000 [37:25<08:45, 22.82it/s]


📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 57058
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5365/10000 (53.6%)
    Average reward: -0.441
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4635/10000 (46.4%)
    Average reward: +0.441
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10172 (35.4%)
    Action 1: 14975 (52.1%)
    Action 2: 2135 (7.4%)
    Action 3: 1479 (5.1%)
  Player 1:
    Action 0: 6903 (24.4%)
    Action 1: 14361 (50.8%)
    Action 2: 3141 (11.1%)
    Action 3: 3892 (13.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4413.5, 4413.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.021 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.993 (max=1.0 for random)
    → Playing n

 78%|███████▊  | 39003/50000 [38:36<08:02, 22.77it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 250061/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 252887/2000000


 80%|███████▉  | 39999/50000 [39:20<07:29, 22.23it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 256432/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 259639/2000000
P1 SL Buffer Size:  256432
P1 SL buffer distribution [ 93166. 131118.  20570.  11578.]
P1 actions distribution [0.36331659 0.51131684 0.0802162  0.04515037]
P2 SL Buffer Size:  259639
P2 SL buffer distribution [ 80313. 130912.  21924.  26490.]
P2 actions distribution [0.30932564 0.50420777 0.08444032 0.10202627]
   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[4.3974e-05, 1.0423e-04, 8.4328e-05, 1.1244e-04, 1.3337e-04,
          1.4384e-04, 1.5704e-04, 5.0400e-04, 5.2887e-04, 4.0581e-04,
          4.0942e-04, 3.0995e-04, 9.2278e-04, 1.2503e-03, 6.1345e-04,
          4.2625e-04, 3.1643e-04, 1.8133e-02, 2.7177e-02, 5.0746e-03,
          1.9385e-03, 8.4062e-04, 5.3167e-03, 5.9919e-03, 2.9

 80%|███████▉  | 39999/50000 [39:35<07:29, 22.23it/s]


📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 56535
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5457/10000 (54.6%)
    Average reward: -0.430
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4543/10000 (45.4%)
    Average reward: +0.430
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9358 (33.4%)
    Action 1: 15295 (54.6%)
    Action 2: 1754 (6.3%)
    Action 3: 1607 (5.7%)
  Player 1:
    Action 0: 7817 (27.4%)
    Action 1: 13205 (46.3%)
    Action 2: 3238 (11.4%)
    Action 3: 4261 (14.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4303.5, 4303.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.005 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.026 (max=1.0 for random)
    → Playing ne

 82%|████████▏ | 41003/50000 [40:44<06:38, 22.59it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 262876/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 266633/2000000


 84%|████████▍ | 41999/50000 [41:28<05:52, 22.67it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 269339/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 273388/2000000
P1 SL Buffer Size:  269339
P1 SL buffer distribution [ 97010. 138690.  21663.  11976.]
P1 actions distribution [0.36017807 0.51492728 0.08043024 0.04446441]
P2 SL Buffer Size:  273388
P2 SL buffer distribution [ 84025. 136984.  23401.  28978.]
P2 actions distribution [0.30734707 0.50106076 0.0855963  0.10599587]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[9.2641e-05, 1.3026e-04, 1.4417e-04, 1.4196e-04, 1.4224e-04,
          1.9103e-04, 1.6919e-04, 6.3995e-04, 9.8527e-04, 6.3186e-04,
          2.8395e-04, 3.2543e-04, 5.1108e-04, 6.8329e-04, 5.0622e-04,
          2.5473e-04, 3.6247e-04, 1.1270e-02, 1.5343e-02, 3.0358e-03,
          1.0233e-03, 5.5167e-04, 3.9815e-03, 4.1925e-03, 2.9

 84%|████████▍ | 41999/50000 [41:45<05:52, 22.67it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 60136
Average episode length: 6.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5353/10000 (53.5%)
    Average reward: -0.647
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4647/10000 (46.5%)
    Average reward: +0.647
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10024 (33.7%)
    Action 1: 16338 (54.9%)
    Action 2: 1672 (5.6%)
    Action 3: 1752 (5.9%)
  Player 1:
    Action 0: 8902 (29.3%)
    Action 1: 12725 (41.9%)
    Action 2: 3165 (10.4%)
    Action 3: 5558 (18.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6467.5, 6467.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.004 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.045 (max=1.0 for random)
    → Playing n

 86%|████████▌ | 43003/50000 [42:56<05:19, 21.89it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 275926/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 280266/2000000


 88%|████████▊ | 43999/50000 [43:42<04:34, 21.88it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 282427/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 286964/2000000
P1 SL Buffer Size:  282427
P1 SL buffer distribution [100950. 146279.  22789.  12409.]
P1 actions distribution [0.3574375  0.51793561 0.08068988 0.04393702]
P2 SL Buffer Size:  286964
P2 SL buffer distribution [ 87607. 143101.  24845.  31411.]
P2 actions distribution [0.30528917 0.49867231 0.0865788  0.10945972]
   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[4.3818e-05, 7.3085e-05, 9.3529e-05, 1.1795e-04, 1.6660e-04,
          2.2093e-04, 1.4436e-04, 2.4832e-04, 2.3439e-04, 2.2357e-04,
          2.0850e-03, 5.7159e-04, 2.2556e-04, 3.7657e-04, 1.8262e-04,
          1.4081e-02, 2.6731e-03, 1.5484e-02, 1.6922e-02, 9.4736e-03,
          6.7807e-02, 5.7171e-03, 6.4249e-03, 7.8666e-03, 3.5

 88%|████████▊ | 43999/50000 [43:55<04:34, 21.88it/s]


📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 60232
Average episode length: 6.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5390/10000 (53.9%)
    Average reward: -0.634
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4610/10000 (46.1%)
    Average reward: +0.634
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9957 (33.4%)
    Action 1: 16557 (55.5%)
    Action 2: 1662 (5.6%)
    Action 3: 1644 (5.5%)
  Player 1:
    Action 0: 8855 (29.1%)
    Action 1: 12678 (41.7%)
    Action 2: 3226 (10.6%)
    Action 3: 5653 (18.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6337.5, 6337.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.000 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.045 (max=1.0 for random)
    → Playing ne

 90%|█████████ | 45003/50000 [45:08<03:55, 21.24it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 288994/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 293746/2000000


 92%|█████████▏| 45999/50000 [45:55<03:05, 21.57it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 295672/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 300473/2000000
P1 SL Buffer Size:  295672
P1 SL buffer distribution [104949. 153911.  23898.  12914.]
P1 actions distribution [0.35495076 0.52054642 0.08082605 0.04367678]
P2 SL Buffer Size:  300473
P2 SL buffer distribution [ 91068. 149094.  26288.  34023.]
P2 actions distribution [0.30308214 0.49619766 0.08748873 0.11323147]


 92%|█████████▏| 45999/50000 [46:08<03:05, 21.57it/s]

   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[8.0284e-05, 1.1041e-04, 1.2379e-04, 1.1916e-04, 1.1912e-04,
          1.6062e-04, 1.4389e-04, 5.5092e-04, 8.4408e-04, 4.7311e-04,
          1.9761e-04, 2.7799e-04, 4.6069e-04, 6.6420e-04, 3.8910e-04,
          1.9620e-04, 2.8148e-04, 9.5151e-03, 1.2942e-02, 2.6161e-03,
          6.1381e-04, 4.1945e-04, 3.5708e-03, 3.7511e-03, 2.6232e-04,
          1.9990e-02, 2.5228e-03, 3.7980e-02, 7.2172e-03, 2.6378e-03,
          1.3848e-02, 6.4948e-02, 2.9414e-01, 2.4754e-01, 1.8540e-03,
          1.1845e-02, 4.0436e-02, 5.4783e-02, 2.5605e-02, 2.0126e-03,
          8.0819e-03, 3.2316e-02, 5.9055e-02, 3.3600e-02, 1.0317e-04,
          1.1621e-04, 1.2072e-04, 1.0683e-04, 1.1659e-04, 9.1574e-05,
          4.4872e-05],
         [8.7369e-05, 1.4617e-04, 1.3553e-04, 1.2747e-04, 1.8877e-04,
          1.9513e-04, 1.9094e-04, 6.6191e-04, 7.1175e-04, 3.5648e-04,
    

 94%|█████████▍| 47003/50000 [47:24<02:17, 21.87it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 301975/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 307324/2000000


 96%|█████████▌| 48000/50000 [48:12<01:32, 21.71it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 308447/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 313949/2000000
P1 SL Buffer Size:  308447
P1 SL buffer distribution [108791. 161341.  24910.  13405.]
P1 actions distribution [0.35270565 0.52307528 0.08075942 0.04345965]
P2 SL Buffer Size:  313949
P2 SL buffer distribution [ 94472. 155075.  27779.  36623.]
P2 actions distribution [0.30091512 0.49394965 0.08848252 0.1166527 ]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2981, 0.7004, 0.0015, 0.0000]])
Player 1 Prediction: tensor([[[5.0858e-05, 1.1204e-04, 8.2063e-05, 7.5512e-05, 1.2559e-04,
          9.5202e-05, 1.0882e-04, 3.2073e-04, 2.8771e-04, 1.8283e-04,
          5.8499e-03, 9.6968e-04, 2.7175e-04, 4.3458e-04, 2.3070e-04,
          5.1645e-02, 3.9152e-03, 2.1691e-03, 3.7417e-03, 1.2696e-0

 96%|█████████▌| 48000/50000 [48:28<01:32, 21.71it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 61846
Average episode length: 6.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5359/10000 (53.6%)
    Average reward: -0.673
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4641/10000 (46.4%)
    Average reward: +0.673
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10548 (33.9%)
    Action 1: 16850 (54.2%)
    Action 2: 2016 (6.5%)
    Action 3: 1681 (5.4%)
  Player 1:
    Action 0: 7906 (25.7%)
    Action 1: 13691 (44.5%)
    Action 2: 3233 (10.5%)
    Action 3: 5921 (19.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6727.5, 6727.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.008 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.024 (max=1.0 for random)
    → Playing n

 98%|█████████▊| 49002/50000 [49:40<00:45, 21.70it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 314560/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 320903/2000000


100%|██████████| 50000/50000 [50:26<00:00, 16.52it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[4.8900e-05, 7.9500e-05, 1.0374e-04, 1.2972e-04, 1.8575e-04,
          2.4052e-04, 1.5042e-04, 2.1962e-04, 2.1355e-04, 2.1347e-04,
          2.0205e-03, 6.3517e-04, 2.0658e-04, 4.4728e-04, 2.1706e-04,
          5.2034e-03, 2.2492e-03, 2.8993e-02, 3.1035e-02, 1.5512e-02,
          5.2504e-02, 6.1697e-03, 1.0126e-02, 1.3785e-02, 4.2037e-04,
          3.9420e-01, 2.0761e-03, 3.0631e-02, 1.1344e-02, 1.3774e-02,
          5.6555e-02, 1.3562e-02, 7.9703e-02, 3.2109e-02, 1.0790e-02,
          3.8147e-02, 7.8516e-03, 8.5824e-03, 1.2320e-02, 2.5637e-02,
          5.8011e-02, 8.1349e-03, 1.6014e-02, 8.5961e-03, 1.2562e-04,
          1.2648e-04, 1.3124e-04, 1.0519e-04, 1.7668e-04, 1.3224e-04,
          5.5705e-05],
         [2.1157e-05, 2.4737e-05, 3.3954e-05, 2.8599e-05, 4.1048e-05,
          5.1704e-05, 3.7411e-05, 3.1250e-05, 3.3452e-05, 3.5020e-05,
    

In [1]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.06,
    "eg_epsilon": 0.0,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x1059a1f00>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [2]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Noisy", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [3]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/50000 [00:00<19:20, 43.09it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 69/200000
   Player 0 SL buffer: 10/2000000
   Player 1 RL buffer: 59/200000
   Player 1 SL buffer: 4/2000000


  2%|▏         | 1004/50000 [00:30<23:23, 34.91it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 63806/200000
   Player 0 SL buffer: 6890/2000000
   Player 1 RL buffer: 64322/200000
   Player 1 SL buffer: 7497/2000000


  4%|▍         | 1999/50000 [01:00<23:25, 34.15it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 127903/200000
   Player 0 SL buffer: 13604/2000000
   Player 1 RL buffer: 128223/200000
   Player 1 SL buffer: 14402/2000000
P1 SL Buffer Size:  13604
P1 SL buffer distribution [4169. 7357.  669. 1409.]
P1 actions distribution [0.30645398 0.54079682 0.04917671 0.10357248]
P2 SL Buffer Size:  14402
P2 SL buffer distribution [4225. 7959.  608. 1610.]
P2 actions distribution [0.29336203 0.55263158 0.04221636 0.11179003]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2800, 0.6745, 0.0455, 0.0000]])
Player 1 Prediction: tensor([[ 0.9938,  1.4187, -1.0911,  1.1235]])
Player 0 Prediction: tensor([[0.1954, 0.7628, 0.0418, 0.0000]])
Player 1 Prediction: tensor([[ 4.3951,  5.3366, -2.0340,  3.3026]])
Player 0 Prediction: tensor([[0.4007, 0.5268, 0.0725, 0.0000]])
Player 1 Prediction: tensor([[ 5.2755,  6.5749, -4.0791,

  4%|▍         | 1999/50000 [01:20<23:25, 34.15it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 55794
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5742/10000 (57.4%)
    Average reward: +0.848
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4258/10000 (42.6%)
    Average reward: -0.848
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8315 (29.2%)
    Action 1: 13766 (48.3%)
    Action 2: 2850 (10.0%)
    Action 3: 3549 (12.5%)
  Player 1:
    Action 0: 8712 (31.9%)
    Action 1: 15700 (57.5%)
    Action 2: 1326 (4.9%)
    Action 3: 1576 (5.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [8484.5, -8484.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.026 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.985 (max=1.0 for random)
    → Playing nea

  axs[row][col].legend()
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].set_xlim(1, len(values))
  axs[row][col].legend()
  6%|▌         | 3003/50000 [02:01<25:34, 30.64it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 192070/200000
   Player 0 SL buffer: 20199/2000000
   Player 1 RL buffer: 192056/200000
   Player 1 SL buffer: 20860/2000000


  8%|▊         | 3999/50000 [02:35<25:33, 30.00it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 26954/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 27176/2000000
P1 SL Buffer Size:  26954
P1 SL buffer distribution [ 8236. 14397.  1763.  2558.]
P1 actions distribution [0.30555762 0.53413223 0.06540773 0.09490243]
P2 SL Buffer Size:  27176
P2 SL buffer distribution [ 8293. 14331.  1725.  2827.]
P2 actions distribution [0.30515896 0.5273403  0.06347513 0.10402561]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1746, 0.8129, 0.0125, 0.0000]])
Player 1 Prediction: tensor([[ 1.3230,  1.1085, -1.4471,  0.7875]])
Player 0 Prediction: tensor([[0.0000, 0.9096, 0.0312, 0.0592]])
Player 1 Prediction: tensor([[ 4.8197,  5.7805, -2.3817,  3.8012]])
Player 0 Prediction: tensor([[0.9430, 0.0000, 0.0570, 0.0000]])


  8%|▊         | 3999/50000 [02:50<25:33, 30.00it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 49462
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5301/10000 (53.0%)
    Average reward: -1.105
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4699/10000 (47.0%)
    Average reward: +1.105
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4169 (17.5%)
    Action 1: 16282 (68.5%)
    Action 2: 1517 (6.4%)
    Action 3: 1791 (7.5%)
  Player 1:
    Action 0: 14291 (55.6%)
    Action 1: 7438 (28.9%)
    Action 2: 3271 (12.7%)
    Action 3: 703 (2.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-11050.0, 11050.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.814 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.989 (max=1.0 for random)
    → Playing nearly random strat

 10%|█         | 5004/50000 [03:40<24:43, 30.33it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 33470/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 33852/2000000


 12%|█▏        | 6000/50000 [04:22<25:06, 29.21it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 40060/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40311/2000000
P1 SL Buffer Size:  40060
P1 SL buffer distribution [12488. 20950.  3000.  3622.]
P1 actions distribution [0.3117324  0.52296555 0.07488767 0.09041438]
P2 SL Buffer Size:  40311
P2 SL buffer distribution [12583. 20809.  3037.  3882.]
P2 actions distribution [0.31214805 0.51621146 0.07533924 0.09630126]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1926, 0.7929, 0.0145, 0.0000]])
Player 1 Prediction: tensor([[ 0.0186, -0.1028, -1.2825,  0.0452]])
Player 0 Prediction: tensor([[0.0000, 0.7110, 0.0916, 0.1974]])
Player 1 Prediction: tensor([[ 4.6432,  6.1727, -2.4139,  4.0849]])
Player 0 Prediction: tensor([[0.3854, 0.4579, 0.1567, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 6000
Epis

 12%|█▏        | 6000/50000 [04:40<25:06, 29.21it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 47935
Average episode length: 4.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5220/10000 (52.2%)
    Average reward: -0.187
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4780/10000 (47.8%)
    Average reward: +0.187
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2127 (9.4%)
    Action 1: 16974 (74.9%)
    Action 2: 1660 (7.3%)
    Action 3: 1904 (8.4%)
  Player 1:
    Action 0: 21875 (86.6%)
    Action 1: 3395 (13.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1866.0, 1866.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.633 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.569 (max=1.0 for random)
    → Strongly prefers Heads
  Average str

 14%|█▍        | 7003/50000 [05:24<23:22, 30.66it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 46772/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 46712/2000000


 16%|█▌        | 8000/50000 [06:01<29:37, 23.63it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 52903/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 53343/2000000
P1 SL Buffer Size:  52903
P1 SL buffer distribution [16969. 27094.  4137.  4703.]
P1 actions distribution [0.32075686 0.51214487 0.07819972 0.08889855]
P2 SL Buffer Size:  53343
P2 SL buffer distribution [16923. 27245.  4333.  4842.]
P2 actions distribution [0.31724875 0.51075118 0.08122903 0.09077105]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.5723,  2.5163, -0.7040,  1.9326]])
Player 0 Prediction: tensor([[0.0000, 0.7661, 0.0667, 0.1672]])
Player 1 Prediction: tensor([[ 0.4152,  0.8209, -1.4830,  0.7239]])
Player 0 Prediction: tensor([[0.4438, 0.4594, 0.0968, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 58315
Average episode l

 16%|█▌        | 8000/50000 [06:20<29:37, 23.63it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 48396
Average episode length: 4.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5154/10000 (51.5%)
    Average reward: -0.139
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4846/10000 (48.5%)
    Average reward: +0.139
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2287 (10.0%)
    Action 1: 16638 (72.5%)
    Action 2: 1929 (8.4%)
    Action 3: 2091 (9.1%)
  Player 1:
    Action 0: 21647 (85.1%)
    Action 1: 3804 (14.9%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1392.5, 1392.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.668 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.608 (max=1.0 for random)
    → Strongly prefers Heads
  Average st

 18%|█▊        | 9005/50000 [07:11<27:49, 24.56it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 59480/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 60026/2000000


 20%|██        | 10000/50000 [07:55<27:23, 24.35it/s] 

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 65909/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 66512/2000000
P1 SL Buffer Size:  65909
P1 SL buffer distribution [21182. 33490.  5336.  5901.]
P1 actions distribution [0.32138251 0.50812484 0.08096011 0.08953254]
P2 SL Buffer Size:  66512
P2 SL buffer distribution [21302. 34103.  5529.  5578.]
P2 actions distribution [0.32027303 0.51273454 0.08312786 0.08386457]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1485, 0.8470, 0.0045, 0.0000]])
Player 1 Prediction: tensor([[ 1.9417,  3.3663, -0.7288,  2.3240]])
Player 0 Prediction: tensor([[0.9950, 0.0000, 0.0050, 0.0000]])
Player 1 Prediction: tensor([[ 1.5509,  3.1407, -3.4160,  1.3410]])
Player 0 Prediction: tensor([[0.0995, 0.3486, 0.5519, 0.0000]])


 20%|██        | 10000/50000 [08:10<27:23, 24.35it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 55290
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4568/10000 (45.7%)
    Average reward: -0.785
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5432/10000 (54.3%)
    Average reward: +0.785
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8015 (28.7%)
    Action 1: 14805 (53.1%)
    Action 2: 3239 (11.6%)
    Action 3: 1821 (6.5%)
  Player 1:
    Action 0: 7060 (25.8%)
    Action 1: 17223 (62.8%)
    Action 2: 2556 (9.3%)
    Action 3: 571 (2.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7846.0, 7846.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.002 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.925 (max=1.0 for random)
    → Playing near

 22%|██▏       | 11006/50000 [09:03<24:18, 26.73it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 72228/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 72821/2000000


 24%|██▍       | 11999/50000 [09:43<38:53, 16.29it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 78290/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 79201/2000000
P1 SL Buffer Size:  78290
P1 SL buffer distribution [25448. 39455.  6519.  6868.]
P1 actions distribution [0.3250479  0.50395964 0.08326734 0.08772512]
P2 SL Buffer Size:  79201
P2 SL buffer distribution [25572. 40690.  6614.  6325.]
P2 actions distribution [0.32287471 0.51375614 0.08350905 0.0798601 ]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.6293,  1.0014, -0.2749,  0.4436]])
Player 0 Prediction: tensor([[0.0806, 0.9162, 0.0032, 0.0000]])
Player 1 Prediction: tensor([[-0.1302,  0.8135, -2.0038,  0.4742]])
Player 0 Prediction: tensor([[0.0000, 0.5352, 0.0257, 0.4392]])
Player 1 Prediction: tensor([[-2.3985, -1.6215, -2.8956, -2.3110]])
Player 0 Prediction: tensor([[0.1031, 0.29

 24%|██▍       | 11999/50000 [10:00<38:53, 16.29it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 53494
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4833/10000 (48.3%)
    Average reward: -0.825
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5167/10000 (51.7%)
    Average reward: +0.825
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7060 (26.1%)
    Action 1: 14842 (54.9%)
    Action 2: 3094 (11.4%)
    Action 3: 2036 (7.5%)
  Player 1:
    Action 0: 9591 (36.2%)
    Action 1: 13373 (50.5%)
    Action 2: 2688 (10.2%)
    Action 3: 810 (3.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-8254.0, 8254.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.981 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.028 (max=1.0 for random)
    → Playing nea

 26%|██▌       | 13004/50000 [11:06<24:51, 24.80it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 84499/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 85387/2000000


 28%|██▊       | 13998/50000 [11:47<24:18, 24.68it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 90810/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 91727/2000000
P1 SL Buffer Size:  90810
P1 SL buffer distribution [29538. 45709.  7688.  7875.]
P1 actions distribution [0.32527255 0.50334765 0.08466028 0.08671952]
P2 SL Buffer Size:  91727
P2 SL buffer distribution [29620. 47307.  7774.  7026.]
P2 actions distribution [0.32291474 0.51573691 0.08475149 0.07659686]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 3.3602,  2.7599, -0.8758,  2.2154]])
Player 0 Prediction: tensor([[0.0000, 0.9327, 0.0028, 0.0645]])
Player 1 Prediction: tensor([[ 2.6072,  2.4928, -1.2740,  2.3996]])
Player 0 Prediction: tensor([[0.0000, 0.8535, 0.0267, 0.1198]])
Player 1 Prediction: tensor([[ 0.1232, -1.4484, -2.0317, -0.0205]])


 28%|██▊       | 13998/50000 [12:00<24:18, 24.68it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 51476
Average episode length: 5.1 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5008/10000 (50.1%)
    Average reward: -0.671
    Reward range: -7.0 to +6.0
  Player 1:
    Wins: 4992/10000 (49.9%)
    Average reward: +0.671
    Reward range: -6.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6032 (23.4%)
    Action 1: 14628 (56.7%)
    Action 2: 2389 (9.3%)
    Action 3: 2734 (10.6%)
  Player 1:
    Action 0: 11973 (46.6%)
    Action 1: 7691 (29.9%)
    Action 2: 2533 (9.9%)
    Action 3: 3496 (13.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6708.5, 6708.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.954 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.034 (max=1.0 for random)
    → Playing ne

 30%|███       | 15005/50000 [12:57<24:43, 23.59it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 97164/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 97882/2000000


 32%|███▏      | 16000/50000 [13:40<23:29, 24.13it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 103641/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 104074/2000000
P1 SL Buffer Size:  103641
P1 SL buffer distribution [33610. 51805.  8948.  9278.]
P1 actions distribution [0.32429251 0.49985045 0.08633649 0.08952056]
P2 SL Buffer Size:  104074
P2 SL buffer distribution [33456. 53440.  8941.  8237.]
P2 actions distribution [0.32146357 0.51348079 0.08591003 0.07914561]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1013, 0.8976, 0.0011, 0.0000]])
Player 1 Prediction: tensor([[ 0.2153, -0.1568, -1.0992, -0.0085]])
Player 0 Prediction: tensor([[0.0000, 0.8776, 0.0186, 0.1038]])
Player 1 Prediction: tensor([[-3.9014, -4.8112, -1.8431, -2.5627]])


 32%|███▏      | 16000/50000 [13:51<23:29, 24.13it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 53353
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4897/10000 (49.0%)
    Average reward: -0.545
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5103/10000 (51.0%)
    Average reward: +0.545
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7552 (28.2%)
    Action 1: 13755 (51.3%)
    Action 2: 3087 (11.5%)
    Action 3: 2405 (9.0%)
  Player 1:
    Action 0: 9456 (35.6%)
    Action 1: 12650 (47.6%)
    Action 2: 2432 (9.2%)
    Action 3: 2016 (7.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5450.0, 5450.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.009 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.040 (max=1.0 for random)
    → Playing nea

 34%|███▍      | 17002/50000 [14:53<22:59, 23.91it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 109834/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 110324/2000000


 36%|███▌      | 18000/50000 [15:41<23:51, 22.35it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 116082/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 116744/2000000
P1 SL Buffer Size:  116082
P1 SL buffer distribution [37774. 57601. 10252. 10455.]
P1 actions distribution [0.3254079  0.49620958 0.08831688 0.09006564]
P2 SL Buffer Size:  116744
P2 SL buffer distribution [37810. 59330. 10153.  9451.]
P2 actions distribution [0.32387103 0.50820599 0.08696807 0.08095491]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.6231,  2.6715, -0.8754,  1.9466]])
Player 0 Prediction: tensor([[0.0771, 0.9219, 0.0010, 0.0000]])
Player 1 Prediction: tensor([[ 1.8145,  2.5848, -2.4238,  1.6144]])
Player 0 Prediction: tensor([[0.0000, 0.5838, 0.0160, 0.4002]])
Player 1 Prediction: tensor([[ 0.7793,  1.6894, -2.7210,  0.9225]])
Player 0 Prediction: tensor([[0.5007, 

 36%|███▌      | 18000/50000 [16:01<23:51, 22.35it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 49086
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5379/10000 (53.8%)
    Average reward: +0.019
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4621/10000 (46.2%)
    Average reward: -0.019
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2441 (10.6%)
    Action 1: 15999 (69.3%)
    Action 2: 1812 (7.8%)
    Action 3: 2843 (12.3%)
  Player 1:
    Action 0: 21050 (81.0%)
    Action 1: 4941 (19.0%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [194.0, -194.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.710 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.702 (max=1.0 for random)
    → Strongly prefers Heads
  Average st

 38%|███▊      | 19005/50000 [16:56<21:38, 23.87it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 122279/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 123252/2000000


 40%|████      | 20000/50000 [17:40<23:26, 21.33it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 128728/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 129475/2000000
P1 SL Buffer Size:  128728
P1 SL buffer distribution [42365. 62896. 11645. 11822.]
P1 actions distribution [0.32910478 0.48859611 0.09046206 0.09183705]
P2 SL Buffer Size:  129475
P2 SL buffer distribution [42308. 65024. 11484. 10659.]
P2 actions distribution [0.32676578 0.50221278 0.08869666 0.08232477]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7375, 0.2524, 0.0101, 0.0000]])
Player 1 Prediction: tensor([[ 0.1272, -0.2523, -0.6671, -0.1129]])
Player 0 Prediction: tensor([[0.0000, 0.1770, 0.0406, 0.7824]])
Player 1 Prediction: tensor([[-0.7762, -1.8120, -0.7157, -0.3137]])


 40%|████      | 20000/50000 [17:52<23:26, 21.33it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 50055
Average episode length: 5.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5325/10000 (53.2%)
    Average reward: -0.644
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4675/10000 (46.8%)
    Average reward: +0.644
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5151 (20.8%)
    Action 1: 13968 (56.3%)
    Action 2: 2489 (10.0%)
    Action 3: 3197 (12.9%)
  Player 1:
    Action 0: 11570 (45.8%)
    Action 1: 8370 (33.1%)
    Action 2: 2898 (11.5%)
    Action 3: 2412 (9.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6444.5, 6444.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.937 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.044 (max=1.0 for random)
    → Playing n

 42%|████▏     | 21004/50000 [18:58<25:40, 18.82it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 135214/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 135868/2000000


 44%|████▍     | 21999/50000 [19:45<21:35, 21.61it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 141557/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 142420/2000000
P1 SL Buffer Size:  141557
P1 SL buffer distribution [46912. 68440. 13003. 13202.]
P1 actions distribution [0.33140007 0.48348015 0.09185699 0.09326278]
P2 SL Buffer Size:  142420
P2 SL buffer distribution [46376. 71561. 12579. 11904.]
P2 actions distribution [0.32562842 0.50246454 0.08832327 0.08358377]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7638, 0.2272, 0.0090, 0.0000]])
Player 1 Prediction: tensor([[ 2.0299,  2.3067, -1.7591,  1.6610]])
Player 0 Prediction: tensor([[0.9947, 0.0000, 0.0053, 0.0000]])
Player 1 Prediction: tensor([[-0.1840,  0.5255, -2.4578,  0.1539]])
Player 0 Prediction: tensor([[0.0588, 0.0698, 0.8714, 0.0000]])


 44%|████▍     | 21999/50000 [20:02<21:35, 21.61it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 53772
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5445/10000 (54.4%)
    Average reward: -0.389
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4555/10000 (45.6%)
    Average reward: +0.389
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8038 (29.6%)
    Action 1: 13048 (48.1%)
    Action 2: 2471 (9.1%)
    Action 3: 3567 (13.2%)
  Player 1:
    Action 0: 8453 (31.7%)
    Action 1: 11364 (42.6%)
    Action 2: 2369 (8.9%)
    Action 3: 4462 (16.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3894.0, 3894.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.028 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.050 (max=1.0 for random)
    → Playing ne

 46%|████▌     | 23005/50000 [21:05<19:30, 23.06it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 147790/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 149119/2000000


 48%|████▊     | 23999/50000 [21:51<34:58, 12.39it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 154424/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 155594/2000000
P1 SL Buffer Size:  154424
P1 SL buffer distribution [51164. 74329. 14218. 14713.]
P1 actions distribution [0.33132156 0.48133062 0.09207118 0.09527664]
P2 SL Buffer Size:  155594
P2 SL buffer distribution [50545. 78115. 13683. 13251.]
P2 actions distribution [0.32485186 0.50204378 0.08794041 0.08516395]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.6383,  0.2127, -0.5538,  0.1818]])
Player 0 Prediction: tensor([[0.0000, 0.8326, 0.0028, 0.1646]])
Player 1 Prediction: tensor([[ 0.0661,  0.8331, -1.1781,  0.0415]])
Player 0 Prediction: tensor([[9.9918e-01, 0.0000e+00, 8.1848e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.8543, -2.2886, -2.8445, -2.4386]])
Player 0 Prediction: t

 48%|████▊     | 23999/50000 [22:02<34:58, 12.39it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 54732
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5079/10000 (50.8%)
    Average reward: -0.118
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4921/10000 (49.2%)
    Average reward: +0.118
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9902 (35.9%)
    Action 1: 12675 (45.9%)
    Action 2: 2853 (10.3%)
    Action 3: 2158 (7.8%)
  Player 1:
    Action 0: 6699 (24.7%)
    Action 1: 16297 (60.0%)
    Action 2: 2109 (7.8%)
    Action 3: 2039 (7.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1178.0, 1178.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.046 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.940 (max=1.0 for random)
    → Playing nea

 50%|█████     | 25005/50000 [23:17<17:37, 23.63it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 160704/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 161941/2000000


 52%|█████▏    | 25998/50000 [23:59<16:37, 24.07it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 167425/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 168180/2000000
P1 SL Buffer Size:  167425
P1 SL buffer distribution [55464. 80049. 15509. 16403.]
P1 actions distribution [0.33127669 0.47811856 0.09263252 0.09797223]
P2 SL Buffer Size:  168180
P2 SL buffer distribution [54764. 83842. 14649. 14925.]
P2 actions distribution [0.3256273  0.49852539 0.0871031  0.0887442 ]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[1.5136e-01, 8.4861e-01, 2.6657e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.5202,  2.3150, -1.5538,  1.6632]])
Player 0 Prediction: tensor([[9.9996e-01, 0.0000e+00, 3.5407e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.8100,  2.3707, -3.3967,  1.6627]])
Player 0 Prediction: tensor([[0.1056, 0.8718, 0.0227, 0.0000]])
Player 1 P

 52%|█████▏    | 25998/50000 [24:12<16:37, 24.07it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 54355
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5100/10000 (51.0%)
    Average reward: -0.270
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4900/10000 (49.0%)
    Average reward: +0.270
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8867 (32.6%)
    Action 1: 12760 (46.9%)
    Action 2: 2745 (10.1%)
    Action 3: 2838 (10.4%)
  Player 1:
    Action 0: 6649 (24.5%)
    Action 1: 14741 (54.3%)
    Action 2: 2280 (8.4%)
    Action 3: 3475 (12.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2697.0, 2697.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.039 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.975 (max=1.0 for random)
    → Playing n

 54%|█████▍    | 27004/50000 [25:22<16:25, 23.34it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 173860/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 174608/2000000


 56%|█████▌    | 27998/50000 [26:05<15:41, 23.36it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 180162/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 180887/2000000
P1 SL Buffer Size:  180162
P1 SL buffer distribution [59742. 85473. 16756. 18191.]
P1 actions distribution [0.33160156 0.47442302 0.09300518 0.10097024]
P2 SL Buffer Size:  180887
P2 SL buffer distribution [58983. 89530. 15726. 16648.]
P2 actions distribution [0.3260765  0.49494989 0.08693825 0.09203536]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[1.4246e-01, 8.5753e-01, 1.6467e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.1088,  2.8948, -1.7369,  1.7689]])
Player 0 Prediction: tensor([[9.9997e-01, 0.0000e+00, 2.7427e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.4513,  0.7679, -2.6667,  0.6227]])
Player 0 Prediction: tensor([[0.5302, 0.4639, 0.0059, 0.0000]])


 56%|█████▌    | 27998/50000 [26:22<15:41, 23.36it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 54146
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5642/10000 (56.4%)
    Average reward: -0.213
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4358/10000 (43.6%)
    Average reward: +0.213
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8113 (30.0%)
    Action 1: 13229 (49.0%)
    Action 2: 2623 (9.7%)
    Action 3: 3053 (11.3%)
  Player 1:
    Action 0: 9821 (36.2%)
    Action 1: 12691 (46.8%)
    Action 2: 2382 (8.8%)
    Action 3: 2234 (8.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2134.5, 2134.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.026 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nea

 58%|█████▊    | 29003/50000 [27:20<16:39, 21.01it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 186742/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 187339/2000000


 60%|█████▉    | 29999/50000 [28:11<15:42, 21.23it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 193110/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 193945/2000000
P1 SL Buffer Size:  193110
P1 SL buffer distribution [63988. 90981. 17969. 20172.]
P1 actions distribution [0.33135519 0.47113562 0.09305059 0.1044586 ]
P2 SL Buffer Size:  193945
P2 SL buffer distribution [63197. 95410. 16824. 18514.]
P2 actions distribution [0.32585011 0.49194359 0.08674624 0.09546005]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.8210,  0.6164, -0.7845,  0.9469]])
Player 0 Prediction: tensor([[0.0000, 0.8418, 0.0023, 0.1559]])
Player 1 Prediction: tensor([[ 0.5337, -0.1196, -1.6446,  0.8818]])
Player 0 Prediction: tensor([[0.0000, 0.7623, 0.0040, 0.2337]])
Player 1 Prediction: tensor([[-0.7170, -1.2212, -1.9392,  0.5572]])


 60%|█████▉    | 29999/50000 [28:22<15:42, 21.23it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 53463
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5579/10000 (55.8%)
    Average reward: -0.282
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4421/10000 (44.2%)
    Average reward: +0.282
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7606 (28.4%)
    Action 1: 12662 (47.3%)
    Action 2: 2300 (8.6%)
    Action 3: 4197 (15.7%)
  Player 1:
    Action 0: 8801 (33.0%)
    Action 1: 10600 (39.7%)
    Action 2: 2387 (8.9%)
    Action 3: 4910 (18.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2815.5, 2815.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.027 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.057 (max=1.0 for random)
    → Playing ne

 62%|██████▏   | 31005/50000 [29:28<14:26, 21.92it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 199903/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 200545/2000000


 64%|██████▍   | 32000/50000 [30:14<17:17, 17.35it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 206705/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 207179/2000000
P1 SL Buffer Size:  206705
P1 SL buffer distribution [68373. 96760. 19194. 22378.]
P1 actions distribution [0.33077574 0.46810672 0.09285697 0.10826056]
P2 SL Buffer Size:  207179
P2 SL buffer distribution [ 67582. 100966.  17972.  20659.]
P2 actions distribution [0.32620101 0.48733704 0.08674624 0.0997157 ]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[3.2873e-01, 6.7097e-01, 2.9666e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.5712,  2.0326, -1.7051,  1.6029]])
Player 0 Prediction: tensor([[9.9943e-01, 0.0000e+00, 5.7112e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.6712,  2.1692, -3.5326,  2.0472]])
Player 0 Prediction: tensor([[0.0565, 0.1120, 0.8315, 0.0000]])

📊 TES

 64%|██████▍   | 32000/50000 [30:33<17:17, 17.35it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 51381
Average episode length: 5.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5386/10000 (53.9%)
    Average reward: +0.358
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4614/10000 (46.1%)
    Average reward: -0.358
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3325 (13.5%)
    Action 1: 14808 (60.3%)
    Action 2: 2357 (9.6%)
    Action 3: 4061 (16.5%)
  Player 1:
    Action 0: 19788 (73.8%)
    Action 1: 7042 (26.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3580.0, -3580.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.831 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.830 (max=1.0 for random)
    → Mixed strategy
  Average strategy

 66%|██████▌   | 33005/50000 [31:28<12:13, 23.17it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 213093/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 213567/2000000


 68%|██████▊   | 34000/50000 [32:12<11:57, 22.31it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 219604/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 219956/2000000
P1 SL Buffer Size:  219604
P1 SL buffer distribution [ 72347. 102185.  20288.  24784.]
P1 actions distribution [0.329443   0.46531484 0.09238447 0.11285769]
P2 SL Buffer Size:  219956
P2 SL buffer distribution [ 71768. 106102.  19036.  23050.]
P2 actions distribution [0.32628344 0.48237829 0.08654458 0.10479369]


 68%|██████▊   | 34000/50000 [32:23<11:57, 22.31it/s]

   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.7570,  0.5497, -0.4326,  1.0137]])
Player 0 Prediction: tensor([[0.0000e+00, 9.4541e-01, 4.9429e-05, 5.4538e-02]])
Player 1 Prediction: tensor([[ 0.3891, -0.1232, -1.6774,  0.8046]])
Player 0 Prediction: tensor([[0.0000, 0.6672, 0.0012, 0.3316]])
Player 1 Prediction: tensor([[-3.7592, -3.0438, -1.9867, -1.4030]])

📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 52568
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5735/10000 (57.4%)
    Average reward: -0.356
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4265/10000 (42.6%)
    Average reward: +0.356
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7310 (28.1%)
    Action 1: 12288 (47.2%)
    Action 2: 1947 (7.5%)
    Action 3: 4502 (17.3%)
  Player 1:
    Ac

 70%|███████   | 35003/50000 [33:29<10:58, 22.77it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 226070/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 226395/2000000


 72%|███████▏  | 36000/50000 [34:15<10:06, 23.08it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 232734/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 233216/2000000
P1 SL Buffer Size:  232734
P1 SL buffer distribution [ 76765. 107532.  21452.  26985.]
P1 actions distribution [0.32984007 0.46203821 0.0921739  0.11594782]
P2 SL Buffer Size:  233216
P2 SL buffer distribution [ 76164. 111881.  20190.  24981.]
P2 actions distribution [0.32658137 0.47973124 0.0865721  0.10711529]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8492, 0.1484, 0.0024, 0.0000]])
Player 1 Prediction: tensor([[ 1.6797,  1.8261, -1.1415,  1.4476]])
Player 0 Prediction: tensor([[0.5878, 0.4092, 0.0030, 0.0000]])
Player 1 Prediction: tensor([[ 0.1823,  0.2133, -1.7178,  0.6029]])
Player 0 Prediction: tensor([[0.0000, 0.8972, 0.0233, 0.0795]])
Player 1 Prediction: tensor([[ 0.1

 72%|███████▏  | 36000/50000 [34:34<10:06, 23.08it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 51756
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5381/10000 (53.8%)
    Average reward: +0.358
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4619/10000 (46.2%)
    Average reward: -0.358
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3459 (14.0%)
    Action 1: 14688 (59.4%)
    Action 2: 2329 (9.4%)
    Action 3: 4256 (17.2%)
  Player 1:
    Action 0: 19700 (72.9%)
    Action 1: 7324 (27.1%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3577.5, -3577.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.843 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.843 (max=1.0 for random)
    → Mixed strategy
  Average strategy

 74%|███████▍  | 37006/50000 [35:26<07:54, 27.36it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 239267/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 239864/2000000


 76%|███████▌  | 37999/50000 [36:06<08:18, 24.06it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 245587/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 246335/2000000
P1 SL Buffer Size:  245587
P1 SL buffer distribution [ 81456. 112407.  22637.  29087.]
P1 actions distribution [0.33167879 0.45770745 0.09217507 0.11843868]
P2 SL Buffer Size:  246335
P2 SL buffer distribution [ 80248. 118208.  21221.  26658.]
P2 actions distribution [0.32576776 0.47986685 0.08614691 0.10821848]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[4.2426e-01, 5.7555e-01, 1.8438e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.6156,  1.2380, -1.2690,  0.1581]])
Player 0 Prediction: tensor([[9.9959e-01, 0.0000e+00, 4.0652e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.1330, -3.1056, -2.8688, -2.5070]])
Player 0 Prediction: tensor([[0.0000, 0.4792, 0.0012, 0.5196]])
Pl

 76%|███████▌  | 37999/50000 [36:24<08:18, 24.06it/s]


📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 52192
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5348/10000 (53.5%)
    Average reward: +0.373
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4652/10000 (46.5%)
    Average reward: -0.373
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3728 (14.9%)
    Action 1: 14419 (57.7%)
    Action 2: 2431 (9.7%)
    Action 3: 4417 (17.7%)
  Player 1:
    Action 0: 19487 (71.7%)
    Action 1: 7710 (28.3%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3726.5, -3726.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.867 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.860 (max=1.0 for random)
    → Mixed strategy
  Average strategy

 78%|███████▊  | 39003/50000 [37:16<07:16, 25.19it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 251952/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 252854/2000000


 80%|███████▉  | 39999/50000 [37:59<07:13, 23.05it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 258330/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 259570/2000000
P1 SL Buffer Size:  258330
P1 SL buffer distribution [ 86089. 117303.  23781.  31157.]
P1 actions distribution [0.33325204 0.45408199 0.09205667 0.1206093 ]
P2 SL Buffer Size:  259570
P2 SL buffer distribution [ 84686. 124096.  22422.  28366.]
P2 actions distribution [0.32625496 0.47808298 0.08638132 0.10928073]
   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.0065,  0.8147, -0.7819,  0.9083]])
Player 0 Prediction: tensor([[0.0000, 0.1864, 0.0269, 0.7867]])
Player 1 Prediction: tensor([[ 0.7505,  0.6176, -1.4258,  0.9527]])
Player 0 Prediction: tensor([[0.0000, 0.0226, 0.0221, 0.9553]])


 80%|███████▉  | 39999/50000 [38:14<07:13, 23.05it/s]


📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 52040
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5718/10000 (57.2%)
    Average reward: -0.274
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4282/10000 (42.8%)
    Average reward: +0.274
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5119 (19.9%)
    Action 1: 13623 (53.1%)
    Action 2: 2152 (8.4%)
    Action 3: 4772 (18.6%)
  Player 1:
    Action 0: 11980 (45.4%)
    Action 1: 8414 (31.9%)
    Action 2: 2927 (11.1%)
    Action 3: 3053 (11.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2743.0, 2743.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.949 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.043 (max=1.0 for random)
    → Playing n

 82%|████████▏ | 41003/50000 [39:11<06:20, 23.66it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 264776/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 266030/2000000


 84%|████████▍ | 41999/50000 [39:53<05:38, 23.63it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 271364/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 272497/2000000
P1 SL Buffer Size:  271364
P1 SL buffer distribution [ 90712. 122302.  25004.  33346.]
P1 actions distribution [0.33428163 0.45069353 0.09214192 0.12288292]
P2 SL Buffer Size:  272497
P2 SL buffer distribution [ 88679. 130017.  23574.  30227.]
P2 actions distribution [0.32543111 0.47713186 0.08651104 0.11092599]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.8739,  1.0308, -0.5833,  1.1111]])
Player 0 Prediction: tensor([[2.3193e-02, 9.7680e-01, 6.4893e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.0800,  0.5301, -2.1968,  0.8492]])
Player 0 Prediction: tensor([[0.0000e+00, 5.4190e-01, 4.6764e-05, 4.5806e-01]])
Player 1 Prediction: tensor([[ 4.0430,  5.4667, -2.7535,  4.0284]]

 84%|████████▍ | 41999/50000 [40:04<05:38, 23.63it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 53144
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5891/10000 (58.9%)
    Average reward: -0.172
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4109/10000 (41.1%)
    Average reward: +0.172
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7593 (28.8%)
    Action 1: 12804 (48.6%)
    Action 2: 2166 (8.2%)
    Action 3: 3768 (14.3%)
  Player 1:
    Action 0: 8083 (30.1%)
    Action 1: 11470 (42.8%)
    Action 2: 2773 (10.3%)
    Action 3: 4487 (16.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1716.0, 1716.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.023 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.046 (max=1.0 for random)
    → Playing n

 86%|████████▌ | 43004/50000 [41:06<04:47, 24.30it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 277956/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 278977/2000000


 88%|████████▊ | 43998/50000 [41:51<04:27, 22.48it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 284136/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 285554/2000000
P1 SL Buffer Size:  284136
P1 SL buffer distribution [ 95092. 127254.  26247.  35543.]
P1 actions distribution [0.33467072 0.447863   0.09237478 0.12509151]
P2 SL Buffer Size:  285554
P2 SL buffer distribution [ 92762. 135992.  24721.  32079.]
P2 actions distribution [0.32484924 0.47623917 0.08657207 0.11233952]
   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.8136,  2.2219, -0.9083,  1.5354]])
Player 0 Prediction: tensor([[2.2399e-02, 9.7759e-01, 6.0416e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.3179,  2.7596, -2.4163,  2.3236]])
Player 0 Prediction: tensor([[0.0000e+00, 5.0868e-01, 4.0941e-05, 4.9128e-01]])
Player 1 Prediction: tensor([[-1.3896, -3.3907, -2.6126,  0.4007]]

 88%|████████▊ | 43998/50000 [42:04<04:27, 22.48it/s]


📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 54367
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5702/10000 (57.0%)
    Average reward: -0.226
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4298/10000 (43.0%)
    Average reward: +0.226
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8370 (30.9%)
    Action 1: 12802 (47.2%)
    Action 2: 2615 (9.6%)
    Action 3: 3343 (12.3%)
  Player 1:
    Action 0: 8189 (30.1%)
    Action 1: 12975 (47.6%)
    Action 2: 2286 (8.4%)
    Action 3: 3787 (13.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2257.5, 2257.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.035 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.031 (max=1.0 for random)
    → Playing ne

 90%|█████████ | 45003/50000 [43:05<03:41, 22.53it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 290792/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 291871/2000000


 92%|█████████▏| 45999/50000 [43:50<03:02, 21.91it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 297427/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 298021/2000000
P1 SL Buffer Size:  297427
P1 SL buffer distribution [ 99836. 132363.  27571.  37657.]
P1 actions distribution [0.33566556 0.44502685 0.09269838 0.12660922]
P2 SL Buffer Size:  298021
P2 SL buffer distribution [ 96728. 141644.  25772.  33877.]
P2 actions distribution [0.32456773 0.47528194 0.08647713 0.1136732 ]
   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8826, 0.1161, 0.0013, 0.0000]])
Player 1 Prediction: tensor([[ 1.9712,  2.1587, -1.3055,  1.4806]])
Player 0 Prediction: tensor([[0.6379, 0.3602, 0.0019, 0.0000]])
Player 1 Prediction: tensor([[ 2.4835,  2.4098, -2.4032,  3.0783]])
Player 0 Prediction: tensor([[0.0000, 0.4500, 0.0537, 0.4962]])


 92%|█████████▏| 45999/50000 [44:05<03:02, 21.91it/s]


📊 TEST RESULTS SUMMARY
Training step: 46000
Episodes completed: 10000/10000
Total steps: 53576
Average episode length: 5.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5653/10000 (56.5%)
    Average reward: -0.270
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4347/10000 (43.5%)
    Average reward: +0.270
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6336 (23.9%)
    Action 1: 13922 (52.5%)
    Action 2: 2478 (9.3%)
    Action 3: 3768 (14.2%)
  Player 1:
    Action 0: 10921 (40.3%)
    Action 1: 11165 (41.2%)
    Action 2: 2302 (8.5%)
    Action 3: 2684 (9.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2702.0, 2702.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.981 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.055 (max=1.0 for random)
    → Playing ne

 94%|█████████▍| 47005/50000 [45:09<02:10, 22.89it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 303897/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 304433/2000000


 96%|█████████▌| 47998/50000 [45:54<01:29, 22.40it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 310303/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 310844/2000000
P1 SL Buffer Size:  310303
P1 SL buffer distribution [104489. 137295.  28793.  39726.]
P1 actions distribution [0.33673216 0.44245463 0.09278995 0.12802325]
P2 SL Buffer Size:  310844
P2 SL buffer distribution [100891. 147275.  26986.  35692.]
P2 actions distribution [0.32457117 0.47379071 0.08681525 0.11482287]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.6247,  2.2591, -0.3747,  1.4098]])
Player 0 Prediction: tensor([[6.4352e-02, 9.3535e-01, 2.9336e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.2301,  2.5705, -2.6263,  2.1838]])
Player 0 Prediction: tensor([[0.0000e+00, 8.6812e-01, 7.2262e-04, 1.3115e-01]])
Player 1 Prediction: tensor([[-2.0832, -2.0580, -2.8285,  0.2926]]

 96%|█████████▌| 47998/50000 [46:05<01:29, 22.40it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 55502
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5722/10000 (57.2%)
    Average reward: -0.147
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4278/10000 (42.8%)
    Average reward: +0.147
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9312 (33.0%)
    Action 1: 12771 (45.3%)
    Action 2: 2838 (10.1%)
    Action 3: 3261 (11.6%)
  Player 1:
    Action 0: 7480 (27.4%)
    Action 1: 14306 (52.4%)
    Action 2: 2294 (8.4%)
    Action 3: 3240 (11.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1472.5, 1472.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.045 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.000 (max=1.0 for random)
    → Playing n

 98%|█████████▊| 49003/50000 [47:08<00:43, 22.98it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 316828/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 317374/2000000


100%|██████████| 50000/50000 [47:54<00:00, 17.39it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[8.5366e-02, 9.1463e-01, 1.2075e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.0980,  0.9251, -1.1830,  0.2591]])
Player 0 Prediction: tensor([[1.0000e+00, 0.0000e+00, 4.3140e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 4.7427,  5.7934, -3.0773,  4.8057]])
Player 0 Prediction: tensor([[0.3256, 0.6660, 0.0084, 0.0000]])
Player 1 Prediction: tensor([[ 7.0234,  8.7366, -4.9536,  7.8181]])

📊 TEST RESULTS SUMMARY
Training step: 49999
Episodes completed: 10000/10000
Total steps: 54005
Average episode length: 5.4 steps
Episode length range: 2 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5305/10000 (53.0%)
    Average reward: -0.252
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4695/10000 (46.9%)
    Average reward: +0.252
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8204 (30.2%)
    Action 1: 13036

In [4]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "per_epsilon": 0.00001,
    "n_step": 3,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x358adec20>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [5]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-NStep", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [6]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 5/50000 [00:00<16:54, 49.28it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 65/200000
   Player 0 SL buffer: 4/2000000
   Player 1 RL buffer: 59/200000
   Player 1 SL buffer: 1/2000000


  2%|▏         | 1004/50000 [00:28<25:34, 31.93it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 63098/200000
   Player 0 SL buffer: 6688/2000000
   Player 1 RL buffer: 65024/200000
   Player 1 SL buffer: 6901/2000000


  4%|▍         | 1998/50000 [01:04<31:01, 25.79it/s]

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 125693/200000
   Player 0 SL buffer: 12855/2000000
   Player 1 RL buffer: 130429/200000
   Player 1 SL buffer: 13131/2000000
P1 SL Buffer Size:  12855
P1 SL buffer distribution [4030. 4714.  843. 3268.]
P1 actions distribution [0.31349669 0.36670556 0.0655776  0.25422015]
P2 SL Buffer Size:  13131
P2 SL buffer distribution [4348. 3808. 1258. 3717.]
P2 actions distribution [0.33112482 0.29000076 0.09580382 0.2830706 ]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.0981,  1.1902, -0.5269,  0.8025]])
Player 0 Prediction: tensor([[0.1098, 0.8221, 0.0681, 0.0000]])
Player 1 Prediction: tensor([[ 1.8187,  2.3780, -2.0367,  1.9532]])
Player 0 Prediction: tensor([[0.0000, 0.5051, 0.0894, 0.4055]])
Player 1 Prediction: tensor([[ 1.1754,  2.6243, -2.9960,  1.7406]])
Player 0 Prediction: tensor([[0.8088, 0.0000, 0.1912

  4%|▍         | 1998/50000 [01:15<31:01, 25.79it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 48717
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5071/10000 (50.7%)
    Average reward: -1.204
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4929/10000 (49.3%)
    Average reward: +1.204
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7388 (30.2%)
    Action 1: 10083 (41.2%)
    Action 2: 1818 (7.4%)
    Action 3: 5161 (21.1%)
  Player 1:
    Action 0: 8809 (36.3%)
    Action 1: 7357 (30.3%)
    Action 2: 2342 (9.7%)
    Action 3: 5759 (23.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-12040.0, 12040.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.049 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.053 (max=1.0 for random)
    → Playing ne

  6%|▌         | 3006/50000 [02:15<28:11, 27.78it/s]   

   Player 0 ε: 0.0011 → 0.0011

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 188396/200000
   Player 0 SL buffer: 18991/2000000
   Player 1 RL buffer: 195726/200000
   Player 1 SL buffer: 19770/2000000


  8%|▊         | 3999/50000 [02:53<27:51, 27.52it/s]

   Player 0 ε: 0.0009 → 0.0009

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 25011/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 26015/2000000
P1 SL Buffer Size:  25011
P1 SL buffer distribution [7851. 9379. 2153. 5628.]
P1 actions distribution [0.31390188 0.374995   0.08608212 0.22502099]
P2 SL Buffer Size:  26015
P2 SL buffer distribution [8646. 7890. 3256. 6223.]
P2 actions distribution [0.33234672 0.30328657 0.12515856 0.23920815]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.0612, -0.1784, -0.6011,  0.0203]])
Player 0 Prediction: tensor([[0.0000, 0.7450, 0.0496, 0.2054]])
Player 1 Prediction: tensor([[ 2.2879,  2.6934, -1.0574,  1.4925]])
Player 0 Prediction: tensor([[0.1034, 0.8327, 0.0639, 0.0000]])
Player 1 Prediction: tensor([[ 5.4927,  5.0451, -2.9821,  3.9391]])


  8%|▊         | 3999/50000 [03:05<27:51, 27.52it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 40420
Average episode length: 4.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5592/10000 (55.9%)
    Average reward: -0.916
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4408/10000 (44.1%)
    Average reward: +0.916
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5725 (29.4%)
    Action 1: 8327 (42.8%)
    Action 2: 1953 (10.0%)
    Action 3: 3461 (17.8%)
  Player 1:
    Action 0: 7504 (35.8%)
    Action 1: 6181 (29.5%)
    Action 2: 3205 (15.3%)
    Action 3: 4064 (19.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-9163.5, 9163.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.050 (max=1.0 for random)
    → Playing nea

 10%|█         | 5004/50000 [03:54<27:14, 27.53it/s]   

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 31032/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 32227/2000000


 12%|█▏        | 6000/50000 [04:31<28:02, 26.15it/s]

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 36818/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 38554/2000000
P1 SL Buffer Size:  36818
P1 SL buffer distribution [11417. 13630.  3966.  7805.]
P1 actions distribution [0.31009289 0.37019936 0.10771905 0.2119887 ]
P2 SL Buffer Size:  38554
P2 SL buffer distribution [12755. 11914.  5272.  8613.]
P2 actions distribution [0.33083467 0.30902111 0.13674327 0.22340094]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8945, 0.0845, 0.0210, 0.0000]])
Player 1 Prediction: tensor([[ 0.1911,  0.0530, -0.9429,  0.2719]])
Player 0 Prediction: tensor([[0.0000, 0.5529, 0.0923, 0.3548]])
Player 1 Prediction: tensor([[ 0.0521,  0.4460, -0.9701, -0.4915]])
Player 0 Prediction: tensor([[0.9451, 0.0000, 0.0549, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 6000
Epis

 12%|█▏        | 6000/50000 [04:45<28:02, 26.15it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 48724
Average episode length: 4.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4353/10000 (43.5%)
    Average reward: -0.148
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5647/10000 (56.5%)
    Average reward: +0.148
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5656 (23.3%)
    Action 1: 11145 (45.9%)
    Action 2: 3444 (14.2%)
    Action 3: 4013 (16.5%)
  Player 1:
    Action 0: 16101 (65.8%)
    Action 1: 8365 (34.2%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1478.5, 1478.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.005 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.927 (max=1.0 for random)
    → Playing nearly ra

 14%|█▍        | 7004/50000 [05:35<27:09, 26.39it/s]   

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 42481/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 44575/2000000


 16%|█▌        | 8000/50000 [06:14<26:42, 26.21it/s]

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 48078/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 50862/2000000
P1 SL Buffer Size:  48078
P1 SL buffer distribution [14794. 17074.  6203. 10007.]
P1 actions distribution [0.30770831 0.35513125 0.12901951 0.20814094]
P2 SL Buffer Size:  50862
P2 SL buffer distribution [16409. 16304.  7392. 10757.]
P2 actions distribution [0.32261806 0.32055365 0.14533443 0.21149385]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-1.0335, -1.4886, -0.5027, -0.5863]])


 16%|█▌        | 8000/50000 [06:25<26:42, 26.21it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 36619
Average episode length: 3.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5635/10000 (56.4%)
    Average reward: -0.556
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4365/10000 (43.6%)
    Average reward: +0.556
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6473 (35.8%)
    Action 1: 6573 (36.3%)
    Action 2: 2735 (15.1%)
    Action 3: 2305 (12.7%)
  Player 1:
    Action 0: 3628 (19.6%)
    Action 1: 8163 (44.0%)
    Action 2: 3413 (18.4%)
    Action 3: 3329 (18.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5563.0, 5563.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.982 (max=1.0 for random)
    → Playing nea

 18%|█▊        | 9004/50000 [07:17<26:58, 25.33it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 53717/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 57288/2000000


 20%|█▉        | 9999/50000 [07:57<26:16, 25.38it/s]

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 59412/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 63575/2000000
P1 SL Buffer Size:  59412
P1 SL buffer distribution [17999. 20495.  8915. 12003.]
P1 actions distribution [0.30295227 0.34496398 0.15005386 0.20202989]
P2 SL Buffer Size:  63575
P2 SL buffer distribution [19599. 21161.  9838. 12977.]
P2 actions distribution [0.30828156 0.33285096 0.15474636 0.20412112]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9108, 0.0643, 0.0249, 0.0000]])
Player 1 Prediction: tensor([[-0.8228, -1.4471, -1.0146, -0.7227]])
Player 0 Prediction: tensor([[0.0000, 0.4443, 0.0564, 0.4993]])
Player 1 Prediction: tensor([[-0.9886, -2.2841, -0.9839, -0.7153]])

📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 34904
Average episod

 20%|█▉        | 9999/50000 [08:15<26:16, 25.38it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 29497
Average episode length: 2.9 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5149/10000 (51.5%)
    Average reward: +0.174
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4851/10000 (48.5%)
    Average reward: -0.174
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 2045 (13.6%)
    Action 1: 4872 (32.5%)
    Action 2: 4161 (27.7%)
    Action 3: 3923 (26.2%)
  Player 1:
    Action 0: 4127 (28.5%)
    Action 1: 5056 (34.9%)
    Action 2: 3057 (21.1%)
    Action 3: 2256 (15.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1745.0, -1745.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.919 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.046 (max=1.0 for random)
    → Playing ne

 22%|██▏       | 11005/50000 [09:01<25:33, 25.43it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 65021/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 69789/2000000


 24%|██▍       | 11998/50000 [09:40<24:51, 25.48it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 70625/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 75994/2000000
P1 SL Buffer Size:  70625
P1 SL buffer distribution [20484. 24397. 11841. 13903.]
P1 actions distribution [0.29003894 0.34544425 0.16766018 0.19685664]
P2 SL Buffer Size:  75994
P2 SL buffer distribution [22456. 26061. 12532. 14945.]
P2 actions distribution [0.29549701 0.34293497 0.16490776 0.19666026]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8714, 0.0945, 0.0341, 0.0000]])
Player 1 Prediction: tensor([[ 1.0890,  1.8367, -0.9485,  1.0866]])
Player 0 Prediction: tensor([[0.5953, 0.3167, 0.0880, 0.0000]])
Player 1 Prediction: tensor([[ 1.4186,  2.5770, -1.8409,  1.6362]])
Player 0 Prediction: tensor([[0.0000, 0.1509, 0.0870, 0.7621]])
Player 1 Prediction: tensor([[ 1.1527,  2.4093

 24%|██▍       | 11998/50000 [09:55<24:51, 25.48it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 44677
Average episode length: 4.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 3973/10000 (39.7%)
    Average reward: +0.140
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 6027/10000 (60.3%)
    Average reward: -0.140
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4376 (19.4%)
    Action 1: 9095 (40.3%)
    Action 2: 4600 (20.4%)
    Action 3: 4502 (19.9%)
  Player 1:
    Action 0: 14166 (64.1%)
    Action 1: 7938 (35.9%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1398.5, -1398.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.987 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.942 (max=1.0 for random)
    → Playing nearly ra

 26%|██▌       | 13003/50000 [10:45<24:59, 24.68it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 76383/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 82011/2000000


 28%|██▊       | 13999/50000 [11:25<23:11, 25.88it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 82072/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 88079/2000000
P1 SL Buffer Size:  82072
P1 SL buffer distribution [22602. 28856. 14998. 15616.]
P1 actions distribution [0.27539234 0.35159372 0.18274198 0.19027196]
P2 SL Buffer Size:  88079
P2 SL buffer distribution [25346. 30379. 15580. 16774.]
P2 actions distribution [0.28776439 0.34490628 0.17688666 0.19044267]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-0.0360, -0.3641, -0.4552,  0.0345]])
Player 0 Prediction: tensor([[0.0000, 0.3770, 0.0374, 0.5857]])
Player 1 Prediction: tensor([[-0.6918, -1.7028, -0.9971, -0.7395]])
Player 0 Prediction: tensor([[0.0000, 0.4852, 0.1590, 0.3557]])
Player 1 Prediction: tensor([[-3.4351, -5.1402, -1.9966, -3.2314]])


 28%|██▊       | 13999/50000 [11:35<23:11, 25.88it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 33277
Average episode length: 3.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5592/10000 (55.9%)
    Average reward: -0.326
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4408/10000 (44.1%)
    Average reward: +0.326
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3139 (19.2%)
    Action 1: 5988 (36.6%)
    Action 2: 3058 (18.7%)
    Action 3: 4159 (25.4%)
  Player 1:
    Action 0: 6856 (40.5%)
    Action 1: 2853 (16.8%)
    Action 2: 3571 (21.1%)
    Action 3: 3653 (21.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3256.5, 3256.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.988 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.961 (max=1.0 for random)
    → Playing ne

 30%|███       | 15005/50000 [12:28<23:01, 25.33it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 87827/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 94325/2000000


 32%|███▏      | 15998/50000 [13:08<25:55, 21.86it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 93672/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 100576/2000000
P1 SL Buffer Size:  93672
P1 SL buffer distribution [24758. 33304. 18374. 17236.]
P1 actions distribution [0.26430524 0.35553847 0.19615253 0.18400376]
P2 SL Buffer Size:  100576
P2 SL buffer distribution [28359. 34924. 18821. 18472.]
P2 actions distribution [0.28196588 0.3472399  0.18713212 0.18366211]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7388, 0.2159, 0.0452, 0.0000]])
Player 1 Prediction: tensor([[ 1.2942,  1.5134, -0.8833,  1.0787]])
Player 0 Prediction: tensor([[0.4574, 0.2712, 0.2714, 0.0000]])
Player 1 Prediction: tensor([[ 1.6084,  1.8770, -1.8545,  1.6164]])
Player 0 Prediction: tensor([[0.0465, 0.1577, 0.7958, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 16000

 32%|███▏      | 15998/50000 [13:26<25:55, 21.86it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 42894
Average episode length: 4.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 3874/10000 (38.7%)
    Average reward: +0.229
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 6126/10000 (61.3%)
    Average reward: -0.229
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3499 (16.0%)
    Action 1: 8818 (40.3%)
    Action 2: 4919 (22.5%)
    Action 3: 4622 (21.1%)
  Player 1:
    Action 0: 13802 (65.6%)
    Action 1: 7234 (34.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [2289.5, -2289.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.951 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.928 (max=1.0 for random)
    → Playing nearly ra

 34%|███▍      | 17003/50000 [14:16<22:42, 24.22it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 99744/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 106684/2000000


 36%|███▌      | 17998/50000 [14:58<22:27, 23.76it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 105820/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 112726/2000000
P1 SL Buffer Size:  105820
P1 SL buffer distribution [27788. 37351. 21964. 18717.]
P1 actions distribution [0.26259686 0.3529673  0.20756001 0.17687583]
P2 SL Buffer Size:  112726
P2 SL buffer distribution [31105. 39342. 22060. 20219.]
P2 actions distribution [0.27593457 0.34900555 0.19569576 0.17936412]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2277, 0.7695, 0.0028, 0.0000]])
Player 1 Prediction: tensor([[-1.0355, -2.7943, -0.9955, -1.0524]])

📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 28834
Average episode length: 2.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5519/10000 (55.2%)
    Average rewa

 36%|███▌      | 17998/50000 [15:16<22:27, 23.76it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 42217
Average episode length: 4.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 3719/10000 (37.2%)
    Average reward: +0.234
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 6281/10000 (62.8%)
    Average reward: -0.234
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3591 (16.6%)
    Action 1: 8262 (38.1%)
    Action 2: 5221 (24.1%)
    Action 3: 4606 (21.2%)
  Player 1:
    Action 0: 13275 (64.6%)
    Action 1: 7262 (35.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [2335.5, -2335.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.960 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.937 (max=1.0 for random)
    → Playing nearly ra

 38%|███▊      | 19003/50000 [16:06<20:33, 25.14it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 111711/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 118948/2000000


 40%|███▉      | 19999/50000 [16:48<21:16, 23.50it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 117520/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 125154/2000000
P1 SL Buffer Size:  117520
P1 SL buffer distribution [30445. 41645. 25489. 19941.]
P1 actions distribution [0.25906229 0.35436521 0.21689074 0.16968176]
P2 SL Buffer Size:  125154
P2 SL buffer distribution [33950. 44000. 25360. 21844.]
P2 actions distribution [0.2712658  0.35156687 0.20263036 0.17453697]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2278, 0.7699, 0.0023, 0.0000]])
Player 1 Prediction: tensor([[ 0.6951,  1.0573, -1.0019,  0.6878]])
Player 0 Prediction: tensor([[0.9978, 0.0000, 0.0022, 0.0000]])
Player 1 Prediction: tensor([[ 1.2462,  2.4815, -3.2528,  1.5230]])
Player 0 Prediction: tensor([[0.1414, 0.8532, 0.0055, 0.0000]])
Player 1 Prediction: tensor([[ 2.2105,  2.

 40%|███▉      | 19999/50000 [17:06<21:16, 23.50it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 30361
Average episode length: 3.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5227/10000 (52.3%)
    Average reward: +0.078
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4773/10000 (47.7%)
    Average reward: -0.078
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4263 (28.0%)
    Action 1: 5572 (36.6%)
    Action 2: 4315 (28.4%)
    Action 3: 1054 (6.9%)
  Player 1:
    Action 0: 3955 (26.1%)
    Action 1: 4981 (32.9%)
    Action 2: 3670 (24.2%)
    Action 3: 2551 (16.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [780.0, -780.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.045 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.033 (max=1.0 for random)
    → Playing nearl

 42%|████▏     | 21003/50000 [17:57<21:03, 22.94it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 123653/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 131620/2000000


 44%|████▍     | 21999/50000 [18:37<18:15, 25.56it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 130165/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 138263/2000000
P1 SL Buffer Size:  130165
P1 SL buffer distribution [33624. 46131. 28974. 21436.]
P1 actions distribution [0.25831829 0.35440403 0.2225944  0.16468329]
P2 SL Buffer Size:  138263
P2 SL buffer distribution [37115. 48940. 28527. 23681.]
P2 actions distribution [0.26843769 0.3539631  0.20632418 0.17127503]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.0827,  1.1744, -0.5725,  0.9859]])
Player 0 Prediction: tensor([[0.2682, 0.7124, 0.0193, 0.0000]])
Player 1 Prediction: tensor([[ 1.3714,  2.1573, -1.6712,  2.0106]])
Player 0 Prediction: tensor([[0.0000, 0.7730, 0.0045, 0.2225]])
Player 1 Prediction: tensor([[ 0.3240,  1.0344, -2.6073,  1.1587]])
Player 0 Prediction: tensor([[0.9911, 

 44%|████▍     | 21999/50000 [18:56<18:15, 25.56it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 33890
Average episode length: 3.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5634/10000 (56.3%)
    Average reward: +0.265
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4366/10000 (43.7%)
    Average reward: -0.265
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5900 (34.6%)
    Action 1: 3915 (22.9%)
    Action 2: 3787 (22.2%)
    Action 3: 3466 (20.3%)
  Player 1:
    Action 0: 3598 (21.4%)
    Action 1: 6306 (37.5%)
    Action 2: 2812 (16.7%)
    Action 3: 4106 (24.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [2647.5, -2647.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.017 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.007 (max=1.0 for random)
    → Playing ne

 46%|████▌     | 23003/50000 [19:41<17:00, 26.44it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 136928/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 144978/2000000


 48%|████▊     | 23999/50000 [20:18<16:13, 26.70it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 144176/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 151558/2000000
P1 SL Buffer Size:  144176
P1 SL buffer distribution [37766. 51828. 31717. 22865.]
P1 actions distribution [0.26194374 0.35947731 0.21998807 0.15859089]
P2 SL Buffer Size:  151558
P2 SL buffer distribution [39972. 54805. 31306. 25475.]
P2 actions distribution [0.26374061 0.36161074 0.20656118 0.16808746]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-0.7142, -0.7956, -0.4501, -0.8366]])

📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 32787
Average episode length: 3.3 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5753/10000 (57.5%)
    Average reward: -0.203
    Reward range: -6.0 to +6.0
  Player 1:
    Wins: 

 48%|████▊     | 23999/50000 [20:37<16:13, 26.70it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 32302
Average episode length: 3.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5939/10000 (59.4%)
    Average reward: +0.163
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4061/10000 (40.6%)
    Average reward: -0.163
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3240 (20.5%)
    Action 1: 6800 (43.0%)
    Action 2: 3410 (21.6%)
    Action 3: 2351 (14.9%)
  Player 1:
    Action 0: 4159 (25.2%)
    Action 1: 5867 (35.6%)
    Action 2: 4144 (25.1%)
    Action 3: 2331 (14.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1626.0, -1626.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.992 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.032 (max=1.0 for random)
    → Playing ne

 50%|█████     | 25003/50000 [21:22<16:52, 24.68it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 151789/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 158501/2000000


 52%|█████▏    | 26000/50000 [22:05<18:04, 22.14it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 159375/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 165325/2000000
P1 SL Buffer Size:  159375
P1 SL buffer distribution [42744. 58313. 33686. 24632.]
P1 actions distribution [0.26819765 0.36588549 0.21136314 0.15455373]
P2 SL Buffer Size:  165325
P2 SL buffer distribution [43509. 59949. 34154. 27713.]
P2 actions distribution [0.26317254 0.36261303 0.20658703 0.1676274 ]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.8057,  0.9237, -0.5852,  0.8477]])
Player 0 Prediction: tensor([[0.4428, 0.1283, 0.4288, 0.0000]])
Player 1 Prediction: tensor([[ 2.2843,  3.1332, -2.5502,  3.6583]])
Player 0 Prediction: tensor([[0.0000, 0.5818, 0.0625, 0.3558]])


 52%|█████▏    | 26000/50000 [22:17<18:04, 22.14it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 30686
Average episode length: 3.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4881/10000 (48.8%)
    Average reward: -0.263
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5119/10000 (51.2%)
    Average reward: +0.263
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4546 (29.3%)
    Action 1: 5348 (34.5%)
    Action 2: 4161 (26.8%)
    Action 3: 1460 (9.4%)
  Player 1:
    Action 0: 2388 (15.7%)
    Action 1: 7043 (46.4%)
    Action 2: 3706 (24.4%)
    Action 3: 2034 (13.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2626.0, 2626.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.049 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.934 (max=1.0 for random)
    → Playing nea

 54%|█████▍    | 27003/50000 [23:14<16:57, 22.60it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 166755/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 172243/2000000


 56%|█████▌    | 27999/50000 [23:58<15:58, 22.95it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 173955/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 179052/2000000
P1 SL Buffer Size:  173955
P1 SL buffer distribution [46919. 65129. 35533. 26374.]
P1 actions distribution [0.26971918 0.37440143 0.20426547 0.15161392]
P2 SL Buffer Size:  179052
P2 SL buffer distribution [46727. 65089. 36855. 30381.]
P2 actions distribution [0.26096888 0.36352009 0.20583406 0.16967697]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3213, 0.6771, 0.0016, 0.0000]])
Player 1 Prediction: tensor([[-0.5648, -0.6681, -1.0008, -0.5050]])
Player 0 Prediction: tensor([[0.0000, 0.8420, 0.0028, 0.1552]])

📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 38612
Average episode length: 3.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFOR

 56%|█████▌    | 27999/50000 [24:17<15:58, 22.95it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 34328
Average episode length: 3.4 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5280/10000 (52.8%)
    Average reward: +0.190
    Reward range: -6.0 to +6.0
  Player 1:
    Wins: 4720/10000 (47.2%)
    Average reward: -0.190
    Reward range: -6.0 to +6.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7451 (42.3%)
    Action 1: 1937 (11.0%)
    Action 2: 3903 (22.2%)
    Action 3: 4305 (24.5%)
  Player 1:
    Action 0: 2310 (13.8%)
    Action 1: 7041 (42.1%)
    Action 2: 2467 (14.7%)
    Action 3: 4914 (29.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [1901.0, -1901.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.875 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.920 (max=1.0 for random)
    → Playing nearly random stra

 58%|█████▊    | 29003/50000 [25:08<15:37, 22.39it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 181014/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 185640/2000000


 60%|█████▉    | 29999/50000 [25:53<14:54, 22.37it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 188200/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 192500/2000000
P1 SL Buffer Size:  188200
P1 SL buffer distribution [50975. 71919. 37423. 27883.]
P1 actions distribution [0.27085547 0.38214134 0.19884697 0.14815622]
P2 SL Buffer Size:  192500
P2 SL buffer distribution [49686. 70326. 39553. 32935.]
P2 actions distribution [0.25810909 0.36532987 0.20547013 0.17109091]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.0622,  0.9128, -0.5679,  0.5900]])
Player 0 Prediction: tensor([[0.0000, 0.2189, 0.0556, 0.7256]])
Player 1 Prediction: tensor([[-0.0082,  0.2389, -0.9742,  0.0361]])
Player 0 Prediction: tensor([[0.0092, 0.0221, 0.9687, 0.0000]])


 60%|█████▉    | 29999/50000 [26:07<14:54, 22.37it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 37486
Average episode length: 3.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5053/10000 (50.5%)
    Average reward: -0.234
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4947/10000 (49.5%)
    Average reward: +0.234
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4953 (26.3%)
    Action 1: 7127 (37.8%)
    Action 2: 3950 (20.9%)
    Action 3: 2830 (15.0%)
  Player 1:
    Action 0: 7300 (39.2%)
    Action 1: 7314 (39.3%)
    Action 2: 3189 (17.1%)
    Action 3: 823 (4.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2336.0, 2336.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.037 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.059 (max=1.0 for random)
    → Playing near

 62%|██████▏   | 31003/50000 [27:04<13:50, 22.87it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 195507/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 199285/2000000


 64%|██████▍   | 31999/50000 [27:49<13:29, 22.22it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 202613/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 206357/2000000
P1 SL Buffer Size:  202613
P1 SL buffer distribution [55243. 78707. 39228. 29435.]
P1 actions distribution [0.27265279 0.38845977 0.19361048 0.14527696]
P2 SL Buffer Size:  206357
P2 SL buffer distribution [53140. 76260. 41924. 35033.]
P2 actions distribution [0.25751489 0.36955373 0.20316248 0.1697689 ]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.6637, 0.3255, 0.0108, 0.0000]])
Player 1 Prediction: tensor([[-0.4033, -0.1821, -1.0327, -0.4240]])
Player 0 Prediction: tensor([[0.5215, 0.1048, 0.3737, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 41537
Average episode length: 4.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFOR

 64%|██████▍   | 31999/50000 [28:08<13:29, 22.22it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 39648
Average episode length: 4.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 7077/10000 (70.8%)
    Average reward: +0.359
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 2923/10000 (29.2%)
    Average reward: -0.359
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5710 (30.2%)
    Action 1: 9899 (52.3%)
    Action 2: 1616 (8.5%)
    Action 3: 1702 (9.0%)
  Player 1:
    Action 0: 5540 (26.7%)
    Action 1: 7488 (36.1%)
    Action 2: 4954 (23.9%)
    Action 3: 2739 (13.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [3589.0, -3589.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.011 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.039 (max=1.0 for random)
    → Playing near

 66%|██████▌   | 33004/50000 [29:01<12:49, 22.07it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 209609/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 213593/2000000


 68%|██████▊   | 34000/50000 [29:47<12:18, 21.66it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 216557/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 220711/2000000
P1 SL Buffer Size:  216557
P1 SL buffer distribution [59226. 85408. 40874. 31049.]
P1 actions distribution [0.2734892  0.39439039 0.18874476 0.14337565]
P2 SL Buffer Size:  220711
P2 SL buffer distribution [57018. 82158. 44042. 37493.]
P2 actions distribution [0.25833783 0.37224243 0.19954601 0.16987373]
   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3613, 0.6371, 0.0016, 0.0000]])
Player 1 Prediction: tensor([[ 1.0639,  0.8790, -0.9855,  0.9930]])
Player 0 Prediction: tensor([[0.0000, 0.8677, 0.0010, 0.1313]])
Player 1 Prediction: tensor([[-0.2297,  0.9042, -2.0377,  1.1007]])
Player 0 Prediction: tensor([[0.9588, 0.0000, 0.0412, 0.0000]])


 68%|██████▊   | 34000/50000 [29:58<12:18, 21.66it/s]


📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 42701
Average episode length: 4.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4520/10000 (45.2%)
    Average reward: -0.317
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5480/10000 (54.8%)
    Average reward: +0.317
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4218 (19.3%)
    Action 1: 9470 (43.4%)
    Action 2: 3964 (18.2%)
    Action 3: 4162 (19.1%)
  Player 1:
    Action 0: 8722 (41.8%)
    Action 1: 6402 (30.7%)
    Action 2: 2799 (13.4%)
    Action 3: 2964 (14.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3168.0, 3168.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.981 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.049 (max=1.0 for random)
    → Playing ne

 70%|███████   | 35003/50000 [31:00<11:32, 21.65it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 223646/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 227901/2000000


 72%|███████▏  | 35998/50000 [31:49<11:18, 20.63it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 230478/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 235380/2000000
P1 SL Buffer Size:  230478
P1 SL buffer distribution [63215. 91842. 42562. 32859.]
P1 actions distribution [0.27427781 0.39848489 0.18466838 0.14256892]
P2 SL Buffer Size:  235380
P2 SL buffer distribution [61172. 88162. 46140. 39906.]
P2 actions distribution [0.25988614 0.37455179 0.19602345 0.16953862]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.9471,  0.7455, -0.5344,  0.7877]])
Player 0 Prediction: tensor([[0.0000e+00, 7.4358e-01, 4.2815e-04, 2.5599e-01]])
Player 1 Prediction: tensor([[ 0.4537,  0.8644, -0.9556,  0.8803]])
Player 0 Prediction: tensor([[9.9928e-01, 0.0000e+00, 7.1956e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.7733, -0.2589, -2.8182,  0.1293]])
Player

 72%|███████▏  | 35998/50000 [32:08<11:18, 20.63it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 43941
Average episode length: 4.4 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4014/10000 (40.1%)
    Average reward: +0.239
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5986/10000 (59.9%)
    Average reward: -0.239
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3139 (14.2%)
    Action 1: 9996 (45.2%)
    Action 2: 4734 (21.4%)
    Action 3: 4266 (19.3%)
  Player 1:
    Action 0: 15031 (68.9%)
    Action 1: 6775 (31.1%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [2391.0, -2391.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.918 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.894 (max=1.0 for random)
    → Mixed strategy
  

 74%|███████▍  | 37003/50000 [33:05<09:57, 21.77it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 237482/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 242558/2000000


 76%|███████▌  | 37998/50000 [33:55<09:33, 20.93it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 244390/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 249716/2000000
P1 SL Buffer Size:  244390
P1 SL buffer distribution [66771. 98786. 44118. 34715.]
P1 actions distribution [0.27321494 0.40421458 0.18052293 0.14204755]
P2 SL Buffer Size:  249716
P2 SL buffer distribution [65574. 93775. 47997. 42370.]
P2 actions distribution [0.26259431 0.3755266  0.19220635 0.16967275]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5890, 0.4042, 0.0068, 0.0000]])
Player 1 Prediction: tensor([[ 1.4085,  1.1529, -0.6268,  1.0139]])
Player 0 Prediction: tensor([[0.5885, 0.0853, 0.3262, 0.0000]])


 76%|███████▌  | 37998/50000 [34:08<09:33, 20.93it/s]


📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 44876
Average episode length: 4.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4129/10000 (41.3%)
    Average reward: -0.397
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5871/10000 (58.7%)
    Average reward: +0.397
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7251 (31.2%)
    Action 1: 8793 (37.8%)
    Action 2: 4309 (18.5%)
    Action 3: 2903 (12.5%)
  Player 1:
    Action 0: 4663 (21.6%)
    Action 1: 11040 (51.1%)
    Action 2: 2251 (10.4%)
    Action 3: 3666 (17.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3968.0, 3968.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.055 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.972 (max=1.0 for random)
    → Playing n

 78%|███████▊  | 39005/50000 [35:09<07:55, 23.14it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 251095/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 257360/2000000


 80%|███████▉  | 39998/50000 [35:54<07:14, 23.04it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 258035/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 264841/2000000
P1 SL Buffer Size:  258035
P1 SL buffer distribution [ 70324. 105160.  45751.  36800.]
P1 actions distribution [0.27253667 0.40754161 0.1773054  0.14261631]
P2 SL Buffer Size:  264841
P2 SL buffer distribution [70616. 98919. 49989. 45317.]
P2 actions distribution [0.26663545 0.37350335 0.18875099 0.17111021]
   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1032, 0.4060, 0.4908, 0.0000]])


 80%|███████▉  | 39998/50000 [36:08<07:14, 23.04it/s]


📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 42591
Average episode length: 4.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4331/10000 (43.3%)
    Average reward: -0.340
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5669/10000 (56.7%)
    Average reward: +0.340
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5547 (25.6%)
    Action 1: 9102 (42.0%)
    Action 2: 4159 (19.2%)
    Action 3: 2878 (13.3%)
  Player 1:
    Action 0: 6072 (29.0%)
    Action 1: 9062 (43.3%)
    Action 2: 2606 (12.5%)
    Action 3: 3165 (15.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3398.5, 3398.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.029 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.041 (max=1.0 for random)
    → Playing ne

 82%|████████▏ | 41003/50000 [37:04<06:17, 23.84it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 264737/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 272209/2000000


 84%|████████▍ | 41999/50000 [37:46<07:01, 18.96it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 271759/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 279491/2000000
P1 SL Buffer Size:  271759
P1 SL buffer distribution [ 74129. 111462.  47455.  38713.]
P1 actions distribution [0.27277477 0.41015017 0.17462163 0.14245342]
P2 SL Buffer Size:  279491
P2 SL buffer distribution [ 74889. 104247.  52061.  48294.]
P2 actions distribution [0.26794781 0.37298875 0.18627076 0.17279268]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5618, 0.4330, 0.0052, 0.0000]])
Player 1 Prediction: tensor([[-0.3882, -0.2997, -1.0022, -0.3472]])
Player 0 Prediction: tensor([[0.9829, 0.0000, 0.0171, 0.0000]])
Player 1 Prediction: tensor([[-1.5073, -1.7409, -2.9283, -0.7258]])
Player 0 Prediction: tensor([[0.0000, 0.6096, 0.0722, 0.3182]])
Player 1 Prediction: tensor([[-4.0

 84%|████████▍ | 41999/50000 [37:59<07:01, 18.96it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 46156
Average episode length: 4.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4966/10000 (49.7%)
    Average reward: -0.288
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5034/10000 (50.3%)
    Average reward: +0.288
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4856 (20.7%)
    Action 1: 9675 (41.2%)
    Action 2: 3425 (14.6%)
    Action 3: 5532 (23.6%)
  Player 1:
    Action 0: 9255 (40.8%)
    Action 1: 5877 (25.9%)
    Action 2: 2594 (11.4%)
    Action 3: 4942 (21.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2877.0, 2877.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.997 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.033 (max=1.0 for random)
    → Playing ne

 86%|████████▌ | 43004/50000 [39:01<05:15, 22.17it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 278465/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 286723/2000000


 88%|████████▊ | 44000/50000 [39:47<04:58, 20.08it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 285264/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 294112/2000000
P1 SL Buffer Size:  285264
P1 SL buffer distribution [ 77209. 117700.  49137.  41218.]
P1 actions distribution [0.27065806 0.41260026 0.17225097 0.14449072]
P2 SL Buffer Size:  294112
P2 SL buffer distribution [ 78960. 110035.  53790.  51327.]
P2 actions distribution [0.26846915 0.37412618 0.18288951 0.17451515]
   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[-0.1498, -0.0760, -0.6869, -0.0960]])
Player 0 Prediction: tensor([[0.4844, 0.0876, 0.4280, 0.0000]])


 88%|████████▊ | 44000/50000 [39:59<04:58, 20.08it/s]


📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 44028
Average episode length: 4.4 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4874/10000 (48.7%)
    Average reward: -0.293
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5126/10000 (51.3%)
    Average reward: +0.293
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4595 (20.6%)
    Action 1: 9882 (44.3%)
    Action 2: 3460 (15.5%)
    Action 3: 4380 (19.6%)
  Player 1:
    Action 0: 8285 (38.2%)
    Action 1: 6358 (29.3%)
    Action 2: 2577 (11.9%)
    Action 3: 4491 (20.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2933.5, 2933.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.990 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.049 (max=1.0 for random)
    → Playing ne

 90%|█████████ | 45003/50000 [41:00<03:45, 22.14it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 292076/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 301092/2000000


 92%|█████████▏| 45999/50000 [41:48<03:11, 20.94it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 298809/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 308244/2000000
P1 SL Buffer Size:  298809
P1 SL buffer distribution [ 80426. 124018.  50652.  43713.]
P1 actions distribution [0.26915521 0.41504105 0.16951297 0.14629077]
P2 SL Buffer Size:  308244
P2 SL buffer distribution [ 82601. 116122.  55429.  54092.]
P2 actions distribution [0.26797277 0.37672104 0.17982183 0.17548436]


 92%|█████████▏| 45999/50000 [41:59<03:11, 20.94it/s]

   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.8577,  0.7854, -0.5573,  0.7832]])
Player 0 Prediction: tensor([[0.0000, 0.2733, 0.0280, 0.6987]])
Player 1 Prediction: tensor([[ 1.1098,  0.9478, -0.9771,  0.8434]])
Player 0 Prediction: tensor([[0.0000e+00, 9.7227e-01, 9.1846e-04, 2.6808e-02]])
Player 1 Prediction: tensor([[ 0.4648,  1.1376, -1.9941,  1.8880]])
Player 0 Prediction: tensor([[0.7642, 0.0000, 0.2358, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 46000
Episodes completed: 10000/10000
Total steps: 44654
Average episode length: 4.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4805/10000 (48.0%)
    Average reward: -0.312
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5195/10000 (51.9%)
    Average reward: +0.312
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4813 (21.1%)
    Action 1: 9830 (43.1%)
    Acti

 94%|█████████▍| 47002/50000 [43:07<02:29, 20.11it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 305841/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 315174/2000000


 96%|█████████▌| 48000/50000 [43:56<01:35, 20.96it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 312795/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 322073/2000000
P1 SL Buffer Size:  312795
P1 SL buffer distribution [ 84532. 129207.  52177.  46879.]
P1 actions distribution [0.27024729 0.41307246 0.16680893 0.14987132]
P2 SL Buffer Size:  322073
P2 SL buffer distribution [ 86687. 121472.  56983.  56931.]
P2 actions distribution [0.26915327 0.37715673 0.17692573 0.17676427]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5072, 0.4894, 0.0034, 0.0000]])
Player 1 Prediction: tensor([[ 0.6003,  0.9074, -0.7384,  0.5359]])
Player 0 Prediction: tensor([[0.6085, 0.1069, 0.2846, 0.0000]])
Player 1 Prediction: tensor([[-3.8582e-01, -1.4987e-01, -1.3670e+00,  3.9915e-04]])
Player 0 Prediction: tensor([[0.0000, 0.9543, 0.0046, 0.0412]])
Player 1 Predictio

 96%|█████████▌| 48000/50000 [44:09<01:35, 20.96it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 43144
Average episode length: 4.3 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4428/10000 (44.3%)
    Average reward: -0.341
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5572/10000 (55.7%)
    Average reward: +0.341
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7101 (32.1%)
    Action 1: 8560 (38.7%)
    Action 2: 3914 (17.7%)
    Action 3: 2547 (11.5%)
  Player 1:
    Action 0: 3524 (16.8%)
    Action 1: 10719 (51.0%)
    Action 2: 2153 (10.2%)
    Action 3: 4626 (22.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3408.0, 3408.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.056 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.927 (max=1.0 for random)
    → Playing n

 98%|█████████▊| 49003/50000 [45:10<00:45, 21.88it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 319627/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 328955/2000000


100%|██████████| 50000/50000 [46:00<00:00, 18.12it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3612, 0.6380, 0.0008, 0.0000]])
Player 1 Prediction: tensor([[ 0.5521,  1.0086, -0.9731,  0.7150]])
Player 0 Prediction: tensor([[9.9971e-01, 0.0000e+00, 2.9235e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.7822, -0.0607, -2.3400,  0.0263]])
Player 0 Prediction: tensor([[0.0000, 0.4244, 0.0129, 0.5627]])
Player 1 Prediction: tensor([[-2.2157, -2.6945, -2.8170, -0.9776]])

📊 TEST RESULTS SUMMARY
Training step: 49999
Episodes completed: 10000/10000
Total steps: 37688
Average episode length: 3.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5544/10000 (55.4%)
    Average reward: -0.238
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4456/10000 (44.6%)
    Average reward: +0.238
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 6190 (32.8%)
    Action 1: 7370 (39.0%)
    Acti

In [7]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": MSELoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,
    "eg_epsilon": 0.06,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.5,
    "per_beta": 0.5,
    "per_beta_final": 1.0,
    "per_epsilon": 0.00001,
    "n_step": 1,
    "atom_size": 1,
    "dueling": False,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.MSELoss object at 0x3711c2e00>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : False
Using  

In [8]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-PER", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [9]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['best_response', 'best_response']


  0%|          | 6/50000 [00:00<16:02, 51.93it/s]

   Player 0 ε: 0.0600 → 0.0600

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 63/200000
   Player 0 SL buffer: 4/2000000
   Player 1 RL buffer: 64/200000
   Player 1 SL buffer: 10/2000000


  2%|▏         | 1004/50000 [00:29<28:40, 28.47it/s]

   Player 0 ε: 0.0019 → 0.0019

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 64178/200000
   Player 0 SL buffer: 7318/2000000
   Player 1 RL buffer: 63948/200000
   Player 1 SL buffer: 7214/2000000


  4%|▍         | 1998/50000 [01:04<27:36, 28.98it/s]

   Player 0 ε: 0.0013 → 0.0013

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 128763/200000
   Player 0 SL buffer: 13649/2000000
   Player 1 RL buffer: 127364/200000
   Player 1 SL buffer: 13651/2000000
P1 SL Buffer Size:  13649
P1 SL buffer distribution [5106. 7723.   23.  797.]
P1 actions distribution [0.37409334 0.565829   0.00168511 0.05839256]
P2 SL Buffer Size:  13651
P2 SL buffer distribution [4357. 8340.   48.  906.]
P2 actions distribution [0.31917076 0.61094425 0.00351623 0.06636876]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3588, 0.6271, 0.0141, 0.0000]])
Player 1 Prediction: tensor([[ 0.2222,  0.5493, -1.2328,  0.5609]])
Player 0 Prediction: tensor([[0.0000, 0.9686, 0.0082, 0.0232]])
Player 1 Prediction: tensor([[-0.4809, -0.6424, -2.2558,  0.6025]])

📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 58432
Average episode length: 5

  4%|▍         | 1998/50000 [01:19<27:36, 28.98it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 48448
Average episode length: 4.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5667/10000 (56.7%)
    Average reward: -0.099
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4333/10000 (43.3%)
    Average reward: +0.099
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3090 (13.9%)
    Action 1: 18087 (81.1%)
    Action 2: 465 (2.1%)
    Action 3: 666 (3.0%)
  Player 1:
    Action 0: 22972 (87.9%)
    Action 1: 3168 (12.1%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-992.5, 992.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.640 (max=1.0 for random)
    → Strongly prefers Tails
  Player 1 strategy entropy: 0.533 (max=1.0 for random)
    → Strongly prefers Heads
  Averag

  6%|▌         | 3007/50000 [02:06<25:32, 30.66it/s]   

   Player 0 ε: 0.0011 → 0.0011

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 193112/200000
   Player 0 SL buffer: 20253/2000000
   Player 1 RL buffer: 191014/200000
   Player 1 SL buffer: 20481/2000000


  8%|▊         | 3998/50000 [02:37<24:16, 31.59it/s]

   Player 0 ε: 0.0009 → 0.0009

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 27069/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 27303/2000000
P1 SL Buffer Size:  27069
P1 SL buffer distribution [ 9547. 14691.  1002.  1829.]
P1 actions distribution [0.35269127 0.54272415 0.03701651 0.06756807]
P2 SL Buffer Size:  27303
P2 SL buffer distribution [ 9061. 15196.  1077.  1969.]
P2 actions distribution [0.33186829 0.55656888 0.03944621 0.07211662]
   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5531, 0.4369, 0.0099, 0.0000]])
Player 1 Prediction: tensor([[ 1.5188,  2.0138, -1.3888,  0.9144]])
Player 0 Prediction: tensor([[0.9908, 0.0000, 0.0092, 0.0000]])
Player 1 Prediction: tensor([[-0.2532, -0.2850, -2.8893,  0.9969]])
Player 0 Prediction: tensor([[0.0000, 0.8019, 0.0393, 0.1587]])
Player 1 Prediction: tensor([[-4.2042, -4.2323, 

  8%|▊         | 3998/50000 [02:49<24:16, 31.59it/s]


📊 TEST RESULTS SUMMARY
Training step: 4000
Episodes completed: 10000/10000
Total steps: 57559
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5702/10000 (57.0%)
    Average reward: -0.910
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4298/10000 (43.0%)
    Average reward: +0.910
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8619 (31.6%)
    Action 1: 16749 (61.3%)
    Action 2: 661 (2.4%)
    Action 3: 1280 (4.7%)
  Player 1:
    Action 0: 10327 (34.1%)
    Action 1: 14555 (48.1%)
    Action 2: 3043 (10.1%)
    Action 3: 2325 (7.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-9098.0, 9098.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.958 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.037 (max=1.0 for random)
    → Playing near

 10%|█         | 5008/50000 [03:36<20:28, 36.63it/s]   

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 34182/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 34081/2000000


 12%|█▏        | 6000/50000 [04:04<21:30, 34.08it/s]

   Player 0 ε: 0.0008 → 0.0008

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 40930/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40881/2000000
P1 SL Buffer Size:  40930
P1 SL buffer distribution [14430. 21058.  2205.  3237.]
P1 actions distribution [0.35255314 0.51448815 0.05387247 0.07908624]
P2 SL Buffer Size:  40881
P2 SL buffer distribution [13827. 21567.  2315.  3172.]
P2 actions distribution [0.33822558 0.52755559 0.05662777 0.07759106]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.2406,  1.9991, -0.7791,  0.9238]])
Player 0 Prediction: tensor([[0.0000, 0.9741, 0.0092, 0.0167]])
Player 1 Prediction: tensor([[ 1.9200,  2.3247, -1.3494,  1.0862]])
Player 0 Prediction: tensor([[0.9907, 0.0000, 0.0093, 0.0000]])
Player 1 Prediction: tensor([[ 0.1474, -0.1212, -2.8362,  0.8075]])
Player 0 Prediction: tensor([[0.0000, 0.7187

 12%|█▏        | 6000/50000 [04:19<21:30, 34.08it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 51595
Average episode length: 5.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5638/10000 (56.4%)
    Average reward: -0.052
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4362/10000 (43.6%)
    Average reward: +0.052
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3838 (16.0%)
    Action 1: 18114 (75.7%)
    Action 2: 807 (3.4%)
    Action 3: 1155 (4.8%)
  Player 1:
    Action 0: 23151 (83.6%)
    Action 1: 4530 (16.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-523.0, 523.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.727 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.643 (max=1.0 for random)
    → Strongly prefers Heads
  Average strat

 14%|█▍        | 7004/50000 [05:04<20:17, 35.32it/s]   

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 47790/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 47924/2000000


 16%|█▌        | 8000/50000 [05:32<19:57, 35.07it/s]

   Player 0 ε: 0.0007 → 0.0007

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 54298/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 54755/2000000
P1 SL Buffer Size:  54298
P1 SL buffer distribution [18899. 27432.  3390.  4577.]
P1 actions distribution [0.3480607  0.50521198 0.06243324 0.08429408]
P2 SL Buffer Size:  54755
P2 SL buffer distribution [18849. 28053.  3461.  4392.]
P2 actions distribution [0.34424253 0.51233677 0.06320884 0.08021185]
   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.7698,  1.4084, -0.8246,  0.7808]])
Player 0 Prediction: tensor([[0.0000, 0.9965, 0.0016, 0.0019]])
Player 1 Prediction: tensor([[ 1.3218,  1.6517, -1.3739,  0.9729]])
Player 0 Prediction: tensor([[9.9912e-01, 0.0000e+00, 8.8030e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-1.3871, -1.0063, -2.8738, -0.4257]])
Player 0 Prediction: tensor(

 16%|█▌        | 8000/50000 [05:49<19:57, 35.07it/s]


📊 TEST RESULTS SUMMARY
Training step: 8000
Episodes completed: 10000/10000
Total steps: 52860
Average episode length: 5.3 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5556/10000 (55.6%)
    Average reward: -0.127
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4444/10000 (44.4%)
    Average reward: +0.127
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4101 (16.7%)
    Action 1: 18265 (74.4%)
    Action 2: 914 (3.7%)
    Action 3: 1271 (5.2%)
  Player 1:
    Action 0: 23312 (82.3%)
    Action 1: 4997 (17.7%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1265.5, 1265.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.749 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.672 (max=1.0 for random)
    → Strongly prefers Heads
  Average str

 18%|█▊        | 9006/50000 [06:33<19:03, 35.85it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 60804/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 61388/2000000


 20%|█▉        | 9999/50000 [07:04<22:35, 29.51it/s]

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 67308/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 67865/2000000
P1 SL Buffer Size:  67308
P1 SL buffer distribution [23201. 34048.  4479.  5580.]
P1 actions distribution [0.344699   0.50585369 0.06654484 0.08290248]
P2 SL Buffer Size:  67865
P2 SL buffer distribution [23543. 34620.  4635.  5067.]
P2 actions distribution [0.34690931 0.51013041 0.06829736 0.07466293]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.6253, 0.3628, 0.0119, 0.0000]])
Player 1 Prediction: tensor([[ 1.0818,  1.4611, -1.3857,  0.9214]])
Player 0 Prediction: tensor([[0.9905, 0.0000, 0.0095, 0.0000]])
Player 1 Prediction: tensor([[-1.7289, -0.9601, -3.0185, -1.0058]])
Player 0 Prediction: tensor([[0.0400, 0.2620, 0.6980, 0.0000]])


 20%|█▉        | 9999/50000 [07:19<22:35, 29.51it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 62134
Average episode length: 6.2 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4711/10000 (47.1%)
    Average reward: -0.754
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5289/10000 (52.9%)
    Average reward: +0.754
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10350 (33.4%)
    Action 1: 15920 (51.4%)
    Action 2: 2990 (9.7%)
    Action 3: 1699 (5.5%)
  Player 1:
    Action 0: 10753 (34.5%)
    Action 1: 17407 (55.8%)
    Action 2: 2764 (8.9%)
    Action 3: 251 (0.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7537.5, 7537.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.022 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.999 (max=1.0 for random)
    → Playing nea

 22%|██▏       | 11005/50000 [08:08<20:19, 31.98it/s]   

   Player 0 ε: 0.0006 → 0.0006

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 74043/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 74558/2000000


 24%|██▍       | 11997/50000 [08:39<20:08, 31.45it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 80430/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 80694/2000000
P1 SL Buffer Size:  80430
P1 SL buffer distribution [27515. 41077.  5676.  6162.]
P1 actions distribution [0.34209872 0.51071739 0.07057068 0.0766132 ]
P2 SL Buffer Size:  80694
P2 SL buffer distribution [27988. 41606.  5649.  5451.]
P2 actions distribution [0.34684115 0.51560215 0.0700052  0.06755149]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.6945,  2.2155, -0.7903,  1.7832]])
Player 0 Prediction: tensor([[0.0000e+00, 9.9940e-01, 2.7353e-04, 3.2922e-04]])
Player 1 Prediction: tensor([[ 1.9663,  2.6828, -1.3803,  1.7628]])
Player 0 Prediction: tensor([[9.9987e-01, 0.0000e+00, 1.3291e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.1946,  1.4772, -2.9850,  0.8222]])
Player 0 P

 24%|██▍       | 11997/50000 [08:49<20:08, 31.45it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 59964
Average episode length: 6.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4486/10000 (44.9%)
    Average reward: -0.769
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5514/10000 (55.1%)
    Average reward: +0.769
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10471 (34.8%)
    Action 1: 14985 (49.8%)
    Action 2: 3208 (10.7%)
    Action 3: 1411 (4.7%)
  Player 1:
    Action 0: 10241 (34.3%)
    Action 1: 16170 (54.1%)
    Action 2: 1892 (6.3%)
    Action 3: 1586 (5.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-7693.5, 7693.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.031 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.009 (max=1.0 for random)
    → Playing n

 26%|██▌       | 13007/50000 [09:43<17:56, 34.38it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 86889/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 87031/2000000


 28%|██▊       | 13999/50000 [10:12<19:21, 31.00it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 93223/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 93227/2000000
P1 SL Buffer Size:  93223
P1 SL buffer distribution [31766. 48037.  6679.  6741.]
P1 actions distribution [0.34075282 0.51529129 0.07164541 0.07231048]
P2 SL Buffer Size:  93227
P2 SL buffer distribution [32204. 48360.  6526.  6137.]
P2 actions distribution [0.34543641 0.51873384 0.07000118 0.06582857]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7211, 0.2660, 0.0129, 0.0000]])
Player 1 Prediction: tensor([[ 1.8726,  2.0783, -1.0392,  1.8645]])
Player 0 Prediction: tensor([[0.0096, 0.9806, 0.0097, 0.0000]])
Player 1 Prediction: tensor([[ 2.1706,  1.7169, -2.2007,  1.5396]])
Player 0 Prediction: tensor([[0.0000, 0.5132, 0.0272, 0.4595]])
Player 1 Prediction: tensor([[ 1.1544,  1.5435

 28%|██▊       | 13999/50000 [10:29<19:21, 31.00it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 54707
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5483/10000 (54.8%)
    Average reward: -0.114
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4517/10000 (45.2%)
    Average reward: +0.114
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4626 (18.0%)
    Action 1: 18158 (70.8%)
    Action 2: 1253 (4.9%)
    Action 3: 1615 (6.3%)
  Player 1:
    Action 0: 23130 (79.6%)
    Action 1: 5925 (20.4%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1136.0, 1136.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.798 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.730 (max=1.0 for random)
    → Mixed strategy
  Average strategy 

 30%|███       | 15004/50000 [11:16<19:07, 30.49it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 99541/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 99348/2000000


 32%|███▏      | 15999/50000 [11:49<18:58, 29.87it/s]

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 105987/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 105565/2000000
P1 SL Buffer Size:  105987
P1 SL buffer distribution [35883. 54831.  7577.  7696.]
P1 actions distribution [0.33856039 0.51733703 0.0714899  0.07261268]
P2 SL Buffer Size:  105565
P2 SL buffer distribution [36417. 54944.  7338.  6866.]
P2 actions distribution [0.34497229 0.52047554 0.06951168 0.0650405 ]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.8707, 0.1278, 0.0015, 0.0000]])
Player 1 Prediction: tensor([[ 1.4644,  2.0978, -0.7583,  1.5185]])
Player 0 Prediction: tensor([[0.0309, 0.9663, 0.0027, 0.0000]])
Player 1 Prediction: tensor([[ 1.7686,  2.1568, -1.9482,  1.0331]])
Player 0 Prediction: tensor([[0.0000, 0.9147, 0.0050, 0.0803]])
Player 1 Prediction: tensor([[-0.1402, -1.

 32%|███▏      | 15999/50000 [12:09<18:58, 29.87it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 54695
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5489/10000 (54.9%)
    Average reward: -0.108
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4511/10000 (45.1%)
    Average reward: +0.108
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4792 (18.7%)
    Action 1: 18067 (70.4%)
    Action 2: 1271 (5.0%)
    Action 3: 1541 (6.0%)
  Player 1:
    Action 0: 22992 (79.2%)
    Action 1: 6032 (20.8%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1076.5, 1076.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.809 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.737 (max=1.0 for random)
    → Mixed strategy
  Average strategy 

 34%|███▍      | 17006/50000 [12:54<18:03, 30.44it/s]   

   Player 0 ε: 0.0005 → 0.0005

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 112235/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 111742/2000000


 36%|███▌      | 17998/50000 [13:28<18:19, 29.11it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 118544/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 118063/2000000
P1 SL Buffer Size:  118544
P1 SL buffer distribution [39984. 61316.  8629.  8615.]
P1 actions distribution [0.33729248 0.51724254 0.07279154 0.07267344]
P2 SL Buffer Size:  118063
P2 SL buffer distribution [40826. 61384.  8181.  7672.]
P2 actions distribution [0.34579843 0.5199258  0.06929351 0.06498226]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.7409,  0.1502, -0.8726,  0.3022]])
Player 0 Prediction: tensor([[0.0000, 0.9874, 0.0020, 0.0106]])
Player 1 Prediction: tensor([[ 0.6550,  0.8624, -1.3707,  0.6970]])
Player 0 Prediction: tensor([[9.9912e-01, 0.0000e+00, 8.8299e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.8056, -0.9328, -2.8969, -0.9762]])
Player 0 Prediction: t

 36%|███▌      | 17998/50000 [13:39<18:19, 29.11it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 58726
Average episode length: 5.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4881/10000 (48.8%)
    Average reward: -0.668
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5119/10000 (51.2%)
    Average reward: +0.668
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10387 (35.4%)
    Action 1: 14567 (49.6%)
    Action 2: 2948 (10.0%)
    Action 3: 1480 (5.0%)
  Player 1:
    Action 0: 10303 (35.1%)
    Action 1: 15103 (51.5%)
    Action 2: 2264 (7.7%)
    Action 3: 1674 (5.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6680.0, 6680.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.032 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.023 (max=1.0 for random)
    → Playing n

 38%|███▊      | 19003/50000 [14:32<17:06, 30.19it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 124734/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 124324/2000000


 40%|████      | 20000/50000 [15:06<17:49, 28.06it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 131229/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 130870/2000000
P1 SL Buffer Size:  131229
P1 SL buffer distribution [44026. 67869.  9838.  9496.]
P1 actions distribution [0.33548987 0.51717989 0.07496819 0.07236205]
P2 SL Buffer Size:  130870
P2 SL buffer distribution [45337. 67497.  9376.  8660.]
P2 actions distribution [0.34642775 0.51575609 0.07164362 0.06617254]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 2.3785,  1.9502, -0.8140,  1.7653]])
Player 0 Prediction: tensor([[0.0000, 0.7384, 0.0218, 0.2398]])
Player 1 Prediction: tensor([[ 1.6728,  2.4077, -1.3722,  1.6915]])
Player 0 Prediction: tensor([[0.9947, 0.0000, 0.0053, 0.0000]])
Player 1 Prediction: tensor([[ 1.2915,  1.3030, -3.0058,  0.7730]])
Player 0 Prediction: tensor([[0.0304, 

 40%|████      | 20000/50000 [15:19<17:49, 28.06it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 58881
Average episode length: 5.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5185/10000 (51.8%)
    Average reward: -0.603
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4815/10000 (48.1%)
    Average reward: +0.603
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 10022 (34.2%)
    Action 1: 14917 (50.9%)
    Action 2: 2647 (9.0%)
    Action 3: 1719 (5.9%)
  Player 1:
    Action 0: 10413 (35.2%)
    Action 1: 14171 (47.9%)
    Action 2: 2607 (8.8%)
    Action 3: 2385 (8.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6030.0, 6030.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.025 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.039 (max=1.0 for random)
    → Playing ne

 42%|████▏     | 21006/50000 [16:14<16:44, 28.88it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 137503/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 137241/2000000


 44%|████▍     | 21999/50000 [16:49<16:34, 28.17it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 143752/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 143885/2000000
P1 SL Buffer Size:  143752
P1 SL buffer distribution [48133. 74242. 11045. 10332.]
P1 actions distribution [0.3348336  0.5164589  0.07683371 0.07187378]
P2 SL Buffer Size:  143885
P2 SL buffer distribution [49982. 73588. 10620.  9695.]
P2 actions distribution [0.34737464 0.51143622 0.07380894 0.0673802 ]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.2310e-01, 7.6856e-02, 4.4493e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.1479,  2.0118, -0.7350,  1.1370]])
Player 0 Prediction: tensor([[6.2051e-02, 9.3784e-01, 1.0541e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.6340,  2.0307, -1.8870,  0.8869]])
Player 0 Prediction: tensor([[0.0000e+00, 9.9133e-01, 3.6015e-04, 8.3140e-

 44%|████▍     | 21999/50000 [16:59<16:34, 28.17it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 58609
Average episode length: 5.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5300/10000 (53.0%)
    Average reward: -0.527
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4700/10000 (47.0%)
    Average reward: +0.527
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9969 (34.3%)
    Action 1: 14851 (51.1%)
    Action 2: 2512 (8.6%)
    Action 3: 1743 (6.0%)
  Player 1:
    Action 0: 10385 (35.2%)
    Action 1: 13927 (47.2%)
    Action 2: 2758 (9.3%)
    Action 3: 2464 (8.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5265.5, 5265.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.025 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.042 (max=1.0 for random)
    → Playing nea

 46%|████▌     | 23004/50000 [17:55<15:39, 28.72it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 150114/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 150351/2000000


 48%|████▊     | 23998/50000 [18:31<15:47, 27.43it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 156454/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 156643/2000000
P1 SL Buffer Size:  156454
P1 SL buffer distribution [52187. 80357. 12504. 11406.]
P1 actions distribution [0.3335613  0.51361423 0.07992125 0.07290322]
P2 SL Buffer Size:  156643
P2 SL buffer distribution [54439. 79231. 12077. 10896.]
P2 actions distribution [0.34753548 0.5058062  0.07709888 0.06955944]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.8449,  1.9748, -0.4214,  1.2933]])
Player 0 Prediction: tensor([[0.0065, 0.9814, 0.0121, 0.0000]])
Player 1 Prediction: tensor([[ 1.6359,  1.9602, -1.8238,  0.8802]])
Player 0 Prediction: tensor([[0.0000, 0.6004, 0.0222, 0.3774]])
Player 1 Prediction: tensor([[ 4.0473,  4.7046, -2.6384,  1.9888]])
Player 0 Prediction: tensor([[0.0297, 

 48%|████▊     | 23998/50000 [18:50<15:47, 27.43it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 54597
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5475/10000 (54.8%)
    Average reward: -0.072
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4525/10000 (45.2%)
    Average reward: +0.072
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4898 (19.1%)
    Action 1: 17407 (67.9%)
    Action 2: 1381 (5.4%)
    Action 3: 1959 (7.6%)
  Player 1:
    Action 0: 22373 (77.3%)
    Action 1: 6579 (22.7%)
    Action 2: 0 (0.0%)
    Action 3: 0 (0.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-721.0, 721.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 0.836 (max=1.0 for random)
    → Mixed strategy
  Player 1 strategy entropy: 0.773 (max=1.0 for random)
    → Mixed strategy
  Average strategy en

 50%|█████     | 25004/50000 [19:45<16:05, 25.90it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 162585/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 163161/2000000


 52%|█████▏    | 26000/50000 [20:24<15:18, 26.14it/s]

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 169244/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 169740/2000000
P1 SL Buffer Size:  169244
P1 SL buffer distribution [56266. 86324. 13949. 12705.]
P1 actions distribution [0.33245492 0.51005649 0.08241947 0.07506913]
P2 SL Buffer Size:  169740
P2 SL buffer distribution [59244. 85150. 13538. 11808.]
P2 actions distribution [0.34902793 0.50164958 0.07975728 0.06956522]
   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.9025, 0.0965, 0.0010, 0.0000]])
Player 1 Prediction: tensor([[-0.5424, -0.0938, -0.8855, -0.1064]])
Player 0 Prediction: tensor([[0.0222, 0.9755, 0.0023, 0.0000]])
Player 1 Prediction: tensor([[-0.1713, -0.8772, -2.0447, -0.2883]])
Player 0 Prediction: tensor([[0.0000, 0.9678, 0.0031, 0.0291]])
Player 1 Prediction: tensor([[-4.6246, -5.

 52%|█████▏    | 26000/50000 [20:40<15:18, 26.14it/s]


📊 TEST RESULTS SUMMARY
Training step: 26000
Episodes completed: 10000/10000
Total steps: 60930
Average episode length: 6.1 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5580/10000 (55.8%)
    Average reward: -0.516
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4420/10000 (44.2%)
    Average reward: +0.516
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9221 (30.9%)
    Action 1: 16108 (53.9%)
    Action 2: 2402 (8.0%)
    Action 3: 2139 (7.2%)
  Player 1:
    Action 0: 11843 (38.1%)
    Action 1: 14040 (45.2%)
    Action 2: 3373 (10.9%)
    Action 3: 1804 (5.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5158.5, 5158.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.004 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.048 (max=1.0 for random)
    → Playing ne

 54%|█████▍    | 27003/50000 [21:47<15:57, 24.03it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 175443/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 176536/2000000


 56%|█████▌    | 28000/50000 [22:33<14:04, 26.04it/s]  

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 181983/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 182862/2000000
P1 SL Buffer Size:  181983
P1 SL buffer distribution [60263. 92097. 15402. 14221.]
P1 actions distribution [0.33114632 0.50607474 0.08463428 0.07814466]
P2 SL Buffer Size:  182862
P2 SL buffer distribution [64110. 90760. 14998. 12994.]
P2 actions distribution [0.35059225 0.49633057 0.08201813 0.07105905]
   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.3674e-01, 6.3239e-02, 2.3671e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.3221,  1.9510, -0.8465,  1.5260]])
Player 0 Prediction: tensor([[4.9341e-02, 9.5056e-01, 1.0156e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 2.0800,  1.5167, -2.0635,  1.6596]])
Player 0 Prediction: tensor([[0.0000e+00, 9.8820e-01, 3.5033e-04, 1.1452e-

 56%|█████▌    | 28000/50000 [22:50<14:04, 26.04it/s]


📊 TEST RESULTS SUMMARY
Training step: 28000
Episodes completed: 10000/10000
Total steps: 57764
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5557/10000 (55.6%)
    Average reward: -0.604
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4443/10000 (44.4%)
    Average reward: +0.604
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8843 (31.4%)
    Action 1: 14878 (52.9%)
    Action 2: 2249 (8.0%)
    Action 3: 2175 (7.7%)
  Player 1:
    Action 0: 10756 (36.3%)
    Action 1: 11864 (40.1%)
    Action 2: 3286 (11.1%)
    Action 3: 3713 (12.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-6035.5, 6035.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.011 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.059 (max=1.0 for random)
    → Playing n

 58%|█████▊    | 29005/50000 [23:49<12:47, 27.35it/s]   

   Player 0 ε: 0.0004 → 0.0004

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 188477/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 189331/2000000


 60%|█████▉    | 29998/50000 [24:27<12:31, 26.63it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 194975/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 195698/2000000
P1 SL Buffer Size:  194975
P1 SL buffer distribution [64439. 97830. 16886. 15820.]
P1 actions distribution [0.33049878 0.50175664 0.08660598 0.08113861]
P2 SL Buffer Size:  195698
P2 SL buffer distribution [68668. 95907. 16452. 14671.]
P2 actions distribution [0.35088759 0.49007655 0.08406831 0.07496755]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.3328, -0.1263, -0.6586,  0.1504]])
Player 0 Prediction: tensor([[0.0000, 0.9766, 0.0090, 0.0143]])
Player 1 Prediction: tensor([[ 0.1952,  0.3103, -0.9729,  0.4514]])
Player 0 Prediction: tensor([[9.9955e-01, 0.0000e+00, 4.5291e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 5.6763,  4.7975, -2.9083,  3.7638]])
Player 0 Prediction: t

 60%|█████▉    | 29998/50000 [24:40<12:31, 26.63it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 57711
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5517/10000 (55.2%)
    Average reward: -0.533
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4483/10000 (44.8%)
    Average reward: +0.533
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8970 (31.7%)
    Action 1: 14635 (51.7%)
    Action 2: 2340 (8.3%)
    Action 3: 2369 (8.4%)
  Player 1:
    Action 0: 10479 (35.6%)
    Action 1: 11730 (39.9%)
    Action 2: 3320 (11.3%)
    Action 3: 3868 (13.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-5327.5, 5327.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.017 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.059 (max=1.0 for random)
    → Playing n

 62%|██████▏   | 31002/50000 [25:43<15:06, 20.97it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 201631/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 201978/2000000


 64%|██████▍   | 32000/50000 [26:47<19:26, 15.43it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 207962/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 208474/2000000
P1 SL Buffer Size:  207962
P1 SL buffer distribution [ 68543. 103579.  18359.  17481.]
P1 actions distribution [0.32959387 0.49806695 0.08828055 0.08405863]
P2 SL Buffer Size:  208474
P2 SL buffer distribution [ 73178. 100936.  17909.  16451.]
P2 actions distribution [0.35101739 0.48416589 0.0859052  0.07891152]
   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.6541,  1.7712, -0.2837,  1.3170]])
Player 0 Prediction: tensor([[4.7631e-02, 9.5222e-01, 1.4777e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.4956,  1.7374, -1.5971,  0.9843]])
Player 0 Prediction: tensor([[0.0000e+00, 9.6909e-01, 3.5416e-04, 3.0558e-02]])
Player 1 Prediction: tensor([[-1.0699,  0.5956, -2.8772, -0.4869]]

 64%|██████▍   | 32000/50000 [27:00<19:26, 15.43it/s]


📊 TEST RESULTS SUMMARY
Training step: 32000
Episodes completed: 10000/10000
Total steps: 57651
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5540/10000 (55.4%)
    Average reward: -0.498
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4460/10000 (44.6%)
    Average reward: +0.498
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8816 (31.2%)
    Action 1: 14467 (51.2%)
    Action 2: 2450 (8.7%)
    Action 3: 2505 (8.9%)
  Player 1:
    Action 0: 10412 (35.4%)
    Action 1: 11632 (39.5%)
    Action 2: 3356 (11.4%)
    Action 3: 4013 (13.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4980.5, 4980.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.019 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.060 (max=1.0 for random)
    → Playing n

 66%|██████▌   | 33004/50000 [28:16<13:44, 20.61it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 214406/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 214886/2000000


 68%|██████▊   | 33999/50000 [29:09<16:40, 16.00it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 220740/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 221568/2000000
P1 SL Buffer Size:  220740
P1 SL buffer distribution [ 72345. 109321.  19821.  19253.]
P1 actions distribution [0.32773852 0.4952478  0.08979342 0.08722026]
P2 SL Buffer Size:  221568
P2 SL buffer distribution [ 77872. 106117.  19376.  18203.]
P2 actions distribution [0.35145869 0.47893649 0.08744945 0.08215537]


 68%|██████▊   | 33999/50000 [29:20<16:40, 16.00it/s]

   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.6720,  1.7360, -0.2595,  1.3458]])
Player 0 Prediction: tensor([[0.0035, 0.9881, 0.0084, 0.0000]])
Player 1 Prediction: tensor([[ 1.4882,  1.7035, -1.5626,  1.0123]])
Player 0 Prediction: tensor([[0.0000, 0.1022, 0.0250, 0.8728]])
Player 1 Prediction: tensor([[ 2.1667,  2.3473, -2.7147,  2.0978]])
Player 0 Prediction: tensor([[0.0315, 0.0856, 0.8829, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 57598
Average episode length: 5.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5620/10000 (56.2%)
    Average reward: -0.446
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4380/10000 (43.8%)
    Average reward: +0.446
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8872 (31.4%)
    Action 1: 14543 (51.5%)
    Action 2: 2355 (8.3

 70%|███████   | 35002/50000 [30:51<14:43, 16.97it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 227026/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 228305/2000000


 72%|███████▏  | 36000/50000 [31:51<14:11, 16.44it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 233356/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 234832/2000000
P1 SL Buffer Size:  233356
P1 SL buffer distribution [ 75856. 114751.  21295.  21454.]
P1 actions distribution [0.32506557 0.49174223 0.09125542 0.09193678]
P2 SL Buffer Size:  234832
P2 SL buffer distribution [ 82713. 111497.  20797.  19825.]
P2 actions distribution [0.35222201 0.47479475 0.08856118 0.08442205]


 72%|███████▏  | 36000/50000 [32:01<14:11, 16.44it/s]

   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.5575e-01, 4.4233e-02, 1.8048e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.8068,  1.6556, -0.4066,  1.0504]])
Player 0 Prediction: tensor([[4.0985e-02, 9.5883e-01, 1.8905e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.5096,  1.6790, -1.5301,  1.0390]])
Player 0 Prediction: tensor([[0.0000e+00, 9.5214e-01, 2.3930e-04, 4.7616e-02]])
Player 1 Prediction: tensor([[-5.5575, -3.9848, -3.0380, -1.3433]])

📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 57233
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5532/10000 (55.3%)
    Average reward: -0.466
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4468/10000 (44.7%)
    Average reward: +0.466
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8767 (31.1%)
   

 74%|███████▍  | 37004/50000 [33:32<12:51, 16.85it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 239792/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 241195/2000000


 76%|███████▌  | 38000/50000 [34:31<12:08, 16.48it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 246187/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 247666/2000000
P1 SL Buffer Size:  246187
P1 SL buffer distribution [ 79654. 119703.  22774.  24056.]
P1 actions distribution [0.3235508  0.48622795 0.09250692 0.09771434]
P2 SL Buffer Size:  247666
P2 SL buffer distribution [ 87375. 116572.  22176.  21543.]
P2 actions distribution [0.35279368 0.47068229 0.08953994 0.08698408]


 76%|███████▌  | 38000/50000 [34:42<12:08, 16.48it/s]

   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.3280e-01, 6.6922e-02, 2.7408e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.1872,  2.0605, -1.0536,  1.6591]])
Player 0 Prediction: tensor([[9.9973e-01, 0.0000e+00, 2.7302e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.9270,  2.2616, -3.1323,  1.9467]])
Player 0 Prediction: tensor([[0.0520, 0.1428, 0.8052, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 38000
Episodes completed: 10000/10000
Total steps: 57313
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5513/10000 (55.1%)
    Average reward: -0.436
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4487/10000 (44.9%)
    Average reward: +0.436
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8727 (30.9%)
    Action 1: 13989 (49.5%)
    Action 2: 2528 (9.0%)
    Action 3: 3001 (10.6%)
  Play

 78%|███████▊  | 39002/50000 [36:08<10:51, 16.88it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 252357/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 254181/2000000


 80%|████████  | 40000/50000 [37:14<10:37, 15.68it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 258639/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 260340/2000000
P1 SL Buffer Size:  258639
P1 SL buffer distribution [ 83214. 124608.  24176.  26641.]
P1 actions distribution [0.32173802 0.48178349 0.09347392 0.10300457]
P2 SL Buffer Size:  260340
P2 SL buffer distribution [ 92042. 121435.  23423.  23440.]
P2 actions distribution [0.35354536 0.46644772 0.08997081 0.09003611]
   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.5745e-01, 4.2539e-02, 1.2416e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.4447, -0.0042, -0.5903,  0.0666]])
Player 0 Prediction: tensor([[0.0000, 0.7225, 0.0148, 0.2627]])
Player 1 Prediction: tensor([[-3.2548, -3.1937, -1.3766, -2.0815]])


 80%|████████  | 40000/50000 [37:32<10:37, 15.68it/s]


📊 TEST RESULTS SUMMARY
Training step: 40000
Episodes completed: 10000/10000
Total steps: 56639
Average episode length: 5.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5391/10000 (53.9%)
    Average reward: -0.332
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4609/10000 (46.1%)
    Average reward: +0.332
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9395 (33.9%)
    Action 1: 12602 (45.4%)
    Action 2: 2262 (8.2%)
    Action 3: 3494 (12.6%)
  Player 1:
    Action 0: 10684 (37.0%)
    Action 1: 10490 (36.3%)
    Action 2: 2807 (9.7%)
    Action 3: 4905 (17.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3318.5, 3318.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.046 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing n

 82%|████████▏ | 41001/50000 [39:42<13:50, 10.83it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 264813/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 266719/2000000


 84%|████████▍ | 42000/50000 [41:17<11:50, 11.26it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 270946/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 273333/2000000
P1 SL Buffer Size:  270946
P1 SL buffer distribution [ 86869. 129213.  25671.  29193.]
P1 actions distribution [0.3206137  0.47689577 0.09474582 0.10774472]
P2 SL Buffer Size:  273333
P2 SL buffer distribution [ 96739. 126155.  24788.  25651.]
P2 actions distribution [0.3539236  0.46154325 0.09068792 0.09384524]
   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.4102e-01, 5.8796e-02, 1.8241e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[-0.4547, -0.0102, -0.5757,  0.0653]])
Player 0 Prediction: tensor([[0.0000, 0.9562, 0.0281, 0.0156]])
Player 1 Prediction: tensor([[-3.1892, -3.6140, -1.3541, -1.2717]])


 84%|████████▍ | 42000/50000 [41:33<11:50, 11.26it/s]


📊 TEST RESULTS SUMMARY
Training step: 42000
Episodes completed: 10000/10000
Total steps: 56188
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5300/10000 (53.0%)
    Average reward: -0.429
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4700/10000 (47.0%)
    Average reward: +0.429
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9326 (33.7%)
    Action 1: 12269 (44.3%)
    Action 2: 2350 (8.5%)
    Action 3: 3729 (13.5%)
  Player 1:
    Action 0: 10215 (35.8%)
    Action 1: 10248 (35.9%)
    Action 2: 3062 (10.7%)
    Action 3: 4989 (17.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4285.5, 4285.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.049 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing 

 86%|████████▌ | 43002/50000 [44:13<12:50,  9.08it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 277123/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 279781/2000000


 88%|████████▊ | 43999/50000 [46:14<08:54, 11.22it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 283433/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 286221/2000000
P1 SL Buffer Size:  283433
P1 SL buffer distribution [ 90453. 134086.  27114.  31780.]
P1 actions distribution [0.31913362 0.47307829 0.09566282 0.11212526]
P2 SL Buffer Size:  286221
P2 SL buffer distribution [101343. 130773.  26215.  27890.]
P2 actions distribution [0.35407255 0.4568952  0.09159007 0.09744219]
   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[9.5942e-01, 4.0575e-02, 6.2881e-06, 0.0000e+00]])
Player 1 Prediction: tensor([[ 0.8633,  1.6425, -0.6953,  1.4367]])
Player 0 Prediction: tensor([[3.2150e-02, 9.6776e-01, 9.2012e-05, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.7855,  1.3781, -1.8879,  1.7090]])
Player 0 Prediction: tensor([[0.0000e+00, 9.6585e-01, 4.5828e-05, 

 88%|████████▊ | 43999/50000 [46:33<08:54, 11.22it/s]


📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 56142
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5339/10000 (53.4%)
    Average reward: -0.390
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4661/10000 (46.6%)
    Average reward: +0.390
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9239 (33.4%)
    Action 1: 12233 (44.3%)
    Action 2: 2287 (8.3%)
    Action 3: 3868 (14.0%)
  Player 1:
    Action 0: 10258 (36.0%)
    Action 1: 10161 (35.6%)
    Action 2: 3033 (10.6%)
    Action 3: 5063 (17.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3898.0, 3898.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.049 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing 

 90%|█████████ | 45002/50000 [48:55<11:07,  7.49it/s]   

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 289401/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 292608/2000000


 92%|█████████▏| 46000/50000 [50:28<05:00, 13.33it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 295344/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 298907/2000000
P1 SL Buffer Size:  295344
P1 SL buffer distribution [ 93873. 138569.  28555.  34347.]
P1 actions distribution [0.31784292 0.46917831 0.09668387 0.1162949 ]
P2 SL Buffer Size:  298907
P2 SL buffer distribution [105899. 135290.  27597.  30121.]
P2 actions distribution [0.35428745 0.4526157  0.09232638 0.10077047]
   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.2956, -0.1544, -0.5552,  0.1833]])
Player 0 Prediction: tensor([[0.0000, 0.2222, 0.0814, 0.6965]])
Player 1 Prediction: tensor([[-2.2318, -2.2144, -1.0013, -0.4973]])
Player 0 Prediction: tensor([[0.0000, 0.0637, 0.6040, 0.3323]])


 92%|█████████▏| 46000/50000 [50:43<05:00, 13.33it/s]


📊 TEST RESULTS SUMMARY
Training step: 46000
Episodes completed: 10000/10000
Total steps: 56044
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5470/10000 (54.7%)
    Average reward: -0.334
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4530/10000 (45.3%)
    Average reward: +0.334
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9226 (33.5%)
    Action 1: 12116 (43.9%)
    Action 2: 2255 (8.2%)
    Action 3: 3984 (14.4%)
  Player 1:
    Action 0: 10237 (36.0%)
    Action 1: 10119 (35.6%)
    Action 2: 3070 (10.8%)
    Action 3: 5037 (17.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3343.5, 3343.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.050 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.061 (max=1.0 for random)
    → Playing 

 94%|█████████▍| 47001/50000 [52:48<04:39, 10.74it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 301532/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 305213/2000000


 96%|█████████▌| 47999/50000 [54:35<03:11, 10.44it/s]

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 307731/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 311826/2000000
P1 SL Buffer Size:  307731
P1 SL buffer distribution [ 97556. 143158.  30070.  36947.]
P1 actions distribution [0.31701714 0.465205   0.09771521 0.12006265]
P2 SL Buffer Size:  311826
P2 SL buffer distribution [110553. 139797.  29007.  32469.]
P2 actions distribution [0.35453426 0.44831733 0.09302303 0.10412538]


 96%|█████████▌| 47999/50000 [54:53<03:11, 10.44it/s]

   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 0.2612, -0.1976, -0.5571,  0.1509]])
Player 0 Prediction: tensor([[0.0000, 0.6812, 0.0010, 0.3177]])
Player 1 Prediction: tensor([[-3.0416, -2.5248, -0.9413, -1.4134]])

📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 55686
Average episode length: 5.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5329/10000 (53.3%)
    Average reward: -0.430
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4671/10000 (46.7%)
    Average reward: +0.430
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9151 (33.4%)
    Action 1: 11849 (43.2%)
    Action 2: 2419 (8.8%)
    Action 3: 4009 (14.6%)
  Player 1:
    Action 0: 10148 (35.9%)
    Action 1: 10102 (35.7%)
    Action 2: 3059 (10.8%)
    Action 3: 4949 (17.5%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards 

 98%|█████████▊| 49005/50000 [56:22<00:40, 24.86it/s]  

   Player 0 ε: 0.0003 → 0.0003

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 314236/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 318197/2000000


100%|██████████| 50000/50000 [57:24<00:00, 14.52it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[ 1.7138,  1.6792, -0.6906,  1.7504]])
Player 0 Prediction: tensor([[0.0000, 0.9565, 0.0108, 0.0327]])
Player 1 Prediction: tensor([[ 1.0844,  1.9906, -0.9959,  1.6272]])
Player 0 Prediction: tensor([[9.9988e-01, 0.0000e+00, 1.2268e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[ 1.7069,  2.1028, -3.0975,  1.7288]])
Player 0 Prediction: tensor([[0.0347, 0.0939, 0.8714, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 49999
Episodes completed: 10000/10000
Total steps: 55184
Average episode length: 5.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5488/10000 (54.9%)
    Average reward: -0.344
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4512/10000 (45.1%)
    Average reward: +0.344
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8984 (33.0%)
    Action 1: 11531 (42.4%)
    Act

In [10]:
# shared network but not shared buffer?
# 1 vs 2 minibatches

from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig
from game_configs import LeducHoldemConfig, MatchingPenniesConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": False,
    "training_steps": 50000,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  #
    "num_minibatches": 1,  # or 2, could be 2 minibatches per network, or 2 minibatches (1 for each network/player)
    "learning_rate": 0.1,
    "momentum": 0.0,
    "optimizer": SGD,
    "loss_function": KLDivergenceLoss(),
    "min_replay_buffer_size": 1000,
    "minibatch_size": 128,
    "replay_buffer_size": 2e5,
    "transfer_interval": 300,
    "residual_layers": [],
    "conv_layers": [],
    "dense_layer_widths": [128],
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.06,
    "eg_epsilon": 0.0,
    # "eg_epsilon_final": 0.06,
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 0,
    "sl_learning_rate": 0.005,
    "sl_momentum": 0.0,
    # "sl_weight_decay": 1e-9,
    # "sl_clipnorm": 1.0,
    "sl_optimizer": SGD,
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_min_replay_buffer_size": 1000,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,
    "sl_residual_layers": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [128],
    "sl_clip_low_prob": 0.0,
    "per_alpha": 0.5,
    "per_beta": 0.5,
    "per_beta_final": 1.0,
    "per_epsilon": 0.00001,
    "n_step": 3,
    "atom_size": 51,
    "dueling": True,
    "clipnorm": 10.0,
    "sl_clipnorm": 10.0,
}
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
config.save_intermediate_weights = True

Using default save_intermediate_weights     : False
Using         training_steps                : 50000
Using default adam_epsilon                  : 1e-06
Using         momentum                      : 0.0
Using         learning_rate                 : 0.1
Using         clipnorm                      : 10.0
Using         optimizer                     : <class 'torch.optim.sgd.SGD'>
Using default weight_decay                  : 0.0
Using         loss_function                 : <utils.utils.KLDivergenceLoss object at 0x35e1ba9b0>
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 128
Using         replay_buffer_size            : 200000.0
Using         min_replay_buffer_size        : 1000
Using         num_minibatches               : 1
Using default training_iterations           : 1
Using default print_interval                : 100
NFSPDQNConfig
Using default save_intermediate_weights     : Fals

In [11]:
from pettingzoo.classic import leduc_holdem_v4
from custom_gym_envs.envs.matching_pennies import (
    env as matching_pennies_env,
    MatchingPenniesGymEnv,
)


env = leduc_holdem_v4.env()
# env = matching_pennies_env(render_mode="human", max_cycles=1)

print(env.observation_space("player_0"))

agent = NFSPDQN(env, config, name="NFSP-LeducHoldem-Rainbow", device="cpu")

Dict('action_mask': Box(0, 1, (4,), int8), 'observation': Box(0.0, 1.0, (36,), float32))
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
float32
Max size: 200000
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)
making test env
leduc_holdem_v4
<class 'method'>
petting zoo
Observation dimensions: (36,)
Observation dtype: float32
num_actions:  4
Max size: 2000000
(2000000, 36)


In [12]:
agent.checkpoint_interval = 2000
agent.checkpoint_trials = 10000
agent.train()

🎯 Initial policies: ['average_strategy', 'average_strategy']


  0%|          | 1/50000 [00:00<2:17:45,  6.05it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 0:
   Player 0 RL buffer: 61/200000
   Player 0 SL buffer: 10/2000000
   Player 1 RL buffer: 63/200000
   Player 1 SL buffer: 6/2000000


  2%|▏         | 1003/50000 [01:01<46:56, 17.39it/s] 

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 1000:
   Player 0 RL buffer: 64875/200000
   Player 0 SL buffer: 7209/2000000
   Player 1 RL buffer: 63247/200000
   Player 1 SL buffer: 6802/2000000


  4%|▍         | 1999/50000 [02:28<1:08:52, 11.61it/s] 

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 2000:
   Player 0 RL buffer: 129984/200000
   Player 0 SL buffer: 13549/2000000
   Player 1 RL buffer: 126138/200000
   Player 1 SL buffer: 13094/2000000
P1 SL Buffer Size:  13549
P1 SL buffer distribution [6974. 4500.  120. 1955.]
P1 actions distribution [0.51472433 0.33212783 0.00885674 0.14429109]
P2 SL Buffer Size:  13094
P2 SL buffer distribution [5874. 5597.   41. 1582.]
P2 actions distribution [0.44860241 0.42744769 0.00313121 0.1208187 ]
   Testing specific player: 0
   At training step: 2000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7026, 0.2783, 0.0191, 0.0000]])
Player 1 Prediction: tensor([[[2.5290e-04, 1.4157e-04, 3.0309e-04, 1.8416e-04, 2.0543e-04,
          2.6430e-04, 2.4201e-04, 5.2354e-03, 8.6208e-03, 3.8661e-04,
          6.3926e-02, 9.0772e-03, 6.9685e-03, 9.1871e-03, 3.4191e-04,
          8.9136e-02, 4.6315e-03, 1.6274e-02, 1.0595e-02, 4.5660e-04,
          7.0242e-0

  4%|▍         | 1999/50000 [02:44<1:08:52, 11.61it/s]


📊 TEST RESULTS SUMMARY
Training step: 2000
Episodes completed: 10000/10000
Total steps: 48283
Average episode length: 4.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5403/10000 (54.0%)
    Average reward: -0.410
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4597/10000 (46.0%)
    Average reward: +0.410
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 9060 (37.6%)
    Action 1: 11050 (45.9%)
    Action 2: 931 (3.9%)
    Action 3: 3038 (12.6%)
  Player 1:
    Action 0: 11592 (47.9%)
    Action 1: 8002 (33.1%)
    Action 2: 0 (0.0%)
    Action 3: 4610 (19.0%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-4099.5, 4099.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.046 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.037 (max=1.0 for random)
    → Playing nearly 

  6%|▌         | 3003/50000 [04:32<55:35, 14.09it/s]    

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 3000:
   Player 0 RL buffer: 194827/200000
   Player 0 SL buffer: 19497/2000000
   Player 1 RL buffer: 189297/200000
   Player 1 SL buffer: 18931/2000000


  8%|▊         | 4000/50000 [05:45<49:53, 15.37it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 4000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 25061/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 24302/2000000
P1 SL Buffer Size:  25061
P1 SL buffer distribution [11270.  9203.  1153.  3435.]
P1 actions distribution [0.44970273 0.36722397 0.04600774 0.13706556]
P2 SL Buffer Size:  24302
P2 SL buffer distribution [ 9749. 10409.   766.  3378.]
P2 actions distribution [0.4011604  0.42831866 0.03152004 0.13900091]


  8%|▊         | 4000/50000 [05:56<49:53, 15.37it/s]

   Testing specific player: 0
   At training step: 4000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[3.8072e-04, 1.4057e-04, 3.6222e-04, 4.7484e-04, 3.4431e-04,
          3.3333e-04, 5.6834e-04, 4.2412e-03, 1.1121e-02, 4.0170e-03,
          3.1537e-02, 1.2691e-02, 2.8386e-02, 7.4226e-02, 1.6076e-03,
          1.0511e-01, 3.3953e-02, 6.1973e-02, 7.5860e-02, 9.0377e-04,
          2.3057e-02, 2.4501e-03, 2.1864e-02, 1.8802e-02, 1.1966e-03,
          1.5968e-01, 3.3009e-04, 2.2917e-02, 2.5245e-02, 4.8260e-03,
          5.1294e-02, 2.0492e-04, 5.1313e-02, 2.5073e-02, 1.2793e-02,
          3.7480e-02, 3.9596e-04, 3.3597e-02, 1.4708e-02, 1.2406e-02,
          1.5104e-02, 3.9697e-03, 7.0558e-03, 3.6170e-03, 2.9515e-04,
          1.7130e-04, 5.7598e-04, 3.5896e-04, 2.0848e-04, 6.5288e-04,
          1.2192e-04],
         [2.5942e-04, 1.2031e-04, 2.1435e-04, 3.0122e-04, 2.3841e-04,
          2.1895e-04, 2.6800e-04, 1.1812e-02, 5.3763e-02, 1.5979e-03,
     

 10%|█         | 5003/50000 [07:35<46:45, 16.04it/s]    

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 5000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 30769/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 29768/2000000


 12%|█▏        | 5999/50000 [08:43<47:53, 15.31it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 6000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 36481/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 35320/2000000
P1 SL Buffer Size:  36481
P1 SL buffer distribution [15267. 14009.  2560.  4645.]
P1 actions distribution [0.41849182 0.38400811 0.07017351 0.12732655]
P2 SL Buffer Size:  35320
P2 SL buffer distribution [13099. 15443.  2018.  4760.]
P2 actions distribution [0.37086636 0.43723103 0.05713477 0.13476784]
   Testing specific player: 0
   At training step: 6000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.5324, 0.2190, 0.2486, 0.0000]])


 12%|█▏        | 5999/50000 [08:56<47:53, 15.31it/s]


📊 TEST RESULTS SUMMARY
Training step: 6000
Episodes completed: 10000/10000
Total steps: 40295
Average episode length: 4.0 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5939/10000 (59.4%)
    Average reward: -0.389
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4061/10000 (40.6%)
    Average reward: +0.389
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 8847 (45.0%)
    Action 1: 7994 (40.7%)
    Action 2: 1331 (6.8%)
    Action 3: 1486 (7.6%)
  Player 1:
    Action 0: 5389 (26.1%)
    Action 1: 10464 (50.7%)
    Action 2: 2297 (11.1%)
    Action 3: 2487 (12.1%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-3889.0, 3889.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.046 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.003 (max=1.0 for random)
    → Playing near

 14%|█▍        | 7002/50000 [10:39<49:00, 14.62it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 7000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 42509/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 40899/2000000


 16%|█▌        | 8000/50000 [11:55<56:02, 12.49it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 8000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 48470/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 46788/2000000
P1 SL Buffer Size:  48470
P1 SL buffer distribution [19432. 19099.  4172.  5767.]
P1 actions distribution [0.40090778 0.39403755 0.08607386 0.11898081]
P2 SL Buffer Size:  46788
P2 SL buffer distribution [16884. 20315.  3523.  6066.]
P2 actions distribution [0.36086176 0.43419253 0.07529708 0.12964863]


 16%|█▌        | 8000/50000 [12:06<56:02, 12.49it/s]

   Testing specific player: 0
   At training step: 8000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[1.1011e-04, 1.3547e-04, 1.3973e-04, 1.8167e-04, 1.6333e-04,
          1.9065e-04, 1.0640e-04, 2.9107e-03, 8.5736e-03, 6.1852e-03,
          9.6457e-03, 7.2477e-03, 8.5234e-03, 2.5288e-02, 1.2831e-03,
          1.9887e-02, 9.4473e-03, 2.8875e-02, 3.7722e-02, 2.6758e-04,
          7.2139e-03, 1.8706e-03, 1.6815e-02, 1.7529e-02, 7.8824e-04,
          2.6258e-01, 1.3283e-04, 4.9642e-02, 4.3902e-02, 2.4443e-03,
          1.6466e-02, 2.4661e-04, 1.1930e-01, 8.6826e-02, 7.7822e-03,
          2.7574e-02, 5.7752e-04, 6.2809e-02, 2.1759e-02, 8.8487e-03,
          2.5047e-02, 9.9897e-03, 3.0522e-02, 1.1424e-02, 1.7854e-04,
          1.4572e-04, 1.4757e-04, 1.5585e-04, 1.2018e-04, 1.8157e-04,
          9.9595e-05],
         [7.3081e-05, 1.0073e-04, 9.6861e-05, 1.3142e-04, 9.4663e-05,
          9.8355e-05, 6.4968e-05, 6.3053e-03, 2.5694e-02, 3.0441e-03,
     

 18%|█▊        | 9003/50000 [13:50<39:14, 17.41it/s]    

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 9000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 54328/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 52583/2000000


 20%|█▉        | 9998/50000 [14:49<30:36, 21.78it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 10000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 60077/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 58030/2000000
P1 SL Buffer Size:  60077
P1 SL buffer distribution [22919. 23902.  6490.  6766.]
P1 actions distribution [0.38149375 0.39785608 0.10802803 0.11262213]
P2 SL Buffer Size:  58030
P2 SL buffer distribution [20309. 24818.  5624.  7279.]
P2 actions distribution [0.34997415 0.42767534 0.09691539 0.12543512]
   Testing specific player: 0
   At training step: 10000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.2333, 0.7475, 0.0192, 0.0000]])
Player 1 Prediction: tensor([[[2.6057e-05, 2.3875e-05, 1.3601e-05, 1.6577e-05, 1.6388e-05,
          6.7108e-06, 1.0016e-05, 8.5761e-04, 7.0554e-04, 2.1799e-04,
          1.0592e-01, 1.9757e-02, 4.4614e-04, 2.3254e-04, 7.3294e-05,
          2.8553e-01, 4.1990e-02, 2.1007e-03, 1.5396e-03, 4.6977e-05,
         

 20%|█▉        | 9998/50000 [15:07<30:36, 21.78it/s]


📊 TEST RESULTS SUMMARY
Training step: 10000
Episodes completed: 10000/10000
Total steps: 34834
Average episode length: 3.5 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5911/10000 (59.1%)
    Average reward: -0.115
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4089/10000 (40.9%)
    Average reward: +0.115
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 7194 (40.4%)
    Action 1: 6807 (38.2%)
    Action 2: 2057 (11.6%)
    Action 3: 1743 (9.8%)
  Player 1:
    Action 0: 4976 (29.2%)
    Action 1: 7391 (43.4%)
    Action 2: 2821 (16.6%)
    Action 3: 1845 (10.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1147.0, 1147.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.059 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.041 (max=1.0 for random)
    → Playing nea

 22%|██▏       | 11004/50000 [16:19<33:07, 19.62it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 11000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 65742/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 63736/2000000


 24%|██▍       | 11999/50000 [17:12<34:14, 18.50it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 12000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 71430/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 69326/2000000
P1 SL Buffer Size:  71430
P1 SL buffer distribution [25690. 28488.  9504.  7748.]
P1 actions distribution [0.35965281 0.39882402 0.13305334 0.10846983]
P2 SL Buffer Size:  69326
P2 SL buffer distribution [23003. 29264.  8660.  8399.]
P2 actions distribution [0.33180913 0.42212157 0.12491706 0.12115224]
   Testing specific player: 0
   At training step: 12000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1707, 0.0601, 0.7691, 0.0000]])


 24%|██▍       | 11999/50000 [17:27<34:14, 18.50it/s]


📊 TEST RESULTS SUMMARY
Training step: 12000
Episodes completed: 10000/10000
Total steps: 29343
Average episode length: 2.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 6027/10000 (60.3%)
    Average reward: -0.291
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 3973/10000 (39.7%)
    Average reward: +0.291
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5254 (36.3%)
    Action 1: 5247 (36.2%)
    Action 2: 2369 (16.4%)
    Action 3: 1615 (11.1%)
  Player 1:
    Action 0: 3162 (21.3%)
    Action 1: 5437 (36.6%)
    Action 2: 4227 (28.4%)
    Action 3: 2032 (13.7%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2913.5, 2913.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.006 (max=1.0 for random)
    → Playing ne

 26%|██▌       | 13003/50000 [18:43<31:20, 19.68it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 13000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 77000/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 75059/2000000


 28%|██▊       | 13999/50000 [19:43<30:53, 19.42it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 14000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 82877/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 80587/2000000
P1 SL Buffer Size:  82877
P1 SL buffer distribution [28319. 32831. 12887.  8840.]
P1 actions distribution [0.34169914 0.39614127 0.15549549 0.10666409]
P2 SL Buffer Size:  80587
P2 SL buffer distribution [25492. 33466. 12046.  9583.]
P2 actions distribution [0.31632894 0.4152779  0.1494782  0.11891496]
   Testing specific player: 0
   At training step: 14000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1507, 0.8400, 0.0093, 0.0000]])
Player 1 Prediction: tensor([[[3.6558e-05, 1.5771e-05, 1.6131e-05, 1.6074e-05, 1.5079e-05,
          3.9979e-06, 1.9384e-05, 4.6944e-04, 3.3724e-04, 1.8264e-04,
          1.1766e-01, 2.9382e-02, 2.8261e-04, 2.1870e-04, 8.0457e-05,
          3.6823e-01, 3.5940e-02, 4.4984e-03, 4.0529e-03, 1.8445e-05,
         

 28%|██▊       | 13999/50000 [19:57<30:53, 19.42it/s]


📊 TEST RESULTS SUMMARY
Training step: 14000
Episodes completed: 10000/10000
Total steps: 27009
Average episode length: 2.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 6069/10000 (60.7%)
    Average reward: -0.242
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 3931/10000 (39.3%)
    Average reward: +0.242
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4520 (34.2%)
    Action 1: 4653 (35.2%)
    Action 2: 2700 (20.4%)
    Action 3: 1345 (10.2%)
  Player 1:
    Action 0: 3115 (22.6%)
    Action 1: 5123 (37.1%)
    Action 2: 4544 (32.9%)
    Action 3: 1009 (7.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2416.5, 2416.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.060 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.016 (max=1.0 for random)
    → Playing nea

 30%|███       | 15002/50000 [21:18<35:23, 16.48it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 15000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 88854/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 86175/2000000


 32%|███▏      | 16000/50000 [22:21<32:56, 17.20it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 16000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 94721/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 91917/2000000
P1 SL Buffer Size:  94721
P1 SL buffer distribution [30928. 37312. 16424. 10057.]
P1 actions distribution [0.32651682 0.39391476 0.17339344 0.10617498]
P2 SL Buffer Size:  91917
P2 SL buffer distribution [28038. 37462. 15674. 10743.]
P2 actions distribution [0.30503607 0.40756335 0.17052341 0.11687718]
   Testing specific player: 0
   At training step: 16000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[6.7277e-05, 5.4895e-05, 6.7586e-05, 9.8493e-05, 5.5821e-05,
          1.0426e-04, 8.3253e-05, 3.5090e-03, 7.9869e-03, 4.9445e-03,
          2.4285e-02, 1.0643e-02, 7.7447e-03, 1.8669e-02, 7.9279e-04,
          6.2263e-02, 1.2224e-02, 1.2727e-02, 1.9632e-02, 1.7077e-04,
          5.9023e-03, 1.6812e-03, 1.0549e-01, 1.3802e-01, 4.8413e-04,
   

 32%|███▏      | 16000/50000 [22:37<32:56, 17.20it/s]


📊 TEST RESULTS SUMMARY
Training step: 16000
Episodes completed: 10000/10000
Total steps: 28053
Average episode length: 2.8 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 6000/10000 (60.0%)
    Average reward: -0.224
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4000/10000 (40.0%)
    Average reward: +0.224
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4411 (31.8%)
    Action 1: 5049 (36.4%)
    Action 2: 2829 (20.4%)
    Action 3: 1591 (11.5%)
  Player 1:
    Action 0: 3034 (21.4%)
    Action 1: 5078 (35.8%)
    Action 2: 4274 (30.2%)
    Action 3: 1787 (12.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2238.0, 2238.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.056 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.007 (max=1.0 for random)
    → Playing ne

 34%|███▍      | 17004/50000 [24:03<33:04, 16.62it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 17000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 100795/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 97732/2000000


 36%|███▌      | 17999/50000 [25:04<31:58, 16.68it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 18000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 106620/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 103567/2000000
P1 SL Buffer Size:  106620
P1 SL buffer distribution [33514. 42509. 19820. 10777.]
P1 actions distribution [0.31433127 0.3986963  0.18589383 0.1010786 ]
P2 SL Buffer Size:  103567
P2 SL buffer distribution [30606. 42032. 19266. 11663.]
P2 actions distribution [0.29551884 0.40584356 0.18602451 0.11261309]
   Testing specific player: 0
   At training step: 18000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.1101, 0.0415, 0.8484, 0.0000]])


 36%|███▌      | 17999/50000 [25:17<31:58, 16.68it/s]


📊 TEST RESULTS SUMMARY
Training step: 18000
Episodes completed: 10000/10000
Total steps: 27474
Average episode length: 2.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5349/10000 (53.5%)
    Average reward: -0.256
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4651/10000 (46.5%)
    Average reward: +0.256
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4218 (30.8%)
    Action 1: 5280 (38.6%)
    Action 2: 3578 (26.1%)
    Action 3: 617 (4.5%)
  Player 1:
    Action 0: 2461 (17.9%)
    Action 1: 6505 (47.2%)
    Action 2: 3675 (26.7%)
    Action 3: 1140 (8.3%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2556.5, 2556.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.053 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.955 (max=1.0 for random)
    → Playing nearl

 38%|███▊      | 19003/50000 [26:46<28:44, 17.97it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 19000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 112676/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 109574/2000000


 40%|████      | 20000/50000 [27:44<27:03, 18.48it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 20000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 119037/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 115519/2000000
P1 SL Buffer Size:  119037
P1 SL buffer distribution [36710. 48062. 23118. 11147.]
P1 actions distribution [0.30839151 0.40375682 0.19420852 0.09364315]
P2 SL Buffer Size:  115519
P2 SL buffer distribution [33247. 46744. 23080. 12448.]
P2 actions distribution [0.28780547 0.40464339 0.19979397 0.10775717]
   Testing specific player: 0
   At training step: 20000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[3.5970e-05, 5.6299e-05, 3.9084e-05, 5.0357e-05, 3.2483e-05,
          4.6934e-05, 4.1063e-05, 7.6609e-04, 3.1464e-03, 2.5851e-03,
          8.5261e-03, 2.2949e-03, 4.3717e-03, 7.1475e-03, 4.5592e-04,
          4.8618e-03, 5.2342e-03, 1.3213e-02, 2.0700e-02, 1.9779e-04,
          2.4964e-03, 2.2190e-03, 5.6688e-02, 5.6068e-02, 4.0925e-04,

 40%|████      | 20000/50000 [27:57<27:03, 18.48it/s]


📊 TEST RESULTS SUMMARY
Training step: 20000
Episodes completed: 10000/10000
Total steps: 26910
Average episode length: 2.7 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5994/10000 (59.9%)
    Average reward: -0.196
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4006/10000 (40.1%)
    Average reward: +0.196
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3641 (27.9%)
    Action 1: 5110 (39.1%)
    Action 2: 2967 (22.7%)
    Action 3: 1336 (10.2%)
  Player 1:
    Action 0: 3311 (23.9%)
    Action 1: 4539 (32.8%)
    Action 2: 4494 (32.4%)
    Action 3: 1512 (10.9%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1956.5, 1956.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.043 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.021 (max=1.0 for random)
    → Playing ne

 42%|████▏     | 21004/50000 [29:14<25:22, 19.04it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 21000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 125315/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 121389/2000000


 44%|████▍     | 22000/50000 [30:10<28:57, 16.12it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 22000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 131641/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 127187/2000000
P1 SL Buffer Size:  131641
P1 SL buffer distribution [40434. 53261. 26597. 11349.]
P1 actions distribution [0.30715355 0.40459279 0.20204192 0.08621174]
P2 SL Buffer Size:  127187
P2 SL buffer distribution [35625. 51786. 26790. 12986.]
P2 actions distribution [0.28009938 0.40716425 0.21063473 0.10210163]
   Testing specific player: 0
   At training step: 22000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[5.1332e-05, 4.8410e-05, 5.1796e-05, 5.3522e-05, 4.4512e-05,
          4.7541e-05, 5.5032e-05, 9.3538e-04, 2.9781e-03, 2.2827e-03,
          1.2516e-02, 5.1158e-03, 3.5949e-03, 9.8910e-03, 8.3315e-04,
          2.6659e-02, 8.3327e-03, 1.9516e-02, 4.0819e-02, 1.3168e-04,
          1.8115e-02, 1.7014e-03, 3.0762e-01, 3.5279e-01, 3.2017e-04,

 44%|████▍     | 22000/50000 [30:27<28:57, 16.12it/s]


📊 TEST RESULTS SUMMARY
Training step: 22000
Episodes completed: 10000/10000
Total steps: 25856
Average episode length: 2.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5308/10000 (53.1%)
    Average reward: -0.117
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4692/10000 (46.9%)
    Average reward: +0.117
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 3890 (30.2%)
    Action 1: 4712 (36.6%)
    Action 2: 3894 (30.2%)
    Action 3: 385 (3.0%)
  Player 1:
    Action 0: 2667 (20.6%)
    Action 1: 6203 (47.8%)
    Action 2: 3686 (28.4%)
    Action 3: 419 (3.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1174.5, 1174.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.052 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.978 (max=1.0 for random)
    → Playing nearly

 46%|████▌     | 23002/50000 [31:50<28:36, 15.73it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 23000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 137931/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 133171/2000000


 48%|████▊     | 24000/50000 [32:54<26:24, 16.41it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 24000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 144056/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 139269/2000000
P1 SL Buffer Size:  144056
P1 SL buffer distribution [44086. 58230. 30235. 11505.]
P1 actions distribution [0.30603376 0.4042178  0.20988366 0.07986477]
P2 SL Buffer Size:  139269
P2 SL buffer distribution [38274. 57183. 30460. 13352.]
P2 actions distribution [0.27482067 0.41059389 0.21871343 0.09587202]
   Testing specific player: 0
   At training step: 24000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.0818, 0.0281, 0.8900, 0.0000]])


 48%|████▊     | 24000/50000 [33:08<26:24, 16.41it/s]


📊 TEST RESULTS SUMMARY
Training step: 24000
Episodes completed: 10000/10000
Total steps: 25642
Average episode length: 2.6 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5202/10000 (52.0%)
    Average reward: -0.081
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4798/10000 (48.0%)
    Average reward: +0.081
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 4071 (31.6%)
    Action 1: 4359 (33.9%)
    Action 2: 4175 (32.5%)
    Action 3: 258 (2.0%)
  Player 1:
    Action 0: 1335 (10.4%)
    Action 1: 7014 (54.9%)
    Action 2: 4069 (31.8%)
    Action 3: 361 (2.8%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-805.5, 805.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.054 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.815 (max=1.0 for random)
    → Mixed strategy
 

 50%|█████     | 25002/50000 [34:52<28:28, 14.63it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 25000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 150574/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 145392/2000000


 52%|█████▏    | 26000/50000 [35:49<20:32, 19.47it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 26000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 157027/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 151968/2000000
P1 SL Buffer Size:  157027
P1 SL buffer distribution [47944. 63475. 33894. 11714.]
P1 actions distribution [0.30532329 0.40422985 0.21584823 0.07459864]
P2 SL Buffer Size:  151968
P2 SL buffer distribution [41442. 62428. 34414. 13684.]
P2 actions distribution [0.27270215 0.41079701 0.22645557 0.09004527]


 52%|█████▏    | 26000/50000 [35:59<20:32, 19.47it/s]

   Testing specific player: 0
   At training step: 26000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.3443, 0.6543, 0.0014, 0.0000]])
Player 1 Prediction: tensor([[[2.6800e-05, 2.1058e-05, 2.2867e-05, 2.5515e-05, 2.3977e-05,
          1.3581e-05, 2.3576e-05, 4.5140e-04, 5.7211e-04, 6.7523e-04,
          9.0520e-02, 1.0784e-02, 5.0890e-04, 8.3196e-04, 2.6990e-04,
          1.5501e-01, 1.5171e-02, 1.2411e-02, 1.3454e-02, 7.9982e-05,
          1.7051e-02, 1.4263e-03, 2.0070e-01, 9.7618e-02, 4.1432e-05,
          1.3723e-01, 2.1635e-05, 2.3065e-02, 2.4525e-02, 7.7925e-04,
          3.5963e-02, 4.1321e-05, 4.4392e-02, 3.1743e-02, 7.3659e-03,
          6.2580e-02, 1.1479e-04, 1.0768e-03, 5.6907e-04, 3.3192e-03,
          8.2943e-03, 3.9656e-04, 3.6355e-04, 2.6960e-04, 2.6067e-05,
          3.0582e-05, 2.5031e-05, 1.5113e-05, 1.3087e-05, 3.0289e-05,
          1.2991e-05],
         [2.0689e-05, 1.8675e-05, 2.0315e-05, 2.6901e-05, 2.7004e-05,
          

 54%|█████▍    | 27002/50000 [37:54<23:49, 16.08it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 27000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 163312/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 158449/2000000


 56%|█████▌    | 28000/50000 [38:56<19:39, 18.65it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 28000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 169858/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 165071/2000000
P1 SL Buffer Size:  169858
P1 SL buffer distribution [51856. 68598. 37514. 11890.]
P1 actions distribution [0.3052903  0.40385498 0.22085507 0.06999965]
P2 SL Buffer Size:  165071
P2 SL buffer distribution [44981. 67860. 38265. 13965.]
P2 actions distribution [0.27249487 0.41109583 0.23180934 0.08459996]


 56%|█████▌    | 28000/50000 [39:10<19:39, 18.65it/s]

   Testing specific player: 0
   At training step: 28000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[4.6081e-05, 3.8611e-05, 5.3027e-05, 7.0011e-05, 4.7705e-05,
          6.7035e-05, 5.9325e-05, 2.0004e-03, 5.5089e-03, 2.9274e-03,
          1.3008e-02, 5.7142e-03, 8.4385e-03, 1.9358e-02, 9.2126e-04,
          2.9218e-02, 1.0807e-02, 1.2468e-02, 1.6162e-02, 1.3802e-04,
          7.5037e-03, 1.7362e-03, 1.8809e-01, 2.4146e-01, 3.3839e-04,
          1.0786e-01, 6.3606e-05, 8.1501e-02, 7.8508e-02, 1.6834e-03,
          3.2494e-02, 1.2350e-04, 3.2988e-02, 2.9406e-02, 3.1681e-03,
          1.7428e-02, 2.2015e-04, 2.7576e-02, 8.9318e-03, 2.2162e-03,
          4.0208e-03, 1.4444e-03, 2.6343e-03, 1.1734e-03, 5.8662e-05,
          6.3697e-05, 6.1353e-05, 5.3677e-05, 5.0039e-05, 6.1345e-05,
          2.8478e-05],
         [2.7567e-05, 3.0115e-05, 2.8981e-05, 3.3545e-05, 2.4487e-05,
          2.3280e-05, 2.8944e-05, 9.4612e-03, 5.4979e-02, 7.7213e-04,
    

 58%|█████▊    | 29004/50000 [40:28<17:55, 19.52it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 29000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 176011/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 171584/2000000


 60%|█████▉    | 29999/50000 [41:27<19:37, 16.98it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 30000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 182249/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 178288/2000000
P1 SL Buffer Size:  182249
P1 SL buffer distribution [55708. 73317. 41129. 12095.]
P1 actions distribution [0.30566972 0.40229027 0.22567476 0.06636525]
P2 SL Buffer Size:  178288
P2 SL buffer distribution [48773. 72976. 42227. 14312.]
P2 actions distribution [0.273563   0.40931527 0.23684712 0.08027461]
   Testing specific player: 0
   At training step: 30000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7610, 0.2159, 0.0231, 0.0000]])
Player 1 Prediction: tensor([[[3.2479e-05, 1.0233e-05, 2.3553e-05, 9.3061e-06, 7.9549e-06,
          1.0592e-05, 1.0969e-05, 1.5100e-04, 1.3781e-04, 8.3734e-04,
          7.9207e-02, 7.3141e-03, 1.6612e-04, 3.3104e-04, 1.0997e-04,
          8.5110e-02, 9.4988e-03, 1.8257e-02, 2.7519e-02, 3.3682e-05,
     

 60%|█████▉    | 29999/50000 [41:40<19:37, 16.98it/s]


📊 TEST RESULTS SUMMARY
Training step: 30000
Episodes completed: 10000/10000
Total steps: 28658
Average episode length: 2.9 steps
Episode length range: 1 - 8

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 6146/10000 (61.5%)
    Average reward: -0.148
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 3854/10000 (38.5%)
    Average reward: +0.148
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5122 (36.7%)
    Action 1: 4988 (35.8%)
    Action 2: 2718 (19.5%)
    Action 3: 1117 (8.0%)
  Player 1:
    Action 0: 4669 (31.7%)
    Action 1: 4797 (32.6%)
    Action 2: 4601 (31.3%)
    Action 3: 646 (4.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1480.0, 1480.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.061 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 1.053 (max=1.0 for random)
    → Playing nearl

 62%|██████▏   | 31004/50000 [43:00<17:53, 17.69it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 31000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 188674/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 184889/2000000


 64%|██████▍   | 32000/50000 [44:55<29:25, 10.20it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 32000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 194995/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 191554/2000000
P1 SL Buffer Size:  194995
P1 SL buffer distribution [59823. 78026. 44828. 12318.]
P1 actions distribution [0.30679248 0.40014359 0.22989307 0.06317085]
P2 SL Buffer Size:  191554
P2 SL buffer distribution [52705. 77817. 46149. 14883.]
P2 actions distribution [0.27514435 0.40624054 0.24091901 0.07769611]


 64%|██████▍   | 32000/50000 [45:12<29:25, 10.20it/s]

   Testing specific player: 0
   At training step: 32000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[3.5493e-05, 6.3733e-05, 7.2401e-05, 7.3476e-05, 5.5658e-05,
          3.9884e-05, 4.0089e-05, 6.6016e-04, 2.2520e-03, 1.4549e-03,
          1.0484e-02, 5.2327e-03, 3.3846e-03, 1.0809e-02, 9.6552e-04,
          1.7343e-02, 5.3172e-03, 1.5323e-02, 2.3037e-02, 9.2249e-05,
          2.6132e-02, 1.6696e-03, 3.3021e-01, 3.6648e-01, 2.8778e-04,
          2.0727e-02, 5.0954e-05, 3.1889e-02, 2.7484e-02, 9.1040e-04,
          3.6671e-02, 8.5496e-05, 1.5986e-02, 1.0955e-02, 3.0529e-03,
          1.1019e-02, 1.1370e-04, 9.0967e-03, 3.3440e-03, 1.2376e-03,
          2.5036e-03, 1.1643e-03, 1.3043e-03, 5.6720e-04, 5.3029e-05,
          1.9932e-05, 6.5696e-05, 2.6250e-05, 3.7326e-05, 6.5358e-05,
          5.7621e-05],
         [2.0481e-05, 4.2166e-05, 3.7791e-05, 4.1560e-05, 3.2486e-05,
          3.3439e-05, 2.0449e-05, 3.6536e-03, 2.3310e-02, 5.3075e-04,
    

 66%|██████▌   | 33002/50000 [47:14<20:03, 14.12it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 33000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 201398/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 198125/2000000


 68%|██████▊   | 34000/50000 [48:21<14:47, 18.03it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 34000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 207694/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 204827/2000000
P1 SL Buffer Size:  207694
P1 SL buffer distribution [63941. 82549. 48621. 12583.]
P1 actions distribution [0.30786157 0.39745491 0.2340992  0.06058432]
P2 SL Buffer Size:  204827
P2 SL buffer distribution [56125. 83062. 49886. 15754.]
P2 actions distribution [0.27401173 0.40552271 0.24355188 0.07691369]


 68%|██████▊   | 34000/50000 [48:33<14:47, 18.03it/s]

   Testing specific player: 0
   At training step: 34000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.0500, 0.0179, 0.9321, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 34000
Episodes completed: 10000/10000
Total steps: 30302
Average episode length: 3.0 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5843/10000 (58.4%)
    Average reward: -0.195
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4157/10000 (41.6%)
    Average reward: +0.195
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5662 (37.4%)
    Action 1: 5573 (36.8%)
    Action 2: 2707 (17.9%)
    Action 3: 1198 (7.9%)
  Player 1:
    Action 0: 4044 (26.7%)
    Action 1: 5362 (35.4%)
    Action 2: 4211 (27.8%)
    Action 3: 1545 (10.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-1948.0, 1948.0]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY 

 70%|███████   | 35004/50000 [49:41<11:32, 21.66it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 35000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 214004/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 211437/2000000


 72%|███████▏  | 36000/50000 [51:25<22:54, 10.19it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 36000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 220422/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 217904/2000000
P1 SL Buffer Size:  220422
P1 SL buffer distribution [68303. 86670. 52415. 13034.]
P1 actions distribution [0.30987379 0.39320032 0.23779387 0.05913203]
P2 SL Buffer Size:  217904
P2 SL buffer distribution [59080. 88608. 53406. 16810.]
P2 actions distribution [0.27112857 0.40663779 0.24508958 0.07714406]
   Testing specific player: 0
   At training step: 36000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[5.7609e-01, 4.2345e-01, 4.6166e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[[7.9092e-05, 1.8598e-05, 5.0803e-05, 1.9160e-05, 3.8035e-05,
          1.9188e-05, 1.0352e-05, 2.1332e-04, 5.1790e-04, 5.1147e-04,
          8.4276e-02, 1.4030e-02, 6.2507e-04, 5.9251e-04, 1.4507e-04,
          1.1335e-01, 9.9330e-03, 1.1038e-02, 1.4366e-02, 2

 72%|███████▏  | 36000/50000 [51:43<22:54, 10.19it/s]


📊 TEST RESULTS SUMMARY
Training step: 36000
Episodes completed: 10000/10000
Total steps: 30203
Average episode length: 3.0 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5080/10000 (50.8%)
    Average reward: -0.254
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4920/10000 (49.2%)
    Average reward: +0.254
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5684 (37.1%)
    Action 1: 5289 (34.5%)
    Action 2: 3517 (22.9%)
    Action 3: 849 (5.5%)
  Player 1:
    Action 0: 2669 (18.0%)
    Action 1: 7161 (48.2%)
    Action 2: 3780 (25.4%)
    Action 3: 1254 (8.4%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2540.5, 2540.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.060 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.952 (max=1.0 for random)
    → Playing nearl

 74%|███████▍  | 37001/50000 [54:04<21:05, 10.27it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 37000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 226804/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 224260/2000000


 76%|███████▌  | 38000/50000 [55:40<20:20,  9.83it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 38000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 233323/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 230616/2000000
P1 SL Buffer Size:  233323
P1 SL buffer distribution [72878. 90625. 56211. 13609.]
P1 actions distribution [0.31234812 0.38841006 0.24091495 0.05832687]
P2 SL Buffer Size:  230616
P2 SL buffer distribution [61746. 93969. 56989. 17912.]
P2 actions distribution [0.26774378 0.40746956 0.24711642 0.07767024]
   Testing specific player: 0
   At training step: 38000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[2.4412e-05, 5.8551e-05, 4.1470e-05, 4.8414e-05, 4.5014e-05,
          5.1687e-05, 8.0795e-05, 1.0041e-03, 4.1744e-03, 2.0111e-03,
          1.1842e-02, 5.4583e-03, 6.9632e-03, 1.9784e-02, 7.7710e-04,
          2.9149e-02, 6.7316e-03, 8.3662e-03, 1.2818e-02, 1.3657e-04,
          1.5385e-02, 3.2878e-03, 2.5205e-01, 2.8081e-01, 3.2106e-04,

 78%|███████▊  | 39005/50000 [57:12<08:18, 22.05it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 39000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 239795/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 237165/2000000


 80%|███████▉  | 39998/50000 [57:57<07:33, 22.08it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 40000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 246157/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 243855/2000000
P1 SL Buffer Size:  246157
P1 SL buffer distribution [77870. 93968. 60084. 14235.]
P1 actions distribution [0.31634282 0.38174011 0.24408812 0.05782895]
P2 SL Buffer Size:  243855
P2 SL buffer distribution [64379. 99915. 60498. 19063.]
P2 actions distribution [0.26400525 0.40973119 0.24809005 0.0781735 ]


 80%|███████▉  | 39998/50000 [58:10<07:33, 22.08it/s]

   Testing specific player: 0
   At training step: 40000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7977, 0.1853, 0.0170, 0.0000]])
Player 1 Prediction: tensor([[[2.5453e-05, 1.7093e-05, 2.3993e-05, 1.3719e-05, 1.8473e-05,
          1.0758e-05, 2.0230e-05, 1.7764e-04, 1.7282e-04, 5.1313e-04,
          6.0125e-02, 5.7199e-03, 2.1191e-04, 3.8765e-04, 1.8463e-04,
          7.1748e-02, 1.0301e-02, 2.0773e-02, 2.1389e-02, 3.1204e-05,
          8.8381e-02, 5.3905e-03, 3.9596e-01, 1.3951e-01, 2.6748e-05,
          1.1724e-02, 1.3419e-05, 7.3140e-03, 1.0952e-02, 9.8866e-04,
          4.5179e-02, 2.2941e-05, 1.6713e-02, 1.1692e-02, 4.4528e-03,
          5.7703e-02, 5.3751e-05, 4.2207e-04, 2.2920e-04, 1.9410e-03,
          8.6422e-03, 3.9691e-04, 2.3067e-04, 8.3713e-05, 1.9475e-05,
          1.2160e-05, 2.0825e-05, 1.2734e-05, 1.3014e-05, 1.6616e-05,
          1.7860e-05],
         [1.2578e-05, 1.8029e-05, 1.3483e-05, 2.0877e-05, 1.1368e-05,
          

 82%|████████▏ | 41002/50000 [59:20<07:07, 21.04it/s]   

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 41000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 252595/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 250487/2000000


 84%|████████▍ | 41998/50000 [1:00:06<06:11, 21.53it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 42000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 258935/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 257056/2000000
P1 SL Buffer Size:  258935
P1 SL buffer distribution [82972. 96940. 64029. 14994.]
P1 actions distribution [0.32043563 0.37437967 0.24727827 0.05790642]
P2 SL Buffer Size:  257056
P2 SL buffer distribution [ 66758. 106088.  63871.  20339.]
P2 actions distribution [0.25970217 0.41270385 0.24847115 0.07912284]


 84%|████████▍ | 41998/50000 [1:00:20<06:11, 21.53it/s]

   Testing specific player: 0
   At training step: 42000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.7975, 0.1859, 0.0166, 0.0000]])
Player 1 Prediction: tensor([[[1.4588e-05, 1.4758e-05, 1.7788e-05, 1.4602e-05, 1.1834e-05,
          9.1625e-06, 1.2255e-05, 1.1630e-04, 1.3799e-04, 3.8989e-04,
          6.8425e-02, 4.5169e-03, 1.5090e-04, 2.6937e-04, 1.4924e-04,
          4.2222e-02, 6.9352e-03, 1.6040e-02, 2.0409e-02, 2.8327e-05,
          1.1623e-01, 3.7174e-03, 4.0363e-01, 1.7155e-01, 2.1327e-05,
          1.2935e-02, 1.1013e-05, 5.6337e-03, 7.2995e-03, 1.2451e-03,
          2.9896e-02, 2.0426e-05, 1.1783e-02, 1.2007e-02, 3.0218e-03,
          4.8010e-02, 4.3198e-05, 2.9975e-04, 1.7902e-04, 2.1691e-03,
          9.6205e-03, 4.0023e-04, 1.8062e-04, 1.1615e-04, 1.3253e-05,
          9.8280e-06, 1.2679e-05, 1.3481e-05, 1.2095e-05, 1.6017e-05,
          8.8261e-06],
         [1.1349e-05, 1.2016e-05, 1.5379e-05, 2.1154e-05, 1.1656e-05,
          

 86%|████████▌ | 43005/50000 [1:01:28<05:26, 21.44it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 43000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 265239/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 263515/2000000


 88%|████████▊ | 43998/50000 [1:02:18<04:54, 20.37it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 44000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 271455/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 270053/2000000
P1 SL Buffer Size:  271455
P1 SL buffer distribution [87922. 99627. 68137. 15769.]
P1 actions distribution [0.32389162 0.36701111 0.25100661 0.05809066]
P2 SL Buffer Size:  270053
P2 SL buffer distribution [ 69072. 112363.  67064.  21554.]
P2 actions distribution [0.25577202 0.41607758 0.24833644 0.07981396]


 88%|████████▊ | 43998/50000 [1:02:31<04:54, 20.37it/s]

   Testing specific player: 0
   At training step: 44000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[0.0356, 0.0087, 0.9557, 0.0000]])

📊 TEST RESULTS SUMMARY
Training step: 44000
Episodes completed: 10000/10000
Total steps: 29171
Average episode length: 2.9 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 4956/10000 (49.6%)
    Average reward: -0.226
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 5044/10000 (50.4%)
    Average reward: +0.226
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5695 (38.4%)
    Action 1: 4558 (30.7%)
    Action 2: 3918 (26.4%)
    Action 3: 679 (4.6%)
  Player 1:
    Action 0: 2399 (16.8%)
    Action 1: 7010 (48.9%)
    Action 2: 3737 (26.1%)
    Action 3: 1175 (8.2%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2261.5, 2261.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY AN

 90%|█████████ | 45004/50000 [1:03:39<04:17, 19.38it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 45000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 277679/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 276424/2000000


 92%|█████████▏| 46000/50000 [1:04:28<03:10, 20.97it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 46000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 283926/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 282935/2000000
P1 SL Buffer Size:  283926
P1 SL buffer distribution [ 92632. 102392.  72475.  16427.]
P1 actions distribution [0.32625402 0.36062918 0.25526017 0.05785662]
P2 SL Buffer Size:  282935
P2 SL buffer distribution [ 71322. 118508.  70219.  22886.]
P2 actions distribution [0.2520791  0.41885239 0.24818068 0.08088784]


 92%|█████████▏| 46000/50000 [1:04:41<03:10, 20.97it/s]

   Testing specific player: 0
   At training step: 46000
🎯 Test policies: ['average_strategy', 'best_response']
Player 1 Prediction: tensor([[[3.8096e-05, 2.7408e-05, 3.7580e-05, 5.3917e-05, 4.6530e-05,
          6.2943e-05, 5.4353e-05, 1.1441e-03, 2.5728e-03, 2.1451e-03,
          1.0889e-02, 5.3265e-03, 7.4188e-03, 1.7352e-02, 7.1017e-04,
          2.6941e-02, 5.6008e-03, 8.0261e-03, 1.2422e-02, 1.2678e-04,
          2.3912e-02, 2.8105e-03, 2.5361e-01, 2.6351e-01, 2.5066e-04,
          7.8963e-02, 6.6670e-05, 7.2632e-02, 6.6689e-02, 3.6830e-03,
          5.1206e-02, 9.6246e-05, 1.8635e-02, 1.4129e-02, 3.3794e-03,
          1.6227e-02, 1.7615e-04, 1.3838e-02, 6.3916e-03, 1.7125e-03,
          2.4005e-03, 1.5891e-03, 1.9091e-03, 8.8117e-04, 4.5950e-05,
          4.1080e-05, 4.8167e-05, 4.4279e-05, 5.2089e-05, 5.2171e-05,
          2.8115e-05],
         [2.2271e-05, 1.8978e-05, 1.5196e-05, 2.3319e-05, 2.0820e-05,
          1.9341e-05, 2.3455e-05, 3.5983e-03, 1.5529e-02, 4.6508e-04,
    

 94%|█████████▍| 47002/50000 [1:05:49<02:22, 21.02it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 47000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 290143/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 289421/2000000


 96%|█████████▌| 47999/50000 [1:06:37<01:38, 20.29it/s]

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 48000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 296156/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 296096/2000000
P1 SL Buffer Size:  296156
P1 SL buffer distribution [ 96992. 105060.  77011.  17093.]
P1 actions distribution [0.32750307 0.35474547 0.26003525 0.0577162 ]
P2 SL Buffer Size:  296096
P2 SL buffer distribution [ 73621. 125022.  73341.  24112.]
P2 actions distribution [0.24863895 0.42223468 0.24769332 0.08143305]
   Testing specific player: 0
   At training step: 48000
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[6.9559e-01, 3.0401e-01, 3.9439e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[[2.0140e-05, 2.1990e-05, 2.0716e-05, 1.6067e-05, 1.5780e-05,
          1.0759e-05, 1.9826e-05, 1.3324e-04, 1.4147e-04, 5.3747e-04,
          4.8001e-02, 5.9434e-03, 1.9454e-04, 6.5305e-04, 2.2346e-04,
          6.1593e-02, 7.1623e-03, 1.7119e-02, 2.172

 96%|█████████▌| 47999/50000 [1:06:51<01:38, 20.29it/s]


📊 TEST RESULTS SUMMARY
Training step: 48000
Episodes completed: 10000/10000
Total steps: 30232
Average episode length: 3.0 steps
Episode length range: 1 - 7

🏆 PLAYER PERFORMANCE:
  Player 0:
    Wins: 5038/10000 (50.4%)
    Average reward: -0.256
    Reward range: -7.0 to +7.0
  Player 1:
    Wins: 4962/10000 (49.6%)
    Average reward: +0.256
    Reward range: -7.0 to +7.0

🎲 ACTION FREQUENCIES:
  Player 0:
    Action 0: 5423 (35.5%)
    Action 1: 5074 (33.2%)
    Action 2: 3648 (23.9%)
    Action 3: 1131 (7.4%)
  Player 1:
    Action 0: 2871 (19.2%)
    Action 1: 5972 (39.9%)
    Action 2: 3474 (23.2%)
    Action 3: 2639 (17.6%)

⚖️  GAME BALANCE ANALYSIS:
  Total rewards per player: [-2556.5, 2556.5]
  Sum of all rewards: 0.0 (should be ~0 for zero-sum games)
  ✅ Game appears balanced (zero-sum)

🧠 STRATEGY ANALYSIS:
  Player 0 strategy entropy: 1.059 (max=1.0 for random)
    → Playing nearly random strategy
  Player 1 strategy entropy: 0.986 (max=1.0 for random)
    → Playing nea

 98%|█████████▊| 49003/50000 [1:08:06<00:49, 19.98it/s]  

   Player 0 ε: 0.0000 → 0.0000

📊 Buffer sizes at step 49000:
   Player 0 RL buffer: 200000/200000
   Player 0 SL buffer: 302193/2000000
   Player 1 RL buffer: 200000/200000
   Player 1 SL buffer: 302743/2000000


100%|██████████| 50000/50000 [1:08:59<00:00, 12.08it/s]


   Testing specific player: 0
   At training step: 49999
🎯 Test policies: ['average_strategy', 'best_response']
Player 0 Prediction: tensor([[7.0547e-01, 2.9397e-01, 5.6024e-04, 0.0000e+00]])
Player 1 Prediction: tensor([[[1.7324e-05, 2.3145e-05, 2.1082e-05, 1.9964e-05, 6.8945e-06,
          5.6234e-06, 9.0555e-06, 1.3760e-04, 8.8924e-05, 4.7290e-04,
          4.5770e-02, 3.5553e-03, 1.9571e-04, 4.0958e-04, 1.0254e-04,
          6.0145e-02, 6.9117e-03, 1.4872e-02, 1.6750e-02, 2.6700e-05,
          1.2332e-01, 3.4675e-03, 4.1755e-01, 1.4741e-01, 3.0349e-05,
          9.7496e-03, 1.0192e-05, 1.1177e-02, 1.1403e-02, 1.8328e-03,
          3.6821e-02, 3.9972e-05, 1.4431e-02, 1.2398e-02, 3.3394e-03,
          4.2790e-02, 4.0599e-05, 2.7584e-04, 2.1474e-04, 2.3125e-03,
          1.0945e-02, 5.3703e-04, 1.5796e-04, 1.1017e-04, 1.4933e-05,
          1.1742e-05, 8.8214e-06, 1.6140e-05, 2.1943e-05, 9.8688e-06,
          1.1937e-05],
         [1.1016e-05, 7.8675e-06, 3.0418e-05, 1.1255e-05, 1.1574