In [None]:
from nfsp_agent_clean import NFSPDQN
from agent_configs import NFSPDQNConfig, RainbowConfig
from game_configs import MississippiMarblesConfig, LeducHoldemConfig, TicTacToeConfig
from utils import KLDivergenceLoss, CategoricalCrossentropyLoss, HuberLoss, MSELoss
from torch.optim import Adam, SGD

config_dict = {
    "shared_networks_and_buffers": True,
    "training_steps": 1000000,  # like 2-5M in the paper (1M for initial test and see if rainbow is faster sooner)
    "num_players": 2,
    "anticipatory_param": 0.1,
    "replay_interval": 128,  # 1
    "num_minibatches": 2,  # 2 # 4
    "activation": "relu",
    "kernel_initializer": "he_normal",
    "learning_rate": 0.05,  # maybe should be lower for distributional, loss magnitudes are similar but there are many more weights for distributions (from atom size)? testing alpha/2 and ... # DQN / 4 # 0.1 for DQN so 0.025 for Rainbow (initial test used 0.1 for rainbow)
    "clipnorm": 10.0,
    "optimizer": SGD,  # SGD
    "loss_function": KLDivergenceLoss(),
    "per_alpha": 0.6,
    "per_beta": 0.5,
    "per_epsilon": 1e-6,
    "training_iterations": 1,  # 1
    "min_replay_buffer_size": 128,
    "minibatch_size": 128,
    "replay_buffer_size": 200000,  # 200000
    "transfer_interval": 300,  # 100
    "n_step": 3,  ##########
    "atom_size": 51,
    # "conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "dense_layer_widths": [],
    # "value_hidden_layer_widths": [],
    # "advantage_hidden_layer_widths": [],
    "conv_layers": [],
    "dense_layer_widths": [64],
    "dueling": True,
    "value_hidden_layer_widths": [],
    "advantage_hidden_layer_widths": [],
    "noisy_sigma": 0.0,  ##########
    "eg_epsilon": 0.06,  # 0
    "eg_epsilon_decay_type": "inverse_sqrt",
    "eg_epsilon_decay_final_step": 1000000,
    "sl_activation": "relu",
    "sl_kernel_initializer": "he_normal",
    "sl_learning_rate": 0.005,  # 0.00005
    "sl_clipnorm": 10.0,
    "sl_optimizer": SGD,  # SGD
    "sl_loss_function": CategoricalCrossentropyLoss(),
    "sl_training_iterations": 1,  # 1
    "sl_min_replay_buffer_size": 128,
    "sl_minibatch_size": 128,
    "sl_replay_buffer_size": 2000000,  # 2000000
    # "sl_conv_layers": [(64, 3, 1), (32, 2, 1)],
    # "sl_dense_layer_widths": [],
    "sl_conv_layers": [],
    "sl_dense_layer_widths": [64],
}
# config = NFSPConfig(config_dict=config_dict, game_config=MississippiMarblesConfig(), rl_config_type=RainbowConfig)
config = NFSPDQNConfig(
    config_dict=config_dict,
    game_config=LeducHoldemConfig(),
)
# config = NFSPDQNConfig(config_dict=config_dict, game_config=TicTacToeConfig(), rl_config_type=RainbowConfig)

In [None]:
from pathlib import Path

# TODO: 8, 9, 10, 11, 12, 13
# DONE: 14
dir = "./checkpoints/bad_test_func/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))

In [None]:
import custom_gym_envs
import gymnasium as gym

# env = gym.make('custom_gym_envs/MississippiMarbles-v0', render_mode="human", players=2)
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
# env = gym.make('custom_gym_envs/TicTacToe-v0', render_mode="rgb_array", player_turn_as_plane=False)
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")

In [None]:
dir = "./checkpoints/NFSPDQN-LeducHoldem"
agent.load_from_checkpoint(dir, 685000)

In [3]:
agent.checkpoint_trials = 500
agent.checkpoint_interval = 1000
agent.config.save_intermediate_weights = True
agent.train()

Prediction tensor([[[4.3099e-06, 4.0031e-06, 5.3373e-06, 1.5358e-05, 7.6363e-06,
          6.3296e-06, 6.9780e-06, 8.0038e-05, 1.8200e-04, 2.8943e-04,
          5.9641e-02, 4.2319e-02, 1.4995e-03, 1.3130e-03, 6.1614e-04,
          9.4977e-02, 3.8871e-02, 1.7291e-03, 3.0033e-03, 8.0180e-04,
          8.8404e-03, 4.5527e-03, 1.3290e-01, 1.5752e-01, 2.4039e-03,
          1.2396e-01, 4.5970e-06, 3.0675e-02, 3.0705e-02, 1.0751e-02,
          6.4136e-02, 6.7470e-04, 2.1730e-03, 1.5025e-03, 3.8679e-02,
          9.1927e-02, 8.1480e-04, 2.3887e-03, 1.7303e-03, 2.0010e-02,
          2.6195e-02, 9.4104e-04, 8.8558e-04, 2.2159e-04, 1.1466e-05,
          8.2029e-06, 4.1876e-06, 4.1765e-06, 2.6876e-06, 1.4305e-06,
          1.1203e-05],
         [3.4438e-06, 5.7219e-06, 8.1395e-06, 9.3008e-06, 2.2087e-05,
          1.1508e-05, 1.0576e-05, 2.6313e-03, 8.2057e-03, 6.3442e-03,
          5.4610e-03, 5.4312e-03, 1.7026e-02, 4.2652e-02, 1.1457e-02,
          3.0526e-02, 1.6175e-02, 6.4437e-02, 8.6521e-02

In [None]:
from pathlib import Path
import yaml
from utils import plot_comparisons
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs

stats_list = []

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=False)

dir = "./checkpoints/NFSPDQN-LeducHoldem-7"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem-8"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-9"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-10"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 275000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-11"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 525000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-12"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 250000)
stats_list.append(agent.stats)

dir = "./checkpoints/NFSPDQN-LeducHoldem-13"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 85000)
stats_list.append(agent.stats)


dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
agent.load_from_checkpoint(dir, 685000)
stats_list.append(agent.stats)

In [None]:
# indices:
# 0: Default
# 1: Default Shared
# 2: PER + Shared
# 3: Dueling + PER + Shared
# 4: Distributional + Dueling + PER + Shared
# 5: Distributional + Dueling + PER + Shared + LR 0.05
plot_comparisons(stats_list, "NFSPDQN-LeducHoldem")

In [None]:
from agent_configs import NFSPDQNConfig
from nfsp_agent_clean import NFSPDQN
import gymnasium as gym
import custom_gym_envs
from pathlib import Path

# the test agent
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
test_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
test_agent.load_from_checkpoint(dir, 685000)

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
dir = "./checkpoints/NFSPDQN-LeducHoldem"
config = NFSPDQNConfig.load(Path(dir, "configs/config.yaml"))
challenger_agent = NFSPDQN(env, config, name="NFSPDQN-LeducHoldem", device="cpu")
challenger_agent.load_from_checkpoint(dir, 685000)

test_agent.policies = ["average_strategy", "average_strategy"]
# the challenger agent
challenger_agent.policies = ["best_response", "best_response"]

In [None]:
import copy


test_player = 0
score = 0
test_score = 0
env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            else:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score = test_score / 5000  #


test_player = 1
test_score = 0

env = gym.make("custom_gym_envs/LeducHoldem-v0", encode_player_turn=True)
for _ in range(5000):
    print("Trial ", _)
    state, info = env.reset()
    done = False
    while not done:
        for player in range(2):
            if player == 0:
                prediction = challenger_agent.predict(state, info)
                action = challenger_agent.select_actions(prediction, info).item()
            else:
                prediction = test_agent.predict(state, info)
                action = test_agent.select_actions(prediction, info).item()
            print("Prediction", prediction)
            action_string = (
                "call"
                if action == 0
                else ("raise" if action == 1 else "fold" if action == 2 else "check")
            )
            print(action_string)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            state = next_state
            average_strategy_reward = reward[test_player]
            total_reward = sum(reward)
            test_score += total_reward - average_strategy_reward
            if done:
                break
score += test_score / 5000  #
print(score)

In [None]:
player_1 = agent.nfsp_agents[0]
player_2 = agent.nfsp_agents[1]
player_1.policy = "best_response"
state, info = env.reset()
print(state)
prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(4)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(7)
print(state)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)


state, reward, terminated, truncated, info = env.step(3)
print(state)
print(terminated)


prediction = player_1.predict(state, info)
action = player_1.select_actions(prediction, info).item()
state, reward, terminated, truncated, info = env.step(action)
print(action)
print(state)

In [None]:
state, info = env.reset()
print(state)
state_2, reward, terminated, truncated, info = env.step(0)
print(state_2)
print(state)

In [None]:
samples = agent.nfsp_agents[0].rl_agent.replay_buffer.sample()
print(samples)

In [None]:
import torch

q_values = torch.tensor(
    [
        [1, 0, 0, 0.5, -1],
        [-1, 1, 1, 1, -1],
    ]
)
legal_moves = [[0, 1, 3, 4], [2, 3, 4]]
mask = torch.zeros_like(q_values, dtype=torch.int8)
for i, legal in enumerate(legal_moves):
    mask[i, legal] = 1
print(mask)
q_values[mask == 0] = float("-inf")
selected_actions = q_values.argmax(1, keepdim=False)
print(q_values)