In [1]:
from agent_configs.alphazero_config import AlphaZeroConfig
from game_configs.tictactoe_config import TicTacToeConfig
from alphazero_agent import AlphaZeroAgent
import gymnasium as gym
import numpy as np
import custom_gym_envs
from torch.optim import Adam, SGD


class ClipReward(gym.RewardWrapper):
    def __init__(self, env, min_reward, max_reward):
        super().__init__(env)
        self.min_reward = min_reward
        self.max_reward = max_reward
        self.reward_range = (min_reward, max_reward)

    def reward(self, reward):
        return np.clip(reward, self.min_reward, self.max_reward)


# env = ClipReward(gym.wrappers.AtariPreprocessing(gym.make("MsPacmanNoFrameskip-v4", render_mode="rgb_array"), terminal_on_life_loss=True), -1, 1) # as recommended by the original paper, should already include max pooling
env = gym.make("custom_gym_envs/Connect4-v0", render_mode="rgb_array")
# env = gym.make("MsPacmanNoFrameskip-v4", render_mode="rgb_array")
# env = gym.wrappers.FrameStack(env, 4)


# self.games_per_generation: int = self.parse_field("games_per_generation", 100)
# self.value_loss_factor: float = self.parse_field("value_loss_factor", 1.0)
# self.weight_decay: float = self.parse_field("weight_decay", 1e-4)

# # MCTS
# self.root_dirichlet_alpha: float = self.parse_field(
#     "root_dirichlet_alpha", required=False
# )
# if self.root_dirichlet_alpha is None:
#     print("Root dirichlet alpha should be defined to a game specific value")
# self.root_exploration_fraction: float = self.parse_field(
#     "root_exploration_fraction", 0.25
# )
# self.num_simulations: int = self.parse_field("num_simulations", 800)
# self.num_sampling_moves: int = self.parse_field("num_sampling_moves", 30)
# self.exploration_temperature: float = self.parse_field(
#     "exploration_temperature", 1.0
# )
# self.exploitation_temperature: float = self.parse_field(
#     "exploitation_temperature", 0.1
# )
# self.clip_low_prob: float = self.parse_field("clip_low_prob", 0.0)
# self.pb_c_base: int = self.parse_field("pb_c_base", 19652)
# self.pb_c_init: float = self.parse_field("pb_c_init", 1.25)

config = {
    "optimizer_function": Adam,
    "learning_rate": 0.002,  #
    "adam_epsilon": 1e-8,
    # "momentum": 0.9,
    "clipnorm": 0.5,
    # NORMALIZATION?
    # REWARD CLIPPING
    "training_steps": 100,  #
    "residual_layers": [(128, 3, 1)] * 20,  #
    "critic_conv_layers": [(32, 3, 1)],  #
    "critic_widths": [],  #
    "actor_conv_layers": [(32, 3, 1)],  #
    "actor_widths": [],  #
    "memory_size": 1600,  # 500,000 /  44,000,000 / 24,000,000 / 21,000,000
    "minibatch_size": 1024,  #
    "root_dirichlet_alpha": 1.0,  #
    "root_exploration_fraction": 0.25,
    "pb_c_init": 1.25,
    "pb_c_base": 19652,
    "num_simulations": 800,
    "weight_decay": 1e-4,
    "num_sampling_moves": 30,  #
    "loss_function": None,
    "games_per_generation": 32,  #
}

config = AlphaZeroConfig(config, TicTacToeConfig())

agent = AlphaZeroAgent(env, config, name="alphazero", device="cpu")

Box(0.0, 1.0, (3, 6, 7), float64)
Using default save_intermediate_weights     : False
Using         training_steps                : 100
Using         adam_epsilon                  : 1e-08
Using default momentum                      : 0.9
Using         learning_rate                 : 0.002
Using         clipnorm                      : 0.5
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using         weight_decay                  : 0.0001
Using         loss_function                 : None
Using default activation                    : relu
Using         kernel_initializer            : None
Using         minibatch_size                : 1024
Using default replay_buffer_size            : 5000
Using default min_replay_buffer_size        : 1024
Using default num_minibatches               : 1
Using default training_iterations           : 1
Using         residual_layers               : [(128, 3, 1), (128, 3, 1), (128, 3, 1), (128, 3, 1), (128, 3, 1), (128, 3, 1), (1

  logger.warn(


In [2]:
agent.checkpoint_interval = 1
agent.train()

Training Step  1
Training Game  1
Predicted Policy  tensor([0.0891, 0.0803, 0.1537, 0.1790, 0.1757, 0.1098, 0.2124],
       device='mps:0', grad_fn=<SelectBackward0>)
Predicted Value  tensor(0.2949, device='mps:0', grad_fn=<SelectBackward0>)


  logger.warn(


Target Policy [0.03375    0.03       0.04875    0.04       0.73874998 0.04
 0.06875   ]
Temperature Policy  [0.03375 0.03    0.04875 0.04    0.73875 0.04    0.06875]
Action  4
Predicted Policy  tensor([0.1220, 0.0948, 0.2011, 0.1220, 0.1437, 0.1371, 0.1794],
       device='mps:0', grad_fn=<SelectBackward0>)
Predicted Value  tensor(0.2475, device='mps:0', grad_fn=<SelectBackward0>)
Target Policy [0.07125    0.035      0.0525     0.05875    0.06125    0.04
 0.68124998]
Temperature Policy  [0.07125 0.035   0.0525  0.05875 0.06125 0.04    0.68125]
Action  6
Predicted Policy  tensor([0.1068, 0.0889, 0.1941, 0.1076, 0.1372, 0.1540, 0.2114],
       device='mps:0', grad_fn=<SelectBackward0>)
Predicted Value  tensor(0.1075, device='mps:0', grad_fn=<SelectBackward0>)
Target Policy [0.04875    0.03375    0.04625    0.03625    0.04125    0.0425
 0.75125003]
Temperature Policy  [0.04875 0.03375 0.04625 0.03625 0.04125 0.0425  0.75125]
Action  6
Predicted Policy  tensor([0.1126, 0.1050, 0.1471, 0.14

KeyboardInterrupt: 

In [None]:
array = np.zeros((3, 9))
array[0] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
print(array)

In [None]:
import gymnasium as gym

env = gym.make("custom_gym_envs/TicTacToe-v0")

state, info = env.reset()
agent.predict_no_mcts(state, info)

In [None]:
import gymnasium as gym

env = gym.make("custom_gym_envs/TicTacToe-v0")

state, info = env.reset()
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
env.render()
state, reward, terminated, truncated, info = env.step(0)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(4)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(3)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(6)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(2)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(1)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
env.render()
state, reward, terminated, truncated, info = env.step(7)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(8)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(5)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
print("Truncated:", truncated)
env.render()


env.reset()
state, reward, terminated, truncated, info = env.step(0)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(3)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(7)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(4)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(2)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(6)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
state, reward, terminated, truncated, info = env.step(1)
print(state)
print("Turn: ", state[2][0][0])
print("Legal moves: ", info["legal_moves"])
print("Terminated:", terminated)
print("Truncated:", truncated)
print("Reward:", reward)

In [None]:
from alphazero_agent import AlphaZeroAgent
from agent_configs import AlphaZeroConfig
from game_configs import TicTacToeConfig

# from alphazero_agent import AlphaZeroAgent
import gymnasium as gym
import numpy as np
import custom_gym_envs


class ClipReward(gym.RewardWrapper):
    def __init__(self, env, min_reward, max_reward):
        super().__init__(env)
        self.min_reward = min_reward
        self.max_reward = max_reward
        self.reward_range = (min_reward, max_reward)

    def reward(self, reward):
        return np.clip(reward, self.min_reward, self.max_reward)


# env = ClipReward(gym.wrappers.AtariPreprocessing(gym.make("MsPacmanNoFrameskip-v4", render_mode="rgb_array"), terminal_on_life_loss=True), -1, 1) # as recommended by the original paper, should already include max pooling
# env = TicTacToeEnv(render_mode="rgb_array")
# env = gym.make("MsPacmanNoFrameskip-v4", render_mode="rgb_array")
# env = gym.wrappers.FrameStack(env, 4)
env = gym.make("custom_gym_envs/Connect4-v0", render_mode="rgb_array")


# MODEL SEEMS TO BE UNDERFITTING SO TRY AND GET IT TO OVERFIT THEN FIND A HAPPY MEDIUM
# 1. INCREASE THE NUMBER OF RESIDUAL BLOCKS
# 2. INCREASE THE NUMBER OF FILTERS
# 3. DECREASE REGULARIZATION
# 4. TRY DECREASING LEARNING RATE (maybe its that whole thing where the policy goes to like 1 0 0 0 0... etc and then goes back on the third training step, so maybe the learning rate is too high)
# 5. TO OVERFIT USE LESS DATA (but that is probably just a bad idea)
# config = {
#         'activation': 'relu',
#         'kernel_initializer': 'glorot_uniform',
#         'optimizer': tf.keras.optimizers.legacy.Adam,
#         'learning_rate': 0.001, # 0.00001 could maybe increase by a factor of 10 or 100 and try to do some weights regularization
#         'adam_epsilon': 3.25e-6,
#         'clipnorm': None,
#         # NORMALIZATION?
#         # REWARD CLIPPING
#         'training_steps': 40,
#         'num_filters': 256,
#         'kernel_size': 3,
#         'stride': 1,
#         'num_res_blocks': 20,
#         'critic_conv_filters': 32, # 1
#         'critic_conv_layers': 1,
#         'critic_dense_size': 256,
#         'critic_dense_layers': 1,
#         'actor_conv_filters': 32, #
#         'actor_conv_layers': 1,
#         'actor_dense_size': 0,
#         'actor_dense_layers': 0,
#         'replay_buffer_size': 800, # IN GAMES
#         'replay_batch_size': 50, # IN MOVES
#         'root_dirichlet_alpha': 0.5, # 2 in theory?
#         'root_exploration_fraction': 0, # 0.25 in paper
#         'pb_c_base': 500,
#         'pb_c_init': 2,
#         'num_simulations': 200,
#         # 'two_player': True,
#         'weight_decay': 0.00, # could try setting this to something other than 0 and increasing learning rate
#         'num_sampling_moves': 0,
#         'initial_temperature': 1,
#         'exploitation_temperature': 0.1,
#         'value_loss_factor': 1, # could try setting this to something other than 1
#         'games_per_generation': 10, # times 8 from augmentation
#     }

config = {
    "activation": "relu",
    "kernel_initializer": "glorot_uniform",
    "optimizer": tf.keras.optimizers.legacy.Adam,
    "learning_rate": 0.0005,  # 0.0001 # 0.00001 could maybe increase by a factor of 10 or 100 and try to do some weights regularization
    "number_of_lr_cycles": 1,  # this will determine the step size based on training steps
    # STILL ADD A SCHEDULE FOR BASE LEARNING RATE (MIN LEARNING RATE)
    "adam_epsilon": 3.25e-6,
    "clipnorm": None,
    # NORMALIZATION?
    # REWARD CLIPPING
    "training_steps": 100,  # alpha zero did 700,000, the lessons from alpha zero did 40 generations but 1000 batches per generation, so 40,000 batches (they just had a cyclical learning rate per generation (also they trained twice on the same data every generation))
    "num_filters": 256,
    "kernel_size": 3,
    "stride": 1,
    "residual_blocks": 20,
    "critic_conv_filters": 32,  # 1
    "critic_conv_layers": 1,
    "critic_dense_size": 256,
    "critic_dense_layers": 1,
    "actor_conv_filters": 32,  #
    "actor_conv_layers": 1,
    "actor_dense_size": 0,
    "actor_dense_layers": 0,
    "replay_buffer_size": 100,  # IN GAMES
    "minibatch_size": 24,  # SHOULD BE ROUGHLY SAME AS AVERAGE MOVE PER GENERATION (SO LIKE 7 TIMES NUMBER OF GAMES PLAYED PER GENERATION) <- what was used in the original paper (they played 44M games, 50 moves per game and sampled 700,000 minibatches of size 4096 (so thats like sampling 1 time per move roughly but this was also happening with parrallel data collection i believe))
    "games_per_generation": 1,  # times 8 from augmentation
    "root_dirichlet_alpha": 2.5,  # Less than 1 more random, greater than one more flat # 2 in theory? # 0.3 in alphazero for chess # TRY CHANGING (MAYBE LOWER? (IT SEEMS TO PLAY THE SAME LINE OVER AND OVER AGAIN <- so we want a lesss flat distribution maybe)
    "root_exploration_fraction": 0.25,  # 0.25 in paper
    "pb_c_base": 20000,  # Seems unimportant to be honest (increases puct the more simulations there are)
    "pb_c_init": 1.25,  # 1.25 in paper # MAYBE HIGHER? (IT SEEMS TO PLAY THE SAME LINE OVER AND OVER AGAIN)
    "num_simulations": 50,  # INCREASE THIS since the model is missing 1 move wins (and also 2 and 3 move wins (it wins by luck)))
    # 'two_player': True,
    "weight_decay": 0.00001,  # could try setting this to something other than 0 and increasing learning rate
    "num_sampling_moves": 30,
    "exploration_temperature": 1,
    "exploitation_temperature": 0.1,
    "value_loss_factor": 1,  # could try setting this to something other than 1
    "loss_function": None,
}

config = AlphaZeroConfig(config, TicTacToeConfig())

agent = AlphaZeroAgent(env, config, "alphazero")

In [None]:
agent.checkpoint_interval = 1
agent.train()

In [None]:
agent.model.load_weights("./alphazero.keras")
agent.train()