# Ultimate TTT Model

## Setup

In [1]:
import tensorflow as tf
import keras
from keras import layers
import scipy.signal

# used for fps logging
from datetime import datetime

# sanity check
tf.config.list_physical_devices()

2023-08-17 21:36:35.331523: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-17 21:36:37.085484: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-17 21:36:37.110091: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import configparser

config = configparser.ConfigParser()
config.read("train.ini")

['train.ini']

## Action Sampling

In [3]:

# Sample action from actor
@tf.function
def sample_action_value(observation, model: keras.Model):
    logits, value = model(observation, training=False)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action, value


@tf.function
def best_action_value(observation, model: keras.Model):
    logits, value = model(observation, training=False)
    action = tf.argmax(logits, axis=-1)
    return logits, action, value

@tf.function
def best_action(observation, model:keras.Model):
    logits, _ = model(observation, training=False)
    action = tf.argmax(logits, axis=-1)
    return action[0]


## Env

In [4]:
# Env Constants
MAX_TIMESTEPS = config["ENV"].getint("MAX_TIMESTEPS")

# board constants
ROWS = config["ENV"].getint("ROWS")
COLS = config["ENV"].getint("COLS")
CELLS = config["ENV"].getint("CELLS")

# socket constants
S_PORT = config["ENV"].getint("S_PORT")
A_PORT = config["ENV"].getint("A_PORT")
R_PORT = config["ENV"].getint("R_PORT")
MAX_MSG_SIZE = config["ENV"].getint("MAX_MSG_SIZE")

# reward parameters
WIN_REWARD = config["REWARD"].getfloat("WIN_REWARD")
CELL_REWARD = config["REWARD"].getfloat("CELL_REWARD")
VALID_REWARD = config["REWARD"].getfloat("VALID_REWARD")
TIMEOUT_PENALTY = config["REWARD"].getfloat("TIMEOUT_PENALTY")
INVALID_PENALTY = config["REWARD"].getfloat("INVALID_PENALTY")
LOSS_PENALTY = config["REWARD"].getfloat("LOSS_PENALTY")
TIE_REWARD = config["REWARD"].getfloat("TIE_REWARD")

# misc
SLEEP_TIME = config["ENV"].getfloat("SLEEP_TIME")

#### Opponents

In [5]:
# Opponents

# math
import numpy as np


# opponents
class Opponent:
    def __init__(self, _env=None) -> None:
        self.env = _env
    def get_action(self, obs) -> int:
        pass


class ValidRandomOpponent(Opponent):
    """
    Makes valid moves when there is a cur cell
    """
    def __init__(self, _env=None) -> None:
        super().__init__(_env)

    def get_action(self, obs: np.ndarray) -> int:
        return np.random.choice(self.env.validmoves)


class CellWinningRandomOpponent(ValidRandomOpponent):
    """
    Wins a cell if possible
    """

    def __init__(self, _env = None) -> None:
        super().__init__(_env)

    def get_cur_cell(self, obs: np.ndarray) -> tuple[bool, int]:
        # determine if there is a current cell
        cur_cell_exists = False
        cur_cell = -1
        for outer in range(obs.shape[0]):
            if obs[outer, 0, 2] == 1:
                cur_cell_exists = True
                cur_cell = outer
                break
        return cur_cell_exists, cur_cell

    def get_winning_cells(self, cell: list[int], turn: int) -> list[bool]:
        """
        Returns a list of bools, representing whether that space
        will win the given cell
        Note: this works as long as the parameters are the correct types
        """
        cell = np.array(cell).reshape((ROWS, COLS)).tolist()
        ret = [[False for _ in range(COLS)] for x in range(ROWS)]

        # check horizontals
        for row in range(ROWS):
            if cell[row].count(turn) == 2 and cell[row].count(0) == 1:
                # this row is winnable
                winning_cell = cell[row].index(0)
                ret[row][winning_cell] = True

        # check verticals
        for c in range(COLS):
            col = [cell[i][c] for i in range(ROWS)]
            if col.count(turn) == 2 and col.count(0) == 1:
                # this col is winnable
                winning_row = col.index(0)
                ret[winning_row][c] = True

        # check diagonal left
        left_diagonal = [cell[i][i] for i in range(ROWS)]
        if left_diagonal.count(turn) == 2 and left_diagonal.count(0) == 1:
            winning_space = left_diagonal.index(0)
            ret[winning_space][winning_space] = True

        # check diagonal right
        right_diagonal = [cell[i][ROWS - 1 - i] for i in range(ROWS)]
        if right_diagonal.count(turn) == 2 and right_diagonal.count(0) == 1:
            winning_space = right_diagonal.index(0)
            ret[winning_space][ROWS - 1 - winning_space] = True

        return np.array(ret).flatten().tolist()

    def get_turn(self, obs: np.ndarray) -> int:
        # whose turn it is
        return obs[0, 0, 3]

    def _get_cellwinning_action(self, cell: int, obs: np.ndarray) -> tuple[bool, int]:
        """
        Get whether there is a cell winning action, and if there is,
        return a random action from those
        Returns:
            * bool - whether there is a cell winning action
            * int - the action number, or -1 if none
        """
        spaces = [obs[cell, i, 0] for i in range(CELLS)]
        winning_spaces = self.get_winning_cells(spaces, self.get_turn(obs))

        # if any of them are cell-winning spaces, choose one of them
        if any(winning_spaces):
            idxs = [i for i in range(CELLS) if winning_spaces[i]]
            return True, cell * CELLS + np.random.choice(idxs)
        return False, -1

    def get_action(self, obs: np.ndarray) -> int:
        cur_cell_exists, cur_cell = self.get_cur_cell(obs)
        if cur_cell_exists:
            valid, action = self._get_cellwinning_action(cur_cell, obs)
            if valid:
                return action
        return super().get_action(obs)


class WinningRandomOpponent(CellWinningRandomOpponent):
    def __init__(self, _env=None) -> None:
        super().__init__(_env)

    def get_winning_action(self, obs: np.ndarray) -> tuple[bool, int]:
        turn = self.get_turn(obs)

        cur_cell_exists, cur_cell = self.get_cur_cell(obs)

        # go through all the cells and see which are claimed
        owners: list[int] = []
        for outer in range(obs.shape[0]):
            owners.append(obs[outer, 0, 1])

        # see which cells are winning cells
        winning_cells = self.get_winning_cells(owners, turn)
        if True in winning_cells:
            if cur_cell_exists:
                # if the cur cell is a possible winning cell
                if winning_cells[cur_cell]:
                    # get the space owners
                    space_owners = [obs[cur_cell, i, 0] for i in range(obs.shape[1])]
                    winning_spaces = self.get_winning_cells(space_owners, turn)
                    # it is possible
                    if True in winning_spaces:
                        return True, cur_cell * CELLS + winning_spaces.index(True)
            else:
                # we can go anywhere
                for potential_winning_cell_idx in range(CELLS):
                    if winning_cells[potential_winning_cell_idx]:
                        # get the space owners
                        space_owners = [
                            obs[potential_winning_cell_idx, i, 0]
                            for i in range(obs.shape[1])
                        ]
                        winning_spaces = self.get_winning_cells(space_owners, turn)
                        # it is possible
                        if True in winning_spaces:
                            return (
                                True,
                                potential_winning_cell_idx * CELLS
                                + winning_spaces.index(True),
                            )

        # winning isn't possible currently
        return False, -1

    def get_action(self, obs: np.ndarray) -> int:
        winnable, action = self.get_winning_action(obs)
        if winnable:
            return action
        else:
            cur_cell_exists, _ = self.get_cur_cell(obs)
            if not cur_cell_exists:
                # check if any of the cells can be won
                for cell in range(CELLS):
                    # if it's already claimed, move on
                    if obs[cell, 0, 1] != 0:
                        continue
                    valid, action = self._get_cellwinning_action(cell, obs)
                    if valid:
                        return action

        return super().get_action(obs)
    
class AIOpponent(ValidRandomOpponent):
    def __init__(self, model_name, _env=None) -> None:
        super().__init__(_env)
        self.model = keras.models.load_model(f"./models/{model_name}.keras")
    
    def get_action(self, obs: np.ndarray) -> int:
        action = best_action(obs.reshape(1, *obs.shape), self.model)
        if action not in self.env.validmoves:
            return super().get_action(obs)
        return action
        

#### Env

In [6]:
# used to send data
import os
import time
import socket

# proto definitions
import py.board_pb2 as pb

# misc
from typing import Tuple


# env
class UltimateTicTacToeEnv:
    obs_dim = (9, 9, 4)
    n_actions = CELLS * CELLS

    def __init__(
        self,
        opponent: Opponent = ValidRandomOpponent(),
        max_timesteps: int = 81,
        player1: bool = True,
    ) -> None:
        opponent.env = self
        self.s_conn, self.a_conn, self.r_conn = None, None, None
        self.opponent = opponent
        self.max_timesteps = max_timesteps
        self.player1 = player1
        os.system("./uttt aivai &")
        self._reset_connection_vars()
        self._reset_vars()
        self.reset()

    def _receive(self, conn: socket.socket, tp: type):
        ret = tp()
        b = conn.recv(MAX_MSG_SIZE)
        ret.ParseFromString(b)
        return ret

    def _get_return(self) -> pb.ReturnMessage:
        return self._receive(self.r_conn, pb.ReturnMessage)

    def _get_state(self) -> pb.StateMessage:
        return self._receive(self.s_conn, pb.StateMessage)

    def _make_coord(self, idx) -> pb.Coord:
        return pb.Coord(row=idx // COLS, col=idx % COLS)

    def _send_action(self, move) -> None:
        action = pb.ActionMessage(move=move)
        self.a_conn.send(action.SerializeToString())

    def _to_idx(self, coord: pb.Coord) -> int:
        return coord.row * COLS + coord.col

    def _to_multi_idx(self, move: pb.Move) -> int:
        return self._to_idx(move.large) * CELLS + self._to_idx(move.small)

    def _process_state(self, state: pb.StateMessage) -> np.ndarray:
        """
        The structure of the state:
        (9, 9, 4)
        Outer 9 represent board cells
        inner 9 represent the cell spaces
        each space has 5 objects:
            space owner (0, 1, 2) representing if the space is claimed or not
            cell owner (0, 1, 2) representing if the cell the space belongs to is claimed or not
            curcellornot (0, 1); 1 if the space belongs to the current cell, 0 if not
            valid (0, 1); 1 if the space is a valid move, 0 if not
        """
        board_state = np.zeros(self.obs_dim)
        try:
            for cell_idx in range(len(state.board.cells)):
                for space_idx in range(len(state.board.cells[cell_idx].spaces)):
                    board_state[cell_idx, space_idx, 0] = (
                        state.board.cells[cell_idx].spaces[space_idx].val
                    )
                    board_state[cell_idx, space_idx, 1] = state.cellowners[cell_idx]
                    board_state[cell_idx, space_idx, 2] = (
                        1 if self._to_idx(state.board.curCell) == cell_idx else 0
                    )
        except Exception as e:
            print(state)
            raise e

        self.validmoves = list(map(lambda move: self._to_multi_idx(move), state.validmoves))
        for idx in self.validmoves:
            board_state[idx//CELLS, idx % CELLS, 3] = 1

        return board_state

    def _get_exploration_reward(self, action: int, msg: pb.ReturnMessage) -> float:
        if msg.valid:
            return VALID_REWARD
        return INVALID_PENALTY

    def _get_win_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for winning if the game was won
        THIS ONLY APPLIES WHEN it is the player turn;
        everything else is passed indirectly
        """
        # the turn sent in the return message should still be the caller's turn
        if msg.state.winner == msg.state.turn:
            if self.player_turn:
                self.won = True
                return WIN_REWARD
            else:
                self.lost = True
                return LOSS_PENALTY
        elif self.done:
            self.tied = True
            return TIE_REWARD
        return 0

    def _get_cell_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for claiming a cell if a cell was claimed
        """
        if self.prev_cellowners == msg.state.cellowners:
            return 0
        elif list(msg.state.cellowners).count(
            msg.state.turn
        ) > self.prev_cellowners.count(msg.state.turn):
            self.prev_cellowners = list(msg.state.cellowners)
            return CELL_REWARD
        return 0

    def _get_timeout_reward(self):
        if self.cur_timestep > self.max_timesteps:
            return TIMEOUT_PENALTY
        return 0

    def _get_reward(self, action: pb.Move, msg: pb.ReturnMessage) -> float:
        return (
            self._get_exploration_reward(action, msg)
            + self._get_cell_reward(msg)
            + self._get_win_reward(msg)
            + self._get_timeout_reward()
        )

    def _step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates self.done
        """
        # send action and get response
        self._send_action(self.to_move(action))
        ret_message = self._get_return()

        # return information
        self.done = ret_message.state.done
        reward = self._get_reward(action, ret_message)
        if not self.done:
            return self.observe(), reward, self.done, ret_message.valid
        else:
            return (
                self._process_state(ret_message.state),
                reward,
                self.done,
                ret_message.valid,
            )

    def _take_opponent_turn(self) -> Tuple[np.ndarray, float, bool, bool]:
        valid = False
        while not valid:
            obs, reward, done, valid = self._step(
                self.opponent.get_action(self.cur_state)
            )
        return obs, reward, done, valid

    def _reset_vars(self):
        self.prev_cellowners = [pb.NONE] * 9
        self.cur_state = None  # the current state; used for debugging
        self.won = False  # whether or not the player won
        self.done = False  # if the game is over
        self.lost = False  # whether or not the player lost
        self.tied = False  # whether or not the player tied
        self.player_turn = True  # whether or not it is the player's turn
        self.cur_timestep = 0

    def _reset_connection_vars(self):
        self.s_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.a_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.r_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        time.sleep(2)
        self.s_conn.connect(("", 8000))
        self.a_conn.connect(("", 8001))
        self.r_conn.connect(("", 8002))

        # drain the state connection
        self.observe()
    # public section
    def observe(self) -> np.ndarray:
        """
        Updates self.cur_state and self._turn
        """
        state = self._get_state()
        self._turn = state.turn
        self.cur_state = self._process_state(state)
        return self.cur_state

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates current timestep

        Returns:
            - next state
            - reward for the action
            - done / not done
            - valid / invalid
        """
        self.player_turn = True
        self.cur_timestep += 1
        obs, reward, done, valid = self._step(action)
        done = done or self.cur_timestep > self.max_timesteps

        # take opponents turn
        if valid and not done:
            self.player_turn = False
            obs, reward2, done, _ = self._take_opponent_turn()

            # add the "lost" penalty
            if self.done:
                if self.lost:
                    return obs, reward + reward2, done, valid
                else:
                    # tie
                    assert self.tied
                    return obs, reward + TIE_REWARD, done, valid
            # penalize losing cells
            elif reward2 == CELL_REWARD + VALID_REWARD:
                return obs, reward - CELL_REWARD * 0.5, done, valid
            # nothing special, just return the reward
            return obs, reward, done, valid
        return obs, reward, done, valid

    def turn(self):
        return self._turn

    def reset(self) -> np.ndarray:
        if not self.done:
            # send invalid move
            self._send_action(self.to_move(-1))
        self._reset_vars()
        if self.player1:
            return self.observe()
        else:
            self.observe()
            obs, _, _, _ = self._take_opponent_turn()
            return obs

    def cleanup(self):
        if self.s_conn is not None:
            self.s_conn.close()
            self.r_conn.close()
            self.a_conn.close()

    def __del__(self):
        os.system("killall -q uttt")
        self.cleanup()

    def to_move(self, idx: int) -> pb.Move:
        outer_idx = idx // CELLS
        inner_idx = idx % CELLS

        return pb.Move(
            large=self._make_coord(outer_idx), small=self._make_coord(inner_idx)
        )

In [7]:
env = UltimateTicTacToeEnv(
    max_timesteps=MAX_TIMESTEPS,
    opponent= AIOpponent("attenppo8_p2"), # WinningRandomOpponent(), # AIOpponent("attenppo8"),
    player1=True,
)

TypeError: Could not deserialize class 'Functional' because its parent module keras.src.engine.functional cannot be imported. Full object config: {'module': 'keras.src.engine.functional', 'class_name': 'Functional', 'config': {'name': 'model', 'trainable': True, 'layers': [{'module': 'keras.layers', 'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, 9, 9, 4], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'input_1'}, 'registered_name': None, 'name': 'input_1', 'inbound_nodes': []}, {'module': 'keras.layers', 'class_name': 'Reshape', 'config': {'name': 'reshape', 'trainable': True, 'dtype': 'float32', 'target_shape': [9, 36]}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 9, 4]}, 'name': 'reshape', 'inbound_nodes': [[['input_1', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense', 'trainable': True, 'dtype': 'float32', 'units': 256, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 36]}, 'name': 'dense', 'inbound_nodes': [[['reshape', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_1', 'trainable': True, 'dtype': 'float32', 'units': 256, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 36]}, 'name': 'dense_1', 'inbound_nodes': [[['reshape', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_2', 'trainable': True, 'dtype': 'float32', 'units': 256, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 36]}, 'name': 'dense_2', 'inbound_nodes': [[['reshape', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Add', 'config': {'name': 'add', 'trainable': True, 'dtype': 'float32'}, 'registered_name': None, 'build_config': {'input_shape': [[None, 9, 256], [None, 9, 256], [None, 9, 256]]}, 'name': 'add', 'inbound_nodes': [[['dense', 0, 0, {}], ['dense_1', 0, 0, {}], ['dense_2', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'LayerNormalization', 'config': {'name': 'layer_normalization', 'trainable': True, 'dtype': 'float32', 'axis': [2], 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 256]}, 'name': 'layer_normalization', 'inbound_nodes': [[['add', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Flatten', 'config': {'name': 'flatten', 'trainable': True, 'dtype': 'float32', 'data_format': 'channels_last'}, 'registered_name': None, 'build_config': {'input_shape': [None, 9, 256]}, 'name': 'flatten', 'inbound_nodes': [[['layer_normalization', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_3', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2304]}, 'name': 'dense_3', 'inbound_nodes': [[['flatten', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'AlphaDropout', 'config': {'name': 'alpha_dropout', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'seed': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'alpha_dropout', 'inbound_nodes': [[['dense_3', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_4', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_4', 'inbound_nodes': [[['alpha_dropout', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_5', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_5', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_6', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_6', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_7', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_7', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Add', 'config': {'name': 'add_1', 'trainable': True, 'dtype': 'float32'}, 'registered_name': None, 'build_config': {'input_shape': [[None, 2048], [None, 2048], [None, 2048]]}, 'name': 'add_1', 'inbound_nodes': [[['dense_5', 0, 0, {}], ['dense_6', 0, 0, {}], ['dense_7', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'LayerNormalization', 'config': {'name': 'layer_normalization_1', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'layer_normalization_1', 'inbound_nodes': [[['add_1', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_8', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_8', 'inbound_nodes': [[['layer_normalization_1', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'AlphaDropout', 'config': {'name': 'alpha_dropout_1', 'trainable': True, 'dtype': 'float32', 'rate': 0.3, 'seed': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'alpha_dropout_1', 'inbound_nodes': [[['dense_8', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_9', 'trainable': True, 'dtype': 'float32', 'units': 2048, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_9', 'inbound_nodes': [[['alpha_dropout_1', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'AlphaDropout', 'config': {'name': 'alpha_dropout_2', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'seed': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'alpha_dropout_2', 'inbound_nodes': [[['dense_9', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_10', 'trainable': True, 'dtype': 'float32', 'units': 1024, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 2048]}, 'name': 'dense_10', 'inbound_nodes': [[['alpha_dropout_2', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_12', 'trainable': True, 'dtype': 'float32', 'units': 512, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1024]}, 'name': 'dense_12', 'inbound_nodes': [[['dense_10', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_14', 'trainable': True, 'dtype': 'float32', 'units': 512, 'activation': 'selu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1024]}, 'name': 'dense_14', 'inbound_nodes': [[['dense_10', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_11', 'trainable': True, 'dtype': 'float32', 'units': 81, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dense_11', 'inbound_nodes': [[['dense_12', 0, 0, {}]]]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_13', 'trainable': True, 'dtype': 'float32', 'units': 1, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dense_13', 'inbound_nodes': [[['dense_14', 0, 0, {}]]]}], 'input_layers': [['input_1', 0, 0]], 'output_layers': [['dense_11', 0, 0], ['dense_13', 0, 0]]}, 'registered_name': 'Functional', 'build_config': {'input_shape': [None, 9, 9, 4]}, 'compile_config': {'optimizer': {'module': 'keras.optimizers', 'class_name': 'Adam', 'config': {'name': 'Adam', 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'jit_compile': True, 'is_legacy_optimizer': False, 'learning_rate': 4.999999873689376e-06, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}, 'registered_name': None}, 'loss': None, 'metrics': None, 'loss_weights': None, 'weighted_metrics': None, 'run_eagerly': None, 'steps_per_execution': None, 'jit_compile': None}}

## Buffer

In [None]:
# buffer related hyperparameters
gamma = config["BUFFER"].getfloat("GAMMA")
lam = config["BUFFER"].getfloat("LAM")

In [None]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


def special_discounted_cumulative_sums(x, discount, invalid_locats):
    """
    invalid_locats: np.ndarray[bool], True if invalid, False if valid
    """
    # discounts, but doesn't stack INVALID_PENALTY
    # where it was invalid, use 0, else use the positive x
    zeros = np.where(invalid_locats, 0, x)
    # filter this
    filtered = scipy.signal.lfilter([1], [1, float(-discount)], zeros[::-1], axis=0)[
        ::-1
    ]
    # replace so that you have the invalid penalty where it was invalid
    # and the gamma'd reward on valid moves
    return np.where(invalid_locats, x, filtered)


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, *observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.valid_action_buffer = np.zeros(
            (size, UltimateTicTacToeEnv.n_actions), dtype=np.bool_
        )
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability, valid_actions):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.valid_action_buffer[self.pointer] = valid_actions
        self.pointer += 1

    def finish_trajectory(self, last_value=0, use_special: bool = False):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        if use_special:
            self.advantage_buffer[path_slice] = special_discounted_cumulative_sums(
                deltas, self.gamma * self.lam, rewards[:-1] == INVALID_PENALTY
            )
            self.return_buffer[path_slice] = special_discounted_cumulative_sums(
                rewards, self.gamma, rewards == INVALID_PENALTY
            )[:-1]
        else:
            self.advantage_buffer[path_slice] = discounted_cumulative_sums(
                deltas, self.gamma * self.lam
            )
            self.return_buffer[path_slice] = discounted_cumulative_sums(
                rewards, self.gamma
            )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
            self.valid_action_buffer,
        )

## Loss

In [None]:
# loss related hyperparameters
clip_ratio = config["LOSS"].getfloat("CLIP_RATIO")
target_kl = config["LOSS"].getfloat("TARGET_KL")
clip_coef = config["LOSS"].getfloat("CLIP_COEF")
v_coef = config["LOSS"].getfloat("VALUE_COEFFICIENT")
entropy_coef = config["LOSS"].getfloat("ENTROPY_COEFFICIENT")
invalid_coef = config["LOSS"].getfloat("INVALID_COEFFICIENT")
reg_coef = config["LOSS"].getfloat("REGULARIZER_COEFFICIENT")

In [None]:
@tf.function
def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, UltimateTicTacToeEnv.n_actions) * logprobabilities_all, axis=1
    )
    return logprobability


@tf.function(reduce_retracing=True)
def vector_slice(A: tf.Tensor, B: tf.Tensor):
    """Returns values of rows i of A at column B[i]

    where A is a 2D Tensor with shape [None, D]
    and B is a 1D Tensor with shape [None]
    with type int32 elements in [0,D)

    Example:
      A =[[1,2], B = [0,1], vector_slice(A,B) -> [1,4]
          [3,4]]

    Credit:
        https://stackoverflow.com/questions/38492608/tensorflow-indexing-into-2d-tensor-with-1d-tensor
    """
    linear_index = tf.shape(A)[1] * tf.range(0, tf.shape(A)[0])
    linear_A = tf.reshape(A, [-1])
    return tf.gather(linear_A, B + linear_index)


kl_loss = tf.keras.losses.KLDivergence()


@tf.function
def train_mod_on_probs(
    observation_buffer, desired_probs, return_buffer, model: keras.Model
):
    """
    Pretraining the model just training it to imitate
    valid moves (mse)
    Arguments:
        * observation_buffer
        * desired_probs
        * return_buffer
        * model
    Returns:
        total loss
    """
    with tf.GradientTape() as tape:
        logits, values = model(observation_buffer, training=True)

        actor_loss = kl_loss(desired_probs, tf.nn.softmax(logits))

        regularizer_loss = tf.reduce_sum(model.losses)
        # regularizer_loss = 0

        # critic loss = MSE
        critic_loss = tf.keras.losses.mse(tf.squeeze(return_buffer), tf.squeeze(values))

        loss = actor_loss + critic_loss + regularizer_loss

    policy_grads = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(policy_grads, model.trainable_variables))
    return actor_loss, critic_loss, regularizer_loss, loss


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_mod(
    observation_buffer,
    action_buffer,
    logprobability_buffer,
    advantage_buffer,
    return_buffer,
    valid_moves_buffer,
    model: keras.Model,
):
    with tf.GradientTape() as tape:
        logits, values = model(observation_buffer, training=True)
        softmax = tf.nn.softmax(logits)

        new_probs = logprobabilities(logits, action_buffer)
        # ratio = E(new_probs / old_probs)
        # this subtraction method is a way to do this
        ratio = tf.exp(new_probs - logprobability_buffer)

        # L_clip = E_t * ( min( r_t*A_t, clip(r_t, 1-e, 1+e)*A_t ) )
        clip_loss = -tf.reduce_mean(
            tf.minimum(
                ratio * advantage_buffer,
                tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio)
                * advantage_buffer,
            )
        )

        regularizer_loss = tf.reduce_sum(model.losses)

        # critic loss = MSE
        critic_loss = tf.keras.losses.mse(return_buffer, tf.squeeze(values))

        # entropy loss to encourage exploration
        entropy_loss = -tf.reduce_mean(-new_probs)

        # penalize large changes
        # kl_loss = tf.reduce_mean(logprobability_buffer - new_probs)
        # kl_loss = tf.reduce_sum(kl_loss)

        # penalize invalids
        invalid_loss = tf.reduce_mean(
            tf.reduce_sum(
                tf.where(
                    valid_moves_buffer,
                    tf.constant(0, dtype=tf.float32),  # if it was valid, don't penalize
                    softmax,  # else, penalize w/ the probability
                ),
                axis=-1,
            )
        )
        # invalid_loss = tf.reduce_mean(tf.where(return_buffer == INVALID_PENALTY, vector_slice(softmax, action_buffer), tf.constant(0, dtype=tf.float32)))

        # full loss
        loss = (
            clip_loss * clip_coef
            + critic_loss * v_coef
            + entropy_loss * entropy_coef
            + regularizer_loss * reg_coef
            + invalid_loss * invalid_coef
        )

    # apply gradients
    policy_grads = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(policy_grads, model.trainable_variables))

    # check new kl divergence
    logits, _ = model(observation_buffer, training=False)
    kl = tf.reduce_mean(logprobability_buffer - logprobabilities(logits, action_buffer))
    kl = tf.reduce_sum(kl)

    return kl, clip_loss, critic_loss, invalid_loss, entropy_loss, regularizer_loss

## Model

In [None]:
model_name = "attenppo8"
load_model = True

In [None]:
def create_shared_layers(input_layer):
    # imitation attention 1:
    x1 = layers.Dense(32, activation="selu", use_bias=True)(input_layer)
    x2 = layers.Dense(32, activation="selu", use_bias=True)(input_layer)
    x3 = layers.Dense(32, activation="selu", use_bias=True)(input_layer)
    x = layers.Add()([x1, x2, x3])
    x = layers.LayerNormalization()(x)

    # feedforward 1:
    x = layers.Flatten()(x)
    x = layers.Dense(2592, activation="selu", activity_regularizer="l2")(x)
    x = layers.AlphaDropout(0.2)(x)
    x = layers.Dense(2592, activation="selu", activity_regularizer="l2")(x)

    # imitation attention 2
    x1 = layers.Dense(3000, activation="selu", use_bias=True, activity_regularizer="l2")(x)
    x2 = layers.Dense(3000, activation="selu", use_bias=True, activity_regularizer="l2")(x)
    x3 = layers.Dense(3000, activation="selu", use_bias=True, activity_regularizer="l2")(x)
    x = layers.Add()([x1, x2, x3])
    x = layers.LayerNormalization()(x)

    # feedforward 2
    x = layers.Dense(2048, activation="selu", activity_regularizer="l2")(x)
    x = layers.AlphaDropout(0.2)(x)
    x = layers.Dense(1024, activation="selu")(x)
    return x


def create_model():
    # model inputs
    inputs = tf.keras.Input(shape=UltimateTicTacToeEnv.obs_dim)
    x = create_shared_layers(inputs)
    logits = layers.Dense(UltimateTicTacToeEnv.n_actions)(
        layers.Dense(512, activation="selu")(x)
    )
    values = tf.keras.layers.Dense(1)(layers.Dense(512, activation="selu")(x))
    return tf.keras.Model(inputs=inputs, outputs=(logits, values))


if load_model and os.path.exists(f"models/{model_name}.keras"):
    print("loading model...")
    model = tf.keras.models.load_model(f"models/{model_name}.keras")
else:
    print("creating model...")
    model = create_model()

loading model...


2023-08-17 21:23:01.618953: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-17 21:23:01.619465: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-17 21:23:01.619641: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9, 9, 4)]    0           []                               
                                                                                                  
 reshape (Reshape)              (None, 9, 36)        0           ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 9, 256)       9472        ['reshape[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 9, 256)       9472        ['reshape[0][0]']                
                                                                                              

## Optimizers

In [None]:
# learning rate hyperparams
learning_rate = config["OPTIMIZER"].getfloat("LEARNING_RATE")

In [None]:
if load_model and os.path.exists(f"models/{model_name}.keras"):
    optim: tf.keras.optimizers.Adam = model.optimizer
    optim.learning_rate = learning_rate
else:
    optim = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optim)

In [None]:
print(optim.learning_rate)

<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=5e-06>


## Train

In [None]:
# training time hyperparameters
train_iterations = config["TRAIN"].getint("TRAIN_ITERATIONS")
epochs = config["TRAIN"].getint("EPOCHS")
minibatch_size = config["TRAIN"].getint("MINIBATCH_SIZE")
steps_per_epoch = config["TRAIN"].getint("STEPS_PER_EPOCH")

In [None]:
epoch = config["TRAIN"].getint("EPOCH")
summary_writer = tf.summary.create_file_writer(f"./logs/{model_name}")

In [None]:
buffer = Buffer(UltimateTicTacToeEnv.obs_dim, steps_per_epoch, gamma=gamma, lam=lam)

#### Pretrain

In [None]:
pretrain = False
load_timesteps = True
pretrain_timesteps = 600000
pretrain_batch_size = 10000

if pretrain:
    if load_timesteps:
        pretrain_observations = np.load("pretrain_observations.npy")
        pretrain_desired_probs = np.load("pretrain_desired_probs.npy")
        pretrain_rewards = np.load("pretrain_rewards.npy")
    else:
        pretrain_observations = np.zeros((pretrain_timesteps, *env.obs_dim))
        pretrain_desired_probs = np.zeros((pretrain_timesteps, env.n_actions))
        pretrain_rewards = np.zeros((pretrain_timesteps, 1))
else:
    pretrain_observations = np.zeros((steps_per_epoch, *env.obs_dim))
    pretrain_desired_probs = np.zeros((steps_per_epoch, env.n_actions))
    pretrain_rewards = np.zeros((steps_per_epoch, 1))

In [None]:
import sklearn.utils as utils


def collect_pretrain_trajectories():
    def preprocess_obs_action_use_map():
        global pretrain_observations, pretrain_desired_probs
        # initialize obs:probs map and calculate probs
        observations_probs_map = {}
        for hashable_obs, action_use_map in observations_actions_uses.items():
            # initialize the probability matrix
            observations_probs_map[hashable_obs] = np.zeros((env.n_actions))

            # calculate the probability of each action
            total_uses = sum(action_use_map.values())
            for action, uses in action_use_map.items():
                observations_probs_map[hashable_obs][action] = uses / total_uses

            assert round(sum(observations_probs_map[hashable_obs]), 4) == 1.0

        # put probs into pretrain_desired_probs
        for i in range(pretrain_timesteps):
            pretrain_desired_probs[i] = observations_probs_map[
                pretrain_observations[i].tobytes()
            ]

    t = 0
    num_valid = 0
    start_time = datetime.now()
    opponent = WinningRandomOpponent(env.n_actions)

    # map observations -> map of actions -> uses
    observations_actions_uses = {}
    while t < pretrain_timesteps:
        # Iterate over the steps of each epoch
        observation = env.reset()
        start_timestep = t
        while not env.done and t < pretrain_timesteps:
            # update the map w/ the observation
            hashable = observation.tobytes()
            if hashable not in observations_actions_uses:
                observations_actions_uses[hashable] = {}

            # get the action
            action = opponent.get_action(observation)

            # step
            observation_new, reward, done, valid = env.step(action)
            if valid:
                num_valid += 1
            else:
                print("action was", action)
                print(observation)
                raise Exception("failed")

            # update the pretrain buffers and the map
            pretrain_observations[t] = observation
            pretrain_rewards[t] = reward
            # update the map w/ the action
            if action not in observations_actions_uses[hashable]:
                observations_actions_uses[hashable][action] = 1
            else:
                observations_actions_uses[hashable][action] += 1

            # Update the observation
            observation = observation_new

            # Finish trajectory if reached to a terminal state
            t += 1
            if done:
                observation = env.reset()

                # do the discounting
                pretrain_rewards[start_timestep:t] = discounted_cumulative_sums(
                    pretrain_rewards[start_timestep:t], gamma
                )
                start_timestep = t

            # log
            print(
                f"Step {t} / {pretrain_timesteps};\t"
                + f"% valid = {num_valid / (t):.4f};\t"
                + f"fps: {(t)/(datetime.now()-start_time).total_seconds():.2f};\t",
                end="\r",
            )
    print()

    preprocess_obs_action_use_map()

    np.save("pretrain_observations", pretrain_observations)
    np.save("pretrain_desired_probs", pretrain_desired_probs)
    np.save("pretrain_rewards", pretrain_rewards)


def pretrain_model(
    model: keras.Model, num_epochs=15, collect_trajectories: bool = True
):
    global pretrain_rewards, pretrain_desired_probs, pretrain_observations
    if collect_trajectories:
        collect_pretrain_trajectories()

    for epoch in range(1, 1 + num_epochs):
        pretrain_observations, pretrain_desired_probs, pretrain_rewards = utils.shuffle(
            pretrain_observations, pretrain_desired_probs, pretrain_rewards
        )
        actor_loss, critic_loss, regularizer_loss, loss = 0, 0, 0, 0
        # do minibatches
        for batch in range(pretrain_timesteps // pretrain_batch_size):
            start_idx = batch * pretrain_batch_size
            end_idx = (batch + 1) * pretrain_batch_size
            _actor_loss, _critic_loss, _regularizer_loss, _loss = train_mod_on_probs(
                pretrain_observations[start_idx:end_idx],
                pretrain_desired_probs[start_idx:end_idx],
                pretrain_rewards[start_idx:end_idx],
                model,
            )
            actor_loss += _actor_loss
            critic_loss += _critic_loss
            regularizer_loss += _regularizer_loss
            loss += _loss

        actor_loss /= batch + 1
        critic_loss /= batch + 1
        regularizer_loss /= batch + 1
        loss /= batch + 1
        # write summaries
        with summary_writer.as_default():
            tf.summary.scalar("pretrain/actor loss", actor_loss, step=epoch)
            tf.summary.scalar("pretrain/critic loss", critic_loss, step=epoch)
            tf.summary.scalar("pretrain/regularizer loss", regularizer_loss, step=epoch)
            tf.summary.scalar("pretrain/loss", loss, step=epoch)

        print(
            f"epoch {epoch}: actor loss: {actor_loss}; critic loss: {critic_loss}; regularizer_loss: {regularizer_loss}, total loss {loss}"
        )
        epoch += 1

In [None]:
env.reset()

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 1., 1.],
        [0., 0., 1., 1.],
    

#### Train on real observations

In [None]:
# main training function
def train_model(
    model: keras.Model,
    epoch_start: int,
    use_kl: bool = True,
    use_adaptive_kl: bool = True,
    shuffle: bool = True,
    use_special: bool = True,
    just_train_critic: bool = False,
) -> int:
    """
    Train the model for `epochs` epochs
    Arguments:
        * model: keras.Model - the model to train
        * epoch_start: int - the epoch to start logging at
        * use_kl: bool - whether or not to stop early if kl divergence > target_kl
        * use_adaptive_kl: bool - whether or not to adapt kl (increase it if divergence > target_kl, decrease it otherwise)
        * shuffle: bool - whether or not to shuffle the observations
        * use_special: bool -whether or not to use special version of discounted sums
        * just_train_critic: bool - whether or not to just train the critic; defaults to False
    """
    global target_kl, pretrain_desired_probs, WIN_REWARD
    # Iterate over the number of epochs
    for epoch in range(epoch_start, epoch_start + epochs):
        # Initialize the sum of the returns, lengths and number of episodes for each epoch
        sum_return = 0
        num_episodes = 0
        episode_return = 0
        episode_length = 0
        lengths = []

        # logging variables
        num_valid = 0
        num_wins = 0
        num_losses = 0
        num_ties = 0

        # Iterate over the steps of each epoch
        observation = env.reset()
        start_time = datetime.now()
        for t in range(steps_per_epoch):
            valid_actions = env.validmoves
            valid_actions = np.array(
                [True if i in valid_actions else False for i in range(env.n_actions)]
            )

            # Get the logits, action, and take one step in the environment
            observation = observation.reshape(1, *env.obs_dim)

            logits, action, value_t = sample_action_value(observation, model)
            # logits, action, value_t = best_action_value(observation, model)

            observation_new, reward, done, valid = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

            # logging variables
            num_valid += 1 if valid else 0
            num_wins += 1 if env.won else 0
            num_losses += 1 if env.lost else 0
            num_ties += 1 if env.tied else 0

            # Get the value and log-probability of the action
            logprobability_t = logprobabilities(logits, action)

            if just_train_critic:
                pretrain_desired_probs[t] = tf.nn.softmax(logits)

            # Store obs, act, rew, v_t, logp_pi_t
            buffer.store(
                observation, action, reward, value_t, logprobability_t, valid_actions
            )

            # Update the observation
            observation = observation_new

            # Finish trajectory if reached to a terminal state
            terminal = done
            if terminal or t == steps_per_epoch - 1:
                last_value = (
                    0
                    if done
                    else model(observation.reshape(1, *env.obs_dim), training=False)[1]
                )
                buffer.finish_trajectory(last_value, use_special=use_special)
                sum_return += episode_return
                lengths.append(episode_length)
                num_episodes += 1
                observation, episode_return, episode_length = env.reset(), 0, 0

            # log
            print(
                f"Step {t+1} / {steps_per_epoch};\t"
                + f"% valid = {num_valid / (t+1):.4f};\t"
                + f"fps: {(t+1)/(datetime.now()-start_time).total_seconds():.2f};\t"
                + f"win rate: {(num_wins/num_episodes if num_episodes > 0 else 0):.2f}\t"
                + f"tie rate: {(num_ties/num_episodes if num_episodes > 0 else 0):.2f}\t"
                + f"loss rate: {(num_losses/num_episodes if num_episodes > 0 else 0):.2f}",
                end="\r",
            )
        print()

        (
            observation_buffer,
            action_buffer,
            advantage_buffer,
            return_buffer,
            logprobability_buffer,
            valid_action_buffer,
        ) = buffer.get()

        if just_train_critic:
            for it in range(train_iterations):
                train_mod_on_probs(
                    observation_buffer,
                    pretrain_desired_probs[:steps_per_epoch],
                    return_buffer,
                    model,
                )
            continue

        # Update the policy and implement early stopping using KL divergence
        kl = 0
        clip_loss = 0
        critic_loss = 0
        invalid_loss = 0
        entropy_loss = 0
        regularizer_loss = 0
        for it in range(train_iterations):
            if shuffle:
                (
                    _observation_buffer,
                    _action_buffer,
                    _advantage_buffer,
                    _return_buffer,
                    _logprobability_buffer,
                    _valid_action_buffer,
                ) = utils.shuffle(
                    observation_buffer,
                    action_buffer,
                    advantage_buffer,
                    return_buffer,
                    logprobability_buffer,
                    valid_action_buffer,
                )
            else:
                (
                    _observation_buffer,
                    _action_buffer,
                    _advantage_buffer,
                    _return_buffer,
                    _logprobability_buffer,
                    _valid_action_buffer,
                ) = (
                    observation_buffer,
                    action_buffer,
                    advantage_buffer,
                    return_buffer,
                    logprobability_buffer,
                    valid_action_buffer,
                )
            temp_clip_loss = 0
            temp_critic_loss = 0
            temp_invalid_loss = 0
            temp_entropy_loss = 0
            temp_regularizer_loss = 0
            stopped_early = False
            # do minibatches
            for i in range(steps_per_epoch // minibatch_size):
                # get the parts of the kl and losses
                (
                    kl,
                    clip_loss_part,
                    critic_loss_part,
                    invalid_loss_part,
                    entropy_loss_part,
                    regularizer_loss_part,
                ) = train_mod(
                    tf.constant(
                        _observation_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # obs
                    tf.constant(
                        _action_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # act
                    tf.constant(
                        _logprobability_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # logprobs
                    tf.constant(
                        _advantage_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # advantages
                    tf.constant(
                        _return_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # returns
                    tf.constant(
                        _valid_action_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # valid actions
                    model,
                )
                # update the temps
                temp_clip_loss += clip_loss_part
                temp_critic_loss += critic_loss_part
                temp_invalid_loss += invalid_loss_part
                temp_entropy_loss += entropy_loss_part
                temp_regularizer_loss += regularizer_loss_part
                if use_kl and kl > 1.5 * target_kl:
                    stopped_early = True
                    if use_adaptive_kl:
                        target_kl *= 1.5
                    break

            # average the temps to get the amount per that pass
            temp_clip_loss /= i + 1
            temp_critic_loss /= i + 1
            temp_invalid_loss /= i + 1
            temp_entropy_loss /= i + 1
            temp_regularizer_loss /= i + 1

            # update the main counts
            clip_loss += temp_clip_loss
            critic_loss += temp_critic_loss
            invalid_loss += temp_invalid_loss
            entropy_loss += temp_entropy_loss
            regularizer_loss += temp_regularizer_loss

            if stopped_early:
                # Early Stopping
                break
        else:
            if use_kl and use_adaptive_kl:
                target_kl /= 1.2

        clip_loss /= it + 1
        critic_loss /= it + 1
        invalid_loss /= it + 1
        entropy_loss /= it + 1
        regularizer_loss /= it + 1

        # Print mean return and length for each epoch
        print(
            f"Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum(lengths) / num_episodes}. STD Length: {np.std(lengths)}"
        )
        print("=" * 64)

        # log scalars
        with summary_writer.as_default():
            # episode info
            tf.summary.scalar("episode/win rate", num_wins / num_episodes, step=epoch)
            tf.summary.scalar("episode/tie rate", num_ties / num_episodes, step=epoch)
            tf.summary.scalar(
                "episode/loss rate", num_losses / num_episodes, step=epoch
            )
            tf.summary.scalar("episode/valid percentage", num_valid / t, step=epoch)
            tf.summary.scalar(
                "episode/mean reward", sum_return / num_episodes, step=epoch
            )
            tf.summary.scalar(
                "episode/mean length", sum(lengths) / num_episodes, step=epoch
            )
            tf.summary.scalar("episode/std length", np.std(lengths), step=epoch)

            # training info
            tf.summary.scalar("train/clip_loss", clip_loss, step=epoch)
            tf.summary.scalar("train/critic_loss", critic_loss, step=epoch)
            tf.summary.scalar("train/invalid_loss", invalid_loss, step=epoch)
            tf.summary.scalar("train/entropy_loss", entropy_loss, step=epoch)
            tf.summary.scalar("train/regularizer_loss", regularizer_loss, step=epoch)
            tf.summary.scalar("train/kl", kl, step=epoch)
            tf.summary.scalar(
                "train/total_loss",
                clip_loss
                + critic_loss * v_coef
                + invalid_loss * invalid_coef
                + entropy_loss * entropy_coef
                + regularizer_loss * reg_coef,
                step=epoch,
            )
            tf.summary.scalar("train/num_iterations", it + 1, step=epoch)

        # save every 5
        if epoch % 5 == 0:
            model.save("model_temp.keras")

        WIN_REWARD += 0.1 / 240

    return epoch + 1

In [None]:
if not pretrain:
    epoch = train_model(
        model,
        epoch,
        use_kl=True,
        use_adaptive_kl=False,
        shuffle=True,
        use_special=False,
        just_train_critic=False,
    )

2023-08-17 21:23:04.312953: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-17 21:23:04.333480: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600


Step 2560 / 2560;	% valid = 0.9723;	fps: 176.24;	win rate: 0.39	tie rate: 0.20	loss rate: 0.38


2023-08-17 21:23:20.467192: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fca6cfd2750 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-17 21:23:20.467240: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2023-08-17 21:23:20.476333: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-17 21:23:20.632821: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch: 2292. Mean Return: 0.03677083333333338. Mean Length: 26.666666666666668. STD Length: 5.3864542037307706
Step 2560 / 2560;	% valid = 0.9852;	fps: 173.41;	win rate: 0.41	tie rate: 0.23	loss rate: 0.33
Epoch: 2293. Mean Return: 0.4299609375000002. Mean Length: 26.666666666666668. STD Length: 4.799450199993977
Step 2560 / 2560;	% valid = 0.9680;	fps: 179.54;	win rate: 0.38	tie rate: 0.26	loss rate: 0.30
Epoch: 2294. Mean Return: -0.0720199275362318. Mean Length: 27.82608695652174. STD Length: 4.986086501569565
Step 2560 / 2560;	% valid = 0.9648;	fps: 177.14;	win rate: 0.33	tie rate: 0.21	loss rate: 0.40
Epoch: 2295. Mean Return: -0.23911842105263176. Mean Length: 26.94736842105263. STD Length: 5.892668810171771
Step 2560 / 2560;	% valid = 0.9641;	fps: 194.31;	win rate: 0.46	tie rate: 0.14	loss rate: 0.34
Epoch: 2296. Mean Return: -0.1560992907801418. Mean Length: 27.23404255319149. STD Length: 5.787781576101271
Step 2560 / 2560;	% valid = 0.9906;	fps: 190.01;	win rate: 0.41	tie rate

KeyboardInterrupt: 

## Save

In [None]:
# remove previous
os.system(f"rm models/{model_name}.keras")
time.sleep(1)

# save
model.save(f"models/{model_name}.keras")

## Evaluate

In [None]:
def evaluate_model(model: keras.Model, num_episodes: int = 10):
    mean_reward = 0
    mean_len = 0
    num_won = 0
    num_lost = 0
    for _ in range(num_episodes):
        # initialize vars
        obs = env.reset()
        done = False
        i = 0

        # complete a round
        while not done:
            # step process
            _, action, _ = best_action_value(obs.reshape(1, *obs.shape), model)
            obs, reward, done, _ = env.step(action[0].numpy())

            # update counts
            mean_reward += reward
            i += 1
        # update mean length
        mean_len += i

        if env.won:
            num_won += 1
        if env.lost:
            num_lost += 1

    # average them
    mean_reward /= num_episodes
    mean_len /= num_episodes
    print("Episode mean reward:", mean_reward)
    print("Episode mean length:", mean_len)
    print(f"win rate: {num_won / num_episodes}, loss rate: {num_lost / num_episodes}")

In [None]:
evaluate_model(model, num_episodes=200)