# Ultimate TTT Model

## Setup

In [1]:
import tensorflow as tf
import keras
from keras import layers
import scipy.signal

# used for fps logging
from datetime import datetime

# sanity check
tf.config.list_physical_devices()

2023-07-18 10:04:01.946831: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-18 10:04:04.145988: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-18 10:04:04.268508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import configparser

config = configparser.ConfigParser()
config.read("train.ini")

['train.ini']

## Env

In [3]:
# Env Constants
MAX_TIMESTEPS = config["ENV"].getint("MAX_TIMESTEPS")

# board constants
ROWS = config["ENV"].getint("ROWS")
COLS = config["ENV"].getint("COLS")
CELLS = config["ENV"].getint("CELLS")

# socket constants
S_PORT = config["ENV"].getint("S_PORT")
A_PORT = config["ENV"].getint("A_PORT")
R_PORT = config["ENV"].getint("R_PORT")
MAX_MSG_SIZE = config["ENV"].getint("MAX_MSG_SIZE")

# reward parameters
WIN_REWARD = config["REWARD"].getfloat("WIN_REWARD")
CELL_REWARD = config["REWARD"].getfloat("CELL_REWARD")
VALID_REWARD = config["REWARD"].getfloat("VALID_REWARD")
INVALID_PENALTY = config["REWARD"].getfloat("INVALID_PENALTY")
LOSS_PENALTY = config["REWARD"].getfloat("LOSS_PENALTY")

# misc
SLEEP_TIME = config["ENV"].getfloat("SLEEP_TIME")

In [4]:
# Opponents

# math
import numpy as np


# opponents
class Opponent:
    def get_action(self, obs) -> int:
        pass


class RandomOpponent(Opponent):
    """
    Makes completely random moves
    """

    def __init__(self, n_actions) -> None:
        self.n_actions = n_actions

    def get_action(self, obs) -> int:
        return np.random.randint(0, self.n_actions)


class ValidCurCellRandomOpponent(RandomOpponent):
    """
    Makes valid moves when there is a cur cell
    """

    def __init__(self, n_actions) -> None:
        super().__init__(n_actions)

    def get_cur_cell(self, obs: np.ndarray) -> tuple[bool, int]:
        # determine if there is a current cell
        cur_cell_exists = False
        cur_cell = -1
        for outer in range(obs.shape[0]):
            if obs[outer, 0, 2] == 1:
                cur_cell_exists = True
                cur_cell = outer
                break
        return cur_cell_exists, cur_cell

    def get_action(self, obs: np.ndarray) -> int:
        cur_cell_exists, cur_cell = self.get_cur_cell(obs)
        # make a valid move if there is a current cell
        if cur_cell_exists:
            indexes = []
            for space in range(obs.shape[1]):
                if obs[cur_cell, space, 0] == 0:  # if it's an empty space
                    indexes.append(space)
            return cur_cell * CELLS + np.random.choice(indexes)

        return super().get_action(obs)


class WinningRandomOpponent(ValidCurCellRandomOpponent):
    def __init__(self, n_actions) -> None:
        super().__init__(n_actions)

    def get_winning_cells(self, cell: list[int], turn: int) -> list[bool]:
        cell = np.array(cell).reshape((ROWS, COLS)).tolist()
        ret = [[False for _ in range(COLS)] for x in range(ROWS)]

        # check horizontals
        for row in range(ROWS):
            if cell[row].count(turn) == 2 and cell[row].count(0) == 1:
                # this row is winnable
                winning_cell = cell[row].index(0)
                ret[row][winning_cell] = True

        # check verticals
        for c in range(COLS):
            col = [cell[i][c] for i in range(ROWS)]
            if col.count(turn) == 2 and col.count(0) == 1:
                # this col is winnable
                winning_row = col.index(0)
                ret[winning_row][c] = True

        # check diagonal left
        left_diagonal = [cell[i][i] for i in range(ROWS)]
        if left_diagonal.count(turn) == 2 and left_diagonal.count(0) == 1:
            winning_space = left_diagonal.index(0)
            ret[winning_space][winning_space] = True

        # check diagonal right
        right_diagonal = [cell[i][ROWS - 1 - i] for i in range(ROWS)]
        if right_diagonal.count(turn) == 2 and right_diagonal.count(0) == 1:
            winning_space = right_diagonal.index(0)
            ret[winning_space][ROWS - 1 - winning_space] = True

        return np.array(ret).flatten().tolist()

    def get_winning_action(self, obs: np.ndarray) -> tuple[bool, int]:
        # whose turn it is
        turn = obs[0, 0, 3]

        cur_cell_exists, cur_cell = self.get_cur_cell(obs)

        # go through all the cells and see which are claimed
        owners: list[int] = []
        for outer in range(obs.shape[0]):
            owners.append(obs[outer, 0, 1])

        # see which cells are winning cells
        winning_cells = self.get_winning_cells(owners, turn)
        if True in winning_cells:
            if cur_cell_exists:
                # if the cur cell is a possible winning cell
                if winning_cells[cur_cell]:
                    # get the space owners
                    space_owners = [obs[cur_cell, i, 0] for i in range(obs.shape[1])]
                    winning_spaces = self.get_winning_cells(space_owners, turn)
                    # it is possible
                    if True in winning_spaces:
                        return True, cur_cell * CELLS + winning_spaces.index(True)
            else:
                # we can go anywhere
                for potential_winning_cell_idx in range(CELLS):
                    if winning_cells[potential_winning_cell_idx]:
                        # get the space owners
                        space_owners = [
                            obs[potential_winning_cell_idx, i, 0]
                            for i in range(obs.shape[1])
                        ]
                        winning_spaces = self.get_winning_cells(space_owners, turn)
                        # it is possible
                        if True in winning_spaces:
                            return (
                                True,
                                potential_winning_cell_idx * CELLS
                                + winning_spaces.index(True),
                            )

        # winning isn't possible currently
        return False, -1

    def get_action(self, obs: np.ndarray) -> int:
        winnable, action = self.get_winning_action(obs)
        if winnable:
            return action
        return super().get_action(obs)

In [5]:
# used to send data
import os
import time
import socket

# proto definitions
import py.board_pb2 as pb

# misc
from typing import Tuple


# env
class UltimateTicTacToeEnv:
    obs_dim = (9, 9, 4)
    n_actions = CELLS * CELLS

    def __init__(
        self, opponent: Opponent = RandomOpponent(n_actions), max_timesteps: int = 81
    ) -> None:
        self.s_conn, self.a_conn, self.r_conn = None, None, None
        self.opponent = opponent
        self.max_timesteps = max_timesteps
        self.reset()

    def _receive(self, conn: socket.socket, tp: type):
        ret = tp()
        b = conn.recv(MAX_MSG_SIZE)
        ret.ParseFromString(b)
        return ret

    def _get_return(self) -> pb.ReturnMessage:
        return self._receive(self.r_conn, pb.ReturnMessage)

    def _get_state(self) -> pb.StateMessage:
        return self._receive(self.s_conn, pb.StateMessage)

    def _make_coord(self, idx) -> pb.Coord:
        return pb.Coord(row=idx // COLS, col=idx % COLS)

    def _send_action(self, move) -> None:
        action = pb.ActionMessage(move=move)
        self.a_conn.send(action.SerializeToString())

    def _to_idx(self, coord: pb.Coord) -> int:
        return coord.row * COLS + coord.col

    def _to_multi_idx(self, move: pb.Move) -> int:
        return self._to_idx(move.large) * CELLS + self._to_idx(move.small)

    def _process_state(self, state: pb.StateMessage) -> np.ndarray:
        """
        The structure of the state:
        (9, 9, 4)
        Outer 9 represent board cells
        inner 9 represent the cell spaces
        each space has 3 objects:
            space owner (0, 1, 2) representing if the space is claimed or not
            cell owner (0, 1, 2) representing if the cell the space belongs to is claimed or not
            curcellornot (0, 1); 1 if the space belongs to the current cell, 0 if not
            turn (1, 2) 1 if the current turn is player1, 2 if the current turn is player2
        """
        board_state = np.zeros(self.obs_dim)
        for cell_idx in range(len(state.board.cells)):
            for space_idx in range(len(state.board.cells[cell_idx].spaces)):
                board_state[cell_idx, space_idx, 0] = (
                    state.board.cells[cell_idx].spaces[space_idx].val
                )
                board_state[cell_idx, space_idx, 1] = state.cellowners[cell_idx]
                board_state[cell_idx, space_idx, 2] = (
                    1 if self._to_idx(state.board.curCell) == cell_idx else 0
                )
                board_state[cell_idx, space_idx, 3] = state.turn

        return board_state

    def _get_exploration_reward(self, action: int, msg: pb.ReturnMessage) -> float:
        if msg.valid:
            return VALID_REWARD
        return INVALID_PENALTY

    def _get_win_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for winning if the game was won
        """
        # the turn sent in the return message should still be the caller's turn
        if msg.state.winner == msg.state.turn:
            if self.player_turn:
                self.won = True
                return WIN_REWARD
            else:
                self.lost = True
                return LOSS_PENALTY
        return 0

    def _get_cell_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for claiming a cell if a cell was claimed
        """
        if self.prev_cellowners == msg.state.cellowners:
            return 0
        elif list(msg.state.cellowners).count(
            msg.state.turn
        ) > self.prev_cellowners.count(msg.state.turn):
            self.prev_cellowners = list(msg.state.cellowners)
            return CELL_REWARD
        return 0

    def _get_reward(self, action: pb.Move, msg: pb.ReturnMessage) -> float:
        return (
            self._get_exploration_reward(action, msg)
            + self._get_cell_reward(msg)
            + self._get_win_reward(msg)
        )

    def _step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates self.done
        """
        # send action and get response
        self._send_action(self.to_move(action))
        ret_message = self._get_return()

        # return information
        reward = self._get_reward(action, ret_message)
        self.done = ret_message.state.done
        if not self.done:
            return self.observe(), reward, self.done, ret_message.valid
        else:
            return (
                self._process_state(ret_message.state),
                reward,
                self.done,
                ret_message.valid,
            )

    def _take_opponent_turn(self) -> Tuple[np.ndarray, float, bool, bool]:
        valid = False
        while not valid:
            obs, reward, done, valid = self._step(
                self.opponent.get_action(self.cur_state)
            )
        return obs, reward, done, valid

    def _reset_vars(self):
        self.prev_cellowners = [pb.NONE] * 9
        self.cur_state = None  # the current state; used for debugging
        self.won = False  # whether or not the player won
        self.done = False  # if the game is over
        self.lost = False  # whether or not the player lost
        self.player_turn = True  # whether or not it is the player's turn
        self.cur_timestep = 0

        self.s_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.a_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.r_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        self.s_conn.connect(("", 8000))
        self.a_conn.connect(("", 8001))
        self.r_conn.connect(("", 8002))

    # public section
    def observe(self) -> np.ndarray:
        """
        Updates self.cur_state and self._turn
        """
        state = self._get_state()
        self._turn = state.turn
        self.cur_state = self._process_state(state)
        return self.cur_state

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates current timestep

        Returns:
            - next state
            - reward for the action
            - done / not done
            - valid / invalid
        """
        self.player_turn = True
        obs, reward, done, valid = self._step(action)
        self.cur_timestep += 1
        done = done or self.cur_timestep > self.max_timesteps

        # take opponents turn
        if valid and not done:
            self.player_turn = False
            obs, reward2, done, _ = self._take_opponent_turn()

            # add the "lost" penalty
            if self.done and self.lost:
                return obs, reward + reward2, done, valid
            # penalize losing cells
            elif reward2 == CELL_REWARD + VALID_REWARD:
                return obs, reward - reward2, done, valid
            # nothing special, just return the reward
            return obs, reward, done, valid
        return obs, reward, done, valid

    def turn(self):
        return self._turn

    def reset(self) -> np.ndarray:
        while 1:
            try:
                self.cleanup()
                # self.pid = os.spawnl(os.P_NOWAIT, "uttt", "uttt", "aivai")
                ret = os.system("./uttt aivai &")
                time.sleep(SLEEP_TIME)
                self._reset_vars()
                break
            except ConnectionRefusedError:
                pass
        return self.observe()

    def cleanup(self):
        os.system("killall -q uttt")
        if self.s_conn is not None:
            self.s_conn.close()
            self.r_conn.close()
            self.a_conn.close()

    def __del__(self):
        self.cleanup()

    def to_move(self, idx: int) -> pb.Move:
        outer_idx = idx // CELLS
        inner_idx = idx % CELLS

        return pb.Move(
            large=self._make_coord(outer_idx), small=self._make_coord(inner_idx)
        )

In [6]:
env = UltimateTicTacToeEnv(
    max_timesteps=MAX_TIMESTEPS,
    opponent=WinningRandomOpponent(UltimateTicTacToeEnv.n_actions),
)

## Buffer

In [7]:
# buffer related hyperparameters
gamma = config["BUFFER"].getfloat("GAMMA")
lam = config["BUFFER"].getfloat("LAM")

In [8]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


def special_discounted_cumulative_sums(x, discount, invalid_locats):
    """
    invalid_locats: np.ndarray[bool], True if invalid, False if valid
    """
    # discounts, but doesn't stack INVALID_PENALTY
    # where it was invalid, use 0, else use the positive x
    zeros = np.where(invalid_locats, 0, x)
    # filter this
    filtered = scipy.signal.lfilter([1], [1, float(-discount)], zeros[::-1], axis=0)[
        ::-1
    ]
    # replace so that you have the invalid penalty where it was invalid
    # and the gamma'd reward on valid moves
    return np.where(invalid_locats, x, filtered)


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, *observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = special_discounted_cumulative_sums(
            deltas, self.gamma * self.lam, rewards[:-1] == INVALID_PENALTY
        )
        self.return_buffer[path_slice] = special_discounted_cumulative_sums(
            rewards, self.gamma, rewards == INVALID_PENALTY
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )

## Loss

In [9]:
# loss related hyperparameters
clip_ratio = config["LOSS"].getfloat("CLIP_RATIO")
target_kl = config["LOSS"].getfloat("TARGET_KL")
v_coef = config["LOSS"].getfloat("VALUE_COEFFICIENT")
entropy_coef = config["LOSS"].getfloat("ENTROPY_COEFFICIENT")
reg_coef = config["LOSS"].getfloat("REGULARIZER_COEFFICIENT")

In [10]:
@tf.function
def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, UltimateTicTacToeEnv.n_actions) * logprobabilities_all, axis=1
    )
    return logprobability


# Sample action from actor
@tf.function
def sample_action_value(observation, model: keras.Model):
    logits, value = model(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action, value


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_mod(
    observation_buffer,
    action_buffer,
    logprobability_buffer,
    advantage_buffer,
    return_buffer,
    model: keras.Model,
):
    with tf.GradientTape() as tape:
        logits, values = model(observation_buffer)

        new_probs = logprobabilities(logits, action_buffer)
        # ratio = E(new_probs / old_probs)
        # this subtraction method is a way to do this
        ratio = tf.exp(new_probs - logprobability_buffer)

        # L_clip = E_t * ( min( r_t*A_t, clip(r_t, 1-e, 1+e)*A_t ) )
        clip_loss = -tf.reduce_mean(
            tf.minimum(
                ratio * advantage_buffer,
                tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio)
                * advantage_buffer,
            )
        )

        regularizer_loss = tf.reduce_sum(model.losses)

        # critic loss = MSE
        critic_loss = tf.keras.losses.mse(return_buffer, tf.squeeze(values))

        # entropy loss to encourage exploration
        entropy_loss = -tf.reduce_mean(-new_probs)

        # full loss
        loss = (
            clip_loss
            + critic_loss * v_coef
            + entropy_loss * entropy_coef
            + regularizer_loss * reg_coef
        )

    policy_grads = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(policy_grads, model.trainable_variables))

    logits, _ = model(observation_buffer)
    kl = tf.reduce_mean(logprobability_buffer - logprobabilities(logits, action_buffer))
    kl = tf.reduce_sum(kl)

    return kl, clip_loss, critic_loss, entropy_loss, regularizer_loss

## Model

In [11]:
model_name = "ppo15"
load_model = True

In [12]:
def create_shared_layers(input_layer):
    # conv layers
    x = layers.Conv1D(128, 9, padding="valid", activation="selu")(input_layer)
    x = layers.Conv2D(
        2048, (9, 1), padding="valid", activation="selu", activity_regularizer="l2"
    )(x)
    x = layers.Dropout(0.4)(x)

    # dense layers
    x = layers.Flatten()(x)
    x = layers.Dense(2048, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(2048, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(1024, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(1024, activation="selu", activity_regularizer="l2")(x)
    return x


def create_model():
    # model inputs
    inputs = tf.keras.Input(shape=UltimateTicTacToeEnv.obs_dim)
    x = create_shared_layers(inputs)
    logits = tf.keras.layers.Dense(UltimateTicTacToeEnv.n_actions)(x)
    values = tf.keras.layers.Dense(1)(x)
    return tf.keras.Model(inputs=inputs, outputs=(logits, values))


if load_model and os.path.exists(f"models/{model_name}.keras"):
    print("loading model...")
    model = tf.keras.models.load_model(f"models/{model_name}.keras")
else:
    print("creating model...")
    model = create_model()

loading model...


2023-07-18 10:04:06.926157: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-18 10:04:06.926398: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-18 10:04:06.926534: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9, 9, 4)]            0         []                            
                                                                                                  
 conv1d (Conv1D)             (None, 9, 1, 128)            4736      ['input_1[0][0]']             
                                                                                                  
 conv2d (Conv2D)             (None, 1, 1, 2048)           2361344   ['conv1d[0][0]']              
                                                                                                  
 dropout (Dropout)           (None, 1, 1, 2048)           0         ['conv2d[0][0]']              
                                                                                              

## Optimizers

In [14]:
# learning rate hyperparams
learning_rate = config["OPTIMIZER"].getfloat("LEARNING_RATE")

In [15]:
if load_model and os.path.exists(f"models/{model_name}.keras"):
    optim: tf.keras.optimizers.Adam = model.optimizer
    optim.learning_rate = learning_rate
else:
    optim = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optim)

In [16]:
print(optim.learning_rate)

<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=3e-06>


## Train

In [17]:
# training time hyperparameters
train_iterations = config["TRAIN"].getint("TRAIN_ITERATIONS")
epochs = config["TRAIN"].getint("EPOCHS")
minibatch_size = config["TRAIN"].getint("MINIBATCH_SIZE")
steps_per_epoch = config["TRAIN"].getint("STEPS_PER_EPOCH")

In [18]:
epoch = config["TRAIN"].getint("EPOCH")
summary_writer = tf.summary.create_file_writer(f"./logs/{model_name}")

In [19]:
buffer = Buffer(UltimateTicTacToeEnv.obs_dim, steps_per_epoch, gamma=gamma, lam=lam)

In [20]:
# shuffle
def get_shuffled_values(*buffers):
    # Get values from the buffer
    # 0 - observation_buffer,
    # 1 - action_buffer,
    # 2 - advantage_buffer,
    # 3 - return_buffer,
    # 4 - logprobability_buffer
    temp_values = list(zip(buffers[0], buffers[1], buffers[2], buffers[3], buffers[4]))
    np.random.shuffle(temp_values)
    return list(zip(*temp_values))


# main training function
def train_model(
    model: keras.Model,
    epoch_start: int,
    use_kl: bool = True,
    use_adaptive_kl: bool = True,
    shuffle: bool = True,
) -> int:
    """
    Train the model for `epochs` epochs
    Arguments:
        * model: keras.Model - the model to train
        * epoch_start: int - the epoch to start logging at
        * use_kl: bool - whether or not to stop early if kl divergence > target_kl
        * use_adaptive_kl: bool - whether or not to adapt kl (increase it if divergence > target_kl, decrease it otherwise)
        * shuffle: bool - whether or not to shuffle the observations
    """
    global target_kl
    # Iterate over the number of epochs
    for epoch in range(epoch_start, epoch_start + epochs):
        # Initialize the sum of the returns, lengths and number of episodes for each epoch
        sum_return = 0
        num_episodes = 0
        episode_return = 0
        episode_length = 0
        lengths = []

        # logging variables
        num_valid = 0
        num_wins = 0
        num_losses = 0

        # Iterate over the steps of each epoch
        observation = env.reset()
        start_time = datetime.now()
        for t in range(steps_per_epoch):
            # Get the logits, action, and take one step in the environment
            observation = observation.reshape(1, *env.obs_dim)
            logits, action, value_t = sample_action_value(observation, model)
            observation_new, reward, done, valid = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

            # logging variables
            num_valid += 1 if valid else 0
            num_wins += 1 if env.won else 0
            num_losses += 1 if env.lost else 0

            # Get the value and log-probability of the action
            logprobability_t = logprobabilities(logits, action)

            # Store obs, act, rew, v_t, logp_pi_t
            buffer.store(observation, action, reward, value_t, logprobability_t)

            # Update the observation
            observation = observation_new

            # Finish trajectory if reached to a terminal state
            terminal = done
            if terminal or t == steps_per_epoch - 1:
                last_value = (
                    0 if done else model(observation.reshape(1, *env.obs_dim))[1]
                )
                buffer.finish_trajectory(last_value)
                sum_return += episode_return
                lengths.append(episode_length)
                num_episodes += 1
                observation, episode_return, episode_length = env.reset(), 0, 0

            # log
            print(
                f"Step {t+1} / {steps_per_epoch};\t"
                + f"% valid = {num_valid / (t+1):.4f};\t"
                + f"fps: {(t+1)/(datetime.now()-start_time).total_seconds():.2f};\t"
                + f"win rate: {(num_wins/num_episodes if num_episodes > 0 else 0):.2f}\t"
                + f"loss rate: {(num_losses/num_episodes if num_episodes > 0 else 0):.2f}",
                end="\r",
            )
        print()

        (
            observation_buffer,
            action_buffer,
            advantage_buffer,
            return_buffer,
            logprobability_buffer,
        ) = buffer.get()

        # Update the policy and implement early stopping using KL divergence
        kl = 0
        clip_loss = 0
        critic_loss = 0
        entropy_loss = 0
        regularizer_loss = 0
        for it in range(train_iterations):
            if shuffle:
                (
                    _observation_buffer,
                    _action_buffer,
                    _advantage_buffer,
                    _return_buffer,
                    _logprobability_buffer,
                ) = get_shuffled_values(
                    observation_buffer,
                    action_buffer,
                    advantage_buffer,
                    return_buffer,
                    logprobability_buffer,
                )
            else:
                (
                    _observation_buffer,
                    _action_buffer,
                    _advantage_buffer,
                    _return_buffer,
                    _logprobability_buffer,
                ) = (
                    observation_buffer,
                    action_buffer,
                    advantage_buffer,
                    return_buffer,
                    logprobability_buffer,
                )
            temp_clip_loss = 0
            temp_regularizer_loss = 0
            temp_critic_loss = 0
            temp_entropy_loss = 0
            stopped_early = False
            # do minibatches
            for i in range(steps_per_epoch // minibatch_size):
                # get the parts of the kl and losses
                (
                    kl,
                    clip_loss_part,
                    critic_loss_part,
                    entropy_loss_part,
                    regularizer_loss_part,
                ) = train_mod(
                    tf.constant(
                        _observation_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # obs
                    tf.constant(
                        _action_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # act
                    tf.constant(
                        _logprobability_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # logprobs
                    tf.constant(
                        _advantage_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # advantages
                    tf.constant(
                        _return_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # returns
                    model,
                )
                # update the temps
                temp_clip_loss += clip_loss_part
                temp_critic_loss += critic_loss_part
                temp_entropy_loss += entropy_loss_part
                temp_regularizer_loss += regularizer_loss_part
                if use_kl and kl > 1.5 * target_kl:
                    stopped_early = True
                    if use_adaptive_kl:
                        target_kl *= 1.5
                    break

            # average the temps to get the amount per that pass
            temp_clip_loss /= i + 1
            temp_regularizer_loss /= i + 1
            temp_critic_loss /= i + 1
            temp_entropy_loss /= i + 1

            # update the main counts
            clip_loss += temp_clip_loss
            regularizer_loss += temp_regularizer_loss
            critic_loss += temp_critic_loss
            entropy_loss += temp_entropy_loss

            if stopped_early:
                # Early Stopping
                break
        else:
            if use_kl and use_adaptive_kl:
                target_kl /= 1.2

        clip_loss /= it + 1
        critic_loss /= it + 1
        regularizer_loss /= it + 1
        entropy_loss /= it + 1

        # Print mean return and length for each epoch
        print(
            f"Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum(lengths) / num_episodes}. STD Length: {np.std(lengths)}"
        )
        print("=" * 64)

        # log scalars
        with summary_writer.as_default():
            # episode info
            tf.summary.scalar("episode/win rate", num_wins / num_episodes, step=epoch)
            tf.summary.scalar(
                "episode/loss rate", num_losses / num_episodes, step=epoch
            )
            tf.summary.scalar("episode/valid percentage", num_valid / t, step=epoch)
            tf.summary.scalar(
                "episode/mean reward", sum_return / num_episodes, step=epoch
            )
            tf.summary.scalar(
                "episode/mean length", sum(lengths) / num_episodes, step=epoch
            )
            tf.summary.scalar("episode/std length", np.std(lengths), step=epoch)

            # training info
            tf.summary.scalar("train/clip_loss", clip_loss, step=epoch)
            tf.summary.scalar("train/regularizer_loss", regularizer_loss, step=epoch)
            tf.summary.scalar("train/kl", kl, step=epoch)
            tf.summary.scalar("train/critic_loss", critic_loss, step=epoch)
            tf.summary.scalar("train/entropy_loss", entropy_loss, step=epoch)
            tf.summary.scalar(
                "train/total_loss",
                clip_loss + regularizer_loss + critic_loss,
                step=epoch,
            )
            tf.summary.scalar("train/num_iterations", it + 1, step=epoch)

        # save every 5
        if epoch % 5 == 0:
            model.save("model_temp.keras")

    return epoch + 1

In [21]:
epoch = train_model(model, epoch, use_kl=False, use_adaptive_kl=False, shuffle=True)

Step 2 / 75000;	% valid = 1.0000;	fps: 12.76;	win rate: 0.00	loss rate: 0.00

2023-07-17 21:11:22.961783: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Step 75000 / 75000;	% valid = 0.9906;	fps: 420.82;	win rate: 0.92	loss rate: 0.07


2023-07-17 21:14:22.926908: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2023-07-17 21:14:24.802308: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.18GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-07-17 21:14:33.566615: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.18GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


Epoch: 4261. Mean Return: 0.6873828330700431. Mean Length: 19.747235387045812. STD Length: 3.8245949570453277
Step 75000 / 75000;	% valid = 0.9911;	fps: 419.98;	win rate: 0.91	loss rate: 0.08
Epoch: 4262. Mean Return: 0.6857199159222304. Mean Length: 19.705727798213346. STD Length: 3.806624171255592
Step 75000 / 75000;	% valid = 0.9929;	fps: 421.30;	win rate: 0.92	loss rate: 0.07
Epoch: 4263. Mean Return: 0.7355340314136154. Mean Length: 19.63350785340314. STD Length: 3.563886305807701
Step 75000 / 75000;	% valid = 0.9938;	fps: 420.73;	win rate: 0.92	loss rate: 0.07
Epoch: 4264. Mean Return: 0.7487706422018342. Mean Length: 19.65923984272608. STD Length: 3.546397644155295
Step 75000 / 75000;	% valid = 0.9928;	fps: 419.57;	win rate: 0.92	loss rate: 0.07
Epoch: 4265. Mean Return: 0.7315800470096674. Mean Length: 19.587359623922694. STD Length: 3.557241067597956
Step 75000 / 75000;	% valid = 0.9926;	fps: 419.42;	win rate: 0.93	loss rate: 0.06
Epoch: 4266. Mean Return: 0.7354569258968351. 

2023/07/17 22:45:09 failed to listen on state port


Step 75000 / 75000;	% valid = 0.9931;	fps: 421.56;	win rate: 0.92	loss rate: 0.07
Epoch: 4290. Mean Return: 0.7408468799160995. Mean Length: 19.664394336654432. STD Length: 3.5813427549454153
Step 75000 / 75000;	% valid = 0.9904;	fps: 423.72;	win rate: 0.92	loss rate: 0.07
Epoch: 4291. Mean Return: 0.6849341412012667. Mean Length: 19.75763962065332. STD Length: 3.874378445375159
Step 75000 / 75000;	% valid = 0.9909;	fps: 423.53;	win rate: 0.92	loss rate: 0.07
Epoch: 4292. Mean Return: 0.6925039370078745. Mean Length: 19.68503937007874. STD Length: 3.7785847467985567
Step 75000 / 75000;	% valid = 0.9917;	fps: 423.12;	win rate: 0.92	loss rate: 0.07
Epoch: 4293. Mean Return: 0.7123730597211256. Mean Length: 19.73164956590371. STD Length: 3.715701963720229
Step 75000 / 75000;	% valid = 0.9916;	fps: 422.26;	win rate: 0.93	loss rate: 0.07
Epoch: 4294. Mean Return: 0.7106703325477899. Mean Length: 19.638648860958366. STD Length: 3.7112080311630167
Step 75000 / 75000;	% valid = 0.9919;	fps: 42

2023/07/18 01:53:19 failed to listen on state port


Step 75000 / 75000;	% valid = 0.9930;	fps: 420.63;	win rate: 0.93	loss rate: 0.07
Epoch: 4350. Mean Return: 0.7415127598000533. Mean Length: 19.73164956590371. STD Length: 3.5894362005966323
Step 75000 / 75000;	% valid = 0.9929;	fps: 421.82;	win rate: 0.92	loss rate: 0.07
Epoch: 4351. Mean Return: 0.7373555672268917. Mean Length: 19.695378151260503. STD Length: 3.5920020376545208
Step 75000 / 75000;	% valid = 0.9917;	fps: 421.97;	win rate: 0.92	loss rate: 0.07
Epoch: 4352. Mean Return: 0.70928384736017. Mean Length: 19.602718243596446. STD Length: 3.7339001572461683
Step 75000 / 75000;	% valid = 0.9941;	fps: 422.16;	win rate: 0.92	loss rate: 0.07
Epoch: 4353. Mean Return: 0.7570083463745436. Mean Length: 19.561815336463223. STD Length: 3.4937849941139825
Step 75000 / 75000;	% valid = 0.9942;	fps: 422.15;	win rate: 0.93	loss rate: 0.07
Epoch: 4354. Mean Return: 0.7665005219206685. Mean Length: 19.572025052192068. STD Length: 3.445221535913576
Step 75000 / 75000;	% valid = 0.9916;	fps: 4

2023/07/18 03:38:50 failed to listen on state port


Step 75000 / 75000;	% valid = 0.9921;	fps: 422.02;	win rate: 0.91	loss rate: 0.08
Epoch: 4384. Mean Return: 0.7101990570979583. Mean Length: 19.64379256155055. STD Length: 3.652376318515895
Step 75000 / 75000;	% valid = 0.9936;	fps: 421.33;	win rate: 0.93	loss rate: 0.07
Epoch: 4385. Mean Return: 0.7542633228840125. Mean Length: 19.592476489028215. STD Length: 3.4688591999580716
Step 29644 / 75000;	% valid = 0.9966;	fps: 417.97;	win rate: 0.92	loss rate: 0.08

## Save

In [21]:
# remove previous
os.system(f"rm models/{model_name}.keras")
time.sleep(1)

# save
model.save(f"models/{model_name}.keras")

2023-07-18 10:04:18.974878: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 16777216 exceeds 10% of free system memory.
2023-07-18 10:04:18.995663: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 16777216 exceeds 10% of free system memory.


## Evaluate

In [22]:
def evaluate_model(model: keras.Model, num_episodes: int = 10):
    mean_reward = 0
    mean_len = 0
    num_won = 0
    num_lost = 0
    for _ in range(num_episodes):
        # initialize vars
        obs = env.reset()
        done = False
        i = 0

        # complete a round
        while not done:
            # step process
            logits, _ = model(np.expand_dims(obs, axis=0))
            logits = logits.numpy().flatten()
            action = np.argmax(logits)
            obs, reward, done, _ = env.step(action)

            # update counts
            mean_reward += reward
            i += 1
        # update mean length
        mean_len += i

        if env.won:
            num_won += 1
        if env.lost:
            num_lost += 1

    # average them
    mean_reward /= num_episodes
    mean_len /= num_episodes
    print("Episode mean reward:", mean_reward)
    print("Episode mean length:", mean_len)
    print(f"win rate: {num_won / num_episodes}, loss rate: {num_lost / num_episodes}")

In [23]:
evaluate_model(model, num_episodes=200)

2023-07-18 10:04:30.265398: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Episode mean reward: 0.5906000000000351
Episode mean length: 19.63
win rate: 0.93, loss rate: 0.055
