# Ultimate TTT Model

## Setup

In [1]:
import tensorflow as tf
import keras
from keras import layers
import scipy.signal

# used for fps logging
from datetime import datetime

# sanity check
tf.config.list_physical_devices()

2023-07-07 20:05:54.643969: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-07 20:05:55.897233: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-07 20:05:55.914236: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Env

In [2]:
# Env Constants
MAX_TIMESTEPS = 50

# board constants
ROWS = 3
COLS = 3
CELLS = 9

# socket constants
S_PORT = 8000
A_PORT = 8001
R_PORT = 8002
MAX_MSG_SIZE = 512

# reward parameters
WIN_REWARD = 1  # 1
CELL_REWARD = 0.1  # 0.3
VALID_REWARD = 0.01
INVALID_PENALTY = -1
LOSS_PENALTY = -.3 - VALID_REWARD - CELL_REWARD # 0.3

In [3]:
# used to send data
import os
import time
import socket

# proto definitions
import py.board_pb2 as pb

# misc
from typing import Tuple

# math
import numpy as np


# opponents
class Opponent:
    def get_action(self, obs) -> int:
        pass


class RandomOpponent(Opponent):
    def __init__(self, n_actions) -> None:
        self.n_actions = n_actions

    def get_action(self, obs) -> int:
        return np.random.randint(0, self.n_actions)


# env
class UltimateTicTacToeEnv:
    obs_dim = (9, 9, 4)
    n_actions = CELLS * CELLS

    def __init__(
        self, opponent: Opponent = RandomOpponent(n_actions), max_timesteps: int = 81
    ) -> None:
        self.s_conn, self.a_conn, self.r_conn = None, None, None
        self.opponent = opponent
        self.max_timesteps = max_timesteps
        self.reset()

    def _receive(self, conn: socket.socket, tp: type):
        ret = tp()
        b = conn.recv(MAX_MSG_SIZE)
        ret.ParseFromString(b)
        return ret

    def _get_return(self) -> pb.ReturnMessage:
        return self._receive(self.r_conn, pb.ReturnMessage)

    def _get_state(self) -> pb.StateMessage:
        return self._receive(self.s_conn, pb.StateMessage)

    def _make_coord(self, idx) -> pb.Coord:
        return pb.Coord(row=idx // COLS, col=idx % COLS)

    def _send_action(self, move) -> None:
        action = pb.ActionMessage(move=move)
        self.a_conn.send(action.SerializeToString())

    def _to_idx(self, coord: pb.Coord) -> int:
        return coord.row * COLS + coord.col

    def _to_multi_idx(self, move: pb.Move) -> int:
        return self._to_idx(move.large) * CELLS + self._to_idx(move.small)

    def _process_state(self, state: pb.StateMessage) -> np.ndarray:
        """
        The structure of the state:
        (9, 9, 4)
        Outer 9 represent board cells
        inner 9 represent the cell spaces
        each space has 3 objects:
            space owner (0, 1, 2) representing if the space is claimed or not
            cell owner (0, 1, 2) representing if the cell the space belongs to is claimed or not
            curcellornot (0, 1); 1 if the space belongs to the current cell, 0 if not
            turn (1, 2) 1 if the current turn is player1, 2 if the current turn is player2
        """
        board_state = np.zeros(self.obs_dim)
        for cell_idx in range(len(state.board.cells)):
            for space_idx in range(len(state.board.cells[cell_idx].spaces)):
                board_state[cell_idx, space_idx, 0] = (
                    state.board.cells[cell_idx].spaces[space_idx].val
                )
                board_state[cell_idx, space_idx, 1] = state.cellowners[cell_idx]
                board_state[cell_idx, space_idx, 2] = (
                    1 if self._to_idx(state.board.curCell) == cell_idx else 0
                )
                board_state[cell_idx, space_idx, 3] = state.turn

        return board_state

    def _get_exploration_reward(self, action: int, msg: pb.ReturnMessage) -> float:
        if msg.valid:
            return VALID_REWARD
        return INVALID_PENALTY

    def _get_win_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for winning if the game was won
        """
        # the turn sent in the return message should still be the caller's turn
        if msg.state.winner == msg.state.turn:
            if self.player_turn:
                self.won = True
                return WIN_REWARD
            else:
                self.lost = True
                return LOSS_PENALTY
        return 0

    def _get_cell_reward(self, msg: pb.ReturnMessage) -> float:
        """
        Get's the reward for claiming a cell if a cell was claimed
        """
        if self.prev_cellowners == msg.state.cellowners:
            return 0
        elif list(msg.state.cellowners).count(
            msg.state.turn
        ) > self.prev_cellowners.count(msg.state.turn):
            self.prev_cellowners = list(msg.state.cellowners)
            return CELL_REWARD
        return 0

    def _get_reward(self, action: pb.Move, msg: pb.ReturnMessage) -> float:
        return (
            self._get_exploration_reward(action, msg)
            + self._get_cell_reward(msg)
            + self._get_win_reward(msg)
        )

    def _step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates self.done
        """
        # send action and get response
        self._send_action(self.to_move(action))
        ret_message = self._get_return()

        # return information
        reward = self._get_reward(action, ret_message)
        self.done = ret_message.state.done
        if not self.done:
            return self.observe(), reward, self.done, ret_message.valid
        else:
            return (
                self._process_state(ret_message.state),
                reward,
                self.done,
                ret_message.valid,
            )

    def _take_opponent_turn(self) -> Tuple[np.ndarray, float, bool, bool]:
        valid = False
        while not valid:
            obs, reward, done, valid = self._step(
                self.opponent.get_action(self.cur_state)
            )
        return obs, reward, done, valid

    def _reset_vars(self):
        self.prev_cellowners = [pb.NONE] * 9
        self.cur_state = None  # the current state; used for debugging
        self.won = False  # whether or not the player won
        self.done = False  # if the game is over
        self.lost = False  # whether or not the player lost
        self.player_turn = True  # whether or not it is the player's turn
        self.cur_timestep = 0

        self.s_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.a_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.r_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        self.s_conn.connect(("", 8000))
        self.a_conn.connect(("", 8001))
        self.r_conn.connect(("", 8002))

    # public section
    def observe(self) -> np.ndarray:
        """
        Updates self.cur_state and self._turn
        """
        state = self._get_state()
        self.cur_state = state
        self._turn = state.turn
        return self._process_state(state)

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        """
        Updates current timestep

        Returns:
            - next state
            - reward for the action
            - done / not done
            - valid / invalid
        """
        self.player_turn = True
        obs, reward, done, valid = self._step(action)
        self.cur_timestep += 1
        done = done or self.cur_timestep > self.max_timesteps

        # take opponents turn
        if valid and not done:
            self.player_turn = False
            obs, reward2, done, _ = self._take_opponent_turn()
            if self.done and self.lost:
                return obs, reward + reward2, done, valid
            else:
                return obs, reward, done, valid
        return obs, reward, done, valid

    def turn(self):
        return self._turn

    def reset(self) -> np.ndarray:
        while 1:
            try:
                self.cleanup()
                # self.pid = os.spawnl(os.P_NOWAIT, "uttt", "uttt", "aivai")
                ret = os.system("./uttt aivai &")
                time.sleep(0.01)
                self._reset_vars()
                break
            except ConnectionRefusedError:
                pass
        return self.observe()

    def cleanup(self):
        os.system("killall -q uttt")
        if self.s_conn is not None:
            self.s_conn.close()
            self.r_conn.close()
            self.a_conn.close()

    def __del__(self):
        self.cleanup()

    def to_move(self, idx: int) -> pb.Move:
        outer_idx = idx // CELLS
        inner_idx = idx % CELLS

        return pb.Move(
            large=self._make_coord(outer_idx), small=self._make_coord(inner_idx)
        )

In [4]:
env = UltimateTicTacToeEnv(max_timesteps=MAX_TIMESTEPS)

## Buffer

In [5]:
# buffer related hyperparameters
gamma = 0.99
lam = 0.97

In [6]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, *observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )

## Loss

In [7]:
# loss related hyperparameters
clip_ratio = 0.1
target_kl = 0.03
v_coef = 0.5
reg_coef = 1
entropy_coef = 0.05

In [8]:
@tf.function
def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, UltimateTicTacToeEnv.n_actions) * logprobabilities_all, axis=1
    )
    return logprobability


# Sample action from actor
@tf.function
def sample_action_value(observation, model: keras.Model):
    logits, value = model(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action, value


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_mod(
    observation_buffer,
    action_buffer,
    logprobability_buffer,
    advantage_buffer,
    return_buffer,
    model: keras.Model,
):
    with tf.GradientTape() as tape:
        logits, values = model(observation_buffer)

        new_probs = logprobabilities(logits, action_buffer)
        # ratio = E(new_probs / old_probs)
        # this subtraction method is a way to do this
        ratio = tf.exp(new_probs - logprobability_buffer)

        # L_clip = E_t * ( min( r_t*A_t, clip(r_t, 1-e, 1+e)*A_t ) )
        clip_loss = -tf.reduce_mean(
            tf.minimum(
                ratio * advantage_buffer,
                tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio)
                * advantage_buffer,
            )
        )

        regularizer_loss = tf.reduce_sum(model.losses)

        # critic loss = MSE
        critic_loss = tf.keras.losses.mse(return_buffer, tf.squeeze(values))

        # entropy loss to encourage exploration
        entropy_loss = -tf.reduce_mean(-new_probs)

        # full loss
        loss = (
            clip_loss
            + critic_loss * v_coef
            + entropy_loss * entropy_coef
            + regularizer_loss * reg_coef
        )

    policy_grads = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(policy_grads, model.trainable_variables))

    logits, _ = model(observation_buffer)
    kl = tf.reduce_mean(logprobability_buffer - logprobabilities(logits, action_buffer))
    kl = tf.reduce_sum(kl)

    return kl, clip_loss, critic_loss, entropy_loss, regularizer_loss

## Model

In [9]:
model_name = "ppo17"
load_model = True

In [10]:
def create_shared_layers(input_layer):
    # conv layers
    x = layers.Conv1D(128, 9, padding="valid", activation="selu")(input_layer)
    x = layers.Conv2D(
        2048, (9, 1), padding="valid", activation="selu", activity_regularizer="l2"
    )(x)
    x = layers.Dropout(0.4)(x)

    # dense layers
    x = layers.Flatten()(x)
    x = layers.Dense(2048, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(2048, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(1024, activation="selu", activity_regularizer="l2")(x)
    x = layers.Dropout(0.3)(x)
    return x


def create_model():
    # model inputs
    inputs = tf.keras.Input(shape=UltimateTicTacToeEnv.obs_dim)
    x = create_shared_layers(inputs)
    logits = tf.keras.layers.Dense(UltimateTicTacToeEnv.n_actions)(x)
    values = tf.keras.layers.Dense(1)(x)
    return tf.keras.Model(inputs=inputs, outputs=(logits, values))


if load_model and os.path.exists(f"models/{model_name}.keras"):
    model = tf.keras.models.load_model(f"models/{model_name}.keras")
else:
    print("creating model...")
    model = create_model()

2023-07-07 20:05:56.008574: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


2023-07-07 20:05:56.008823: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-07 20:05:56.008978: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-07 20:05:56.064145: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9, 9, 4)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 9, 1, 128)    4736        ['input_1[0][0]']                
                                                                                                  
 conv2d (Conv2D)                (None, 1, 1, 2048)   2361344     ['conv1d[0][0]']                 
                                                                                                  
 dropout (Dropout)              (None, 1, 1, 2048)   0           ['conv2d[0][0]']                 
                                                                                              

## Optimizers

In [12]:
# learning rate hyperparams
learning_rate = 1e-4

In [13]:
if load_model and os.path.exists(f"models/{model_name}.keras"):
    optim: tf.keras.optimizers.Adam = model.optimizer
    optim.learning_rate = learning_rate
else:
    optim = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optim)

In [14]:
print(optim.learning_rate)

<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=1e-04>


## Train

In [15]:
# training time hyperparameters
train_iterations = 3
epochs = 12
minibatch_size = 4096 * 2
steps_per_epoch = minibatch_size * 2  # should be a multiple of minibatch_size

In [16]:
epoch = 163
summary_writer = tf.summary.create_file_writer(f"./logs/{model_name}")

In [17]:
buffer = Buffer(UltimateTicTacToeEnv.obs_dim, steps_per_epoch, gamma=gamma, lam=lam)

In [18]:
# shuffle
def get_shuffled_values(*buffers):
    # Get values from the buffer
    # 0 - observation_buffer,
    # 1 - action_buffer,
    # 2 - advantage_buffer,
    # 3 - return_buffer,
    # 4 - logprobability_buffer
    temp_values = list(zip(buffers[0], buffers[1], buffers[2], buffers[3], buffers[4]))
    np.random.shuffle(temp_values)
    return list(zip(*temp_values))


# main training function
def train_model(model, epoch_start) -> int:
    global target_kl
    # Iterate over the number of epochs
    for epoch in range(epoch_start, epoch_start + epochs):
        # Initialize the sum of the returns, lengths and number of episodes for each epoch
        sum_return = 0
        sum_length = 0
        num_episodes = 0
        episode_return = 0
        episode_length = 0

        # logging variables
        num_valid = 0
        num_wins = 0
        num_losses = 0

        # Iterate over the steps of each epoch
        observation = env.reset()
        start_time = datetime.now()
        for t in range(steps_per_epoch):
            # Get the logits, action, and take one step in the environment
            observation = observation.reshape(1, *env.obs_dim)
            logits, action, value_t = sample_action_value(observation, model)
            observation_new, reward, done, valid = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

            # logging variables
            num_valid += 1 if valid else 0
            num_wins += 1 if env.won else 0
            num_losses += 1 if env.lost else 0

            # Get the value and log-probability of the action
            logprobability_t = logprobabilities(logits, action)

            # Store obs, act, rew, v_t, logp_pi_t
            buffer.store(observation, action, reward, value_t, logprobability_t)

            # Update the observation
            observation = observation_new

            # Finish trajectory if reached to a terminal state
            terminal = done
            if terminal or t == steps_per_epoch - 1:
                last_value = (
                    0 if done else model(observation.reshape(1, *env.obs_dim))[1]
                )
                buffer.finish_trajectory(last_value)
                sum_return += episode_return
                sum_length += episode_length
                num_episodes += 1
                observation, episode_return, episode_length = env.reset(), 0, 0

            # log
            print(
                f"Step {t+1} / {steps_per_epoch};\t"
                + f"% valid = {num_valid / (t+1):.4f};\t"
                + f"fps: {(t+1)/(datetime.now()-start_time).total_seconds():.2f};\t"
                + f"win rate: {(num_wins/num_episodes if num_episodes > 0 else 0):.2f}\t"
                + f"loss rate: {(num_losses/num_episodes if num_episodes > 0 else 0):.2f}",
                end="\r",
            )
        print()

        (
            observation_buffer,
            action_buffer,
            advantage_buffer,
            return_buffer,
            logprobability_buffer,
        ) = buffer.get()

        # Update the policy and implement early stopping using KL divergence
        kl = 0
        clip_loss = 0
        critic_loss = 0
        entropy_loss = 0
        regularizer_loss = 0
        for it in range(train_iterations):
            (
                _observation_buffer,
                _action_buffer,
                _advantage_buffer,
                _return_buffer,
                _logprobability_buffer,
            ) = get_shuffled_values(
                observation_buffer,
                action_buffer,
                advantage_buffer,
                return_buffer,
                logprobability_buffer,
            )
            temp_clip_loss = 0
            temp_regularizer_loss = 0
            temp_critic_loss = 0
            temp_entropy_loss = 0
            stopped_early = False
            # do minibatches
            for i in range(steps_per_epoch // minibatch_size):
                # get the parts of the kl and losses
                (
                    kl,
                    clip_loss_part,
                    critic_loss_part,
                    entropy_loss_part,
                    regularizer_loss_part,
                ) = train_mod(
                    tf.constant(
                        _observation_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # obs
                    tf.constant(
                        _action_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # act
                    tf.constant(
                        _logprobability_buffer[
                            i * minibatch_size : (i + 1) * minibatch_size
                        ]
                    ),  # logprobs
                    tf.constant(
                        _advantage_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # advantages
                    tf.constant(
                        _return_buffer[i * minibatch_size : (i + 1) * minibatch_size]
                    ),  # returns
                    model,
                )
                # update the temps
                temp_clip_loss += clip_loss_part
                temp_critic_loss += critic_loss_part
                temp_entropy_loss += entropy_loss_part
                temp_regularizer_loss += regularizer_loss_part
                if kl > 1.5 * target_kl:
                    stopped_early = True
                    target_kl *= 1.5
                    break

            # average the temps to get the amount per that pass
            temp_clip_loss /= i + 1
            temp_regularizer_loss /= i + 1
            temp_critic_loss /= i + 1
            temp_entropy_loss /= i + 1

            # update the main counts
            clip_loss += temp_clip_loss
            regularizer_loss += temp_regularizer_loss
            critic_loss += temp_critic_loss
            entropy_loss += temp_entropy_loss

            if stopped_early:
                # Early Stopping
                break
        else:
            target_kl /= 1.2

        clip_loss /= it + 1
        critic_loss /= it + 1
        regularizer_loss /= it + 1
        entropy_loss /= it + 1

        # Print mean return and length for each epoch
        print(
            f"Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
        )
        print("=" * 64)

        # log scalars
        with summary_writer.as_default():
            # episode info
            tf.summary.scalar("episode/win rate", num_wins / num_episodes, step=epoch)
            tf.summary.scalar(
                "episode/loss rate", num_losses / num_episodes, step=epoch
            )
            tf.summary.scalar("episode/valid percentage", num_valid / t, step=epoch)
            tf.summary.scalar(
                "episode/mean reward", sum_return / num_episodes, step=epoch
            )
            tf.summary.scalar(
                "episode/mean length", sum_length / num_episodes, step=epoch
            )

            # training info
            tf.summary.scalar("train/clip_loss", clip_loss, step=epoch)
            tf.summary.scalar("train/regularizer_loss", regularizer_loss, step=epoch)
            tf.summary.scalar("train/kl", kl, step=epoch)
            tf.summary.scalar("train/critic_loss", critic_loss, step=epoch)
            tf.summary.scalar("train/entropy_loss", entropy_loss, step=epoch)
            tf.summary.scalar(
                "train/total_loss",
                clip_loss + regularizer_loss + critic_loss,
                step=epoch,
            )
            tf.summary.scalar("train/num_iterations", it + 1, step=epoch)

        # save every 10
        if epoch % 10 == 0:
            model.save("model_temp.keras")

    return epoch + 1

In [19]:
epoch = train_model(model, epoch)

2023-07-07 20:05:56.713243: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Step 16384 / 16384;	% valid = 0.8950;	fps: 196.94;	win rate: 0.17	loss rate: 0.79


2023-07-07 20:07:21.175423: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8902
2023-07-07 20:07:21.592552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fedf0903d10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-07 20:07:21.592574: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-07-07 20:07:21.595314: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-07 20:07:21.698572: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch: 164. Mean Return: -1.831153324287654. Mean Length: 22.230664857530527
Step 16384 / 16384;	% valid = 0.8954;	fps: 194.34;	win rate: 0.16	loss rate: 0.79
Epoch: 165. Mean Return: -1.8453415300546476. Mean Length: 22.382513661202186
Step 16384 / 16384;	% valid = 0.8810;	fps: 201.42;	win rate: 0.18	loss rate: 0.77
Epoch: 166. Mean Return: -2.148517006802723. Mean Length: 22.291156462585032
Step 16384 / 16384;	% valid = 0.8780;	fps: 197.27;	win rate: 0.17	loss rate: 0.78
Epoch: 167. Mean Return: -2.233036935704516. Mean Length: 22.41313269493844
Step 16384 / 16384;	% valid = 0.8694;	fps: 198.50;	win rate: 0.15	loss rate: 0.80
Epoch: 168. Mean Return: -2.4615890410958885. Mean Length: 22.443835616438356
Step 16384 / 16384;	% valid = 0.8824;	fps: 200.71;	win rate: 0.16	loss rate: 0.80
Epoch: 169. Mean Return: -2.165958620689657. Mean Length: 22.59862068965517
Step 16384 / 16384;	% valid = 0.8792;	fps: 196.50;	win rate: 0.16	loss rate: 0.80
Epoch: 170. Mean Return: -2.242782369146009. M

## Save

In [None]:
# remove previous
os.system(f"rm models/{model_name}.keras")
time.sleep(1)

# save
model.save(f"models/{model_name}.keras")

## Evaluate

In [None]:
def evaluate_model(model: keras.Model, num_episodes: int = 100):
    mean_reward = 0
    mean_len = 0
    for _ in range(num_episodes):
        # initialize vars
        obs = env.reset()
        done = False
        i = 0

        # complete a round
        while not done:
            # step process
            logits, _ = model(np.expand_dims(obs, axis=0))
            logits = logits.numpy().flatten()
            action = np.argmax(logits)
            obs, reward, done, _ = env.step(action)

            # update counts
            mean_reward += reward
            i += 1
        # update mean length
        mean_len += i

    # average them
    mean_reward /= num_episodes
    mean_len /= num_episodes
    print("Episode mean reward:", mean_reward)
    print("Episode mean length:", mean_len)

In [None]:
evaluate_model(model)

Episode mean reward: -1.6377999999999826
Episode mean length: 21.34
