In [1]:
import tensorflow as tf
from IPython.display import clear_output
%run GameFunctions.ipynb
%run Player.ipynb
%run Stats.ipynb
%run TrainCritic.ipynb
%run Breed.ipynb

In [2]:
class ActorCritic():
    def genRewards(size, win):
        rewardList = [0] * size
        rewardList[size - 1] = 1 if win else -1
        return tf.convert_to_tensor(np.array(rewardList), dtype=tf.float32)
    def compute_loss(action_probs: tf.Tensor,  values: tf.Tensor,  returns: tf.Tensor) -> tf.Tensor:
        eps = np.finfo(np.float32).eps.item()
        returns = tf.cast(returns, float)
        values = tf.cast(values, float)
        values = ((values - tf.math.reduce_mean(values)) / 
                   (tf.math.reduce_std(values) + eps))
        advantage = returns - values
        action_log_probs = tf.math.log(action_probs)
        actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)
        huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
        critic_loss = huber_loss(values, returns)
        return actor_loss + critic_loss
    def get_expected_return(rewards, gamma = 0.95, standardize = False) -> tf.Tensor:
        eps = np.finfo(np.float32).eps.item()
        rewardtensor = rewards
        n = tf.shape(rewardtensor)[0]
        returns = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        # Start from the end of `rewards` and accumulate reward sums
        # into the `returns` array
        rewards = tf.cast(rewards[::-1], dtype=tf.float32)
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + gamma * discounted_sum
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1]

        if standardize:
            returns = ((returns - tf.math.reduce_mean(returns)) / 
                   (tf.math.reduce_std(returns) + eps))
        return returns
    def trainStep(player1, player2):
        with tf.GradientTape(persistent = True) as tape:
            returnDict = GameFunctions.runGame(player1, player2, name1 = player1.name, name2 = player2.name)
            
            winner = returnDict["winner"]
            
            returns1 = ActorCritic.get_expected_return(ActorCritic.genRewards(tf.size(returnDict['p1']['values']).numpy(), winner == 1))
            returns2 = ActorCritic.get_expected_return(ActorCritic.genRewards(tf.size(returnDict['p2']['values']).numpy(), winner == 2))
            
            loss1 = ActorCritic.compute_loss(returnDict['p1']['probs'], returnDict['p1']['values'], returns1)
            loss2 = ActorCritic.compute_loss(returnDict['p2']['probs'], returnDict['p2']['values'], returns2)
        grads1 = tape.gradient(loss1, player1.net.trainable_variables)
        grads2 = tape.gradient(loss2, player2.net.trainable_variables)
        player1.opt.apply_gradients(zip(grads1, player1.net.trainable_variables))
        player2.opt.apply_gradients(zip(grads2, player2.net.trainable_variables))
        player1.updateStats(tf.size(returnDict['p1']['values']).numpy(), winner == 1)
        player2.updateStats(tf.size(returnDict['p2']['values']).numpy(), winner == 2)
        player1.save()
        player2.save()
        returnDict["p1"] = player1
        returnDict["p2"] = player1
        Statistics.writeStatistics()
        return returnDict
    def getCriticData(player1, player2):
        CriticTrainingFunctions.formatInitialCsv()
        returnDict = GameFunctions.runGame(player1.actor, player2.actor, player1.critic, player2.critic, name1 = player1.name, name2 = player2.name)
        winner = returnDict['winner']
        returns1 = ActorCritic.get_expected_return(ActorCritic.genRewards(tf.size(returnDict['p1']['values']).numpy(), winner == 1))
        returns2 = ActorCritic.get_expected_return(ActorCritic.genRewards(tf.size(returnDict['p2']['values']).numpy(), winner == 2))
        CriticTrainingFunctions.writeStates(returnDict['p1']['states'], returns1, returnDict['winner'] == 1, tf.size(returnDict['p1']['values']).numpy())
        CriticTrainingFunctions.writeStates(returnDict['p2']['states'], returns2, returnDict['winner'] == 2, tf.size(returnDict['p2']['values']).numpy())
        return int(tf.size(returnDict['p2']['values']).numpy())
    def maxDamageTest(player):
        returnDict = GameFunctions.testAgainstMaxDamage(player, name1 = player.name)
        winner = returnDict["winner"]
        return winner