In [None]:
from ppo_agent import PPOAgent

In [4]:
import tensorflow as tf
import gymnasium as gym
from agent_configs import PPOConfig, PPOActorConfig, PPOCriticConfig
from game_configs import CartPoleConfig

env = gym.make('CartPole-v1', render_mode='rgb_array')
config_dict = {
        'clip_param': 0.2,
        'activation': 'relu',
        'kernel_initializer': 'orthogonal',
        # NORMALIZATION?
        'discount_factor': 0.99,
        'gae_lambda': 0.98,
        'conv_layers': [],
        'conv_layers_noisy': False,
        'critic_width': 64,
        'critic_dense_layers': 2,
        'critic_dense_layers_noisy': False,
        'actor_width': 64,
        'actor_dense_layers': 2,
        'actor_dense_layers_noisy': False,
        # REWARD CLIPPING
        'steps_per_epoch': 4800,
        'train_policy_iterations': 5,
        'train_value_iterations': 5,
        'target_kl': 0.02,
        'entropy_coefficient': 0.01,
        'num_minibatches': 4,
    }

actor_config_dict = {
    'optimizer': tf.keras.optimizers.legacy.Adam,
    'learning_rate': 0.0005,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
}

critic_config_dict = {
    'optimizer': tf.keras.optimizers.legacy.Adam,
    'learning_rate': 0.0005,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
}

print("Actor Config")
actor_config = PPOActorConfig(actor_config_dict)
print("Critic Config")
critic_config = PPOCriticConfig(critic_config_dict)

print("PPO Config")
config = PPOConfig(config_dict, CartPoleConfig(), actor_config=actor_config, critic_config=critic_config)

agent = PPOAgent(env, config=config, name='PPOAgent')


Actor Config
Using optimizer: <class 'keras.src.optimizers.legacy.adam.Adam'>
Using adam_epsilon: 1e-07
Using learning_rate: 0.0005
Using clipnorm: 0.5
Critic Config
Using optimizer: <class 'keras.src.optimizers.legacy.adam.Adam'>
Using adam_epsilon: 1e-07
Using learning_rate: 0.0005
Using clipnorm: 0.5
PPO Config
Using optimizer: -1
Using adam_epsilon: -1
Using learning_rate: -1
Using clipnorm: -1
Using loss_function: -1
Using training_iterations: -1
Using num_minibatches: 4
Using minibatch_size: 4800
Using replay_buffer_size: 4800
Using min_replay_buffer_size: -1
Using default training_steps: 10000


ValueError: Missing required field without default value: relu

In [None]:
# agent.actor.summary()

In [None]:
# state, _ = env.reset()
# print(state)
# print(agent.select_action(state)[0])
agent.train()

In [None]:
agent.test()

In [None]:
import tensorflow_probability as tfp

print(tf.one_hot([0, 1, 1], 2))
distribution = tfp.distributions.Categorical(probs=[[0.1, 0.9],[0.9, 0.1]])
print(distribution.sample())
print(tf.reduce_sum(tf.reduce_mean([-0.6, -0.6] - distribution.log_prob([1, 0]))))
print(tf.reduce_mean([-0.6, -0.6] - tf.reduce_sum(tf.one_hot([1, 0], 2) * tf.math.log([[0.1, 0.9],[0.9, 0.1]]), axis=1)))

print(tf.math.log(tf.nn.softmax([[0.1, 0.9],[0.9, 0.1]])))
print(tf.nn.log_softmax([[0.1, 0.9],[0.9, 0.1]]))

def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.math.log(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, 2) * logprobabilities_all, axis=1
    )
    return logprobability


kl = tf.reduce_mean(
[-0.6, -0.6] - logprobabilities([[0.1, 0.9], [0.9, 0.1]], [1, 0])
)
# kl = tf.reduce_sum(kl)
print(kl)


In [None]:
import numpy as np
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
print(arr.reshape(3, 3))
arr = np.rot90(arr.reshape(3, 3), k=2)
print(arr)
arr = arr.flatten()
print(arr)