In [3]:
!pip install -q gymnasium
import gymnasium as gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

# Hyperparameters
LEARNING_RATE = 0.0003
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPSILON = 0.2
ENTROPY_COEF = 0.01
NUM_EPOCHS = 10
BATCH_SIZE = 64
HIDDEN_SIZE = 64

# Define the Actor-Critic model
class ActorCritic(tf.keras.Model):
    def __init__(self, action_dim):
        super(ActorCritic, self).__init__()
        self.common = layers.Dense(HIDDEN_SIZE, activation="relu")
        self.actor = layers.Dense(action_dim, activation="softmax")
        self.critic = layers.Dense(1)

    def call(self, inputs):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

# Proximal Policy Optimization (PPO) implementation
class PPOAgent:
    def __init__(self, state_dim, action_dim):
        self.actor_critic = ActorCritic(action_dim)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
        self.action_dim = action_dim

    def select_action(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        prob, _ = self.actor_critic(state)
        action = np.random.choice(self.action_dim, p=prob.numpy()[0])
        return action, prob[0][action].numpy()

    def compute_advantages(self, rewards, values, dones):
        advantages = []
        discounted_sum = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + GAMMA * (1 - dones[t]) * (values[t + 1] if t + 1 < len(values) else 0) - values[t]
            discounted_sum = delta + GAMMA * LAMBDA * (1 - dones[t]) * discounted_sum
            advantages.insert(0, discounted_sum)
        return advantages

    def train(self, states, actions, rewards, dones, old_probs):
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.int32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)
        old_probs = tf.convert_to_tensor(old_probs, dtype=tf.float32)

        # Calculate advantages
        _, values = self.actor_critic(states)
        values = tf.squeeze(values, axis=1)
        advantages = self.compute_advantages(rewards, values.numpy(), dones.numpy())
        advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)

        for _ in range(NUM_EPOCHS):
            with tf.GradientTape() as tape:
                new_probs, new_values = self.actor_critic(states)
                new_values = tf.squeeze(new_values, axis=1)

                action_probs = tf.gather(new_probs, actions, axis=1, batch_dims=1)
                ratios = tf.exp(tf.math.log(action_probs) - tf.math.log(old_probs))
                clipped_ratios = tf.clip_by_value(ratios, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)
                policy_loss = -tf.reduce_mean(tf.minimum(ratios * advantages, clipped_ratios * advantages))

                value_loss = tf.reduce_mean(tf.square(rewards + GAMMA * (1 - dones) * new_values - values))
                entropy_loss = -tf.reduce_mean(new_probs * tf.math.log(new_probs + 1e-10))

                total_loss = policy_loss + 0.5 * value_loss - ENTROPY_COEF * entropy_loss

            grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))

# Main training loop
env = gym.make("CartPole-v1")
agent = PPOAgent(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)

for episode in range(500):
    state, _ = env.reset()
    states, actions, rewards, dones, old_probs = [], [], [], [], []
    total_reward = 0

    while True:
        action, prob = agent.select_action(state)
        next_state, reward, done, _ ,_ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        old_probs.append(prob)

        state = next_state
        total_reward += reward

        if done:
            break

    agent.train(states, actions, rewards, dones, old_probs)
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

env.close()

2025-01-05 01:06:02.428126: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ValueError: Can't convert non-rectangular Python sequence to Tensor.