In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Step 1: Create the LunarLander-v2 environment
env_name = "BipedalWalker-v3"
env = make_vec_env(env_name, n_envs=4)  # Vectorized environment for parallelization

# Step 2: Define the PPO model
model = PPO(
    "MlpPolicy",       # Multi-layer perceptron policy
    env,               # Environment
    verbose=1,         # Print training information
    learning_rate=3e-4,  # Learning rate for the optimizer
    n_steps=2048,      # Number of steps to run per environment per update
    batch_size=64,     # Minibatch size for gradient updates
    n_epochs=10,       # Number of epochs to optimize the surrogate loss
    gamma=0.99,        # Discount factor
    gae_lambda=0.95,   # GAE (Generalized Advantage Estimation) lambda
    clip_range=0.2,    # Clipping parameter for PPO
)

# Step 3: Train the PPO agent
timesteps = 100000  # Total training steps
print("Training the PPO agent...")
model.learn(total_timesteps=timesteps)
print("Training complete!")

# Step 4: Evaluate the agent
def evaluate_agent(model, env, n_episodes=5):
    """
    Evaluate the trained PPO agent in the given environment.
    """
    total_rewards = []
    for episode in range(n_episodes):
        obs, _ = env.reset()
        episode_reward = 0
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)  # Use the trained policy
            obs, reward, done, _, info = env.step(action)
            episode_reward += reward
        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward:.2f}")
    avg_reward = sum(total_rewards) / n_episodes
    print(f"Average reward over {n_episodes} episodes: {avg_reward:.2f}")

# Evaluate the trained model
print("Evaluating the PPO agent...")
#evaluate_agent(model, gym.make(env_name))

# Step 5: Save the model
model.save("ppo_lunarlander")
print("Model saved as 'ppo_lunarlander.zip'.")

Using cpu device
Training the PPO agent...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 300      |
|    ep_rew_mean     | -110     |
| time/              |          |
|    fps             | 7038     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 378          |
|    ep_rew_mean          | -113         |
| time/                   |              |
|    fps                  | 4745         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0057234652 |
|    clip_fraction        | 0.0629       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.68        |
|    explained_

In [2]:
#Load the model and continue training (optional)
# model = PPO.load("ppo_lunarlander", env=env)
# model.learn(total_timesteps=100000)
# model.save("ppo_lunarlander")
# print("Model saved as 'ppo_lunarlander.zip'.")

In [None]:
env = gym.make("BipedalWalker-v3", render_mode="human")  # Enable rendering
test_model = PPO.load("ppo_lunarlander")

# Step 2: Evaluate the agent
obs,_ = env.reset()
done = False
total_reward = 0

while not done:
    # Step 3: Predict action using the trained model
    action, _ = test_model.predict(obs, deterministic=True)  # Use deterministic policy for evaluation
    obs, reward, done, _, info = env.step(action)
    total_reward += reward

# Step 4: Close the environment after evaluation
env.close()
