In [2]:
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()

### Step 1: Install Required Libraries

In [None]:
!pip install stable-baselines3 gymnasium

### Step 2: Write the Training Script

In [19]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Create and wrap the environment
env = make_vec_env("LunarLander-v3", n_envs=4)  # Parallel environments for faster training

# Define the PPO model
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./ppo_lunarlander_tensorboard/")

# Train the agent
timesteps = 1000000  # Adjust as needed
model.learn(total_timesteps=timesteps)

# Save the trained model
model.save("ppo_lunarlander")
env.close()


Using cpu device
Logging to ./ppo_lunarlander_tensorboard/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.7     |
|    ep_rew_mean     | -207     |
| time/              |          |
|    fps             | 4008     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 95          |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 2260        |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.011105313 |
|    clip_fraction        | 0.0948      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    exp

### Step 3: Evaluate the Trained Agent

In [27]:
from stable_baselines3 import PPO

# Load the trained model
model = PPO.load("ppo_lunarlander")

# Create the environment for evaluation
env = gym.make("LunarLander-v3", render_mode="human")

# Run the trained model in the environment
observation, info = env.reset()
episode_over = False

while not episode_over:
    # Use the trained model to predict actions
    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    episode_over = terminated or truncated

env.close()

### Step 4: Monitor Training with TensorBoard (Optional)

In [None]:
tensorboard_log="./ppo_lunarlander_tensorboard/"