### Step 1 : Training Without Vectorize

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO

import os

models_dir = "models/PPO"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

env = gym.make('LunarLander-v3')
env.reset()

model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 10000
iters = 0
for i in range(30):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

### Step 2 : Training with Vectorize

In [14]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import os

models_dir = "models/PPO"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)
    
# Create and wrap the environment
env = make_vec_env("LunarLander-v3", n_envs=4)  # Parallel environments for faster training

# Define the PPO model
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir)

# Train the agent
TIMESTEPS = 10000
iters = 0
for i in range(30):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

env.close()

Using cpu device
Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93.8     |
|    ep_rew_mean     | -196     |
| time/              |          |
|    fps             | 3830     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 99           |
|    ep_rew_mean          | -173         |
| time/                   |              |
|    fps                  | 2236         |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0053854156 |
|    clip_fraction        | 0.0417       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_vari

### Step 2 :Test and Evaluate the Model

In [25]:
from stable_baselines3 import PPO

# Load the trained model
models_dir = "models/PPO"
model_path = f"{models_dir}/290000"
model = PPO.load(model_path, env=env)

# Create the environment for evaluation
env = gym.make("LunarLander-v3", render_mode="human")

# Run the trained model in the environment
observation, info = env.reset()
episode_over = False

while not episode_over:
    # Use the trained model to predict actions
    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    episode_over = terminated or truncated

env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
