Sample Game

In [None]:
import gymnasium as gym


env = gym.make("LunarLander-v2", render_mode="human")

observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()
    print(f"Action: {action}")
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
        print("Environment reset")

env.close()

Action/Observation Space


In [None]:
env = gym.make("LunarLander-v2")
# create multiple environments
# env = make_vec_env("LunarLander-v2", n_envs=16)
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

Model

In [8]:
import gymnasium as gym
import time

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [33]:
# Create environment
env = make_vec_env('LunarLander-v2')

# Instantiate the agent
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1,
    device='cpu')

# train
start_time = time.time()
model.learn(total_timesteps=int(2e6))
end_time = time.time()

# save model
model_name = "ppo-LunarLander-v2"
model.save(model_name)

# print training time
print(f"Training time: {end_time - start_time} seconds")

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.5     |
|    ep_rew_mean     | -178     |
| time/              |          |
|    fps             | 2633     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1024     |
---------------------------------
--------------------------------------------
| rollout/                |                |
|    ep_len_mean          | 87.2           |
|    ep_rew_mean          | -168           |
| time/                   |                |
|    fps                  | 2387           |
|    iterations           | 2              |
|    time_elapsed         | 0              |
|    total_timesteps      | 2048           |
| train/                  |                |
|    approx_kl            | 0.0019579632   |
|    clip_fraction        | 0              |
|    clip_range           | 0.2            |
|    entropy_loss         | -1.39          |
|    explaine

Evaluate

In [37]:
eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=261.88 +/- 18.90349049355859


Run Model

In [43]:
# Create a new environment for inference
env = gym.make("LunarLander-v2", render_mode="human")

# Reset the environment
obs, _ = env.reset()

# Run the model for a few episodes
for episode in range(5):
    episode_reward = 0
    done = False
    truncated = False
    
    while not (done or truncated):
        # Get the model's action
        action, _ = model.predict(obs, deterministic=True)
        
        # Take the action in the environment
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
    
    print(f"Episode {episode + 1} reward: {episode_reward}")
    
    # Reset the environment for the next episode
    obs, _ = env.reset()

# Close the environment
env.close()


Episode 1 reward: 295.45620828117933
Episode 2 reward: 272.68909744880295
Episode 3 reward: 249.66976016254492
Episode 4 reward: 255.21906640260153
Episode 5 reward: 266.1475719949203
