In [1]:
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()

In [5]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
import os

models_dir = "models/A2C"
logdir = "/home/jlukas/Desktop/My_Project/AI_Stable_GYM/logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

# Create and wrap the environment
env = make_vec_env("LunarLander-v3", n_envs=4)  # Parallel environments for faster training

# Define the A2C model
model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=logdir)

# Train the agent
TIMESTEPS = 10000
iters = 0
for i in range(30):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="A2C")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

env.close()

Using cuda device
Logging to /home/jlukas/Desktop/My_Project/AI_Stable_GYM/logs/A2C_0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 117      |
|    ep_rew_mean        | -203     |
| time/                 |          |
|    fps                | 493      |
|    iterations         | 100      |
|    time_elapsed       | 4        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -1.09    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -5.24    |
|    value_loss         | 58.2     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 145      |
|    ep_rew_mean        | -245     |
| time/                 |          |
|    fps                | 453      |
|    iterations         | 200      |
|    time_elapsed       | 

In [15]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
import os

models_dir = "models/A2C"
model_path = f"{models_dir}/210000"

# Create the environment for evaluation
env = gym.make("LunarLander-v3", render_mode="human")

# Load the trained model
model = A2C.load(model_path)

# Run the trained model in the environment
observation, info = env.reset()
episode_over = False

while not episode_over:
    # Use the trained model to predict actions
    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    episode_over = terminated or truncated

env.close()