In [None]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv  # Pastikan environment sudah terdaftar (register)
# Import algoritma dari stable-baselines3
from stable_baselines3 import PPO, A2C, DDPG, SAC, TD3

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

# Pilih algoritma yang ingin digunakan: "PPO", "A2C", "DDPG", "SAC", "TD3", atau "TRPO"
algorithm = "A2C"

# Buat environment
env = gym.make("LunarLander3DEnv-v0", render_mode=None)

# Inisialisasi model sesuai algoritma yang dipilih
model = None
if algorithm == "PPO":
    model = PPO(
        "MlpPolicy",
        env,
        policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
        verbose=1,
    )
elif algorithm == "A2C":
    model = A2C(
        "MlpPolicy",
        env,
       
        verbose=1,
    )
elif algorithm == "DDPG":
    model = DDPG(
        "MlpPolicy",
        env,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )
elif algorithm == "SAC":
    model = SAC(
        "MlpPolicy",
        env,
        policy_kwargs=dict(net_arch=dict(pi=[256, 256], qf=[256, 256])),
        use_sde=True,
        ent_coef="auto_0.1",
        verbose=1,
    )
elif algorithm == "TD3":
    model = TD3(
        "MlpPolicy",
        env,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )

else:
    raise ValueError("Algoritma yang dipilih tidak didukung. Pilih salah satu: PPO, A2C, DDPG, SAC, TD3, TRPO.")

# Setup callbacks untuk menyimpan model dan evaluasi secara berkala
total_timesteps_learning = 1_000_000

checkpoint_callback = CheckpointCallback(
    save_freq=int(total_timesteps_learning / 10),
    save_path='./models/',
    name_prefix=f'{algorithm}_checkpoint_v1_'
)
eval_callback = EvalCallback(
    env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=total_timesteps_learning / 10,
    n_eval_episodes=10,
    deterministic=True,
)

# Mulai proses pelatihan
model.learn(total_timesteps=total_timesteps_learning, callback=[checkpoint_callback, eval_callback])

# Simpan model yang sudah dilatih
model.save(f"ll3d_{algorithm}")
print("Training is finished")

# Evaluasi agent
n_eval_episodes = 10
render = False  # Ubah ke True jika ingin melihat visualisasi evaluasi
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=n_eval_episodes, render=render)

print(f"Mean Reward over {n_eval_episodes} episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

env.close()
del model


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-------------------------------------
| time/                 |           |
|    fps                | 141       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -29.6     |
|    explained_variance | -0.000136 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 433       |
|    std                | 0.993     |
|    value_loss         | 254       |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 690      |
|    ep_rew_mean        | 2.53e+03 |
| time/                 |          |
|    fps                | 148      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -29.7    |
|    explained_variance



Eval num_timesteps=100000, episode_reward=5012.26 +/- 7419.30
Episode length: 1832.00 +/- 2728.60
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 1.83e+03 |
|    mean_reward        | 5.01e+03 |
| time/                 |          |
|    total_timesteps    | 100000   |
| train/                |          |
|    entropy_loss       | -28.7    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 19999    |
|    policy_loss        | -247     |
|    std                | 0.954    |
|    value_loss         | 95.1     |
------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 243      |
|    ep_rew_mean     | 565      |
| time/              |          |
|    fps             | 143      |
|    iterations      | 20000    |
|    time_elapsed    | 697      |
|    total_timesteps | 100000   |
-------------------