In [None]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv  # Pastikan environment sudah terdaftar (register)
# Import algoritma dari stable-baselines3
# Tambahkan library sb3-contrib jika menggunakan TRPO
from stable_baselines3 import PPO, A2C, DDPG, SAC, TD3
from sb3_contrib import TRPO

# TRPO tersedia dari sb3-contrib
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

# Pilih algoritma yang ingin digunakan: "PPO", "A2C", "DDPG", "SAC", "TD3", atau "TRPO"
algorithm = "TRPO"

# Buat environment
env = gym.make("LunarLander3DEnv-v0", render_mode=None)

# Inisialisasi model sesuai algoritma yang dipilih
model = None
if algorithm == "PPO":
    model = PPO(
        "MlpPolicy",
        env,
        learning_rate=3e-4,  # Bisa diturunkan jika training terlalu lambat
        n_steps=2048,  # Lebih besar agar bisa menangkap lebih banyak trajektori
        batch_size=64,  # Cukup kecil untuk menjaga stabilitas
        n_epochs=10,  # Jumlah update per batch
        gamma=0.99,  # Discount factor untuk jangka panjang
        gae_lambda=0.95,  # Generalized Advantage Estimation
        clip_range=0.2,  # PPO Clipping
        ent_coef=0.01,  # Menambah eksplorasi dengan entropy loss
        vf_coef=0.5,  # Koefisien untuk fungsi nilai (value function)
        policy_kwargs=dict(net_arch=dict(pi=[256, 256], vf=[256, 256])),
        verbose=1,
        device="cuda" if th.cuda.is_available() else "cpu"
    )

elif algorithm == "SAC":
    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=3e-4,
        buffer_size=500_000,  # Replay buffer lebih besar untuk kontrol kontinu
        batch_size=128,  # SAC lebih stabil dengan batch besar
        tau=0.005,  # Soft update factor
        gamma=0.99,
        ent_coef="auto_0.1",  # Entropy coefficient adaptif
        use_sde=True,  # State Dependent Exploration (untuk eksplorasi lebih baik)
        policy_kwargs=dict(net_arch=dict(pi=[256, 256], qf=[256, 256])),
        verbose=1,
    )

elif algorithm == "TD3":
    model = TD3(
        "MlpPolicy",
        env,
        learning_rate=1e-3,  # TD3 lebih cepat belajar dengan LR lebih tinggi
        buffer_size=500_000,
        batch_size=128,
        tau=0.005,
        gamma=0.99,
        policy_delay=2,  # Delay untuk update policy lebih stabil
        target_policy_noise=0.2,  # Tambahkan noise agar eksplorasi lebih baik
        target_noise_clip=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )
elif algorithm == "TRPO":
    model = TRPO(
        "MlpPolicy",
        env,
        learning_rate=1e-4,
        gamma=0.99,
        gae_lambda=0.95,
        #max_kl=0.01,  # Batas maksimum perubahan policy
        #ent_coef=0.01,
        #vf_coef=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )

else:
    raise ValueError("Algoritma yang dipilih tidak didukung. Pilih salah satu: PPO, SAC, TD3, TRPO.")

# Setup callbacks untuk menyimpan model dan evaluasi secara berkala
total_timesteps_learning = 100_000

checkpoint_callback = CheckpointCallback(
    save_freq=int(total_timesteps_learning / 10),
    save_path='./models/',
    name_prefix=f'{algorithm}_checkpoint_v1_'
)
eval_callback = EvalCallback(
    env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=total_timesteps_learning / 10,
    n_eval_episodes=10,
    deterministic=True,
)

# Mulai proses pelatihan
model.learn(total_timesteps=total_timesteps_learning, callback=[checkpoint_callback, eval_callback])

# Simpan model yang sudah dilatih
model.save(f"ll3d_{algorithm}")
print("Training is finished")

# Evaluasi agent
n_eval_episodes = 10
render = False  # Ubah ke True jika ingin melihat visualisasi evaluasi
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=n_eval_episodes, render=render)

print(f"Mean Reward over {n_eval_episodes} episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

env.close()
del model


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 328       |
|    ep_rew_mean     | -9.47e+04 |
| time/              |           |
|    fps             | 241       |
|    iterations      | 1         |
|    time_elapsed    | 8         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                  |           |
|    ep_len_mean            | 338       |
|    ep_rew_mean            | -9.08e+04 |
| time/                     |           |
|    fps                    | 211       |
|    iterations             | 2         |
|    time_elapsed           | 19        |
|    total_timesteps        | 4096      |
| train/                    |           |
|    explained_variance     | -0.000102 |
|    is_line_search_success | 1         |
|    kl_divergence_loss     | 0.00412   |
|    learning_rate          | 0.0001    |
|    n_updates              | 1         |
|    policy_objectiv



In [None]:
import gymnasium as gym
import time
from LunarLander3DEnv.envs import LunarLander3DEnv
# Pilih algoritma yang digunakan saat training, misal "PPO"

algorithm = "TRPO"
model_path = f"ll3d_{algorithm}"  # Nama file model yang telah disimpan

# Buat environment dengan render_mode "human" agar dapat melihat visualisasi
env = gym.make("LunarLander3DEnv-v0", action_type="discrete", render_mode="human")

# Muat model berdasarkan algoritma yang dipilih
if algorithm == "PPO":
    from stable_baselines3 import PPO
    model = PPO.load(model_path, env=env)
elif algorithm == "A2C":
    from stable_baselines3 import A2C
    model = A2C.load(model_path, env=env)
elif algorithm == "DDPG":
    from stable_baselines3 import DDPG
    model = DDPG.load(model_path, env=env)
elif algorithm == "SAC":
    from stable_baselines3 import SAC
    model = SAC.load(model_path, env=env)
elif algorithm == "TD3":
    from stable_baselines3 import TD3
    model = TD3.load(model_path, env=env)
elif algorithm == "TRPO":
    from sb3_contrib import TRPO
    model = TRPO.load(model_path, env=env)
else:
    raise ValueError("Algoritma yang dipilih tidak didukung.")

# Jalankan beberapa episode untuk inferensi
num_episodes = 10
for episode in range(1, num_episodes+1):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    while not done:
        # Prediksi aksi dengan deterministik
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()  # Pastikan render dipanggil agar kamera dan visualisasi diupdate
        #time.sleep(1/60)  # Optional: delay untuk melambatkan tampilan
        done = terminated or truncated
    print(f"Episode {episode}: Total Reward = {total_reward:.2f}")

env.close()


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Episode 1: Total Reward = -67353.34
Episode 2: Total Reward = -4242.19
Episode 3: Total Reward = -7493.94
Episode 4: Total Reward = -8167.36
