In [None]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv  # Pastikan environment sudah terdaftar (register)

# Import algoritma dari stable-baselines3
from stable_baselines3 import PPO, A2C, DDPG, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

# Import tambahan untuk TRPO, RecurrentPPO, dan HER
from sb3_contrib import TRPO
# Untuk RecurrentPPO, import dilakukan hanya jika diperlukan

# Pilih algoritma yang ingin digunakan:
# Pilihan: "PPO", "A2C", "DDPG", "SAC", "TD3", "TRPO", "RecurrentPPO", atau "HER"
algorithm = "RecurrentPPO"  # Ganti sesuai kebutuhan, misal "RecurrentPPO" atau "HER"

# Buat environment
env = gym.make("LunarLander3DEnv-v0", render_mode=None)

# Inisialisasi model sesuai algoritma yang dipilih
model = None

if algorithm == "PPO":
    model = PPO(
        "MlpPolicy",
        env,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        vf_coef=0.5,
        policy_kwargs=dict(net_arch=dict(pi=[256, 256], vf=[256, 256])),
        verbose=1,
        device="cuda" if th.cuda.is_available() else "cpu"
    )

elif algorithm == "SAC":
    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=3e-4,
        buffer_size=500_000,
        batch_size=128,
        tau=0.005,
        gamma=0.99,
        ent_coef="auto_0.1",
        use_sde=True,
        policy_kwargs=dict(net_arch=dict(pi=[256, 256], qf=[256, 256])),
        verbose=1,
    )

elif algorithm == "TD3":
    model = TD3(
        "MlpPolicy",
        env,
        learning_rate=1e-3,
        buffer_size=500_000,
        batch_size=128,
        tau=0.005,
        gamma=0.99,
        policy_delay=2,
        target_policy_noise=0.2,
        target_noise_clip=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )

elif algorithm == "TRPO":
    model = TRPO(
        "MlpPolicy",
        env,
        learning_rate=1e-4,
        gamma=0.99,
        gae_lambda=0.95,
        #max_kl=0.01,
        #ent_coef=0.01,
        #vf_coef=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )

elif algorithm == "RecurrentPPO":
    from sb3_contrib import RecurrentPPO  # Pastikan sudah install sb3-contrib versi terbaru
    model = RecurrentPPO(
        "MlpLstmPolicy",  # Menggunakan policy dengan LSTM
        env,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        vf_coef=0.5,
        policy_kwargs=dict(lstm_hidden_size=256, net_arch=[256, 256]),
        verbose=1,
        device="cuda" if th.cuda.is_available() else "cpu"
    )

elif algorithm == "HER":
    # HER biasanya digunakan pada environment goal-oriented. Pastikan env sudah sesuai
    from stable_baselines3 import HER, SAC  # Contoh: gunakan SAC sebagai algoritma dasar untuk HER
    model = HER(
        "MlpPolicy",
        env,
        SAC,
        n_sampled_goal=4,
        goal_selection_strategy="future",  # Bisa juga 'final' atau 'episode'
        online_sampling=True,
        verbose=1,
        device="cuda" if th.cuda.is_available() else "cpu"
    )

else:
    raise ValueError("Algoritma yang dipilih tidak didukung. Pilih salah satu: PPO, SAC, TD3, TRPO, RecurrentPPO, atau HER.")

# Setup callbacks untuk menyimpan model dan evaluasi secara berkala
total_timesteps_learning = 100_000

checkpoint_callback = CheckpointCallback(
    save_freq=int(total_timesteps_learning / 10),
    save_path='./models/',
    name_prefix=f'{algorithm}_checkpoint_v1_'
)
eval_callback = EvalCallback(
    env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=total_timesteps_learning // 10,
    n_eval_episodes=10,
    deterministic=True,
)

# Mulai proses pelatihan
model.learn(total_timesteps=total_timesteps_learning, callback=[checkpoint_callback, eval_callback])

# Simpan model yang sudah dilatih
model.save(f"ll3d_{algorithm}")
print("Training is finished")

# Evaluasi agent
n_eval_episodes = 10
render = False  # Ubah ke True jika ingin melihat visualisasi evaluasi
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=n_eval_episodes, render=render)

print(f"Mean Reward over {n_eval_episodes} episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

env.close()
del model


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 378       |
|    ep_rew_mean     | -1.06e+05 |
| time/              |           |
|    fps             | 172       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 318           |
|    ep_rew_mean          | -8.8e+04      |
| time/                   |               |
|    fps                  | 77            |
|    iterations           | 2             |
|    time_elapsed         | 52            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00035241139 |
|    clip_fraction        | 0             |
|    clip_range           |



Eval num_timesteps=10000, episode_reward=-144567.72 +/- 69496.74
Episode length: 474.00 +/- 214.61
-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 474           |
|    mean_reward          | -1.45e+05     |
| time/                   |               |
|    total_timesteps      | 10000         |
| train/                  |               |
|    approx_kl            | 0.00018357154 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -29.9         |
|    explained_variance   | 5.13e-06      |
|    learning_rate        | 0.0003        |
|    loss                 | 1.38e+07      |
|    n_updates            | 40            |
|    policy_gradient_loss | -0.000971     |
|    std                  | 1             |
|    value_loss           | 2.77e+07      |
-------------------------------------------
New best mean reward!
----------------------------------
| rollou

In [None]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.callbacks import BaseCallback

class CurriculumCallback(BaseCallback):
    def __init__(self, total_timesteps, verbose=0):
        super(CurriculumCallback, self).__init__(verbose)
        self.total_timesteps = total_timesteps

    def _on_step(self) -> bool:
        # Hitung progres (0 hingga 1)
        progress = self.num_timesteps / self.total_timesteps
        # Misal, kita naikkan difficulty secara linier dari 0 ke 1
        for env in self.model.get_env().envs:
            if hasattr(env, "difficulty"):
                env.difficulty = progress
        return True

# Fungsi untuk membuat environment
def make_env(render_mode=None):
    def _init():
        env = gym.make("LunarLander3DEnv-v0", render_mode=render_mode)
        # Inisialisasi difficulty ke 0
        env.difficulty = 0.0
        return env
    return _init

num_envs = 8
train_envs = DummyVecEnv([make_env(render_mode=None) for _ in range(num_envs)])
eval_env = DummyVecEnv([make_env(render_mode=None)])

total_timesteps_learning = 1_000_000

# Inisialisasi model PPO (misalnya)
model = RecurrentPPO(
    "MlpLstmPolicy",
    train_envs,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    vf_coef=0.5,    
    policy_kwargs=dict(
        net_arch=dict(pi=[256, 256], vf=[256, 256]),
        lstm_hidden_size=256,
        n_lstm_layers=2,
        shared_lstm=True,
        enable_critic_lstm=False,  # Pastikan LSTM kritikus tidak diaktifkan
    ),
    verbose=1,
    device="cuda" if th.cuda.is_available() else "cpu"
)

checkpoint_callback = CheckpointCallback(
    save_freq=int(total_timesteps_learning / 10),
    save_path='./models/',
    name_prefix='rppo_curriculum_checkpoint_'
)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=int(total_timesteps_learning / 10),
    n_eval_episodes=10,
    deterministic=True,
)

curriculum_callback = CurriculumCallback(total_timesteps=total_timesteps_learning)

# Training dengan callback gabungan
model.learn(
    total_timesteps=total_timesteps_learning,
    callback=[checkpoint_callback, eval_callback, curriculum_callback],
    progress_bar=True
)

model.save("ll3d_rppo_curriculum")
train_envs.close()
eval_env.close()


Using cuda device


Output()

------------------------------
| time/              |       |
|    fps             | 58    |
|    iterations      | 1     |
|    time_elapsed    | 281   |
|    total_timesteps | 16384 |
------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 2           |
|    time_elapsed         | 603         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.010321844 |
|    clip_fraction        | 0.0705      |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.9       |
|    explained_variance   | 3.1e-06     |
|    learning_rate        | 0.0003      |
|    loss                 | 57.1        |
|    n_updates            | 10          |
|    policy_gradient_loss | 0.00135     |
|    std                  | 1.01        |
|    value_loss           | 9.18e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 3           |
|    time_elapsed         | 926         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.004619338 |
|    clip_fraction        | 0.0496      |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.9       |
|    explained_variance   | 5.19e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 187         |
|    n_updates            | 20          |
|    policy_gradient_loss | 0.00224     |
|    std                  | 1.01        |
|    value_loss           | 1.38e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 4           |
|    time_elapsed         | 1247        |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.017658498 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.1       |
|    explained_variance   | -2.03e-06   |
|    learning_rate        | 0.0003      |
|    loss                 | 79.5        |
|    n_updates            | 30          |
|    policy_gradient_loss | 0.00672     |
|    std                  | 1.01        |
|    value_loss           | 1.64e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 5           |
|    time_elapsed         | 1566        |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.006667882 |
|    clip_fraction        | 0.0651      |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.1       |
|    explained_variance   | -1.07e-06   |
|    learning_rate        | 0.0003      |
|    loss                 | 193         |
|    n_updates            | 40          |
|    policy_gradient_loss | 0.00165     |
|    std                  | 1.01        |
|    value_loss           | 1.65e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 6           |
|    time_elapsed         | 1884        |
|    total_timesteps      | 98304       |
| train/                  |             |
|    approx_kl            | 0.008830693 |
|    clip_fraction        | 0.0825      |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.1       |
|    explained_variance   | 8.34e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.03e+03    |
|    n_updates            | 50          |
|    policy_gradient_loss | 0.00216     |
|    std                  | 1.02        |
|    value_loss           | 1.92e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 7           |
|    time_elapsed         | 2202        |
|    total_timesteps      | 114688      |
| train/                  |             |
|    approx_kl            | 0.009169063 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.2       |
|    explained_variance   | 3.99e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.67        |
|    n_updates            | 60          |
|    policy_gradient_loss | 0.00322     |
|    std                  | 1.02        |
|    value_loss           | 1.93e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 8           |
|    time_elapsed         | 2531        |
|    total_timesteps      | 131072      |
| train/                  |             |
|    approx_kl            | 0.006642438 |
|    clip_fraction        | 0.092       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.3       |
|    explained_variance   | 1.19e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 50.9        |
|    n_updates            | 70          |
|    policy_gradient_loss | 0.00233     |
|    std                  | 1.03        |
|    value_loss           | 2.38e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 9           |
|    time_elapsed         | 2856        |
|    total_timesteps      | 147456      |
| train/                  |             |
|    approx_kl            | 0.014058206 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.4       |
|    explained_variance   | 6.03e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 22.4        |
|    n_updates            | 80          |
|    policy_gradient_loss | 0.00382     |
|    std                  | 1.04        |
|    value_loss           | 2.47e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 10          |
|    time_elapsed         | 3183        |
|    total_timesteps      | 163840      |
| train/                  |             |
|    approx_kl            | 0.012218054 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.6       |
|    explained_variance   | 0.625       |
|    learning_rate        | 0.0003      |
|    loss                 | 6.6e+03     |
|    n_updates            | 90          |
|    policy_gradient_loss | 0.00424     |
|    std                  | 1.04        |
|    value_loss           | 2.31e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 11          |
|    time_elapsed         | 3507        |
|    total_timesteps      | 180224      |
| train/                  |             |
|    approx_kl            | 0.010915666 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.7       |
|    explained_variance   | 0.701       |
|    learning_rate        | 0.0003      |
|    loss                 | 7.58e+03    |
|    n_updates            | 100         |
|    policy_gradient_loss | 0.00364     |
|    std                  | 1.05        |
|    value_loss           | 1.86e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 12          |
|    time_elapsed         | 3831        |
|    total_timesteps      | 196608      |
| train/                  |             |
|    approx_kl            | 0.015336487 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -30.7       |
|    explained_variance   | 0.637       |
|    learning_rate        | 0.0003      |
|    loss                 | 59.6        |
|    n_updates            | 110         |
|    policy_gradient_loss | 0.00378     |
|    std                  | 1.05        |
|    value_loss           | 2.32e+04    |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 13           |
|    time_elapsed         | 4158         |
|    total_timesteps      | 212992       |
| train/                  |              |
|    approx_kl            | 0.0096529685 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.2          |
|    entropy_loss         | -30.8        |
|    explained_variance   | 0.736        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.6e+03      |
|    n_updates            | 120          |
|    policy_gradient_loss | 0.00378      |
|    std                  | 1.05         |
|    value_loss           | 1.95e+04     |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 14          |
|    time_elapsed         | 4477        |
|    total_timesteps      | 229376      |
| train/                  |             |
|    approx_kl            | 0.009262716 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31         |
|    explained_variance   | 0.666       |
|    learning_rate        | 0.0003      |
|    loss                 | 94.1        |
|    n_updates            | 130         |
|    policy_gradient_loss | 0.00364     |
|    std                  | 1.06        |
|    value_loss           | 2.31e+04    |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 15           |
|    time_elapsed         | 4792         |
|    total_timesteps      | 245760       |
| train/                  |              |
|    approx_kl            | 0.0127731655 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.2          |
|    entropy_loss         | -31.1        |
|    explained_variance   | 0.657        |
|    learning_rate        | 0.0003       |
|    loss                 | 44.4         |
|    n_updates            | 140          |
|    policy_gradient_loss | 0.00339      |
|    std                  | 1.07         |
|    value_loss           | 3.42e+04     |
------------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 16           |
|    time_elapsed         | 5109         |
|    total_timesteps      | 262144       |
| train/                  |              |
|    approx_kl            | 0.0077388664 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.2          |
|    entropy_loss         | -31.1        |
|    explained_variance   | 0.605        |
|    learning_rate        | 0.0003       |
|    loss                 | 98.9         |
|    n_updates            | 150          |
|    policy_gradient_loss | 0.00298      |
|    std                  | 1.07         |
|    value_loss           | 2.35e+04     |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 17          |
|    time_elapsed         | 5428        |
|    total_timesteps      | 278528      |
| train/                  |             |
|    approx_kl            | 0.014930856 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.2       |
|    explained_variance   | 0.357       |
|    learning_rate        | 0.0003      |
|    loss                 | 152         |
|    n_updates            | 160         |
|    policy_gradient_loss | 0.0058      |
|    std                  | 1.07        |
|    value_loss           | 3.09e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 18          |
|    time_elapsed         | 5742        |
|    total_timesteps      | 294912      |
| train/                  |             |
|    approx_kl            | 0.012742024 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.3       |
|    explained_variance   | 0.0944      |
|    learning_rate        | 0.0003      |
|    loss                 | 9.73e+03    |
|    n_updates            | 170         |
|    policy_gradient_loss | 0.00287     |
|    std                  | 1.08        |
|    value_loss           | 2.88e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 19          |
|    time_elapsed         | 6055        |
|    total_timesteps      | 311296      |
| train/                  |             |
|    approx_kl            | 0.009056149 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.4       |
|    explained_variance   | 0.00502     |
|    learning_rate        | 0.0003      |
|    loss                 | 27          |
|    n_updates            | 180         |
|    policy_gradient_loss | 0.00214     |
|    std                  | 1.08        |
|    value_loss           | 2.26e+04    |
-----------------------------------------


----------------------------------------
| time/                   |            |
|    fps                  | 51         |
|    iterations           | 20         |
|    time_elapsed         | 6378       |
|    total_timesteps      | 327680     |
| train/                  |            |
|    approx_kl            | 0.00838717 |
|    clip_fraction        | 0.119      |
|    clip_range           | 0.2        |
|    entropy_loss         | -31.5      |
|    explained_variance   | 0.0783     |
|    learning_rate        | 0.0003     |
|    loss                 | 525        |
|    n_updates            | 190        |
|    policy_gradient_loss | 0.00256    |
|    std                  | 1.09       |
|    value_loss           | 2.8e+04    |
----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 21          |
|    time_elapsed         | 6692        |
|    total_timesteps      | 344064      |
| train/                  |             |
|    approx_kl            | 0.010467581 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.6       |
|    explained_variance   | 0.323       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.01e+04    |
|    n_updates            | 200         |
|    policy_gradient_loss | 0.00295     |
|    std                  | 1.09        |
|    value_loss           | 3.01e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 22          |
|    time_elapsed         | 7006        |
|    total_timesteps      | 360448      |
| train/                  |             |
|    approx_kl            | 0.008994521 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.7       |
|    explained_variance   | 0.37        |
|    learning_rate        | 0.0003      |
|    loss                 | 59.3        |
|    n_updates            | 210         |
|    policy_gradient_loss | 0.00181     |
|    std                  | 1.1         |
|    value_loss           | 2.25e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 23          |
|    time_elapsed         | 7317        |
|    total_timesteps      | 376832      |
| train/                  |             |
|    approx_kl            | 0.007767984 |
|    clip_fraction        | 0.0891      |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.7       |
|    explained_variance   | 0.521       |
|    learning_rate        | 0.0003      |
|    loss                 | 518         |
|    n_updates            | 220         |
|    policy_gradient_loss | 0.000277    |
|    std                  | 1.1         |
|    value_loss           | 2.62e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 24          |
|    time_elapsed         | 7626        |
|    total_timesteps      | 393216      |
| train/                  |             |
|    approx_kl            | 0.013704658 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.2         |
|    entropy_loss         | -31.9       |
|    explained_variance   | 0.335       |
|    learning_rate        | 0.0003      |
|    loss                 | 239         |
|    n_updates            | 230         |
|    policy_gradient_loss | 0.00307     |
|    std                  | 1.11        |
|    value_loss           | 1.71e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 25          |
|    time_elapsed         | 7935        |
|    total_timesteps      | 409600      |
| train/                  |             |
|    approx_kl            | 0.010133015 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32         |
|    explained_variance   | 0.469       |
|    learning_rate        | 0.0003      |
|    loss                 | 9.47e+03    |
|    n_updates            | 240         |
|    policy_gradient_loss | 0.00248     |
|    std                  | 1.11        |
|    value_loss           | 2.02e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 26          |
|    time_elapsed         | 8247        |
|    total_timesteps      | 425984      |
| train/                  |             |
|    approx_kl            | 0.009702092 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.1       |
|    explained_variance   | 0.439       |
|    learning_rate        | 0.0003      |
|    loss                 | 622         |
|    n_updates            | 250         |
|    policy_gradient_loss | 0.00191     |
|    std                  | 1.12        |
|    value_loss           | 1.78e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 27          |
|    time_elapsed         | 8567        |
|    total_timesteps      | 442368      |
| train/                  |             |
|    approx_kl            | 0.012865976 |
|    clip_fraction        | 0.0921      |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.1       |
|    explained_variance   | 0.698       |
|    learning_rate        | 0.0003      |
|    loss                 | 36.5        |
|    n_updates            | 260         |
|    policy_gradient_loss | 0.0011      |
|    std                  | 1.12        |
|    value_loss           | 2.18e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 28          |
|    time_elapsed         | 8877        |
|    total_timesteps      | 458752      |
| train/                  |             |
|    approx_kl            | 0.011585547 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.3       |
|    explained_variance   | 0.5         |
|    learning_rate        | 0.0003      |
|    loss                 | 58.5        |
|    n_updates            | 270         |
|    policy_gradient_loss | 0.00449     |
|    std                  | 1.13        |
|    value_loss           | 1.75e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 29          |
|    time_elapsed         | 9182        |
|    total_timesteps      | 475136      |
| train/                  |             |
|    approx_kl            | 0.008631409 |
|    clip_fraction        | 0.0735      |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.4       |
|    explained_variance   | 0.618       |
|    learning_rate        | 0.0003      |
|    loss                 | 15.2        |
|    n_updates            | 280         |
|    policy_gradient_loss | 0.000629    |
|    std                  | 1.13        |
|    value_loss           | 1.38e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 30          |
|    time_elapsed         | 9480        |
|    total_timesteps      | 491520      |
| train/                  |             |
|    approx_kl            | 0.022669379 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.4       |
|    explained_variance   | 0.621       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.57e+04    |
|    n_updates            | 290         |
|    policy_gradient_loss | 0.00316     |
|    std                  | 1.14        |
|    value_loss           | 1.59e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 31          |
|    time_elapsed         | 9771        |
|    total_timesteps      | 507904      |
| train/                  |             |
|    approx_kl            | 0.015722105 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.5       |
|    explained_variance   | 0.054       |
|    learning_rate        | 0.0003      |
|    loss                 | 91.6        |
|    n_updates            | 300         |
|    policy_gradient_loss | 0.00464     |
|    std                  | 1.14        |
|    value_loss           | 2.19e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 32          |
|    time_elapsed         | 10067       |
|    total_timesteps      | 524288      |
| train/                  |             |
|    approx_kl            | 0.012682409 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.6       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.0003      |
|    loss                 | 8.17        |
|    n_updates            | 310         |
|    policy_gradient_loss | 0.00343     |
|    std                  | 1.14        |
|    value_loss           | 3.66e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 33          |
|    time_elapsed         | 10384       |
|    total_timesteps      | 540672      |
| train/                  |             |
|    approx_kl            | 0.010651436 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.7       |
|    explained_variance   | 0.631       |
|    learning_rate        | 0.0003      |
|    loss                 | 48.9        |
|    n_updates            | 320         |
|    policy_gradient_loss | 0.00123     |
|    std                  | 1.15        |
|    value_loss           | 749         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 34          |
|    time_elapsed         | 10677       |
|    total_timesteps      | 557056      |
| train/                  |             |
|    approx_kl            | 0.015830554 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.8       |
|    explained_variance   | 0.338       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.17        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.00106    |
|    std                  | 1.16        |
|    value_loss           | 1.52e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 35          |
|    time_elapsed         | 10968       |
|    total_timesteps      | 573440      |
| train/                  |             |
|    approx_kl            | 0.011646215 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.2         |
|    entropy_loss         | -32.9       |
|    explained_variance   | 0.324       |
|    learning_rate        | 0.0003      |
|    loss                 | 57.4        |
|    n_updates            | 340         |
|    policy_gradient_loss | -0.00151    |
|    std                  | 1.16        |
|    value_loss           | 1.5e+03     |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 52           |
|    iterations           | 36           |
|    time_elapsed         | 11278        |
|    total_timesteps      | 589824       |
| train/                  |              |
|    approx_kl            | 0.0109805055 |
|    clip_fraction        | 0.151        |
|    clip_range           | 0.2          |
|    entropy_loss         | -33.1        |
|    explained_variance   | 0.549        |
|    learning_rate        | 0.0003       |
|    loss                 | 18.7         |
|    n_updates            | 350          |
|    policy_gradient_loss | -0.00201     |
|    std                  | 1.17         |
|    value_loss           | 861          |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 37          |
|    time_elapsed         | 11615       |
|    total_timesteps      | 606208      |
| train/                  |             |
|    approx_kl            | 0.014380917 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.2       |
|    explained_variance   | 0.882       |
|    learning_rate        | 0.0003      |
|    loss                 | 39.5        |
|    n_updates            | 360         |
|    policy_gradient_loss | 0.0008      |
|    std                  | 1.18        |
|    value_loss           | 121         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 38          |
|    time_elapsed         | 11941       |
|    total_timesteps      | 622592      |
| train/                  |             |
|    approx_kl            | 0.012782279 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.2       |
|    explained_variance   | 0.437       |
|    learning_rate        | 0.0003      |
|    loss                 | 4.35        |
|    n_updates            | 370         |
|    policy_gradient_loss | -0.000973   |
|    std                  | 1.18        |
|    value_loss           | 1.49e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 39          |
|    time_elapsed         | 12233       |
|    total_timesteps      | 638976      |
| train/                  |             |
|    approx_kl            | 0.013933467 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.4       |
|    explained_variance   | 0.323       |
|    learning_rate        | 0.0003      |
|    loss                 | 9.33        |
|    n_updates            | 380         |
|    policy_gradient_loss | 0.000653    |
|    std                  | 1.19        |
|    value_loss           | 1.54e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 40          |
|    time_elapsed         | 12523       |
|    total_timesteps      | 655360      |
| train/                  |             |
|    approx_kl            | 0.009796882 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.5       |
|    explained_variance   | 0.647       |
|    learning_rate        | 0.0003      |
|    loss                 | 22.8        |
|    n_updates            | 390         |
|    policy_gradient_loss | 0.000332    |
|    std                  | 1.2         |
|    value_loss           | 238         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 41          |
|    time_elapsed         | 12813       |
|    total_timesteps      | 671744      |
| train/                  |             |
|    approx_kl            | 0.010488497 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.5       |
|    explained_variance   | 0.214       |
|    learning_rate        | 0.0003      |
|    loss                 | 5.58        |
|    n_updates            | 400         |
|    policy_gradient_loss | 0.0019      |
|    std                  | 1.2         |
|    value_loss           | 174         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 42          |
|    time_elapsed         | 13105       |
|    total_timesteps      | 688128      |
| train/                  |             |
|    approx_kl            | 0.008121267 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.6       |
|    explained_variance   | 0.154       |
|    learning_rate        | 0.0003      |
|    loss                 | 10.8        |
|    n_updates            | 410         |
|    policy_gradient_loss | 0.000478    |
|    std                  | 1.2         |
|    value_loss           | 778         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 43          |
|    time_elapsed         | 13396       |
|    total_timesteps      | 704512      |
| train/                  |             |
|    approx_kl            | 0.012301281 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.7       |
|    explained_variance   | 0.328       |
|    learning_rate        | 0.0003      |
|    loss                 | 32.5        |
|    n_updates            | 420         |
|    policy_gradient_loss | 0.000129    |
|    std                  | 1.21        |
|    value_loss           | 840         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 44          |
|    time_elapsed         | 13687       |
|    total_timesteps      | 720896      |
| train/                  |             |
|    approx_kl            | 0.009650338 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.2         |
|    entropy_loss         | -33.9       |
|    explained_variance   | 0.291       |
|    learning_rate        | 0.0003      |
|    loss                 | 23.1        |
|    n_updates            | 430         |
|    policy_gradient_loss | 6.73e-05    |
|    std                  | 1.22        |
|    value_loss           | 1.54e+03    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 45          |
|    time_elapsed         | 13976       |
|    total_timesteps      | 737280      |
| train/                  |             |
|    approx_kl            | 0.010400822 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.2         |
|    entropy_loss         | -34         |
|    explained_variance   | 0.478       |
|    learning_rate        | 0.0003      |
|    loss                 | 58.4        |
|    n_updates            | 440         |
|    policy_gradient_loss | -0.000454   |
|    std                  | 1.22        |
|    value_loss           | 106         |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 46          |
|    time_elapsed         | 14265       |
|    total_timesteps      | 753664      |
| train/                  |             |
|    approx_kl            | 0.012366281 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.2         |
|    entropy_loss         | -34.1       |
|    explained_variance   | 0.685       |
|    learning_rate        | 0.0003      |
|    loss                 | 107         |
|    n_updates            | 450         |
|    policy_gradient_loss | -0.000826   |
|    std                  | 1.23        |
|    value_loss           | 130         |
-----------------------------------------


----------------------------------------
| time/                   |            |
|    fps                  | 52         |
|    iterations           | 47         |
|    time_elapsed         | 14554      |
|    total_timesteps      | 770048     |
| train/                  |            |
|    approx_kl            | 0.00970849 |
|    clip_fraction        | 0.114      |
|    clip_range           | 0.2        |
|    entropy_loss         | -34.2      |
|    explained_variance   | 0.799      |
|    learning_rate        | 0.0003     |
|    loss                 | 3.37       |
|    n_updates            | 460        |
|    policy_gradient_loss | -0.00152   |
|    std                  | 1.24       |
|    value_loss           | 90.2       |
----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 48          |
|    time_elapsed         | 14843       |
|    total_timesteps      | 786432      |
| train/                  |             |
|    approx_kl            | 0.009372419 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -34.3       |
|    explained_variance   | 0.536       |
|    learning_rate        | 0.0003      |
|    loss                 | 42.8        |
|    n_updates            | 470         |
|    policy_gradient_loss | -0.00243    |
|    std                  | 1.24        |
|    value_loss           | 139         |
-----------------------------------------


In [None]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

num_envs = 8  # Jumlah environment paralel

def make_env(render_mode=None):
    def _init():
        return gym.make("LunarLander3DEnv-v0", render_mode=render_mode)
    return _init

train_envs = DummyVecEnv([make_env(render_mode=None) for _ in range(num_envs)])
eval_env = DummyVecEnv([make_env(render_mode=None)])

model = RecurrentPPO(
    "MlpLstmPolicy",
    train_envs,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    vf_coef=0.5,    
    policy_kwargs=dict(
        net_arch=dict(pi=[256, 256], vf=[256, 256]),
        lstm_hidden_size=256,
        n_lstm_layers=2,
        shared_lstm=True,
        enable_critic_lstm=False,  # Pastikan LSTM kritikus tidak diaktifkan
    ),
    verbose=1,
    device="cuda" if th.cuda.is_available() else "cpu"
)

total_timesteps_learning = 500_000

checkpoint_callback = CheckpointCallback(
    save_freq=int(total_timesteps_learning / 10),
    save_path='./models/',
    name_prefix='RecurrentPPO_parallel_checkpoint_'
)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=int(total_timesteps_learning / 10),
    n_eval_episodes=10,
    deterministic=True,
)

model.learn(
    total_timesteps=total_timesteps_learning,
    callback=[checkpoint_callback, eval_callback],
    progress_bar=True
)

model.save("ll3d_RecurrentPPO_parallel")
train_envs.close()
eval_env.close()


Using cuda device


Output()

------------------------------
| time/              |       |
|    fps             | 53    |
|    iterations      | 1     |
|    time_elapsed    | 304   |
|    total_timesteps | 16384 |
------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 2           |
|    time_elapsed         | 660         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.007973919 |
|    clip_fraction        | 0.0737      |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.9       |
|    explained_variance   | 2.8e-05     |
|    learning_rate        | 0.0003      |
|    loss                 | 1.37e+05    |
|    n_updates            | 10          |
|    policy_gradient_loss | 0.00118     |
|    std                  | 1.01        |
|    value_loss           | 9.13e+04    |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 3           |
|    time_elapsed         | 992         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.039660174 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.9       |
|    explained_variance   | 1.38e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.61e+04    |
|    n_updates            | 20          |
|    policy_gradient_loss | 0.00674     |
|    std                  | 1.01        |
|    value_loss           | 9.28e+04    |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 50           |
|    iterations           | 4            |
|    time_elapsed         | 1305         |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0072901165 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.2          |
|    entropy_loss         | -30          |
|    explained_variance   | 9.24e-06     |
|    learning_rate        | 0.0003       |
|    loss                 | 2.56e+04     |
|    n_updates            | 30           |
|    policy_gradient_loss | 0.00625      |
|    std                  | 1.01         |
|    value_loss           | 6.23e+04     |
------------------------------------------


In [2]:
import gymnasium as gym
import torch as th
from LunarLander3DEnv.envs import LunarLander3DEnv
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import time
algorithm = "RPPO"
model_path = f"ll3d_rppo_curriculum"  # Nama file model yang telah disimpan

# Buat environment dengan render_mode "human" agar dapat melihat visualisasi
env = gym.make("LunarLander3DEnv-v0", render_mode="human")
model = RecurrentPPO.load(model_path, env=env)

# Jalankan beberapa episode untuk inferensi
num_episodes = 10
for episode in range(1, num_episodes+1):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    while not done:
        # Prediksi aksi dengan deterministik
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()  # Pastikan render dipanggil agar kamera dan visualisasi diupdate
        #time.sleep(1/60)  # Optional: delay untuk melambatkan tampilan
        done = terminated or truncated
        time.sleep(0.01)
    print(f"Episode {episode}: Total Reward = {total_reward:.2f}")

env.close()


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Episode 1: Total Reward = -47063.38


error: Not connected to physics server.