#Environment Check

In [1]:
import gymnasium as gym
import torch as th
import numpy as np
from twsbr_env.envs import TwsbrEnv  # Import environment
from stable_baselines3 import SAC, A2C, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, BaseCallback
from stable_baselines3.common.logger import configure

# Buat environment
env = gym.make("TwsbrEnv-v0", render_mode=None, action_type="continuous")

# Callback untuk mengupdate ent_coef selama training
class EntropyScheduleCallback(BaseCallback):
    def __init__(self, start_ent=0.05, end_ent=0.001, total_timesteps=1e5, verbose=0):
        super().__init__(verbose)
        self.start_ent = start_ent
        self.end_ent = end_ent
        self.total_timesteps = total_timesteps

    def _on_step(self) -> bool:
        progress = self.num_timesteps / self.total_timesteps
        if progress < 0.1:
            self.model.ent_coef = 0.2
        elif progress < 0.25:
            self.model.ent_coef = 0.1
        elif progress < 0.5:
            self.model.ent_coef = 0.05
        else:
            self.model.ent_coef = 0.001
        return True

# Pilih Algoritma (SAC, A2C, PPO)
algorithm = "PPO"  # Pilihan: "SAC", "PPO", "A2C"

if algorithm == "PPO":
    model = PPO(
        "MlpPolicy", env,
        n_steps=2048, batch_size=128, n_epochs=10,
        target_kl=0.05,
        ent_coef=0.1,
        verbose=1,
    )
elif algorithm == "A2C":
    model = A2C(
        "MlpPolicy", env,
        learning_rate=lr_schedule, gamma=0.99,
        ent_coef=0.1,
        
        verbose=1,
    )
else:  # Default ke SAC
    model = SAC(
        "MlpPolicy", env,
        buffer_size=int(1e6),
       
        use_sde=True, ent_coef="auto_0.1",
        verbose=1,
    )

# Setup TensorBoard logger
#log_dir = "./tensorboard_logs/"
#logger = configure(log_dir, ['tensorboard'])
#model.set_logger(logger)

# Setup callbacks
total_timesteps_learning = 1_000_000
save_freq_learning = total_timesteps_learning / 10
checkpoint_callback = CheckpointCallback(save_freq=save_freq_learning, save_path='./models/', name_prefix=f'{algorithm.lower()}_checkpoint')
eval_callback = EvalCallback(env, best_model_save_path="./logs/", log_path="./logs/", eval_freq=save_freq_learning, n_eval_episodes=5, deterministic=True)
entropy_callback = EntropyScheduleCallback(start_ent=0.1, end_ent=0.001, total_timesteps=total_timesteps_learning)

# Training model
model.learn(total_timesteps=total_timesteps_learning, callback=[checkpoint_callback, eval_callback, entropy_callback])

# Simpan model
model.save("twsbr_ppo")
print("Training is finished")

# Evaluasi model
n_eval_episodes = 10
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=n_eval_episodes, render=False)
print(f"Mean Reward over {n_eval_episodes} episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

env.close()
del model


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 9.53     |
|    ep_rew_mean     | -25.4    |
|    success_rate    | 0        |
| time/              |          |
|    fps             | 7        |
|    iterations      | 1        |
|    time_elapsed    | 271      |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 10.2        |
|    ep_rew_mean          | -25.3       |
|    success_rate         | 0           |
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 2           |
|    time_elapsed         | 536         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011504446 |
|    clip_fraction        | 0.0777      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.87     



Eval num_timesteps=100000, episode_reward=-26.97 +/- 1.83
Episode length: 27.80 +/- 15.20
Success rate: 0.00%
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 27.8        |
|    mean_reward          | -27         |
|    success_rate         | 0           |
| time/                   |             |
|    total_timesteps      | 100000      |
| train/                  |             |
|    approx_kl            | 0.006928888 |
|    clip_fraction        | 0.0502      |
|    clip_range           | 0.2         |
|    entropy_loss         | -6.33       |
|    explained_variance   | 0.00746     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.47        |
|    n_updates            | 480         |
|    policy_gradient_loss | -0.00325    |
|    std                  | 5.83        |
|    value_loss           | 3.73        |
-----------------------------------------
New best mean reward!
----------------------------

In [None]:
# -------------------------------------------------------------
# Inferensi dan evaluasi dengan custom evaluasi metrik (untuk PPO)
# -------------------------------------------------------------
import numpy as np  # Tambahkan numpy untuk analisis statistik
import gymnasium as gym
from twsbr_env.envs import TwsbrEnv  # Import environment
from stable_baselines3 import PPO

# Muat kembali lingkungan dan model
env = gym.make("TwsbrEnv-v0",
               render_mode="human",
               action_type="continuous"  # Action type
               )

model = PPO.load("twsbr_ppo")

# Mulai evaluasi
total_rewards = []
episode_lengths = []
episode_reward = 0
episode_length = 0

obs, info = env.reset()
terminated = False
truncated = False

for _ in range(10000):  # Jumlah langkah yang akan dilakukan selama evaluasi
    # Pilih tindakan berdasarkan model
    action, _states = model.predict(obs, deterministic=True)

    # Lakukan aksi di lingkungan
    obs, reward, terminated, truncated, info = env.step(action)
    
    episode_reward += reward
    episode_length += 1

    # Render visualisasi (jika diinginkan)
    # Reset episode jika selesai
    if terminated or truncated:
        #print(f"{episode_length} : {episode_reward} : {truncated} or {terminated} : {obs}")  # Uncomment for debugging
        total_rewards.append(episode_reward)
        episode_lengths.append(episode_length)
        obs, info = env.reset()
        episode_reward = 0
        episode_length = 0

# Analisis hasil evaluasi
average_reward = np.mean(total_rewards)
average_length = np.mean(episode_lengths)

print(f"Average Reward per Episode: {average_reward:.2f}")
print(f"Average Episode Length: {average_length:.2f}")

# Menutup lingkungan
env.close()
