In [None]:
# %%
import os
import numpy as np
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from utils.env import CogSatEnv

# Folder setup
saved_folder = "saved_data"
best_model_path = os.path.join(saved_folder, "best_model")

os.makedirs(saved_folder, exist_ok=True)
print(f"Folder '{saved_folder}' is ready.")

# %%
class RewardLoggerCallback(BaseCallback):
    """
    Logs epoch-based mean and median rewards.
    Optional use alongside EvalCallback.
    """
    def __init__(self, epoch_length, verbose=0):
        super().__init__(verbose)
        self.epoch_length = epoch_length
        self.epoch_rewards = []
        self.epoch_mean_rewards = []
        self.epoch_median_rewards = []
        self.epoch_all_rewards = []
        self.current_rewards = []

    def _on_step(self) -> bool:
        rewards = self.locals.get("rewards")
        if rewards is not None:
            self.current_rewards.append(rewards[0])

        if self.num_timesteps % self.epoch_length == 0:
            if self.current_rewards:
                mean_reward = np.mean(self.current_rewards)
                median_reward = np.median(self.current_rewards)

                self.epoch_rewards.append(mean_reward)
                self.epoch_mean_rewards.append(mean_reward)
                self.epoch_median_rewards.append(median_reward)
                self.epoch_all_rewards.append(self.current_rewards.copy())

                if self.verbose:
                    print(f"[Epoch {len(self.epoch_rewards)}] Mean: {mean_reward:.2f}, Median: {median_reward:.2f}")

                self.current_rewards = []

        return True

    def _on_training_end(self):
        np.save(f'{saved_folder}/epoch_rewards.npy', self.epoch_rewards)
        np.save(f'{saved_folder}/epoch_mean_rewards.npy', self.epoch_mean_rewards)
        np.save(f'{saved_folder}/epoch_median_rewards.npy', self.epoch_median_rewards)
        np.save(f'{saved_folder}/epoch_all_rewards.npy', self.epoch_all_rewards)
        print("Reward stats saved to disk.")


In [None]:

# %%
# Register custom environment
gymnasium.register(
    id='CogSatEnv-v1',
    entry_point='utils.env:CogSatEnv',
)

# Create vectorized environment
seed = 42
env = make_vec_env("CogSatEnv-v1", n_envs=1, seed=seed)

# %%
# Training configuration
epoch_length = 62
epoch_numbers = 1000
total_timesteps = epoch_length * epoch_numbers

# Instantiate reward logger and eval callback
reward_logger = RewardLoggerCallback(epoch_length=epoch_length, verbose=1)

eval_env = make_vec_env("CogSatEnv-v1", n_envs=1, seed=seed + 100)  # Different seed for eval
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_model_path,
    log_path=saved_folder,
    eval_freq=epoch_length,           # Evaluate every epoch
    deterministic=True,
    render=False,
    verbose=1
)

# %%
# Define and train model
model = A2C(
    "MultiInputPolicy",
    env,
    ent_coef=0.01,
    learning_rate=1e-5,
    seed=seed,
    verbose=1,
    tensorboard_log="./a2c_dsa_tensorboard/"
)

# Combine both callbacks
from stable_baselines3.common.callbacks import CallbackList
callback = CallbackList([reward_logger, eval_callback])

# Train the model
model.learn(total_timesteps=total_timesteps, callback=callback)

# Save final model
model.save(os.path.join(saved_folder, "a2c_cogsatenv_final"))
env.close()

# %%
# Plotting
def plot_rewards(filename, label, ylabel):
    rewards = np.load(os.path.join(saved_folder, filename))
    plt.figure(figsize=(10, 6))
    plt.plot(rewards, label=label)
    plt.xlabel('Epoch')
    plt.ylabel(ylabel)
    plt.title(f'A2C Training Performance on CogSatEnv ({label})')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_rewards("epoch_median_rewards.npy", label="Median Reward", ylabel="Median Reward")
plot_rewards("epoch_rewards.npy", label="Mean Reward", ylabel="Mean Reward")
