In [None]:
import os
import gymnasium
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback, CallbackList
from utils.env import CogSatEnv

In [None]:
# %% utils/callbacks.py
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback

class RewardLoggerCallback(BaseCallback):
    def __init__(self, epoch_length, verbose=0):
        super().__init__(verbose)
        self.epoch_length = epoch_length
        self.epoch_rewards = []
        self.epoch_mean_rewards = []
        self.epoch_median_rewards = []
        self.epoch_all_rewards = []
        self.current_rewards = []

    def _on_step(self) -> bool:
        if self.locals.get("rewards") is not None:
            self.current_rewards.append(self.locals["rewards"][0])
        
        # Every epoch_length steps, calculate mean and reset
        if self.num_timesteps % self.epoch_length == 0:
            if self.current_rewards:
                mean_reward = np.mean(self.current_rewards)
                median_reward = np.median(self.current_rewards)
                self.epoch_rewards.append(mean_reward)
                self.epoch_mean_rewards.append(mean_reward)
                self.epoch_median_rewards.append(median_reward)
                self.epoch_all_rewards.append(self.current_rewards.copy())
                self.current_rewards = []

        return True

    def _on_training_end(self):
        # Save rewards to file (optional)
        np.save("epoch_rewards.npy", self.epoch_rewards)
        np.save("epoch_mean_rewards.npy", self.epoch_mean_rewards)
        np.save("epoch_median_rewards.npy", self.epoch_median_rewards)
        np.save("epoch_all_rewards.npy", self.epoch_all_rewards)


In [None]:



# Allow duplicate OpenMP libs (only needed in some environments)
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Set random seed for reproducibility
seed = 42

# Register the custom environment
gymnasium.register(
    id='CogSatEnv-v1',
    entry_point='utils.env:CogSatEnv',
)

# Environment ID and initialization
env_id = "CogSatEnv-v1"
train_env = make_vec_env(env_id, n_envs=1, seed=seed)
eval_env = make_vec_env(env_id, n_envs=1, seed=seed + 1)  # separate eval env

# Training configuration
epoch_length = 62
epoch_numbers = 500
total_timesteps = epoch_length * epoch_numbers

# Define callbacks
reward_logger = RewardLoggerCallback(epoch_length=epoch_length)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/best_model/",
    log_path="./logs/results/",
    eval_freq=epoch_length,  # Evaluate once every epoch
    deterministic=True,
    render=False
)
callback = CallbackList([reward_logger, eval_callback])

# Initialize DQN model with multi-input policy
model = DQN(
    policy="MultiInputPolicy",
    env=train_env,
    verbose=1,
    tensorboard_log="./a2c_dsa_tensorboard/",
    seed=seed,
    learning_rate=0.0001
)

# Train the model
model.learn(total_timesteps=total_timesteps, callback=callback)

# Save the final model (note: best model saved separately via EvalCallback)
model.save("models/final_dqn_cogsatenv")

# Cleanup
train_env.close()
eval_env.close()
