### DQN

In [None]:
import gymnasium
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from utils.env import CogSatEnv


In [None]:
from utils.env import env_name
print(f"Using environment: {env_name}")

In [None]:

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# set the seed
seed = 42

gymnasium.register(
    id='CogSatEnv-v1',  # Use the same ID here as you used in the script
    entry_point='env:CogSatEnv',
)

# Initialize the environment
env_id = "CogSatEnv-v1"
env = CogSatEnv()

In [None]:
# %% utils/callbacks.py
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback

class RewardLoggerCallback(BaseCallback):
    def __init__(self, epoch_length, verbose=0):
        super().__init__(verbose)
        self.epoch_length = epoch_length
        self.epoch_rewards = []
        self.epoch_mean_rewards = []
        self.epoch_median_rewards = []
        self.epoch_all_rewards = []
        self.current_rewards = []

    def _on_step(self) -> bool:
        if self.locals.get("rewards") is not None:
            self.current_rewards.append(self.locals["rewards"][0])
        
        # Every epoch_length steps, calculate mean and reset
        if self.num_timesteps % self.epoch_length == 0:
            if self.current_rewards:
                mean_reward = np.mean(self.current_rewards)
                median_reward = np.median(self.current_rewards)
                self.epoch_rewards.append(mean_reward)
                self.epoch_mean_rewards.append(mean_reward)
                self.epoch_median_rewards.append(median_reward)
                self.epoch_all_rewards.append(self.current_rewards.copy())
                self.current_rewards = []

        return True

    def _on_training_end(self):
        # Save rewards to file (optional)
        np.save("epoch_rewards.npy", self.epoch_rewards)
        np.save("epoch_mean_rewards.npy", self.epoch_mean_rewards)
        np.save("epoch_median_rewards.npy", self.epoch_median_rewards)
        np.save("epoch_all_rewards.npy", self.epoch_all_rewards)


In [None]:
env.reset(seed=seed)  # Reset the environment with the seed

In [None]:
env.intial_obs

In [None]:
dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

In [None]:

epoch_length = 62 ## got through experiment
epoch_numbers = 500


# epoch_length = 5 ## got through experiment
# epoch_numbers = 5


total_steps = epoch_length * epoch_numbers

# Optional: Check the environment
check_env(env, warn=True)

# Instantiate the model
model = DQN(
    policy="MultiInputPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=10,
    batch_size=16,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10,
    verbose=1
)



# Instantiate callback
reward_logger = RewardLoggerCallback(epoch_length=epoch_length)

# Train the agent
model.learn(total_timesteps=total_steps, callback=reward_logger)

# Save rewards for plotting
rewards = reward_logger.epoch_rewards
np.save("epoch_rewards.npy", rewards)


# measure perofmance of training
# Save the model
model.save("dqn_cogsat")
# env.close()




In [None]:
# %% Plotting
import matplotlib.pyplot as plt
import numpy as np

rewards = np.load("epoch_rewards.npy")

plt.figure(figsize=(10, 6))
plt.plot(rewards, label='Mean Reward per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Mean Reward')
plt.title('DQN Training Performance on CogSatEnv')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# %% Plotting
import matplotlib.pyplot as plt
import numpy as np

rewards = np.load("epoch_median_rewards.npy")

plt.figure(figsize=(10, 6))
plt.plot(rewards, label='Median Reward per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Median Reward')
plt.title('DQN Training Performance on CogSatEnv')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Run it in sepratae file
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

In [None]:
env.close()