### DQN

In [1]:
import gymnasium
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from utils.env import CogSatEnv


In [2]:
from utils.env import env_name
print(f"Using environment: {env_name}")

Using environment: NermineCogSatEnv-v1


In [3]:

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# set the seed
seed = 42

gymnasium.register(
    id='CogSatEnv-v1',  # Use the same ID here as you used in the script
    entry_point='env:CogSatEnv',
)

# Initialize the environment
env_id = "CogSatEnv-v1"
env = CogSatEnv()

In [4]:
env.reset(seed=seed)  # Reset the environment with the seed

++++===== ENV RESET+++===


({'utc_time': array([1744250400], dtype=int64),
  'freq_lgs_leo': array([21.,  2., 23.,  2., 23., 23.,  8., 24.,  9., 24.])},
 {})

In [5]:
env.intial_obs

{'utc_time': array([0], dtype=int64),
 'freq_lgs_leo': array([21.31080281, 20.50352931, 20.88618926, 21.859923  , 21.33061667,
        20.10466071, 20.5875362 , 20.8283225 , 21.69223925, 20.23678501])}

In [6]:
dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

In [None]:

epoch_length = 180 ## got through experiment
epoch_numbers = 100

total_steps = epoch_length * epoch_numbers

# Optional: Check the environment
check_env(env, warn=True)

# Instantiate the model
model = DQN(
    policy="MultiInputPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=10,
    batch_size=16,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10,
    verbose=1
)

# Train the agent
model.learn(total_timesteps=total_steps)
# measure perofmance of training
# Save the model
model.save("dqn_cogsat")
env.close()




++++===== ENV RESET+++===
++++===== ENV RESET+++===
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
Step Started
Action taken:  17
Current LEO User ID:  0
self.tIndex:  0
Current LEO Satellite ID:  583
Updated ChannelListLeo:  17.0
Next Observation:  {'utc_time': array([1744250400], dtype=int64), 'freq_lgs_leo': array([17., 11.,  4.,  2.,  9.,  2.,  9., 21., 13.,  6.])}
SINR[:,self.tIndex]:  [18.6320472  19.48774564 17.69953941 19.75407402 18.27789436 18.44491618
 19.17220831 18.07122335 18.4869541  16.84949809 18.34457316 20.14168138
 17.39988693 18.60699242 21.40963998 18.45710648 20.35807666 18.75110374
 19.78965549 19.0766706 ]
Reward:  377.2114875187826
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
++++===== ENV RESET+++===
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
Step Started
Action taken:  18
Current LEO User ID: 

In [None]:
# Run it in sepratae file
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

In [None]:

dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

epoch_length = 884 ## got through experiment
epoch_numbers = 100

# Set up the checkpoint callback
checkpoint_callback = CheckpointCallback(save_freq=epoch_length, save_path='./logs/', name_prefix='rl_model_A2C')

# Specify the policy network architecture, here we are using the default MIP
model = A2C("MultiInputPolicy", env, ent_coef=0.01, verbose=1, tensorboard_log="./a2c_leogeo_tensorboard/",
            seed=seed, learning_rate=0.0001)

# Define the total number of timesteps to train the model
total_timesteps = epoch_length*epoch_numbers

# Train the model
model.learn(total_timesteps=total_timesteps, callback=checkpoint_callback)

# Save the model
model.save("a2c_leogeoenv_1")

env.close()
