### DQN

In [1]:
import gymnasium
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from utils.env import CogSatEnv

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# set the seed
seed = 42

gymnasium.register(
    id='CogSatEnv-v1',  # Use the same ID here as you used in the script
    entry_point='env:CogSatEnv',
)

# Initialize the environment
env_id = "CogSatEnv-v1"
env = CogSatEnv()

In [2]:
env.reset(seed=seed)  # Reset the environment with the seed

({'utc_time': array([1744250400], dtype=int64),
  'leo_pos': array([-65.25349693, 131.19641504]),
  'geo_freq': array([1.5e+09]),
  'leo_freq': array([0.]),
  'leo_access': array([0., 0.])},
 {})

In [3]:
dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

In [None]:

epoch_length = 884 ## got through experiment
epoch_numbers = 100

# Optional: Check the environment
check_env(env, warn=True)

# Instantiate the model
model = DQN(
    policy="MultiInputPolicy",
    env=env,
    learning_rate=1e-3,
    buffer_size=50000,
    learning_starts=10,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10,
    verbose=1
)

# Train the agent
model.learn(total_timesteps=180)
# measure perofmance of training
# Save the model
model.save("dqn_cogsat")


# Run it in sepratae file
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")


Action taken:  7
Reward:  0.0
Action taken:  6
Reward:  0.0
Action taken:  1
Reward:  0.0
Action taken:  2
Reward:  0.0
Action taken:  4
Reward:  0.0
Action taken:  1
Reward:  0.0
Action taken:  5
Reward:  0.0
Action taken:  8
Reward:  -31.70839275978682
Action taken:  4
Reward:  -31.434871430345908
Action taken:  7
Reward:  -31.278070443890726
Action taken:  4
Reward:  -30.974284558562857
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Action taken:  4
Reward:  0.0
Action taken:  7
Reward:  0.0
Action taken:  1
Reward:  0.0
Action taken:  2
Reward:  0.0
Action taken:  9
Reward:  0.0
Action taken:  1
Reward:  0.0
Action taken:  9
Reward:  -31.728637861795193
Action taken:  2
Reward:  -31.394433600868865
Action taken:  6
Reward:  -31.25785057343643
Action taken:  2
Reward:  -30.933912372121824
Action taken:  5
Reward:  -30.724831052674205
Action taken:  8
Reward:  -30.48835320359086
Action taken:  9
Reward:  -30.183507302314325
Action taken

In [None]:

dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

epoch_length = 884 ## got through experiment
epoch_numbers = 100

# Set up the checkpoint callback
checkpoint_callback = CheckpointCallback(save_freq=epoch_length, save_path='./logs/', name_prefix='rl_model_A2C')

# Specify the policy network architecture, here we are using the default MIP
model = A2C("MultiInputPolicy", env, ent_coef=0.01, verbose=1, tensorboard_log="./a2c_leogeo_tensorboard/",
            seed=seed, learning_rate=0.0001)

# Define the total number of timesteps to train the model
total_timesteps = epoch_length*epoch_numbers

# Train the model
model.learn(total_timesteps=total_timesteps, callback=checkpoint_callback)

# Save the model
model.save("a2c_leogeoenv_1")

env.close()
