### DQN

In [1]:
import gymnasium
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from utils.env import CogSatEnv

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# set the seed
seed = 42

gymnasium.register(
    id='CogSatEnv-v1',  # Use the same ID here as you used in the script
    entry_point='env:CogSatEnv',
)

# Initialize the environment
env_id = "CogSatEnv-v1"
env = CogSatEnv()

In [3]:
env.LeoChannels

25.0

In [None]:
env.reset(seed=seed)  # Reset the environment with the seed

++++===== ENV RESET+++===


({'utc_time': array([1744250400], dtype=int64),
  'leo_pos': array([-65.25349693, 131.19641504]),
  'geo_freq': array([1.5e+09]),
  'leo_freq': array([0.]),
  'leo_access': array([0., 0.])},
 {})

In [3]:
dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

In [None]:

epoch_length = 180 ## got through experiment
epoch_numbers = 100

total_steps = epoch_length * epoch_numbers

# Optional: Check the environment
check_env(env, warn=True)

# Instantiate the model
model = DQN(
    policy="MultiInputPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=10,
    batch_size=16,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10,
    verbose=1
)

# Train the agent
model.learn(total_timesteps=total_steps)
# measure perofmance of training
# Save the model
model.save("dqn_cogsat")
env.close()




++++===== ENV RESET+++===
++++===== ENV RESET+++===
Action taken:  1
Step Scenario 2.0
Reward:  -32.4115512468957
++++===== ENV RESET+++===
Action taken:  7
Step Scenario 2.0
Reward:  -32.4115512468957
Action taken:  4
Step Scenario 3.0
Reward:  -32.4115512468957
Action taken:  3
Step Scenario 4.0
Reward:  -32.4115512468957
Action taken:  3
Step Scenario 5.0
Reward:  -32.4115512468957
Action taken:  8
Step Scenario 6.0
Reward:  -32.4115512468957
Action taken:  1
Step Scenario 7.0
Reward:  -32.4115512468957
Action taken:  8
Step Scenario 8.0
Reward:  -31.70839275978682
Action taken:  2
Step Scenario 9.0
Reward:  -31.394433600868865
Action taken:  8
Step Scenario 10.0
Reward:  -31.298294162502515
Action taken:  1
Step Scenario 11.0
Reward:  -30.91373204662449
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
++++===== ENV RESET+++===
Action taken:  7
Step Scenario 2.0
Reward:  -32.4115512468957
Action taken:  1
Step Scenario 3.0
Reward:  -32.4

In [5]:
# Run it in sepratae file
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")



RejectedExecutionError: MATLAB has already terminated

In [None]:

dummy_env = DummyVecEnv([lambda: env])  # Wrap the environment with DummyVecEnv

epoch_length = 884 ## got through experiment
epoch_numbers = 100

# Set up the checkpoint callback
checkpoint_callback = CheckpointCallback(save_freq=epoch_length, save_path='./logs/', name_prefix='rl_model_A2C')

# Specify the policy network architecture, here we are using the default MIP
model = A2C("MultiInputPolicy", env, ent_coef=0.01, verbose=1, tensorboard_log="./a2c_leogeo_tensorboard/",
            seed=seed, learning_rate=0.0001)

# Define the total number of timesteps to train the model
total_timesteps = epoch_length*epoch_numbers

# Train the model
model.learn(total_timesteps=total_timesteps, callback=checkpoint_callback)

# Save the model
model.save("a2c_leogeoenv_1")

env.close()
