In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback

# %% utils/callbacks.py
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback

class RewardLoggerCallback(BaseCallback):
    def __init__(self, epoch_length, verbose=0):
        super().__init__(verbose)
        self.epoch_length = epoch_length
        self.epoch_rewards = []
        self.epoch_mean_rewards = []
        self.epoch_median_rewards = []
        self.epoch_all_rewards = []
        self.current_rewards = []

    def _on_step(self) -> bool:
        if self.locals.get("rewards") is not None:
            self.current_rewards.append(self.locals["rewards"][0])
        
        # Every epoch_length steps, calculate mean and reset
        if self.num_timesteps % self.epoch_length == 0:
            if self.current_rewards:
                mean_reward = np.mean(self.current_rewards)
                median_reward = np.median(self.current_rewards)
                self.epoch_rewards.append(mean_reward)
                self.epoch_mean_rewards.append(mean_reward)
                self.epoch_median_rewards.append(median_reward)
                self.epoch_all_rewards.append(self.current_rewards.copy())
                self.current_rewards = []

        return True

    def _on_training_end(self):
        # Save rewards to file (optional)
        np.save("epoch_rewards.npy", self.epoch_rewards)
        np.save("epoch_mean_rewards.npy", self.epoch_mean_rewards)
        np.save("epoch_median_rewards.npy", self.epoch_median_rewards)
        np.save("epoch_all_rewards.npy", self.epoch_all_rewards)


# List of environments
env_ids = ['CartPole-v1', 'MountainCar-v0']
timesteps = 50000
eval_freq = 5000
n_eval_episodes = 5

# Store results
all_results = {}

for env_id in env_ids:
    print(f"\n=== Training on {env_id} ===")
    train_env = gym.make(env_id)
    eval_env = gym.make(env_id)

    model = DQN("MlpPolicy", train_env, verbose=0, learning_rate=1e-3)
    
    reward_logger = RewardLoggerCallback(epoch_length=timesteps)
    
    model.learn(total_timesteps=timesteps, callback=reward_logger)
    
    all_results[env_id] = reward_logger.epoch_rewards

    train_env.close()
    eval_env.close()



=== Training on CartPole-v1 ===

=== Training on MountainCar-v0 ===


In [2]:
all_results

{'CartPole-v1': [9.0, 9.0, 10.0, 10.0, 12.0, 26.0, 358.0, 187.0, 240.0, 207.0],
 'MountainCar-v0': [-200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0,
  -200.0]}