1. Rainbow and Ape-X Expiriments 
    1. We release a set of hyper parameters for CartPole-v1 and Classic Control and Atari
    2. We release code for Rainbow that can train X steps in Y minutes on a Mac M2 Chip
    3. We also release a version of Ape-X as described in the original paper, and an Ape-X with rainbow
        1. Compare results of each 
        2. Compare Ape-X with different Rainbow components added or removed
    4. We compare the different models of DQN as seen in their papers to rainbow, the different individual components to rainbow, and rainbow with individual components removed
    10. Compare rainbow training speeds with different levels of numerical precision and datatypes
        1. Mixed precision using torch.amp 
        2. Lower matmul precision
            1. comparing medium, high, and highest 
            2. https://pytorch.org/docs/master/generated/torch.set_float32_matmul_precision.html?highlight=precision#torch.set_float32_matmul_precision
    11. Ape-X Hyper parameter sweep and sensitivities
    12. Exploration methods for Rainbow Ape-X
        1. Just noisy nets (same for all actors)
        2. Noisy nets and varying epsilon 
        3. Adding a constant that changes variance of noisy nets for action selection
        4. AlphaStar Agents

## Rainbow on CartPole-v1
Hyperparameters are based on the hyperopt experiments, quantized trial 27 with some minor changes

In [None]:
import gymnasium as gym
import sys

from utils import CategoricalCrossentropy, KLDivergence
sys.path.append('../..')
from dqn.rainbow.rainbow_agent import RainbowAgent
from agent_configs import RainbowConfig
from game_configs import CartPoleConfig

env = gym.make("CartPole-v1")

config_dict = {
  "dense_layers_widths": [128, 128],
  "value_hidden_layers_widths": [64, 64],
  "advatage_hidden_layers_widths": [64, 64],
  "adam_epsilon": 0.00375,
  "learning_rate": 0.005,
  "training_steps": 10000,
  "per_epsilon": 0.05,
  "per_alpha": 0.8,
  "per_beta": 0.45,
  "minibatch_size": 128,
  "replay_buffer_size": 10000,
  "min_replay_buffer_size": 1250,
  "transfer_interval": 10,
  "n_step": 9,
  "kernel_initializer": "glorot_uniform",
  "loss_function": KLDivergence(), # could do categorical cross entropy 
  "clipnorm": 2.0,
  "discount_factor": 0.99,
  "atom_size": 81,
  "replay_interval": 4,
}
game_config = CartPoleConfig()
config = RainbowConfig(config_dict, game_config)
agent = RainbowAgent(env, config, name="Rainbow_CartPole-v1")

for param in agent.model.parameters():
  print(param)
print("start")
agent.train()

## Rainbow on Classic Control
Hyperparameters come from revisitting rainbow

In [None]:
import gymnasium as gym
import sys

from utils import CategoricalCrossentropy, KLDivergence
sys.path.append('../..')
from dqn.rainbow.rainbow_agent import RainbowAgent
from agent_configs import RainbowConfig
from game_configs import ClassicControlConfig

config_dict = {
  "dense_layers_widths": [512, 512],
  "value_hidden_layers_widths": [], # 
  "advatage_hidden_layers_widths": [], # 
  "adam_epsilon": 3.125e-4,
  "learning_rate": 0.001,
  "training_steps": 30000, #
  "per_epsilon": 1e-6, # 
  "per_alpha": 0.5, # 
  "per_beta": 0.4, # 
  "minibatch_size": 128,
  "replay_buffer_size": 50000,
  "min_replay_buffer_size": 500,
  "transfer_interval": 100,
  "n_step": 3,
  "kernel_initializer": "orthogonal", #
  "loss_function": KLDivergence(), # 
  "clipnorm": 2.0, # 
  "discount_factor": 0.99,
  "atom_size": 51,
  "replay_interval": 2,
}
game_config = ClassicControlConfig()
config = RainbowConfig(config_dict, game_config)

# game_config.v_min = 0
# game_config.v_max = 500
# env = gym.make("CartPole-v1", render_mode="rgb_array")
# agent = RainbowAgent(env, config, name="Rainbow_ClassicControl_CartPole-v1")
# agent.train()

# game_config.v_min = -500
# game_config.v_max = 0
# env = gym.make("Acrobot-v1", render_mode="rgb_array")
# agent = RainbowAgent(env, config, name="Rainbow_ClassicControl_Acrobot-v1")
# agent.train()

# game_config.v_min = -200
# game_config.v_max = 200
# env = gym.make("LunarLander-v2", render_mode="rgb_array")
# agent = RainbowAgent(env, config, name="Rainbow_ClassicControl_LunarLander-v2")
# agent.train()

# game_config.v_min = -200
# game_config.v_max = -100
env = gym.make("MountainCar-v0", render_mode="rgb_array")
agent = RainbowAgent(env, config, name="Rainbow_ClassicControl_MountainCar-v0")
agent.train()

## Rainbow on Atari

In [1]:
import gymnasium as gym
import sys

from utils import CategoricalCrossentropy, KLDivergence
sys.path.append('../..')
from dqn.rainbow.rainbow_agent import RainbowAgent
from agent_configs import RainbowConfig
from game_configs import AtariConfig
from gymnasium.wrappers import AtariPreprocessing, FrameStack
import numpy as np

config_dict = {
  "conv_layers": [
      (32, 8, 4),
      (64, 4, 2),
      (64, 3, 1),
  ],
  "dense_layers_widths": [512], 
  "value_hidden_layers_widths": [], # 
  "advatage_hidden_layers_widths": [], # 
  "adam_epsilon": 1.5e-4,
  "learning_rate": 0.00025/4,
  "training_steps": 50000000, # Agent saw 200,000,000 frames
  "per_epsilon": 1e-6, # 
  "per_alpha": 0.5, 
  "per_beta": 0.4, 
  "minibatch_size": 32,
  "replay_buffer_size": 1000000,
  "min_replay_buffer_size": 80000,
  "transfer_interval": 32000,
  "n_step": 3,
  "kernel_initializer": "orthogonal", #
  "loss_function": KLDivergence(),
  "clipnorm": 0.0, # 
  "discount_factor": 0.99,
  "atom_size": 51,
  "replay_interval": 4,
}
game_config = AtariConfig()
config = RainbowConfig(config_dict, game_config)

class ClipReward(gym.RewardWrapper):
    def __init__(self, env, min_reward, max_reward):
        super().__init__(env)
        self.min_reward = min_reward
        self.max_reward = max_reward
        self.reward_range = (min_reward, max_reward)

    def reward(self, reward):
        return np.clip(reward, self.min_reward, self.max_reward)
env = gym.make("MsPacmanNoFrameskip-v4", render_mode="rgb_array", max_episode_steps=108000)
env = AtariPreprocessing(env, terminal_on_life_loss=True)
env = FrameStack(env, 4)
agent = RainbowAgent(env, config, name="Rainbow_Atari_MsPacmanNoFrameskip-v4")
agent.checkpoint_interval = 1000
agent.train()

filling replay buffer 57377
filling replay buffer 57378
filling replay buffer 57379
filling replay buffer 57380
filling replay buffer 57381
filling replay buffer 57382
filling replay buffer 57383
filling replay buffer 57384
filling replay buffer 57385
filling replay buffer 57386
filling replay buffer 57387
filling replay buffer 57388
filling replay buffer 57389
filling replay buffer 57390
filling replay buffer 57391
filling replay buffer 57392
filling replay buffer 57393
filling replay buffer 57394
filling replay buffer 57395
filling replay buffer 57396
filling replay buffer 57397
filling replay buffer 57398
filling replay buffer 57399
filling replay buffer 57400
filling replay buffer 57401
filling replay buffer 57402
filling replay buffer 57403
filling replay buffer 57404
filling replay buffer 57405
filling replay buffer 57406
filling replay buffer 57407
filling replay buffer 57408
filling replay buffer 57409
filling replay buffer 57410
filling replay buffer 57411
filling replay buffe

KeyboardInterrupt: 

In [None]:
agent.save_checkpoint(
    0,
    0,
    0,
    0,
)

: 