In [None]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

env = Monitor(gym.make("CartPole-v1"))
model = DQN(
    "MlpPolicy",
    env,
    learning_rate=5e-4,
    buffer_size=200000,
    learning_starts=5000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1000,
    exploration_fraction=0.4,
    exploration_final_eps=0.05,
    verbose=1,
)


model.learn(total_timesteps=400000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20)
print("Baseline eval:", mean_reward, "+/-", std_reward)
model.save("dqn_cartpole_baseline")
env.close()




In [None]:
import numpy as np
import gymnasium as gym
from collections import deque

class ObsNoise(gym.ObservationWrapper):
    def __init__(self, env, sigma=0.02):
        super().__init__(env)
        self.sigma = sigma

    def observation(self, obs):
        return obs + np.random.normal(0, self.sigma, size=obs.shape)

class ActionDelay(gym.Wrapper):
    def __init__(self, env, delay_steps=2):
        super().__init__(env)
        self.delay_steps = delay_steps
        self.queue = deque([0]*delay_steps, maxlen=delay_steps) 

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.queue = deque([0]*self.delay_steps, maxlen=self.delay_steps)
        return obs, info

    def step(self, action):
        self.queue.append(action)
        delayed_action = self.queue[0]
        return self.env.step(delayed_action)
def make_noisy_delayed_env(sigma=0.02, delay_steps=2):
    env = gym.make("CartPole-v1")
    env = ObsNoise(env, sigma=sigma)
    env = ActionDelay(env, delay_steps=delay_steps)
    return env


In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym

clean_env = gym.make("CartPole-v1")
hard_env = make_noisy_delayed_env(sigma=0.03, delay_steps=2)

baseline = DQN.load("dqn_cartpole_baseline", env=clean_env)

print("Baseline on clean:", evaluate_policy(baseline, clean_env, n_eval_episodes=20))
print("Baseline on hard :", evaluate_policy(baseline, hard_env, n_eval_episodes=20))


In [None]:
import random
import gymnasium as gym

class DomainRandomize(gym.Wrapper):
    def __init__(self, env_fn, sigma_range=(0.0, 0.05), delay_range=(0, 2)):
        # env_fn should make a *fresh* base env
        self.env_fn = env_fn
        self.sigma_range = sigma_range
        self.delay_range = delay_range
        super().__init__(env_fn())

    def reset(self, **kwargs):
        # swap env each episode
        self.env.close()
        sigma = random.uniform(*self.sigma_range)
        delay = random.randint(*self.delay_range)
        base = self.env_fn()
        wrapped = ObsNoise(base, sigma=sigma)
        wrapped = ActionDelay(wrapped, delay_steps=delay)
        self.env = wrapped
        return self.env.reset(**kwargs)

    def step(self, action):
        return self.env.step(action)

def env_fn():
    return gym.make("CartPole-v1")

train_env = DomainRandomize(env_fn, sigma_range=(0.0, 0.05), delay_range=(0, 2))

In [None]:
from stable_baselines3 import DQN

robust = DQN(
    "MlpPolicy",
    train_env,
    learning_rate=1e-3,
    buffer_size=200_000,
    learning_starts=5_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1_000,
    exploration_fraction=0.3,
    exploration_final_eps=0.05,
    verbose=1,
)

robust.learn(total_timesteps=300_000)
robust.save("dqn_cartpole_robust")
