This notebook contains code examples for the experiments that I ran. I have not included all of the variations that I did, as that would result in a lot of code duplication. I've just given examples for each phase. 

In [None]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
from gymnasium import Wrapper


# Phase 0: State and action space

In [None]:
env = gym.make("LunarLander-v3")

print(f"Action space: {env.action_space}")
print(f"Sample action: {env.action_space.sample()}")

print(f"Observation space: {env.observation_space}")
print(f"Sample observation: {env.observation_space.sample()}")

env.close()

# Phase 1: Baselines

In [None]:
# Random Agent
random_env = gym.make("LunarLander-v3")
random_rewards = []

for _ in range(100):
    obs, info = random_env.reset()
    episode_reward = 0
    done = False
    
    while not done:
        # Random agent just samples from the action space
        action = random_env.action_space.sample()
        obs, reward, terminated, truncated, info = random_env.step(action)
        episode_reward += reward
        done = terminated or truncated
    
    random_rewards.append(episode_reward)

random_env.close()
mean_reward = np.mean(random_rewards)

print(f"Random agent mean reward: {np.mean(random_rewards):.2f} +/- {np.std(random_rewards):.2f}")

In [None]:
# DQN default hyperparameters, vary total_timesteps
train_env = gym.make("LunarLander-v3")
model = DQN('MlpPolicy', train_env, verbose=0)
model.learn(total_timesteps=100_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

# Phase 2: Hyperparameter tuning

In [None]:
# Vary exploration fraction
train_env = gym.make("LunarLander-v3")
model = DQN('MlpPolicy', train_env, exploration_fraction = 0.5, verbose=0)
model.learn(total_timesteps=300_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

In [None]:
# Vary final epsilon
train_env = gym.make("LunarLander-v3")
model = DQN('MlpPolicy', train_env, 
            learning_rate = 0.0001, 
            exploration_fraction = 0.5, 
            exploration_final_eps = 0.25, 
            verbose=0)
model.learn(total_timesteps=300_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

In [None]:
# Vary learning rate
train_env = gym.make("LunarLander-v3")
model = DQN(
    'MlpPolicy', 
    train_env, 
    learning_rate = 0.00001, 
    exploration_fraction = 0.5, 
    exploration_final_eps = 0.25, 
    verbose=0)

model.learn(total_timesteps=1_000_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

# Phase 3: Reward Shaping

In [None]:
class LandingIncentiveWrapper(gym.Wrapper):
    def __init__(self, env, airborne_penalty=0.0, leg_contact_bonus_multiplier=1.0):
        """
        Wrapper to modify LunarLander rewards to encourage landing.
        
        Args:
            env: The base LunarLander environment
            airborne_penalty: Additional penalty per timestep while airborne (e.g., 0.1)
            leg_contact_bonus_multiplier: Multiply the leg contact reward by this factor (e.g., 2.0)
        """
        super().__init__(env)
        self.airborne_penalty = airborne_penalty
        self.leg_contact_bonus_multiplier = leg_contact_bonus_multiplier
        
    def step(self, action):
        # Take a step in the environment
        observation, reward, terminated, truncated, info = self.env.step(action)
        
        # The LunarLander observation is an 8-dimensional vector:
        # [x_pos, y_pos, x_vel, y_vel, angle, angular_vel, left_leg_contact, right_leg_contact]
        # Indices 6 and 7 are boolean values (1.0 or 0.0) indicating leg contact
        left_leg_contact = observation[6]
        right_leg_contact = observation[7]
        
        # Start with the original reward
        modified_reward = reward
        
        # Apply airborne penalty if no legs are touching ground
        if not (left_leg_contact or right_leg_contact):
            modified_reward -= self.airborne_penalty
        
        # Amplify the leg contact reward if legs are touching
        # The original environment already gives +10 per leg per frame
        # So we add additional bonus based on the multiplier
        if left_leg_contact:
            modified_reward += 10 * (self.leg_contact_bonus_multiplier - 1.0)
        if right_leg_contact:
            modified_reward += 10 * (self.leg_contact_bonus_multiplier - 1.0)
            
        return observation, modified_reward, terminated, truncated, info

In [None]:
# Create base environment
base_env = gym.make("LunarLander-v3")

# Wrap it with custom reward wrapper
train_env = LandingIncentiveWrapper(
    base_env, 
    airborne_penalty=0.1,  # Small penalty per frame for hovering
    leg_contact_bonus_multiplier=2.0  # Double the leg contact reward
)

# Train
model = DQN('MlpPolicy', train_env, verbose=0,
            exploration_fraction=0.5,
            exploration_final_eps=0.25)
model.learn(total_timesteps=500_000)
model.save("dqn_lunar_stable6")

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

In [None]:
class EngineIncentiveWrapper(gym.Wrapper):
    def __init__(self, env, main_engine_penalty_multiplier=1.0, 
                 side_engine_penalty_multiplier=1.0,):
        """
        Wrapper to modify LunarLander engine penalties to discourage hovering.
        
        The original LunarLander gives:
        - 0.3 penalty per frame when main engine fires
        - 0.03 penalty per frame when side engine fires
        
        Args:
            env: The base LunarLander environment
            main_engine_penalty_multiplier: Multiply main engine penalty by this (e.g., 3.0 makes it 0.9 per frame)
            side_engine_penalty_multiplier: Multiply side engine penalty by this (e.g., 3.0 makes it 0.09 per frame)
        """
        super().__init__(env)
        self.main_engine_penalty_multiplier = main_engine_penalty_multiplier
        self.side_engine_penalty_multiplier = side_engine_penalty_multiplier
        
    def step(self, action):
        # Take a step in the environment
        observation, reward, terminated, truncated, info = self.env.step(action)
        
        # The action space in LunarLander is discrete with 4 actions:
        # 0: do nothing
        # 1: fire left orientation engine
        # 2: fire main engine
        # 3: fire right orientation engine
        
        # We need to calculate the additional penalty based on which action was taken
        # The original environment already applies penalties of -0.3 for main, -0.03 for side
        # We want to add extra penalty to make these more expensive
        
        additional_penalty = 0.0
        
        if action == 2:  # Main engine
            # Original penalty is -0.3, we want to add extra to multiply it
            additional_penalty = -0.3 * (self.main_engine_penalty_multiplier - 1.0)
        elif action == 1 or action == 3:  # Side engines
            # Original penalty is -0.03, we want to add extra to multiply it
            additional_penalty = -0.03 * (self.side_engine_penalty_multiplier - 1.0)
        
        modified_reward = reward + additional_penalty
            
        return observation, modified_reward, terminated, truncated, info

In [None]:
# Create base environment
base_env = gym.make("LunarLander-v3")

# Try making engines 3x more expensive
train_env = EngineIncentiveWrapper(
    base_env, 
    main_engine_penalty_multiplier=3.0,
    side_engine_penalty_multiplier=3.0
)

# Train
model = DQN('MlpPolicy', train_env, verbose=0,
            exploration_fraction=0.5,
            exploration_final_eps=0.25)
model.learn(total_timesteps=500_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

In [None]:
class CombinedLandingWrapper(gym.Wrapper):
    def __init__(self, env, 
                 airborne_penalty=0.0, 
                 leg_contact_bonus_multiplier=1.0,
                 main_engine_penalty_multiplier=1.0,
                 side_engine_penalty_multiplier=1.0):
        """
        Combined wrapper to modify LunarLander rewards to encourage landing.
        
        This wrapper applies multiple reward modifications:
        1. Airborne penalty: small cost per frame when no legs touch ground
        2. Leg contact bonus: amplifies the reward for having legs on ground
        3. Engine penalties: increases the cost of firing engines
        
        Args:
            env: The base LunarLander environment
            airborne_penalty: Additional penalty per timestep while airborne (e.g., 0.1)
            leg_contact_bonus_multiplier: Multiply leg contact reward by this (e.g., 2.0)
            main_engine_penalty_multiplier: Multiply main engine penalty by this (e.g., 3.0)
            side_engine_penalty_multiplier: Multiply side engine penalty by this (e.g., 3.0)
        """
        super().__init__(env)
        self.airborne_penalty = airborne_penalty
        self.leg_contact_bonus_multiplier = leg_contact_bonus_multiplier
        self.main_engine_penalty_multiplier = main_engine_penalty_multiplier
        self.side_engine_penalty_multiplier = side_engine_penalty_multiplier
        
    def step(self, action):
        # Take a step in the environment
        observation, reward, terminated, truncated, info = self.env.step(action)
        
        # Extract leg contact information from observation
        # Observation: [x_pos, y_pos, x_vel, y_vel, angle, angular_vel, left_leg, right_leg]
        left_leg_contact = observation[6]
        right_leg_contact = observation[7]

        # Start with the original reward from the environment
        modified_reward = reward
        
        # If neither leg is touching, agent pays a small penalty per frame
        # This makes hovering accumulate costs over time
        if not (left_leg_contact or right_leg_contact):
            modified_reward -= self.airborne_penalty
        
        # Original environment gives +10 per leg per frame
        # We add extra bonus based on the multiplier
        # This makes the landed state more valuable
        if left_leg_contact:
            modified_reward += 10 * (self.leg_contact_bonus_multiplier - 1.0)
        if right_leg_contact:
            modified_reward += 10 * (self.leg_contact_bonus_multiplier - 1.0)
        
        # Original penalties: -0.3 for main engine, -0.03 for side engines
        # We add additional penalties to make hovering more expensive
        # Action space: 0=nothing, 1=left engine, 2=main engine, 3=right engine
        if action == 2:  # Main engine fired
            additional_penalty = -0.3 * (self.main_engine_penalty_multiplier - 1.0)
            modified_reward += additional_penalty
        elif action == 1 or action == 3:  # Side engine fired
            additional_penalty = -0.03 * (self.side_engine_penalty_multiplier - 1.0)
            modified_reward += additional_penalty
            
        return observation, modified_reward, terminated, truncated, info

In [None]:
# Create base environment
base_env = gym.make("LunarLander-v3")

# Wrap
train_env = CombinedLandingWrapper(
    base_env,
    airborne_penalty=0.1,               # Small time pressure
    leg_contact_bonus_multiplier=2.0,   # Double the landing reward
    main_engine_penalty_multiplier=3.0, # Triple engine costs
    side_engine_penalty_multiplier=3.0
)

# Train 
model = DQN('MlpPolicy', train_env, verbose=0,
            exploration_fraction=0.5,
            exploration_final_eps=0.25)
model.learn(total_timesteps=300_000)

eval_env = gym.make("LunarLander-v3")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

train_env.close()
eval_env.close()

# Phase 5: Research-based Hyperparameters

In [None]:
# Create training and evaluation environments
env = gym.make("LunarLander-v3")
eval_env = gym.make("LunarLander-v3")

# Network architecture: [256, 128] hidden layers
policy_kwargs = dict(
    net_arch=[256, 128]
)

# Create the DQN agent with paper's hyperparameters
model = DQN(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs,
    learning_rate=0.0005,              
    buffer_size=65536,                 
    batch_size=32,                     
    gamma=0.99,                        
    exploration_fraction=0.75,         
    exploration_initial_eps=0.5,       
    exploration_final_eps=0.0,         
    target_update_interval=1000,       
    learning_starts=1000,              
    verbose=0
)

# Train for 2000 episodes (paper used 2000)
# Each episode is roughly 200 steps, so ~400,000 timesteps
total_timesteps = 400000

print("Starting training...")
model.learn(total_timesteps=total_timesteps)

# Test the trained model
print("\nTesting...")
mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)
print(f"Mean reward over 100 episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

# Phase 6: Ablation Tests

In [None]:
# Create training and evaluation environments
env = gym.make("LunarLander-v3")
eval_env = gym.make("LunarLander-v3")

# Smaller network
policy_kwargs = dict(
    net_arch=[64, 64]
)

# Create the DQN agent with paper's hyperparameters
model = DQN(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs,
    learning_rate=0.0005,              
    buffer_size=65536,                 
    batch_size=32,                     
    gamma=0.99,                        
    exploration_fraction=0.75,         
    exploration_initial_eps=0.5,       
    exploration_final_eps=0.0,         
    target_update_interval=1000,       
    learning_starts=1000,              
    verbose=0
)

# Train for 2000 episodes (paper used 2000)
# Each episode is roughly 200 steps, so ~400,000 timesteps
total_timesteps = 400000

print("Starting training...")
model.learn(total_timesteps=total_timesteps)

# Test the trained model
print("\nTesting...")
mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=100,
    deterministic=True
)
print(f"Mean reward over 100 episodes: {mean_reward:.2f} +/- {std_reward:.2f}")