# **Mario**
### This project aims to crete a Mario Agent that will beat the entirity of Super Mario Bros using Deep Reinforcement Learning and Double Deep Q-Networks

**Eve Collier\
AI II - Spring 2025\
Final Project**

![Mario](https://media3.giphy.com/media/v1.Y2lkPTc5MGI3NjExdmFvcTg0dDBnbnQ1b3BjbzlnemJnbGVrdm02a29pbDF4a3ExcGF2eiZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/DqqHabAaTHRII/giphy.gif)

First, we need to install the game and import some stuff:

In [1]:
!pip install gym_super_mario_bros==7.3.0 nes_py
import gym_super_mario_bros
import nes_py
from nes_py.wrappers import JoypadSpace # Wrap the game 
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # simplify what a Mario agent can do, 256 actions it can do otherwise
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import cv2
import random
from collections import deque, namedtuple


# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Mario'sExperience
Experience = namedtuple('Experience', 
                       ('state', 'action', 'next_state', 'reward', 'done'))



Cool.\
Now, setup the game (AKA our environment):

In [2]:
#env = gym_super_mario_bros.make('SuperMarioBros-v0',apply_api_compatibility=True, render_mode = 'human')#,render_mode=\"human\
#env = JoypadSpace(env, SIMPLE_MOVEMENT)
#env.observation_space.shape # gives us frame of game
#env.action_space # actions we can take (simplemovement actions)
#env.reset() 
#nextState, reward, done, trunc, info = env.step(action=0)

now we will initalize our Double Deep Q- Network

In [3]:
# Enhanced Frame Stacker with pit detection
class EnhancedFrameStacker:
    def __init__(self, stack_size=4):
        self.stack_size = stack_size
        self.frames = deque(maxlen=stack_size)
        
    def _detect_pits(self, frame):
        #Detect pits by looking for sudden drops
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        resized = cv2.resize(gray, (84, 84))
        edges = cv2.Canny(resized, 50, 150)
        pit_mask = np.zeros_like(edges)
        pit_mask[-20:, :] = edges[-20:, :]  # Look at bottom of screen
        return (pit_mask > 0).astype(np.float32)
    
    def preprocess(self, frame):
        if isinstance(frame, tuple):
            frame = frame[0]

        # First resize the frame to fix dimension error
        frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
            
        # Base grayscale frame
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        #resized = cv2.resize(gray, (84, 84))
        normalized = gray / 255
        
        # Object detection channels
        #enemy_mask = self._detect_enemies(frame)
        #obstacle_mask = self._detect_obstacles(frame)
        enemy_mask = cv2.inRange(frame, (200, 0, 0), (255, 50, 50)) / 255.0
        obstacle_mask = cv2.inRange(frame, (100, 60, 0), (180, 120, 80)) / 255.0
        #pit_mask = self._detect_pits(frame)
        # Pit detection with morphological operations
        pit_mask = np.zeros_like(gray)
        pit_mask[-20:, :] = gray[-20:, :] < 50  # Dark areas at bottom
        
        # Stack all channels
        processed = np.stack([normalized, enemy_mask, obstacle_mask, pit_mask], axis=0)
        return processed.astype(np.float32)
    
    def _detect_enemies(self, frame):
        lower_red = np.array([200, 0, 0])
        upper_red = np.array([255, 50, 50])
        mask = cv2.inRange(frame, lower_red, upper_red)
        return (cv2.resize(mask, (84, 84)) > 0).astype(np.float32)
    
    def _detect_obstacles(self, frame):
        lower_brown = np.array([100, 60, 0])
        upper_brown = np.array([180, 120, 80])
        lower_gold = np.array([200, 180, 50])
        upper_gold = np.array([255, 220, 150])
        mask = cv2.inRange(frame, lower_brown, upper_brown) | cv2.inRange(frame, lower_gold, upper_gold)
        return (cv2.resize(mask, (84, 84)) > 0).astype(np.float32)
    
    def reset(self, frame):
        self.frames.clear()
        frame = self.preprocess(frame)
        for _ in range(self.stack_size):
            self.frames.append(frame)
        return self._get_stacked_frames()
    
    def append(self, frame):
        frame = self.preprocess(frame)
        self.frames.append(frame)
        return self._get_stacked_frames()
    
    def _get_stacked_frames(self):
        stacked = np.stack(self.frames, axis=0)  # Shape: [stack_size, channels, height, width]
        return torch.FloatTensor(stacked).to(device)  # Remove unsqueeze(0)

In [4]:
class MarioDQN(nn.Module):
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    
    def __init__(self, input_channels, output_dim):
        super().__init__()
        self.output_dim = output_dim
        
        # Convolutional layers - expects 4 channels per frame
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )
    
         # Calculate the output size of convolutional layers
        with torch.no_grad():
            # Create dummy input to determine conv output size
            dummy_input = torch.zeros(1, input_channels, 84, 84).to(device)
            conv_out = self.conv(dummy_input)
            self.conv_out_size = conv_out.shape[1] 
        
        # Value stream
        self.value_stream = nn.Sequential(
            nn.Linear(self.conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        # Advantage stream
        self.advantage_stream = nn.Sequential(
            nn.Linear(self.conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, x):
        # Expected input shape: [batch, stack*channels, h, w]
        if x.dim() == 4:  # [batch*stack, channels, h, w]
            pass  # Already in correct format
        elif x.dim() == 5:  # [batch, stack, channels, h, w]
            x = x.view(-1, x.size(2), x.size(3), x.size(4))  # Combine batch and stack
        else:
            raise ValueError(f"Unexpected input dimension: {x.dim()}")
    
        features = self.conv(x)
        values = self.value_stream(features)
        advantages = self.advantage_stream(features)
    
        return values + (advantages - advantages.mean(dim=1, keepdim=True))


import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import random
import cv2

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Experience tuple
Experience = namedtuple('Experience', 
                       ('state', 'action', 'next_state', 'reward', 'done'))
class FrameStacker:
    #Handles frame stacking without gym wrappers
    def __init__(self, stack_size=4):
        self.stack_size = stack_size
        self.frames = deque(maxlen=stack_size)


    def preprocess(self, frame):
        # First ensure we have a numpy array
        if isinstance(frame, tuple):
            frame = frame[0]  # Take the first element if it's a tuple
        #Convert to grayscale and resize
        #frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        #frame = cv2.resize(frame, (84, 84))
        #frame = frame / 255  # Normalize
        # Convert to grayscale and resize
        if len(frame.shape) == 3:  # If RGB image
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (84, 84))
        frame = frame / 255.0  # Normalize
        return frame
        #return frame.astype(np.float32)  # Ensure float32 output
    
    def reset(self, frame):
        #Initialize with the same first frame
        self.frames.clear()
        frame = self.preprocess(frame)
        frame = np.array(frame, dtype=np.float32) if not isinstance(frame, np.ndarray) else frame.astype(np.float32)
        for _ in range(self.stack_size):
            self.frames.append(frame)
            #self.frames.append(frame.astype(np.float32))  # Explicit type conversion error otherwise
        return self._get_stacked_frames()
    
    #def append(self, frame):
        #self.frames.append(frame)
        #return self._get_stacked_frames()
        

    def _get_stacked_frames(self):
        # Convert to numpy array with explicit float32 dtype
        stacked = np.array(self.frames, dtype=np.float32)
        return torch.FloatTensor(stacked).unsqueeze(0).to(device)

    def append(self, frame):
        frame = self.preprocess(frame)
        self.frames.append(frame.astype(np.float32))  # Explicit type conversion
        return self._get_stacked_frames()
    
    #def _get_stacked_frames(self):
        #stacked = np.stack(self.frames, axis=0)
        #return torch.FloatTensor(stacked).unsqueeze(0).to(device)

class EnhancedFrameStacker(FrameStacker):
    def preprocess(self, frame):
        """Enhanced preprocessing with object detection"""
        if isinstance(frame, tuple):
            frame = frame[0]
            
        # Convert to grayscale base frame
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        resized = cv2.resize(gray, (84, 84))
        normalized = resized / 255.0
        
        # Create object detection channels
        enemy_mask = self._detect_enemies(frame)
        obstacle_mask = self._detect_obstacles(frame)
        
        # Stack channels (original + object detection)
        processed = np.stack([normalized, enemy_mask, obstacle_mask], axis=0)
        return processed.astype(np.float32)

    def _detect_enemies(self, frame):
        """Create binary mask of enemies using color thresholding"""
        lower_red = np.array([200, 0, 0])
        upper_red = np.array([255, 50, 50])
        mask = cv2.inRange(frame, lower_red, upper_red)
        return (cv2.resize(mask, (84, 84)) > 0).astype(np.float32)

    def _detect_obstacles(self, frame):
        """Create binary mask of pipes/blocks using color thresholding"""
        lower_brown = np.array([100, 60, 0])
        upper_brown = np.array([180, 120, 80])
        mask = cv2.inRange(frame, lower_brown, upper_brown)
        return (cv2.resize(mask, (84, 84)) > 0).astype(np.float32)

class MarioDQN(nn.Module):
    """Combined Double DQN Network"""
    def __init__(self, input_channels, output_dim):
        super().__init__()
        self.output_dim = output_dim
        
        # Shared convolutional layers
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )
        
        # Get conv output size
        with torch.no_grad():
            dummy = torch.zeros(1, input_channels, 84, 84)
            self.conv_out = self.conv(dummy).shape[1]
        
        # Dueling DQN branches (optional improvement)
        self.value_stream = nn.Sequential(
            nn.Linear(self.conv_out, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        self.advantage_stream = nn.Sequential(
            nn.Linear(self.conv_out, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, x):
        features = self.conv(x)
        values = self.value_stream(features)
        advantages = self.advantage_stream(features)
        return values + (advantages - advantages.mean(dim=1, keepdim=True))

Now we are ready to initalize our Mario agent!

In [5]:
class MarioAgent:
    def __init__(self, env):
        self.env = env
        self.stacker = EnhancedFrameStacker(stack_size=4)
        self.num_actions = env.action_space.n  # Store number of actions
        self.stuck_threshold = 50  # frames without x-position change
        self.stuck_counter = 0
        self.last_x_pos = 0
        self.turn_actions = [0, 2]  # Left and Right in SIMPLE_MOVEMENT
        
        # Networks
        self.policy_net = MarioDQN(4, self.num_actions).to(device)
        self.target_net = MarioDQN(4, self.num_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        # Training params
        self.gamma = 0.99
        self.batch_size = 64
        self.memory = deque(maxlen=100000)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.0001)
        
        # Exploration
        self.eps_start = 1.0
        self.eps_end = 0.01
        self.eps_decay = 1000000
        self.steps_done = 0
        self.best_reward = -np.inf
        
        # Action mapping
        self.jump_action = 5  # Jump right in SIMPLE_MOVEMENT
        self.lives = 3
        self.current_world = 1
        self.current_stage = 1

    def _is_stuck(self, current_x_pos):
        if abs(current_x_pos - self.last_x_pos) < 2:  # Minimal movement
            self.stuck_counter += 1
        else:
            self.stuck_counter = 0
        self.last_x_pos = current_x_pos
        return self.stuck_counter > self.stuck_threshold

    def _should_jump(self, state):
        """Determine if Mario should jump based on immediate dangers"""
        state_np = state.cpu().numpy()[0]  # Shape: [channels, height, width]
        
        # Channel indices
        ENEMY_CHANNEL = 1
        OBSTACLE_CHANNEL = 2
        PIT_CHANNEL = 3
        
        # Danger zones - ensure they're within array bounds
        height, width = state_np.shape[1], state_np.shape[2]
        FRONT_RANGE = slice(max(0, height-20), height)  # Bottom 20 pixels
        ABOVE_RANGE = slice(0, min(20, height))         # Top 20 pixels
        WIDTH_RANGE = slice(max(0, width//2-2), min(width//2+2, width))  # Center 4 pixels
        
        # Check dangers safely
        try:
            enemy_in_front = state_np[ENEMY_CHANNEL, FRONT_RANGE, WIDTH_RANGE].max() > 0.5
            block_above = state_np[OBSTACLE_CHANNEL, ABOVE_RANGE, WIDTH_RANGE].max() > 0.5
            pit_ahead = state_np[PIT_CHANNEL, FRONT_RANGE, WIDTH_RANGE].max() > 0.5
        except ValueError:  # If any slice is empty
            return False
            
        return enemy_in_front or block_above or pit_ahead


    def select_action(self, state, info):
    # Ensure state has correct dimensions [1, stack_size, channels, h, w]
        if state.dim() == 4:
            state = state.unsqueeze(0)  # Add batch dimension if missing
    
        # Immediate danger response
        if self._should_jump(state):
            return torch.tensor([[self.jump_action]], device=device, dtype=torch.long)
    
        # Force turn-around when stuck
        if self._is_stuck(info['x_pos']):  # Use current_info instead of info
            self.stuck_counter = 0
            return torch.tensor([[random.choice(self.turn_actions)]], 
                              device=device, dtype=torch.long)
        
        # Epsilon-greedy action selection
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            np.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
    
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.num_actions)]], 
                              device=device, dtype=torch.long)
    
    
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        # Sample batch from memory
        batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = zip(*batch)
    
        ## Convert to tensors with proper shapes
        state_batch = torch.stack([s for s in states])  # [batch, stack, channels, h, w]
        state_batch = state_batch.view(-1, 4, 84, 84)  # [batch*stack, channels, h, w]
    
        # Handle actions
        action_batch = torch.cat(actions)  # [batch]
    
        # Handle next states
        non_final_mask = torch.tensor(
            [s is not None for s in next_states],
            device=device, dtype=torch.bool
        )
        non_final_next_states = torch.stack([s for s in next_states if s is not None])
        if non_final_next_states.dim() == 4:  # [batch, stack, channels, h, w]
            non_final_next_states = non_final_next_states.view(-1, 4, 84, 84)
        reward_batch = torch.cat(rewards)  # [batch_size]
        done_batch = torch.tensor(dones, dtype=torch.float32, device=device)  # [batch_size]
    
        # Compute Q values
        q_values = self.policy_net(state_batch)  # [batch_size*stack_size, num_actions]
    
        # Reshape and process Q-values
        q_values = q_values.view(self.batch_size, -1, self.num_actions)  # [batch_size, stack_size, num_actions]
        q_values = q_values.mean(dim=1)  # Average across stacked frames [batch_size, num_actions]
    
        # Gather the Q-values for taken actions
        state_action_values = q_values.gather(1, action_batch.view(-1, 1))  # [batch_size, 1]
    
        # Compute expected Q values
        next_state_values = torch.zeros(self.batch_size, device=device)
    
        # Handle non-final next states
        non_final_mask = torch.tensor(
            [s is not None for s in next_states],
            device=device, dtype=torch.bool
        )
    
        if non_final_mask.any():
            non_final_next_states = torch.cat([s.unsqueeze(0) for s in next_states if s is not None])
            non_final_next_states = non_final_next_states.view(-1, 4, 84, 84)
        
            with torch.no_grad():
                next_q_values = self.target_net(non_final_next_states)
                next_q_values = next_q_values.view(-1, 4, self.num_actions).max(dim=2)[0]  # Max over actions
                next_state_values[non_final_mask] = next_q_values.mean(dim=1)  # Average over frames
    
        # Compute expected state-action values
        expected_state_action_values = (next_state_values * self.gamma * (1 - done_batch)) + reward_batch
    
        # Compute loss
        loss = nn.SmoothL1Loss()(state_action_values, expected_state_action_values.unsqueeze(1))
    
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
        self.optimizer.step()

    def _calculate_reward(self, reward, info, prev_info):
        """Enhanced reward calculation with all components properly defined"""
        # Get values with defaults from prev_info
        current_life = info.get('life', prev_info['life'])
        current_x = info.get('x_pos', prev_info['x_pos'])
        current_status = info.get('status', prev_info['status'])
        current_score = info.get('score', prev_info['score'])
        current_y = info.get('y_pos', prev_info['y_pos'])
    
        # Calculate reward components
        life_penalty = -25 if current_life < prev_info['life'] else 0
        x_reward = (current_x - prev_info['x_pos']) * 0.2
    
        status_bonus = 0
        if current_status == 'tall' and prev_info['status'] == 'small':
            status_bonus = 10
        elif current_status == 'fireball' and prev_info['status'] != 'fireball':
            status_bonus = 15
    
        time_penalty = -0.1 if current_x == prev_info['x_pos'] else 0
        danger_penalty = -1 if current_y < 50 else 0  # Simple danger detection
        block_hit_bonus = 1 if current_score > prev_info['score'] else 0
    
        return (reward + x_reward + status_bonus + 
                time_penalty + danger_penalty + block_hit_bonus + 
                life_penalty)
    
       
        
    def train(self, total_episodes=10000):
        for episode in range(total_episodes):
            observation = self.env.reset()
            state = self.stacker.reset(observation)  # Should return [stack, channels, h, w]
            # Initialize with default values
            prev_info = {
                'life': 3,
                'x_pos': 0,
                'status': 'small',
                'score': 0,
                'y_pos': 0
            }
            info = prev_info
        
            # Reset environment
            observation = self.env.reset()
            state = self.stacker.reset(observation)
            if state.dim() == 4:  # [stack, channels, h, w]
                state = state.unsqueeze(0)  # Add batch dimension
            total_reward = 0
            done = False
        
            while not done:
                action = self.select_action(state, info)
                # Take step and handle different info formats
                step_result = self.env.step(action.item())
            
            
                # Handle different return formats
                if len(step_result) == 4:  # (obs, reward, done, info)
                    next_frame, reward, done, info = step_result
                elif len(step_result) == 5:  # (obs, reward, terminated, truncated, info)
                    next_frame, reward, done, _, info = step_result
                else:
                    raise ValueError(f"Unexpected step return format: {step_result}")
            
            
                # Calculate reward with safe dictionary access
                reward = self._calculate_reward(reward, info, prev_info)
            
                # Store experience with properly shaped states
                next_state = self.stacker.append(next_frame) if not done else None
                self.memory.append(Experience(
                    state.squeeze(0),  # Remove batch dim if present [stack, channels, h, w]
                    action,
                    next_state.squeeze(0) if next_state is not None else None,
                    torch.FloatTensor([reward]).to(device),
                    done
                ))
            
                # Update state and info
                state = next_state if next_state is not None else state
                prev_info = {
                    'life': info.get('life', prev_info['life']),
                    'x_pos': info.get('x_pos', prev_info['x_pos']),
                    'status': info.get('status', prev_info['status']),
                    'score': info.get('score', prev_info['score']),
                    'y_pos': info.get('y_pos', prev_info['y_pos'])
                }
                total_reward += reward
            
                # Train and update networks
                if len(self.memory) > self.batch_size:
                    self.optimize_model()
                
                if self.steps_done % 10000 == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())
            
                self.steps_done += 1
            
            # Save best model
            if total_reward > self.best_reward:
                self.best_reward = total_reward
                torch.save(self.policy_net.state_dict(), 'mario_best.pth')
                
            print(f"Episode {episode}, Reward: {total_reward:.1f}, Lives: {info['life']}")
    
    
    def run_best_model(self):
        """Run the trained model with visualization"""
        self.policy_net.load_state_dict(torch.load('mario_best.pth'))
        self.policy_net.eval()
        
        state = self.stacker.reset(self.env.reset())
        done = False
        total_reward = 0
        
        while not done:
            self.env.render()
            with torch.no_grad():
                action = self.policy_net(state).max(1)[1].view(1, 1)
            
            next_frame, reward, done, _, info = self.env.step(action.item())
            state = self.stacker.append(next_frame) if not done else None
            total_reward += reward
        
        print(f"Final Reward: {total_reward}")
        print(f"World {info['world']}-{info['stage']} {'Completed!' if info['flag_get'] else 'Failed'}")
        print(f"Lives remaining: {info['life']}")


import torch
import numpy as np
from collections import deque
#!pip install gym_super_mario_bros==7.3.0 nes_py
import gym_super_mario_bros
import nes_py
from nes_py.wrappers import JoypadSpace # Wrap the game 
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # simplify what a Mario agent can do, 256 actions it can do otherwise

class MarioAgent:
    def __init__(self, env, stack_frames=4):
        self.env = env
        self.stack_size = stack_frames
        self.stacker = FrameStacker(stack_size=4)

        
        # Double DQN setup
        self.policy_net = MarioDQN(4, env.action_space.n).to(device)
        self.target_net = MarioDQN(4, env.action_space.n).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        # Training parameters
        self.gamma = 0.99
        self.batch_size = 32
        self.memory = deque(maxlen=100000)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.00025)
        
        # Exploration
        self.eps_start = 1.0
        self.eps_end = 0.01
        self.eps_decay = 500000
        self.steps_done = 0
        self.best_reward = -np.inf

    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            np.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1

        # Convert state tensor to numpy for processing
        state_np = state.cpu().numpy()[0]  # Shape: [channels, height, width]
    
        # Define danger zones (adjust these areas as needed)
        front_zone = state_np[1, -10:, 40:44]  # Bottom center (immediate front)
        above_zone = state_np[2, :20, 40:44]   # Top center (above Mario)
    
        # Check for enemies (channel 1) or obstacles (channel 2)
        enemy_in_front = (front_zone > 0.5).any()
        block_above = (above_zone > 0.5).any()
    
        # Force jump if danger detected (override epsilon-greedy)
        if enemy_in_front or block_above:
            jump_action = 3
            return torch.tensor([[jump_action]], device=device, dtype=torch.long)


        
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.env.action_space.n)]], 
                              device=device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        # Sample a batch from memory
        batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = zip(*batch)
        
        # Convert to tensors
        state_batch = torch.cat(states)
        action_batch = torch.cat(actions)
        reward_batch = torch.cat(rewards)
        
        # Compute Q values for current states
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        
        # Compute expected Q values
        next_state_values = torch.zeros(self.batch_size, device=device)
        
        # Find non-final states
        non_final_mask = []
        non_final_next_states = []
        for i, state in enumerate(next_states):
            if state is not None:
                non_final_mask.append(True)
                non_final_next_states.append(state)
            else:
                non_final_mask.append(False)
        
        non_final_mask = torch.tensor(non_final_mask, device=device)
        if len(non_final_next_states) > 0:
            non_final_next_states = torch.cat(non_final_next_states)
            with torch.no_grad():
                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0]
        
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # Compute loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
        self.optimizer.step()


    def train(self, total_episodes=1000):
        """Silent training with best model saving"""
        for episode in range(total_episodes):
            raw_frame = self.env.reset()
            #frame = self._preprocess_frame(raw_frame)
            state = self.stacker.reset(raw_frame)
            total_reward = 0
            done = False
            
            while not done:
                action = self.select_action(state)
                next_raw_frame, reward, done, _, info = self.env.step(action.item())

                total_reward += reward
                reward = torch.FloatTensor([reward]).to(device)
                
                # Store experience
                next_state = self.stacker.append(next_raw_frame) if not done else None
                self.memory.append(Experience(state, action, next_state, reward, done))

                state = next_state if next_state is not None else state                
                # Train
                self.optimize_model()
                
                
                # total_reward += reward
            
            # Update target network
            if episode % 10 == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            
            # Save best model
            if total_reward > self.best_reward:
                self.best_reward = total_reward
                torch.save(self.policy_net.state_dict(), 'mario_best.pth')
                
            eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
                np.exp(-1. * self.steps_done / self.eps_decay)
            print(f"Episode {episode}, Reward: {total_reward}, Epsilon: {eps_threshold:.2f}")

    def run_best_model(self):
        """Run the best saved model with rendering"""
        self.policy_net.load_state_dict(torch.load('mario_best.pth'))
        self.policy_net.eval()
        
        state = self.stacker.reset(self._preprocess_frame(self.env.reset()))
        done = False
        total_reward = 0
        
        while not done:
            self.env.render()
            with torch.no_grad():
                action = self.policy_net(state).max(1)[1].view(1, 1)
            
            next_frame, reward, done, _, info = self.env.step(action.item())
            state = self.stacker.append(next_frame) if not done else None
            total_reward += reward
        
        print(f"Final Reward: {total_reward}")
        print(f"Final Position: {info['x_pos']}")
        print(f"World {info['world']}-{info['stage']} {'Completed!' if info['flag_get'] else 'Failed'}")


class EnhancedMarioAgent(MarioAgent):
    def __init__(self, env, stack_frames=4):
        # Use enhanced frame stacker
        self.stacker = EnhancedFrameStacker(stack_size=stack_frames)
        super().__init__(env, stack_frames)
        
        # Action mapping (adjust based on SIMPLE_MOVEMENT)
        self.jump_action = 1  # Index of jump action in action space
        
    def select_action(self, state):
        # Danger detection first
        if self._should_jump(state):
            return torch.tensor([[self.jump_action]], device=device, dtype=torch.long)
            
        # Original epsilon-greedy selection
        return super().select_action(state)
        
    def _should_jump(self, state):
        """Enhanced danger detection using object channels"""
        state_np = state.cpu().numpy()[0]  # Shape: [channels, height, width]
        
        # Channel indices (0=grayscale, 1=enemies, 2=obstacles)
        ENEMY_CHANNEL = 1
        OBSTACLE_CHANNEL = 2
        
        # Danger zones (adjust based on Mario's position in frame)
        FRONT_RANGE = slice(70, 84)  # Bottom 14 pixels
        ABOVE_RANGE = slice(0, 20)   # Top 20 pixels
        WIDTH_RANGE = slice(40, 44)  # Center 4 pixels
        
        # Check for immediate dangers
        enemy_in_front = state_np[ENEMY_CHANNEL, FRONT_RANGE, WIDTH_RANGE].max() > 0.5
        block_above = state_np[OBSTACLE_CHANNEL, ABOVE_RANGE, WIDTH_RANGE].max() > 0.5
        
        return enemy_in_front or block_above

    def train(self, total_episodes=1000):
        """Enhanced training with strategic rewards"""
        for episode in range(total_episodes):
            state = self.stacker.reset(self.env.reset())
            total_reward = 0
            done = False
            prev_info = None
            
            while not done:
                action = self.select_action(state)
                next_frame, reward, done, _, info = self.env.step(action.item())
                
                # Enhanced reward shaping
                if prev_info is not None:
                    reward += self._calculate_strategic_reward(info, prev_info)
                
                # Store experience
                next_state = self.stacker.append(next_frame) if not done else None
                self.memory.append(Experience(
                    state, action, next_state, 
                    torch.FloatTensor([reward]).to(device),
                    done
                ))
                
                # Train
                self.optimize_model()
                
                state = next_state if next_state is not None else state
                prev_info = info
                total_reward += reward
            
            # Periodic updates
            if episode % 10 == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            
            if total_reward > self.best_reward:
                self.best_reward = total_reward
                torch.save(self.policy_net.state_dict(), 'mario_best.pth')
                
            print(f"Episode {episode}, Reward: {total_reward}")
    
    def _calculate_strategic_reward(self, info, prev_info):
        """Additional rewards for strategic actions"""
        reward = 0
        
        # Reward for hitting blocks from below (potential powerups)
        if info['status'] != prev_info['status']:
            reward += 2  # Got bigger
        
        # Penalty for being stuck
        if info['x_pos'] == prev_info['x_pos']:
            reward -= 0.1
            
        return reward

# Initialize
env = gym_super_mario_bros.make('SuperMarioBros-v0',
                               apply_api_compatibility=True,
                               render_mode='human')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
agent = EnhancedMarioAgent(env)

# Silent training
print("Training silently...")
agent.train(total_episodes=5000)

# Final output
print("\nTraining complete. Running best model:")
nextState, reward, done, trunc, info = env.step(0)  # Initial step to start
agent.run_best_model()
env.close()

Putting it all together- make appropriate function calls:

In [6]:
# Initialize
env = gym_super_mario_bros.make('SuperMarioBros-v0',
                               apply_api_compatibility=True,
                               render_mode='human')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
agent = MarioAgent(env)

# Silent training
print("Training silently...")
agent.train(total_episodes=5000)

# Final output
print("\nTraining complete. Running best model:")
nextState, reward, done, trunc, info = env.step(0)  # Initial step to start
agent.run_best_model()
env.close()

  logger.warn(
  logger.warn(
  from .autonotebook import tqdm as notebook_tqdm


Training silently...


KeyboardInterrupt: 

In [None]:
# Flag to keep track of when to restart the game. Once game is done, it must be started
#done = True
# Loop thru all frames of game:
#for step in range (100000):
    # Are we done?
    #if done:
        #env.reset() # Restart for the new game
    # Do random actions and get info back:
    #action = env.action_space.sample() # random action
    #obs, reward, terminated, truncated , info = env.step(action) # 'Pressing a button', step allows to pass action into game
    #done = terminated or truncated
    #env.render() # Display the game on screen
#end.close() 

The first thing we want to do is define our neural network.

# WORKS CITED: #
@misc{gym-super-mario-bros,
  author = {Christian Kauten},
  howpublished = {GitHub},
  title = {{S}uper {M}ario {B}ros for {O}pen{AI} {G}ym},
  URL = {https://github.com/Kautenja/gym-super-mario-bros},
  year = {2018},
}