# **Mario**
### This project aims to crete a Mario Agent that will beat the entirity of Super Mario Bros using Deep Reinforcement Learning and Double Deep Q-Networks

**Eve Collier\
AI II - Spring 2025\
Final Project**

![Mario](https://media3.giphy.com/media/v1.Y2lkPTc5MGI3NjExdmFvcTg0dDBnbnQ1b3BjbzlnemJnbGVrdm02a29pbDF4a3ExcGF2eiZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/DqqHabAaTHRII/giphy.gif)

First, we need to install the game and import some stuff:

In [3]:
!pip install gym_super_mario_bros==7.3.0 nes_py torch opencv-python
import gym
import gym_super_mario_bros
import numpy as np
import torch
import torch.nn as nn
import cv2
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from collections import deque
from gym import Wrapper




In [4]:
!pip install gym_super_mario_bros==7.3.0 nes_py
import gym_super_mario_bros
import nes_py
from nes_py.wrappers import JoypadSpace # Wrap the game 
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # simplify what a Mario agent can do, 256 actions it can do otherwise



Cool.
Now, wrappers:

In [5]:
# Custom Wrappers
class SkipFrame(Wrapper):
    def __init__(self, env, skip=4):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for _ in range(self._skip):
            step_result = self.env.step(action)
            if len(step_result) == 4:  # Old API
                obs, reward, done, info = step_result
                terminated, truncated = done, False
            else:  # New API
                obs, reward, terminated, truncated, info = step_result
                done = terminated or truncated
            
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

    def reset(self):
        return self.env.reset()

class GrayScaleObservation(Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84), dtype=np.uint8)

    def observation(self, obs):
        if isinstance(obs, tuple):  # Handle newer gym API
            obs = obs[0]
        obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        return obs

    def step(self, action):
        step_result = self.env.step(action)
        if len(step_result) == 4:  # Old API
            obs, reward, done, info = step_result
        else:  # New API
            obs, reward, terminated, truncated, info = step_result
            done = terminated or truncated
        return self.observation(obs), reward, done, info

    def reset(self):
        obs = self.env.reset()
        return self.observation(obs)

class FrameStack(Wrapper):
    def __init__(self, env, num_stack=4):
        super().__init__(env)
        self.num_stack = num_stack
        self.frames = deque(maxlen=num_stack)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(num_stack, 84, 84), dtype=np.uint8)

    def reset(self):
        obs = self.env.reset()
        for _ in range(self.num_stack):
            self.frames.append(obs)
        return np.stack(self.frames)

    def step(self, action):
        step_result = self.env.step(action)
        if len(step_result) == 4:  # Old API
            obs, reward, done, info = step_result
        else:  # New API
            obs, reward, terminated, truncated, info = step_result
            done = terminated or truncated
        self.frames.append(obs)
        return np.stack(self.frames), reward, done, info

Define our Neural Net

In [6]:
# Neural Network 
class MarioNet(nn.Module):
    def __init__(self, input_channels, num_actions):
        super().__init__()
        self.online = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(7*7*64, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        return self.online(x)

Now our Mario agent:

In [7]:
# Mario Agent with jumping heuristic
class Mario:
    def __init__(self, state_shape, num_actions):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.net = MarioNet(state_shape[0], num_actions).to(self.device)
        self.jump_cooldown = 0
        self.last_status = 'small'
        self.last_x_pos = 0
        self.stuck_counter = 0
        
    def act(self, state, info=None):
        # Update status and position tracking if info is provided
        if info is not None:
            self.update_status(info)
            current_x_pos = info.get('x_pos', 0)
            
            # Track if we're stuck (not moving forward)
            if current_x_pos == self.last_x_pos:
                self.stuck_counter += 1
            else:
                self.stuck_counter = 0
            self.last_x_pos = current_x_pos
        
        # Convert state to tensor
        state_t = torch.from_numpy(state).float().to(self.device) / 255.0
        state_t = state_t.unsqueeze(0)  # Add batch dimension
        
        # Get network's action values
        with torch.no_grad():
            action_values = self.net(state_t)
        
        # Get the action with highest Q-value
        action = torch.argmax(action_values).item()
        
        # Jumping logic
        if self.jump_cooldown <= 0:
            # If we're small or stuck, jump more
            if self.last_status in ('small', 'tall') or self.stuck_counter > 10:
                # 60% chance to jump when vulnerable
                if np.random.random() < 0.6:
                    action = np.random.choice([2, 3, 4, 5])  # Jumping actions
                    self.jump_cooldown = 15  # frames before next forced jump
        
        if self.jump_cooldown > 0:
            self.jump_cooldown -= 1
        action = np.random.choice([2, 3, 4, 5])  # Jumping actions WHY DOES THIS WORK?!?!?!  
        return action
    
    def update_status(self, info):
        self.last_status = info.get('status', 'small')

Putting it all together:

In [8]:
# Initialize Environment with rendering
env = gym_super_mario_bros.make('SuperMarioBros-v0', 
                               apply_api_compatibility=True,
                               render_mode='human')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = FrameStack(env, num_stack=4)

# Initialize Mario
mario = Mario(state_shape=(4, 84, 84), num_actions=env.action_space.n)

# Main game loop
done = True
for step in range(100000):
    if done:
        state, info = env.reset(), {}
    
    action = mario.act(state, info)  # Pass info to act()
    
    next_state, reward, done, info = env.step(action)
    state = next_state
    
    env.render()

env.close()

KeyboardInterrupt: 