## [TRAIN A MARIO-PLAYING RL AGENT](https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html#train-a-mario-playing-rl-agent)

In [None]:
#%%bash
#pip install gym-super-mario-bros==7.4.0

In [1]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os, copy

In [2]:
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

In [3]:
from nes_py.wrappers import JoypadSpace

In [4]:
import gym_super_mario_bros

### [RL Definitions](https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html#rl-definitions)

**Environment** The world that an agent interacts with and learns from.

**Action** $a$ : How the Agent responds to the Environment. The set of all possible Actions is called action-space.

**State** $s$ : The current characteristic of the Environment. The set of all possible States the Environment can be in is called state-space.

**Reward** $r$ : Reward is the key feedback from Environment to Agent. It is what drives the Agent to learn and to change its future action. An aggregation of rewards over multiple time steps is called Return.

**Optimal Action-Value function** $Q^*(s,a)$ : Gives the expected return if you start in state ss, take an arbitrary action aa, and then for each future time step take the action that maximizes returns. QQ can be said to stand for the “quality” of the action in a state. We try to approximate this function.

In [5]:
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v3", render_mode='rgb', apply_api_compatibility=True)

  logger.warn(


In [6]:
env = JoypadSpace(env, [['right'], ['right', 'A']])

In [None]:
env.reset()

In [8]:
next_state, reward, done, trunc, info = env.step(action=0)

In [9]:
print(f'{next_state.shape} \t {reward} \t {done}\t{info}')

(240, 256, 3) 	 0.0 	 False	{'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'y_pos': 79}


### [Preprocess Environment](https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html#preprocess-environment)

Environment data is returned to the agent in next_state. As you saw above, each state is represented by a [3, 240, 256] size array. Often that is more information than our agent needs; for instance, Mario’s actions do not depend on the color of the pipes or the sky!

In [10]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_rewards = 0.0
        for i in range(self._skip):
            obs, reward, done, trunk, info = self.env.step(action)
            total_rewards += reward
            if done:
                break
        return obs, total_rewards, done, trunk, info


In [11]:
env = SkipFrame(env, skip=4)

In [12]:
class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
    
    def permute_orientation(self, observation):
        observation = np.transpose(observation, (2,0,1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation

In [13]:
env = GrayScaleObservation(env)

In [14]:
class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape=(shape, shape)
        else:
            self.shape = tuple(shape)
        
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype = np.uint8)

    def observation(self, observation):
        transforms = T.compose(
            [T.Resize(self.shape), T.Normalize(0, 255)]
        )

        observation = transforms(observation).squeeze(0)
        return observation


In [15]:
env = ResizeObservation(env, shape=84)

In [16]:
env = FrameStack(env, num_stack=4)

![](https://pytorch.org/tutorials/_images/mario_env.png)

### [Agent](https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html#agent)

In [None]:
class Mario:
    def __init__():
        pass
    def act(self, state):
        pass
    def cache(self, experience):
        """Add the experience to memory"""
        pass
    def recall(self):
        """"Sample experiences from memory"""
        pass
    def learn(self):
        """Update online action value Q function with a batch of experiences"""
        pass