# Mario with Policy Gradient

In [12]:
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from tqdm import trange
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import random, datetime, os, copy

# Gym is an OpenAI toolkit for RL
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

gym.__version__

'0.26.0'

In [18]:
# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace
# Super Mario environment for OpenAI Gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT,COMPLEX_MOVEMENT 
# env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v1", render_mode='rgb_array', apply_api_compatibility=True)
env = JoypadSpace(env, SIMPLE_MOVEMENT)
n_actions = env.action_space.n
print("Action :",n_actions)

Action : 7


In [19]:
obs,info = env.reset()
obs.shape, info #3 dimension

((240, 256, 3), {})

## Test Gym is worked or not

In [20]:
env.reset()
done = True
for step in range(500):
    if done:
        next_state = env.reset()
    next_state, reward, done,_, info = env.step(env.action_space.sample())
    env.render()
    # img=env.render(mode="rgb_array")
    
env.close()
print('Done')

Done


## Grey Scale

In [21]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, trunk, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunk, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        # permute [H, W, C] array to [C, H, W] tensor
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation

class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation

In [22]:
# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace
# Super Mario environment for OpenAI Gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT,COMPLEX_MOVEMENT 
# env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v1", render_mode='rgb_array', apply_api_compatibility=True)
env = JoypadSpace(env, SIMPLE_MOVEMENT)
n_actions = env.action_space.n
print("Action :",n_actions)

obs,info = env.reset()
obs.shape, info #3 dimension

Action : 7


((240, 256, 3), {})

In [23]:
# env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
# env = JoypadSpace(env, [["right"], ["right", "A"]])
env = FrameStack(ResizeObservation(GrayScaleObservation(SkipFrame(env, skip=4)), shape=84), num_stack=4)
# env.seed(42)
# env.action_space.seed(42)
torch.manual_seed(42)
torch.random.manual_seed(42)
np.random.seed(42)

obs,info = env.reset()
obs.shape, info #3 dimension

((4, 84, 84), {})

In [24]:
import torch
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
device

device(type='cpu')

## Neural Network

In [25]:
class MarioSolver:
    def __init__(self, learning_rate):
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n),
            nn.Softmax(dim=-1)
        ).to(device) #.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate, eps=1e-4)
        self.reset()

    def forward(self, x):
        return self.model(x)

    def reset(self):
        self.episode_actions = torch.tensor([], requires_grad=True).to(device) #.cuda()
        self.episode_rewards = []

    def save_checkpoint(self, directory, episode):
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = os.path.join(directory, 'checkpoint_{}.pth'.format(episode))
        torch.save(self.model.state_dict(), f=filename)
        print('Checkpoint saved to \'{}\''.format(filename))

    def load_checkpoint(self, directory, filename):
        self.model.load_state_dict(torch.load(os.path.join(directory, filename)))
        print('Resuming training from checkpoint \'{}\'.'.format(filename))
        return int(filename[11:-4])

    def backward(self):
        future_reward = 0
        rewards = []
        for r in self.episode_rewards[::-1]:
            future_reward = r + gamma * future_reward
            rewards.append(future_reward)
        rewards = torch.tensor(rewards[::-1], dtype=torch.float32).to(device) #.cuda()
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        loss = torch.sum(torch.mul(self.episode_actions, rewards).mul(-1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.reset()

## Model

In [26]:
batch_size = 10
gamma = 0.95
load_filename = None
save_directory = "./mario_pg"
batch_rewards = []
episode = 0

from torch.distributions import Categorical

model = MarioSolver(learning_rate=0.00025)
if load_filename is not None:
    episode = model.load_checkpoint(save_directory, load_filename)
all_episode_rewards = []
all_mean_rewards = []

## Train

In [27]:
while True:
    observation,info  = env.reset()
    done = False
    while not done:
        env.render()
        # observation = torch.tensor(observation.__array__()).cuda().unsqueeze(0)
        observation = torch.tensor(observation.__array__()).unsqueeze(0).to(device)
        distribution = Categorical(model.forward(observation))
        action = distribution.sample()
        observation, reward, done, _, info = env.step(action.item())
        model.episode_actions = torch.cat([model.episode_actions, distribution.log_prob(action).reshape(1)])
        model.episode_rewards.append(reward)
        if done:
            all_episode_rewards.append(np.sum(model.episode_rewards))
            batch_rewards.append(np.sum(model.episode_rewards))
            model.backward()
            episode += 1
            if episode % batch_size == 0:
                print('Batch: {}, average reward: {}'.format(episode // batch_size, np.array(batch_rewards).mean()))
                batch_rewards = []
                all_mean_rewards.append(np.mean(all_episode_rewards[-batch_size:]))
                plt.plot(all_mean_rewards)
                if episode % 500 == 0:
                    plt.savefig("{}/mean_reward_{}.png".format(save_directory, episode))
                plt.clf()
            if episode % 500 == 0 and save_directory is not None:
                model.save_checkpoint(save_directory, episode)

Batch: 1, average reward: 662.7
Batch: 2, average reward: 799.1
Batch: 3, average reward: 643.4
Batch: 4, average reward: 569.9
Batch: 5, average reward: 663.5
Batch: 6, average reward: 666.8
Batch: 7, average reward: 1142.7
Batch: 8, average reward: 463.7
Batch: 9, average reward: 631.8
Batch: 10, average reward: 457.9
Batch: 11, average reward: 667.8
Batch: 12, average reward: 808.3
Batch: 13, average reward: 659.1
Batch: 14, average reward: 736.4
Batch: 15, average reward: 845.2
Batch: 16, average reward: 743.4
Batch: 17, average reward: 626.7
Batch: 18, average reward: 873.7
Batch: 19, average reward: 760.9
Batch: 20, average reward: 613.4
Batch: 21, average reward: 600.8
Batch: 22, average reward: 666.9
Batch: 23, average reward: 607.1
Batch: 24, average reward: 746.2
Batch: 25, average reward: 978.7
Batch: 26, average reward: 771.6
Batch: 27, average reward: 824.8
Batch: 28, average reward: 609.6
Batch: 29, average reward: 690.4
Batch: 30, average reward: 675.6
Batch: 31, average

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>