# Street Fighter Tutorial
This notebook accompanies the YouTube tutorial on <a href='https://www.youtube.com/c/NicholasRenotte'>Nicholas Renotte</a>

# Setup StreetFighter

In [None]:
# !pip install gym[all] gym-retro
# !pip install pygame
# !pip install opencv-python
# !pip install matplotlib
# !pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# !pip install stable-baselines3[extra] optuna

In [1]:
# Import retro to play Street Fighter using a ROM
import retro
# Import time to slow down game
import time

In [None]:
# See the different retro games
retro.data.list_games()

In [None]:
# python -m retro.import . # Run this from the roms folder, or where you have your game roms 

# Setup Environment
## What we are going to do! FUNNN
- Observation Preprocess - grayscale (DONE), frame delta, resize the frame so we have less pixels  (DONE) 
- Filter the action - parameter DONE
- Reward function - set this to the score

In [2]:
# Import environment base class for a wrapper 
from gym import Env 
# Import the space shapes for the environment
from gym.spaces import MultiBinary, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import opencv for grayscaling
import cv2
# Import matplotlib for plotting the image
from matplotlib import pyplot as plt

In [3]:
# Create custom environment 
class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): 
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta 
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

# Hyperparameter tune

In [4]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

In [5]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [6]:
# Function to return test hyperparameters - define the object function
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [7]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [8]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        #model.learn(total_timesteps=30000)
        model.learn(total_timesteps=50000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

In [9]:
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=50, n_jobs=1)
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

[32m[I 2022-02-20 12:11:11,581][0m A new study created in memory with name: no-name-e8703b8c-eaea-49b4-8f25-af561edabea7[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4907 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 12:17:26,992][0m Trial 0 finished with value: 2700.0 and parameters: {'n_steps': 4907, 'gamma': 0.8128099333446631, 'learning_rate': 2.5434615459478538e-05, 'clip_range': 0.29328195874236473, 'gae_lambda': 0.8650270744429742}. Best is trial 0 with value: 2700.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7495 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 12:22:55,060][0m Trial 1 finished with value: 2000.0 and parameters: {'n_steps': 7495, 'gamma': 0.8421510608912435, 'learning_rate': 5.920472143790282e-05, 'clip_range': 0.28111880704291303, 'gae_lambda': 0.9011007801571661}. Best i

Expected parameter logits (Tensor of shape (64, 12)) of distribution Bernoulli(logits: torch.Size([64, 12])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5380 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 16:09:06,761][0m Trial 40 finished with value: 2000.0 and parameters: {'n_steps': 5380, 'gamma': 0.9167225400922788, 'learning_rate': 4.93084417949993e-05, 'clip_range': 0.3214016980588021, 'gae_lambda': 0.8712019866883192}. Best is trial 29 with value: 35300.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7308 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 16:14:17,286][0m Trial 41 finished with value: 2300.0 and parameters: {'n_steps': 7308, 'gamma': 0.9107392037741536, 'learning_rate': 1.043060830123758e-05, 'clip_range': 0.3559459070300301, 'gae_lambda': 0.8225105211921662}. Best is trial 29 with value: 35300.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=66

Expected parameter logits (Tensor of shape (64, 12)) of distribution Bernoulli(logits: torch.Size([64, 12])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6129 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 16:32:00,094][0m Trial 45 finished with value: 2000.0 and parameters: {'n_steps': 6129, 'gamma': 0.9553057708166062, 'learning_rate': 1.5071874330716572e-05, 'clip_range': 0.326947936673088, 'gae_lambda': 0.9008657589294249}. Best is trial 29 with value: 35300.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5923 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-02-20 16:37:31,082][0m Trial 46 finished with value: 500.0 and parameters: {'n_steps': 5923, 'gamma': 0.8090218293939454, 'learning_rate': 1.8172980184505882e-05, 'clip_range': 0.3890063519160743, 'gae_lambda': 0.885893420469851}. Best is trial 29 with value: 35300.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=80

In [10]:
study.best_params

{'n_steps': 5024,
 'gamma': 0.8189709083805853,
 'learning_rate': 2.650192325308733e-05,
 'clip_range': 0.2947393915843855,
 'gae_lambda': 0.9161058440080572}

In [11]:
study.best_trial

FrozenTrial(number=29, values=[35300.0], datetime_start=datetime.datetime(2022, 2, 20, 14, 59, 27, 584115), datetime_complete=datetime.datetime(2022, 2, 20, 15, 6, 21, 910247), params={'n_steps': 5024, 'gamma': 0.8189709083805853, 'learning_rate': 2.650192325308733e-05, 'clip_range': 0.2947393915843855, 'gae_lambda': 0.9161058440080572}, distributions={'n_steps': IntUniformDistribution(high=8192, low=2048, step=1), 'gamma': LogUniformDistribution(high=0.9999, low=0.8), 'learning_rate': LogUniformDistribution(high=0.0001, low=1e-05), 'clip_range': UniformDistribution(high=0.4, low=0.1), 'gae_lambda': UniformDistribution(high=0.99, low=0.8)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=29, state=TrialState.COMPLETE, value=None)

In [12]:
model = PPO.load(os.path.join(OPT_DIR, 'trial_29_best_model.zip'))

# Setup Callback

In [6]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [7]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [8]:
CHECKPOINT_DIR = './train/'

In [9]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [10]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [18]:
model_params = study.best_params
model_params['n_steps'] = 4992  # set n_steps to 7488 or a factor of 64
model_params['learning_rate'] = 5e-7
model_params

{'n_steps': 4992,
 'gamma': 0.8189709083805853,
 'learning_rate': 5e-07,
 'clip_range': 0.2947393915843855,
 'gae_lambda': 0.9161058440080572}

In [12]:
#model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [13]:
# Reload previous weights from HPO
model.load(os.path.join('train', 'best_model_5000000.zip'))

<stable_baselines3.ppo.ppo.PPO at 0x20ba6a80088>

In [14]:
# Kick off training 
model.learn(total_timesteps=5000000, callback=callback)
# model.learn(total_timestep=5000000) 

Logging to ./logs/PPO_52
-----------------------------
| time/              |      |
|    fps             | 276  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 216        |
|    iterations           | 2          |
|    time_elapsed         | 18         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.60310304 |
|    clip_fraction        | 0.629      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.73      |
|    explained_variance   | 2.46e-05   |
|    learning_rate        | 0.0003     |
|    loss                 | 5.34e+04   |
|    n_updates            | 10         |
|    policy_gradient_loss | 0.106      |
|    value_loss           | 5.15e+06   |
----------------------------------------
---------------------------

# Evaluate the Model

In [None]:
model = PPO.load('./opt/trial_5_best_model.zip')

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

In [None]:
mean_reward

# Test out the Model

In [None]:
obs = env.reset()

In [None]:
obs.shape

In [None]:
env.step(model.predict(obs)[0])

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        print(reward)