In [1]:
# Import vizdoom for game env
from vizdoom import *
# Import radom action for sampling random actions
import random
# Import time for sleep
import time
# Import numpy for identity matrix
import numpy as np
# Import environment base class from OpenAI gym
from gym import Env
# Import gym spaces for defining the action space, Box returns a n-dim array, Discrete is a set of binary values
from gym.spaces import Discrete, Box
# Import opencv for image processing (grayscaling)
import cv2
# Import matplotlib 
from matplotlib import pyplot as plt

In [None]:
# Setup the game
game = DoomGame()
game.load_config('ViZDoom/gtihub/ViZDoom/scenarios/deadly_corridor_s1.cfg')
game.init()

In [None]:
actions = np.identity(7, dtype=np.uint8)

In [None]:
# loop through the episodes
episodes = 10
for episode in range(episodes):
    game.new_episode()
    while not game.is_episode_finished():
        state = game.get_state()
        img = state.screen_buffer
        info = state.game_variables        
        reward = game.make_action(random.choice(actions),4)
        print('reward', reward)
        time.sleep(0.02)
    print('Result', game.get_total_reward())
    time.sleep(2)

In [None]:
game.close()

In [2]:
class VizDoomEnv(Env):
    # function that is called when we start env
    def __init__(self, render = False, config='ViZDoom/gtihub/ViZDoom/scenarios/deadly_corridor.cfg'):
        # Inherit from Env
        super().__init__()
        
        self.game = DoomGame()
        self.game.load_config(config)

        self.observation_space = Box(low =0, high = 255, shape = (100, 160, 1), dtype = np.uint8)
        self.action_space = Discrete(7)

        # game variables : HEALTH DAMAGE_TAKEN HITCOUNT SELECTED_WEAPON_AMMO
        self.damage_taken = 0
        self.hitcount = 0
        self.ammo = 52

        # render frame logic
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # start the game
        self.game.init()

        
    # this is how we take a step in the environment    
    def step(self, action):
        # actions = array([[1, 0, 0],
        ##                 [0, 1, 0],
        ##                 [0, 0, 1]], dtype=uint8)
        
        actions = np.identity(7, dtype = np.uint8)

        # take an action with a frameskip of 4 - frameskip is needed for AI to get feedback
        movement_reward = self.game.make_action(actions[action], 4)

        reward = 0
        if self.game.get_state():
            # get the screen image and game variables
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)

            ##### perform reward shaping ####
            game_variables = self.game.get_state().game_variables
            health, damage_taken, hitcount, ammo = game_variables

            # calculate deltas, damage starts with past 10, current frame = 20 delta = -20 + 10 (disint
            damage_taken_delta = self.damage_taken - damage_taken
            self.damage_taken = damage_taken
            hitcount_delta = - self.hitcount + hitcount
            self.hitcount = hitcount
            ammo_delta = - self.ammo + ammo
            self.ammo = ammo

            reward = movement_reward + damage_taken_delta*10 + hitcount_delta*200 + ammo_delta*5
            info = ammo
        # handling game end scenario when get_state and ammo are zeros
        else:
            state = np.zeros(self.observation_space.shape)
            info = 0

        info = {"info":info}
        done = self.game.is_episode_finished()
        return state, reward, done, info

    # reset the instance 
    def reset(self):
        self.game.new_episode()
        state = self.game.get_state().screen_buffer            

        # game variables : HEALTH DAMAGE_TAKEN HITCOUNT SELECTED_WEAPON_AMMO
        self.damage_taken = 0
        self.hitcount = 0
        self.ammo = 52
        
        return self.grayscale(state)
        
 
    # grayscale the game frame and resize it 
    ## observation will be the gameframe
    def grayscale(self, observation):
        ## moveaxis will rearange the order in a way that cvtColor can accept
        ### in this instance from (3, 240, 320) to (240, 320, 3) - color channels at the end
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)

        ##cut the frame down to scale down and reduces the pixels
        resize = cv2.resize(gray, (160, 100), interpolation = cv2.INTER_CUBIC)
        state = np.reshape(resize, (100, 160, 1))
        return state
        
   # this is already defined to render but vizdoom does this for us
    def render():
        pass 
        
    def close(self):
        self.game.close()

In [None]:
env = VizDoomEnv(render = True)

In [15]:
env.close()

In [None]:
# Import Environment checker
from stable_baselines3.common import env_checker

In [None]:
env_checker.check_env(env)

# 3. View State

In [None]:
plt.imshow(cv2.cvtColor(state, cv2.COLOR_BGR2RGB))

# 4. Set up CallBack

In [3]:
# Import os for file navigation
import os
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback

In [4]:
class TrainingLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainingLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [5]:
CHECKPOINT_DIR = './train/train_deadly_corridor'
LOG_DIR = './logs/log_deadly_corridor'

In [6]:
# Setup the call back
callback = TrainingLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# 5 Train Model using Curriculum

In [7]:
# Import PPO (Proximal Policy Optimization) for algorithm 
from stable_baselines3 import PPO 

In [8]:
# Non rendered environment
env = VizDoomEnv(config='ViZDoom/gtihub/ViZDoom/scenarios/deadly_corridor_s1.cfg')

In [9]:
#n_steps = how many time_frames that are passed as a part of initial run - larger the value more info is passed and longer to learn
model = PPO('CnnPolicy', env, tensorboard_log = LOG_DIR, verbose = 1, learning_rate = 0.00001, n_steps = 8192, clip_range=.1, gamma=.95, gae_lambda = .9)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [10]:
#model = PPO.load('./train/train_basic/best_model_100000', tensorboard_log=LOG_DIR)
#model.set_env(env)
#model.learning_rate = 0.0000001

In [11]:
model.learn(total_timesteps = 400000, callback = callback)

Logging to ./logs/log_deadly_corridor\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 205      |
|    ep_rew_mean     | -561     |
| time/              |          |
|    fps             | 26       |
|    iterations      | 1        |
|    time_elapsed    | 311      |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 209          |
|    ep_rew_mean          | -542         |
| time/                   |              |
|    fps                  | 19           |
|    iterations           | 2            |
|    time_elapsed         | 826          |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0028062842 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.94        |
|    explained

<stable_baselines3.ppo.ppo.PPO at 0x26cfa93d1c0>

# 6 Level up and retrain

In [16]:
model.load('./train/train_deadly_corridor/best_model_340000.zip')

<stable_baselines3.ppo.ppo.PPO at 0x26ca2bcd880>

In [17]:
env = VizDoomEnv(config='ViZDoom/gtihub/ViZDoom/scenarios/deadly_corridor_s2.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_corridor\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 31.6     |
|    ep_rew_mean     | -89.8    |
| time/              |          |
|    fps             | 26       |
|    iterations      | 1        |
|    time_elapsed    | 305      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 31.2        |
|    ep_rew_mean          | -83.9       |
| time/                   |             |
|    fps                  | 19          |
|    iterations           | 2           |
|    time_elapsed         | 833         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.004194458 |
|    clip_fraction        | 0.00747 

<stable_baselines3.ppo.ppo.PPO at 0x26cfa93d1c0>

In [None]:
# reload model from disk
model = PPO.load('./train/train_basic/best_model_170000')

In [None]:
env = VizDoomEnv(render = True)

In [None]:
# evaluate mean reward for 10 games
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes = 10)

In [None]:
mean_reward