## Initialize VizDoom

In [16]:
#necessary
!pip install vizdoom
!pip install opencv-python
!pip install pandas



In [17]:
# import VizDoom for game env
from vizdoom import *
# Import random for action sampling
import random
# Import time for sleeping
import time
# import numpy for identity matrix
import numpy as np

from matplotlib import pyplot as plt

## Make it a Gym Env

In [18]:
# Import environment base class from OpenAI Gym
from gymnasium import Env
# Import gym spaces
from gymnasium.spaces import Discrete, Box
# Import Opencv for greyscaling observations
import cv2

LEVEL = 'defend_the_center'
DOOM_SKILL = ''

In [19]:
# Create VizDoom OpenAI Gym Environment
class VizDoomGym(Env): 
    def __init__(self, render=False):
        """
        Function called when we start the env.
        """

        # Inherit from Env
        super().__init__()
        
        # Set up game
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        

        # Whether we want to render the game 
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # Create action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8)
        self.action_space = Discrete(3)

    
    def step(self, action):
        """
        How we take a step in the environment.
        """
        # Specify action and take step
        actions = np.identity(3, dtype=np.uint8)
        prev_kill_count = self.game.get_game_variable(GameVariable.KILLCOUNT) if self.game.get_state() else 0
        reward = self.game.make_action(actions[action], 6)  # get action using index -> left, right, shoot

        # Check for new kills
        new_kill_count = self.game.get_game_variable(GameVariable.KILLCOUNT) if self.game.get_state() else 0
        if new_kill_count > prev_kill_count:
            reward *= 10  # Multiply reward by 10 if a kill has been made

        reward += 1  # Add a constant reward of 1 at every timestep

        # Get all the other stuff we need to return 
        if self.game.get_state():  # if nothing is
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)  # Apply Grayscale
            ammo = self.game.get_state().game_variables[0] 
            info = {"ammo": ammo, "kills": new_kill_count}
        # If we dont have anything turned from game.get_state
        else:
            # Return a numpy zero array
            state = np.zeros(self.observation_space.shape)
            # Return info (game variables) as zero
            info = {"ammo": 0, "kills": 0}

        done = self.game.is_episode_finished()
        truncated = False  # Assuming it's not truncated, modify if applicable
        return state, reward, done, info


    
    def render(self):
        """
        Define how to render the game environment.
        """
        pass

    
    def reset(self, seed=None):
        """
        Function for defining what happens when we start a new game.
        """
        if seed is not None:
            self.game.set_seed(seed)
            
        self.game.new_episode()
        state = self.game.get_state().screen_buffer  # Apply Grayscale

        return self.grayscale(state), {}

    
    def grayscale(self, observation):
        """
        Function to grayscale the game frame and resize it.
        observation: gameframe
        """
        # Change colour channels 
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)

        # Reduce image pixel size for faster training
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize,(100, 160,1))
        return state

    def close(self):
        """
        Call to close down the game.
        """
        self.game.close()


## Custom PPO model

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

In [21]:

class ActorCriticNetwork(nn.Module):
    def __init__(self, in_channels, n_output, gae_lambda):
        super(ActorCriticNetwork, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Use the conv2d_size_out function to compute the feature size
        def conv2d_size_out(size, kernel_size, stride):
            return (size - kernel_size) // stride + 1
        
        # Calculate output dimensions for both width and height
        height = conv2d_size_out(conv2d_size_out(conv2d_size_out(100, 8, 4), 4, 2), 3, 1)
        width = conv2d_size_out(conv2d_size_out(conv2d_size_out(160, 8, 4), 4, 2), 3, 1)
        self.feature_count = 64 * height * width

        self.fc = nn.Linear(self.feature_count, 512)
        self.actor = nn.Linear(512, n_output)
        self.critic = nn.Linear(512, 1)
        self.gae_lambda = gae_lambda 

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.reshape(-1, self.feature_count)  # Use reshape instead of view
        x = F.relu(self.fc(x))
        action_probs = F.softmax(self.actor(x), dim=1)
        value = self.critic(x)
        return action_probs, value

## Training loop

In [22]:
import os
import pandas as pd
from IPython.display import clear_output

def train(env, model, num_timesteps, device, batch_size=16, save_dir="E:\RLModelTraining\Doom_PPO", save_interval=100):
    optimizer = optim.Adam(model.parameters(), lr=0.0000000000001)
    model.to(device)
    gamma = 0.95

    total_timesteps = 0
    total_batches = 0
    episode_lengths = []
    episode_rewards = []

    checkpoint_timesteps = []
    checkpoint_avg_rewards = []
    checkpoint_avg_lengths = []

    while total_timesteps < num_timesteps:
        state = env.reset()
        state_array = np.array(state[0])
        state = torch.from_numpy(state_array).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
        done = False
        total_reward = 0
        episode_length = 0

        while not done and total_timesteps < num_timesteps:
            policy_dist, value = model(state)
            action = policy_dist.multinomial(num_samples=1).detach()
            next_state, reward, done, _ = env.step(action.item())
            next_state = torch.from_numpy(next_state).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)

            total_reward += reward
            episode_length += 1
            total_timesteps += 1

            # Calculate and update loss here

            state = next_state

            if total_timesteps % batch_size == 0:
                total_batches += 1
                episode_lengths.append(episode_length)
                episode_rewards.append(total_reward)

                print(f"Batch {total_batches}: Length = {episode_length}, Reward = {total_reward}")

                # Calculate running averages of episode length and reward
                avg_episode_lengths = [np.mean(episode_lengths)]
                avg_episode_rewards = [np.mean(episode_rewards)]

                checkpoint_timesteps.append(total_timesteps)
                checkpoint_avg_rewards.append(avg_episode_rewards[-1])
                checkpoint_avg_lengths.append(avg_episode_lengths[-1])

                clear_output(wait=True)

                # Plotting
                fig, ax1 = plt.subplots()

                ax1.set_xlabel('Number of Timesteps')
                ax1.set_ylabel('Mean Episode Length', color='tab:blue')
                ax1.plot(checkpoint_timesteps, checkpoint_avg_lengths, color='tab:blue', label='Episode Length')
                ax1.tick_params(axis='y', labelcolor='tab:blue')

                ax2 = ax1.twinx()
                ax2.set_ylabel('Mean Reward per Episode', color='tab:red')
                ax2.plot(checkpoint_timesteps, checkpoint_avg_rewards, color='tab:red', label='Episode Reward')
                ax2.tick_params(axis='y', labelcolor='tab:red')

                fig.tight_layout()
                ax1.legend(loc='upper right')
                ax2.legend(loc='upper left')
                plt.title('Training Curve')

                # Save the plot with batch name
                plot_save_path = os.path.join(save_dir, f"plot_batch_{total_batches}_timestep_batch_{total_batches}.png")
                plt.savefig(plot_save_path)
                print(f"Plot saved at batch {total_batches} to {plot_save_path}")

                # Save the model
                model_save_path = os.path.join(save_dir, f"model_batch_{total_batches}_timestep_batch_{total_batches}.pt")
                torch.save(model.state_dict(), model_save_path)
                print(f"Model saved at timestep {total_timesteps} to {model_save_path}")

                plt.show()

    return pd.DataFrame({
        "Timestep": checkpoint_timesteps,
        "Mean Reward": checkpoint_avg_rewards,
        "Mean Episode Length": checkpoint_avg_lengths
    })

if __name__ == "__main__":
    env = VizDoomGym(render=False)
    in_channels = 1  # Assuming grayscale input
    n_actions = env.action_space.n
    gae_lambda = .9
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ActorCriticNetwork(in_channels, n_actions, gae_lambda)
    num_timesteps = 500000
    training_data = train(env, model, num_timesteps, device)


Plot saved at batch 56 to E:\RLModelTraining\Doom_PPO\plot_batch_56_timestep_batch_56.png
Model saved at timestep 896 to E:\RLModelTraining\Doom_PPO\model_batch_56_timestep_batch_56.pt


KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":
    env = VizDoomGym(render=True)
    in_channels = 1  # Assuming grayscale input
    n_actions = env.action_space.n
    gae_lambda = .9
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ActorCriticNetwork(in_channels, n_actions, gae_lambda)
    num_episodes = 10000
    training_data = train(env, model, num_episodes, device)

In [None]:
training_data