## Initialize VizDoom

In [7]:
#necessary
#!pip install vizdoom
#!pip install opencv-python
#!pip install pandas
#!pip install torch


In [8]:
# import VizDoom for game env
from vizdoom import *
# Import random for action sampling
import random
# Import time for sleeping
import time
# import numpy for identity matrix
import numpy as np

from matplotlib import pyplot as plt

## Make it a Gym Env

In [9]:
# Import environment base class from OpenAI Gym
from gymnasium import Env
# Import gym spaces
from gymnasium.spaces import Discrete, Box
# Import Opencv for greyscaling observations
import cv2

LEVEL = 'defend_the_center'
DOOM_SKILL = ''

In [15]:
# Create VizDoom OpenAI Gym Environment
class VizDoomGym(Env): 
    def __init__(self, render=False):
        """
        Function called when we start the env.
        """

        # Inherit from Env
        super().__init__()
        
        # Set up game
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        

        # Whether we want to render the game 
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # Create action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8)
        self.action_space = Discrete(3)

    
    def step(self, action):
        """
        How we take a step in the environment.
        """

        # Specify action and take step
        actions = np.identity(3, dtype=np.uint8)
        reward = self.game.make_action(actions[action], 4) # get action using index -> left, right, shoot
        
        # Get all the other stuff we need to return 
        if self.game.get_state():  # if nothing is
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)  # Apply Grayscale
            ammo = self.game.get_state().game_variables[0] 
            info = ammo
        # If we dont have anything turned from game.get_state
        else:
            # Return a numpy zero array
            state = np.zeros(self.observation_space.shape)
            # Return info (game variables) as zero
            info = 0

        info = {"info":info}
        done = self.game.is_episode_finished()
        truncated = False  # Assuming it's not truncated, modify if applicable
        
        return state, reward, done, truncated, info

    
    def render(self):
        """
        Define how to render the game environment.
        """
        pass

    
    def reset(self, seed=None):
        """
        Function for defining what happens when we start a new game.
        """
        if seed is not None:
            self.game.set_seed(seed)
            
        self.game.new_episode()
        state = self.game.get_state().screen_buffer  # Apply Grayscale

        return self.grayscale(state), {}

    
    def grayscale(self, observation):
        """
        Function to grayscale the game frame and resize it.
        observation: gameframe
        """
        # Change colour channels 
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)

        # Reduce image pixel size for faster training
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize,(100, 160,1))
        return state

    def close(self):
        """
        Call to close down the game.
        """
        self.game.close()


## Custom PPO model

In [11]:
# IMPORTS
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

from torch.distributions import Categorical
from torch.optim import Adam

import matplotlib.pyplot as plt
from IPython.display import clear_output
import os

In [12]:
# Convolutional output size calculator
def conv2d_size_out(size, kernel_size = 3, stride = 2, padding = 0):
    return (size + 2 * padding - (kernel_size - 1) - 1) // stride  + 1


class ActorCriticNetwork(nn.Module):
    def __init__(self, in_channels, n_output):
        super(ActorCriticNetwork, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Temporarily assume some output size after convolution
        # This should ideally be calculated based on input size
        self.feature_count = 64 * conv2d_size_out(conv2d_size_out(conv2d_size_out(100, 8, 4), 4, 2), 3, 1) * \
                        conv2d_size_out(conv2d_size_out(conv2d_size_out(160, 8, 4), 4, 2), 3, 1)

        self.fc = nn.Linear(self.feature_count, 512)
        self.actor = nn.Linear(512, n_output)
        self.critic = nn.Linear(512, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.reshape(-1, self.feature_count)  # Use reshape instead of view
        x = F.relu(self.fc(x))
        action_probs = F.softmax(self.actor(x), dim=1)
        value = self.critic(x)
        return action_probs, value

In [13]:
## PPO 
    
class PPO:
    def __init__(self, env, in_channels, n_actions, device):
        self.env = env
        self.device = device
        self._init_hyperparameters()
        self.actor = ActorCriticNetwork(in_channels, n_actions)
        self.critic = ActorCriticNetwork(in_channels, 1)

        #Initialise optimizer
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

    def _init_hyperparameters(self):
        self.timesteps_per_batch = 1
        self.max_timesteps_per_episode = 1600
        self.gamma = 0.95
        self.n_updates_per_iteration = 5
        self.lr = 0.005
        self.clip = 0.2  # Clipping parameter for PPO

    def train(self, num_episodes):
        for episode in range(num_episodes):
            batch_data = self.collect_data()
            self.update_policy(batch_data)
            print(f"Episode {episode}: Data = {batch_data}")

    def collect_data(self):
        batch_obs = []
        batch_acts = []
        batch_log_probs = []
        batch_rews = []
        batch_values = []
        t = 0

        while t < self.timesteps_per_batch:
            state, _ = self.env.reset()
            state = torch.from_numpy(state).float().unsqueeze(0).permute(0, 3, 1, 2).to(self.device)
            done = False
            while not done and t < self.timesteps_per_batch:
                policy_dist, value = self.actor(state)  # Changed here
                action = policy_dist.multinomial(num_samples=1).detach()
                log_prob = torch.log(policy_dist.squeeze(0)[action])
                next_state, reward, done, _, _ = self.env.step(action.item())

                batch_obs.append(state)
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                batch_rews.append(reward)
                batch_values.append(value)

                state = torch.from_numpy(next_state).float().unsqueeze(0).permute(0, 3, 1, 2).to(self.device)
                t += 1

        rewards_to_go = PPO.calculate_rewards_to_go(torch.tensor(batch_rews, dtype=torch.float), self.gamma)
        return {
            "obs": torch.cat(batch_obs),
            "acts": torch.stack(batch_acts),
            "log_probs": torch.stack(batch_log_probs),
            "rewards": torch.tensor(batch_rews, dtype=torch.float),
            "values": torch.cat(batch_values),
            "rewards_to_go": rewards_to_go  # Add this line
        }

    def update_policy(self, batch_data):
        batch_obs = batch_data['obs']
        batch_acts = batch_data['acts']
        batch_log_probs = batch_data['log_probs']
        batch_rtgs = batch_data['rewards_to_go']  # Assuming rewards to go are calculated and passed

        # Calculate current V and log probs
        current_V, current_log_probs = self.evaluate(batch_obs, batch_acts)

        # Compute advantages and normalize
        A_k = batch_rtgs - current_V.detach()
        A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

        for _ in range(self.n_updates_per_iteration):
            # Recalculate V and log_probs for the current policy
            V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

            # Calculate the ratio (pi_theta / pi_theta_old)
            ratios = torch.exp(curr_log_probs - batch_log_probs)

            # Actor Loss: PPO's clipped objective
            surr1 = ratios * A_k
            surr2 = torch.clamp(ratios, 1.0 - self.clip, 1.0 + self.clip) * A_k
            actor_loss = -torch.min(surr1, surr2).mean()

            # Critic Loss
            critic_loss = F.mse_loss(V, batch_rtgs)

            # Perform backward propagation for critic
            self.critic_optim.zero_grad()
            critic_loss.backward(retain_graph=True)
            self.critic_optim.step()

            # Perform backward propagation for actor
            self.actor_optim.zero_grad()
            actor_loss.backward(retain_graph=True)
            self.actor_optim.step()

    def calculate_rewards_to_go(rewards, gamma):
        n = len(rewards)
        rewards_to_go = torch.zeros_like(rewards)
        for i in reversed(range(n)):
            rewards_to_go[i] = rewards[i] + (gamma * rewards_to_go[i + 1] if i + 1 < n else 0)
        return rewards_to_go

    def evaluate(self, batch_obs, batch_acts):
        # Get the policy distribution and value estimate for the given observations
        policy_dist, value = self.actor(batch_obs)  # Changed here
        dist = Categorical(policy_dist)
        log_probs = dist.log_prob(batch_acts)

        return value, log_probs

if __name__ == "__main__":
    env = VizDoomGym(render=True)
    in_channels = 1  # Assuming grayscale input
    n_actions = env.action_space.n
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    ppo_agent = PPO(env, in_channels, n_actions, device)
    ppo_agent.train(500)  # Train for 500 episodes


ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

## Test

In [19]:
def load_model(model_path):
    model = ActorCriticNetwork(in_channels=1, n_output=3)  # Assuming grayscale images and 3 action outputs
    state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Ensure to load on CPU if GPU is not available
    model.load_state_dict(state_dict)
    model.eval()
    return model


# Load the PyTorch model
model = load_model('E:\RLModelTraining\Doom_PPO\model_batch_25713_timestep_batch_25713.pt')  # Replace with your actual model path

# Helper function to preprocess observations if needed
def preprocess(observation):
    # Extract the array part of the observation if it's in a tuple
    if isinstance(observation, tuple):
        observation = observation[0]

    # Check if the observation is a numpy array and has the expected number of dimensions
    if not isinstance(observation, np.ndarray) or observation.ndim != 3:
        raise ValueError("Observation is not in the expected format or shape")

    # Convert to grayscale (assuming the model expects single-channel inputs)
    if observation.shape[2] == 3:
        observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)

    # Resize the image to 96x96, which should match the model's input layer requirement
    observation = cv2.resize(observation, (96, 96))

    # Normalize and add necessary dimensions for PyTorch
    observation = np.expand_dims(observation, axis=0)  # Add batch dimension
    observation = np.expand_dims(observation, axis=0)  # Add channel dimension
    observation = torch.tensor(observation, dtype=torch.float32) / 255.0  # Normalize

    return observation



env = VizDoomGym(render=True)
obs = env.reset()
done = False
while not done:
    obs = preprocess(obs)
    print("Processed observation shape (should be [1, 1, 96, 96]):", obs.shape)
    with torch.no_grad():
        action = model(obs).max(1)[1].item()
    obs, reward, done, _, info = env.step(action)
    if done:
        obs = env.reset()
env.close()



Processed observation shape (should be [1, 1, 96, 96]): torch.Size([1, 1, 96, 96])


RuntimeError: shape '[-1, 9216]' is invalid for input of size 4096

## Training loop