## Initialize VizDoom

In [None]:
#necessary
#!pip install vizdoom
#!pip install opencv-python
#!pip install pandas
#!pip install torch


In [None]:
# import VizDoom for game env
from vizdoom import *
# Import random for action sampling
import random
# Import time for sleeping
import time
# import numpy for identity matrix
import numpy as np

from matplotlib import pyplot as plt

## Make it a Gym Env

In [None]:
# Import environment base class from OpenAI Gym
from gymnasium import Env
# Import gym spaces
from gymnasium.spaces import Discrete, Box
# Import Opencv for greyscaling observations
import cv2

LEVEL = 'defend_the_center'
DOOM_SKILL = ''

In [None]:
# Create VizDoom OpenAI Gym Environment
class VizDoomGym(Env): 
    def __init__(self, render=False):
        """
        Function called when we start the env.
        """

        # Inherit from Env
        super().__init__()
        
        # Set up game
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        

        # Whether we want to render the game 
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # Create action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8)
        self.action_space = Discrete(3)

    
    def step(self, action):
        """
        How we take a step in the environment.
        """

        # Specify action and take step
        actions = np.identity(3, dtype=np.uint8)
        reward = self.game.make_action(actions[action], 4) # get action using index -> left, right, shoot
        
        # Get all the other stuff we need to return 
        if self.game.get_state():  # if nothing is
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)  # Apply Grayscale
            ammo = self.game.get_state().game_variables[0] 
            info = ammo
        # If we dont have anything turned from game.get_state
        else:
            # Return a numpy zero array
            state = np.zeros(self.observation_space.shape)
            # Return info (game variables) as zero
            info = 0

        info = {"info":info}
        done = self.game.is_episode_finished()
        truncated = False  # Assuming it's not truncated, modify if applicable
        
        return state, reward, done, info

    
    def render(self):
        """
        Define how to render the game environment.
        """
        pass

    
    def reset(self, seed=None):
        """
        Function for defining what happens when we start a new game.
        """
        if seed is not None:
            self.game.set_seed(seed)
            
        self.game.new_episode()
        state = self.game.get_state().screen_buffer  # Apply Grayscale

        return self.grayscale(state), {}

    
    def grayscale(self, observation):
        """
        Function to grayscale the game frame and resize it.
        observation: gameframe
        """
        # Change colour channels 
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)

        # Reduce image pixel size for faster training
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize,(100, 160,1))
        return state

    def close(self):
        """
        Call to close down the game.
        """
        self.game.close()


## Custom PPO model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

from torch.distributions import Categorical
from torch.optim import Adam

import matplotlib.pyplot as plt
from IPython.display import clear_output
import os

In [None]:
class FeedForwardNN(nn.Module):
    # define basic neural network layers (can also use convolution layers?)
    def __init__(self, in_dim, out_dim):
        super(FeedForwardNN, self).__init__()

        if isinstance(in_dim, int):
            total_input_size = in_dim
        else:
            total_input_size = int(torch.prod(torch.tensor(in_dim)))  # Calculate the total number of elements in the shape tuple

        self.layer1 = nn.Linear(total_input_size, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)

    # forward method to pass on neural network
    """
    uses "ReLU" activation function?

    this network module defines both the actor and the critic so will:
    1. take in an observation
    2. return an action OR return a value
    
    - observation is set as parameter
    - network must be a tensor so should convert obs to a tensor first in case it is passed as numpy array
    """

    def forward(self, obs):
        # Ensure the observation is a tensor
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float)

        if obs.dim() > 1 and obs.size(-1) != 16000:
            obs = obs.view(-1)  # Only flatten if it's not already a flat vector per observation
            
        # Pass observation through the neural network
        activation1 = F.relu(self.layer1(obs))
        activation2 = F.relu(self.layer2(activation1))
        output = self.layer3(activation2)
        return output
    
    


TIMESTEPS_PER_BATCH = 100
MAX_TIMESTEPS_PER_EPISODE = 1600
GAMMA = 0.95
N_UPDATES_PER_ITERATION = 5
LEARNING_RATE = 0.005

from torch.optim import Adam
#class
class PPO:

    """
    No information regarding input or output sizes, which can change depending on fed environment.

    Solution: initialise it as an instance var in ppo class
    """
    def __init__(self, env):
        # init hyperparameters
        self._init_hyperparameters()
        
        # Extract environment information
        self.env = env
        self.obs_dim = env.observation_space.shape[0] if len(env.observation_space.shape) == 1 else env.observation_space.shape
        self.act_dim = env.action_space.n

        # Initialize episode counter
        self.episode_number = 0  # This will keep track of the number of episodes

        # ALG STEP 1
        # Initialize actor and critic networks
        self.actor = FeedForwardNN(self.obs_dim, self.act_dim)
        self.critic = FeedForwardNN(self.obs_dim, 1)

        """
        Need to explain what adam is doing in the report:
        The Adam optimizer is an adaptive learning rate optimization algorithm commonly
        used in deep learning that combines the advantages of AdaGrad and RMSprop to compute 
        and apply adaptive learning rates for each parameter during the training process.
        (citations)
        """
        #Initialise optimizer
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)


    """
    Define a for loop for some number of iterations. 
    ---> Should likely specify how many timesteps to train instead of counting to infinity.
    """
    def learn(self, total_timesteps):
        t_so_far = 0 # Timesteps simulated so far
        while t_so_far < total_timesteps:              # ALG STEP 2
           # ALG STEP 3
            batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout()
            # Calculate how many timesteps we collected this batch   
            t_so_far += np.sum(batch_lens)

            """
            We will use the advantage function defined here:
            https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#advantage-functions

            --> Q^π is the Q-value of state action pair (s, a), and Vᵩₖ is the value of some observation s determined by our critic network following parameters Φ on the k-th iteration.

            Though it is modified: 
            value predicted is following parameters Φ on the k-th iteration, as we'll need to recalculate V(s) following parameters Φ on the i-th epoch.
            """

            # Calculate V_{phi, k}
            V, _ = self.evaluate(batch_obs, batch_acts)
        
            # ALG STEP 5
            # Calculate advantage
            A_k = batch_rtgs - V.detach()  # we do V.detach() since V is a tensor with gradient required.
        
            # Normalize advantages
            A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10) # we add 1e-10 to the standard deviation of the advantages, to avoid the possibility of dividing by 0.
        
            for _ in range(self.n_updates_per_iteration):
                # Calculate V_phi and pi_theta(a_t | s_t)    
                V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

                # Calculate ratios
                ratios = torch.exp(curr_log_probs - batch_log_probs)

                # Calculate surrogate losses
                surr1 = ratios * A_k
                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

                # Calculate the actor loss 
                actor_loss = (-torch.min(surr1, surr2)).mean() #taking the minimum between the 2 surrogate losses
                critic_loss = nn.MSELoss()(V, batch_rtgs)      #calculate MSE of predicted values

                # Calculate gradients and perform backward propagation for critic network    
                self.critic_optim.zero_grad()    
                critic_loss.backward()    
                self.critic_optim.step()
                # Calculate gradients and perform backward propagation for actor 
                # network
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()

    """
    need to collect data from a set of episodes by running our current actor policy
    
    ---> Can collect data in batches?

    - To increment t_so_far in learn, the number of timesteps simulated per batch is necessary.
    - Return the lengths of each episode run in our batch for future logging of average episodic length.
    - Optionally, sum the episodic lengths before returning, based on preference.

    Also have to:
    - Determine the number of timesteps to run per batch, which will be treated as a hyperparameter.
    - Create a function named to establish default hyperparameters.
    """
    
    def rollout(self):
        """
        In batch we run episodes til we hit timesteps per batch. 
        Collect the observations, actions, probabilities of actions, rewards, rewards to-go, and lengths of each episode.
        """
        # Batch data
        batch_obs = []             # batch observations           (number of timesteps per batch, dimension of observation)
        batch_acts = []            # batch actions                (number of timesteps per batch, dimension of action)
        batch_log_probs = []       # log probs of each action     (number of timesteps per batch)
        batch_rews = []            # batch rewards                (number of episodes, number of timesteps per episode)
        batch_rtgs = []            # batch rewards-to-go          (number of timesteps per batch)
        batch_lens = []            # episodic lengths in batch    (number of episodes)
        
        # sources explaining why we keep track of raw action probabilities:
        # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
        # https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient
        # essentially, makes gradient ascient easier
       

        # Number of timesteps run so far this batch
        t = 0 
        while t < self.timesteps_per_batch:
            # Rewards this episode
            ep_rews = []
            obs, _ = self.env.reset()
            done = False
            self.episode_number += 1  # Increment the class attribute for episode count
            for ep_t in range(self.max_timesteps_per_episode):

                # Increment timesteps ran this batch so far
                t += 1
                

                # Collect observation
                batch_obs.append(obs)

                action, log_prob = self.get_action(obs)
                obs, rew, done, _ = self.env.step(action)
  
                # Collect reward, action, and log prob
                ep_rews.append(rew)
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                
                if done:
                    break
            # Collect episodic length and rewards
            print(f"Episode {self.episode_number}: Total Reward = {sum(ep_rews)}, Length = {len(ep_rews)}")

            batch_lens.append(ep_t + 1) # plus 1 because timestep starts at 0
            batch_rews.append(ep_rews) 

        """
        convert our batch_obs, batch_acts, batch_log_probs, and batch_rtgs to tensors since we’ll need them in that form later to draw our computation graphs

        also create some function that will compute the rewards to go of the batch rewards (step 4 of algo)
        """

        # Reshape data as tensors in the shape specified before returning

        batch_obs = [obs.reshape(-1) for obs in batch_obs]  # Flatten each observation
        batch_obs_tensor = torch.tensor(batch_obs, dtype=torch.float)  # Convert list to tensor


        batch_acts = torch.tensor(batch_acts, dtype=torch.float)
        batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)

        # ALG STEP #4
        batch_rtgs = self.compute_rtgs(batch_rews)
        # Return the batch data
        return batch_obs_tensor, batch_acts, batch_log_probs, batch_rtgs, batch_lens
    
    """
    Next we need to get an action.

    This uses MULTIVARIATE NORMAL DISTRIBUTION.
    ---> Essentially, actor will output a "mean" action on a forward pass, then create a covariance matrix with standard deviation.
    ---> Mean is then used to generate a MND and then sample an action close to the mean.
    """
    # source on multivariance normal distribution: https://cs229.stanford.edu/notes2021fall/cs229-notes2.pdf


    # NOTE: actions will be deterministic when testing, meaning that the “mean” action will be our actual action during testing.
    # NOTE: However, during training we need an exploratory factor, which this distribution can help us with.

    def get_action(self, obs):
        # Get logits from the actor network
        logits = self.actor(obs)

        # Apply softmax to convert logits into probabilities
        probs = F.softmax(logits, dim=-1)  # Ensure this matches the dimension of logits output
        
        # Create a categorical distribution and sample an action
        dist = Categorical(probs)
        action = dist.sample()

        # Get the log probability of the sampled action
        log_prob = dist.log_prob(action)
        
        # Return the sampled action and the log prob of that action
        return action.item(), log_prob

    
    def compute_rtgs(self, batch_rews):
        # The rewards-to-go (rtg) per episode per batch to return.
        # The shape will be (num timesteps per episode)
        batch_rtgs = []
        # Iterate through each episode backwards to maintain same order
        # in batch_rtgs
        for ep_rews in reversed(batch_rews):
            discounted_reward = 0 # The discounted reward so far
            for rew in reversed(ep_rews):
                discounted_reward = rew + discounted_reward * self.gamma
                batch_rtgs.insert(0, discounted_reward)
        # Convert the rewards-to-go into a tensor
        return torch.tensor(batch_rtgs, dtype=torch.float)

    

    # default values for hyperparameters. can change in config
    def _init_hyperparameters(self):
        self.timesteps_per_batch = TIMESTEPS_PER_BATCH               # timesteps per batch
        self.max_timesteps_per_episode = MAX_TIMESTEPS_PER_EPISODE   # timesteps per episode
        self.gamma = GAMMA
        self.n_updates_per_iteration = N_UPDATES_PER_ITERATION
        self.clip = 0.2
        self.lr = LEARNING_RATE

    # function to evaluate V(s)
    def evaluate(self, batch_obs, batch_acts):
        # Query critic network for a value V for each obs in batch_obs.
        V = self.critic(batch_obs).squeeze()

        # Calculate the log probabilities of batch actions using most recent actor network.
        logits = self.actor(batch_obs)
        probs = F.softmax(logits, dim=-1)  # Apply softmax to convert logits into probabilities
        dist = Categorical(probs)
        log_probs = dist.log_prob(batch_acts)

        # Return predicted values V and log probs
        return V, log_probs

        

## Training loop

In [None]:
env = VizDoomGym(render=True)
model = PPO(env)
model.learn(10000)