## Initialize VizDoom

In [309]:
#necessary
#!pip install vizdoom
#!pip install opencv-python
#!pip install pandas
#!pip install torch
#!pip install gym
#!pip install pyglet==1.5.11

In [310]:
# import VizDoom for game env
from vizdoom import *
# Import random for action sampling
import random
# Import time for sleeping
import time
# import numpy for identity matrix
import numpy as np

from matplotlib import pyplot as plt

## Make it a Gym Env

In [311]:
# Import environment base class from OpenAI Gym
from gymnasium import Env
# Import gym spaces
from gymnasium.spaces import Discrete, Box
# Import Opencv for greyscaling observations
import cv2

LEVEL = 'defend_the_center'
DOOM_SKILL = ''

In [312]:
# Create VizDoom OpenAI Gym Environment
class VizDoomGym(Env): 
    def __init__(self, render=False):
        """
        Function called when we start the env.
        """

        # Inherit from Env
        super().__init__()
        
        # Set up game
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        

        # Whether we want to render the game 
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # Create action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8)
        self.action_space = Discrete(3)

    
    def step(self, action):
        """
        How we take a step in the environment.
        """

        # Specify action and take step
        actions = np.identity(3, dtype=np.uint8)
        reward = self.game.make_action(actions[action], 4) # get action using index -> left, right, shoot
        
        # Get all the other stuff we need to return 
        if self.game.get_state():  # if nothing is
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)  # Apply Grayscale
            ammo = self.game.get_state().game_variables[0] 
            info = ammo
        # If we dont have anything turned from game.get_state
        else:
            # Return a numpy zero array
            state = np.zeros(self.observation_space.shape)
            # Return info (game variables) as zero
            info = 0

        info = {"info":info}
        done = self.game.is_episode_finished()
        truncated = False  # Assuming it's not truncated, modify if applicable
        
        return state, reward, done, truncated, info

    
    def render(self):
        """
        Define how to render the game environment.
        """
        pass

    
    def reset(self, seed=None):
        """
        Function for defining what happens when we start a new game.
        """
        if seed is not None:
            self.game.set_seed(seed)
            
        self.game.new_episode()
        state = self.game.get_state().screen_buffer  # Apply Grayscale

        return self.grayscale(state), {}

    
    def grayscale(self, observation):
        """
        Function to grayscale the game frame and resize it.
        observation: gameframe
        """
        # Change colour channels 
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)

        # Reduce image pixel size for faster training
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize,(100, 160,1))
        return state

    def close(self):
        """
        Call to close down the game.
        """
        self.game.close()


## Custom PPO model

In [313]:
# Imports

import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

In [314]:
# PPO Algorithm

"""
https://www.youtube.com/watch?v=hlv79rcHws0
"""

class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha,
                 fc1_dims=256, fc2_dims=256, checkpoint_dir='tmp/ppo'):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(checkpoint_dir, 'actor_torch_ppo')
        os.makedirs(os.path.dirname(self.checkpoint_file), exist_ok=True)
        
        self.actor = nn.Sequential(
            nn.Linear(*input_dims, fc1_dims),
            nn.LeakyReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.LeakyReLU(),
            nn.Linear(fc2_dims, n_actions),
            nn.Softmax(dim=-1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)

        return dist
    
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256, 
                 checkpoint_dir='tmp/ppo'):
        super(CriticNetwork, self).__init__()

        self.checkpoint_file = os.path.join(checkpoint_dir, 'critic_torch_ppo')
        os.makedirs(os.path.dirname(self.checkpoint_file), exist_ok=True)

        self.critic = nn.Sequential(
            nn.Linear(*input_dims, fc1_dims),
            nn.LeakyReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.LeakyReLU(),
            nn.Linear(fc2_dims, 1),
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value
    
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class PPOAgent:
    def __init__(self, n_actions, input_dims, gamma, alpha, gae_lambda,
                 policy_clip, batch_size, N, n_epochs):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)

        self.actor_losses = []
        self.critic_losses = []
        self.values = []

    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('...saving models...')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()

    def load_models(self):
        print('...loading models...')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, probs, value

    def learn(self):
        entropy_coefficient = 0.01  # Set the entropy coefficient to a suitable value
        episode_actor_losses = []
        episode_critic_losses = []
        episode_values = []
        
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_probs_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()
            
            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount * (reward_arr[k] + self.gamma * values[k+1] * (1 - int(dones_arr[k])) - values[k])
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_probs_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()

                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                # Calculate entropy bonus
                entropy = dist.entropy().mean()
                actor_loss -= entropy_coefficient * entropy  # Adding entropy bonus

                returns = advantage[batch] + values[batch]
                critic_loss = (returns - critic_value) ** 2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5 * critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

            # Collect losses for each batch
            episode_actor_losses.append(actor_loss.item())
            episode_critic_losses.append(critic_loss.item())
            episode_values.append(critic_value.mean().item())

        # Store average loss and value for the episode
        self.actor_losses.append(np.mean(episode_actor_losses))
        self.critic_losses.append(np.mean(episode_critic_losses))
        self.values.append(np.mean(episode_values))

        self.memory.clear_memory()

    # Reset stored data after each episode or training session
    def reset_learning_debug_data(self):
        self.actor_losses = []
        self.critic_losses = []
        self.values = []

    def reset_learning_debug_data(self):
        self.actor_losses = []
        self.critic_losses = []
        self.values = []

## Define Hyperparameters

In [315]:
def evaluate_hyperparameters(env, agent_params, n_games=50):
    agent = PPOAgent(**agent_params)
    total_rewards = []
    for _ in range(n_games):
        observation, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _, _ = agent.choose_action(observation)
            observation, reward, done, info, _ = env.step(action)
            total_reward += reward
        total_rewards.append(total_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward

def hyperparameter_tuning():
    env = gym.make('CartPole-v1')
    learning_rates = [0.001, 0.0005, 0.0001]
    gammas = [0.99, 0.95, 0.90]
    gae_lambdas = [0.95, 0.97, 0.99]

    best_avg_reward = -np.inf
    best_params = {}

    for alpha in learning_rates:
        for gamma in gammas:
            for gae_lambda in gae_lambdas:
                agent_params = {
                    'n_actions': env.action_space.n,
                    'input_dims': env.observation_space.shape,
                    'alpha': alpha,
                    'gamma': gamma,
                    'gae_lambda': gae_lambda,
                    'policy_clip': 0.5,
                    'batch_size': 5,
                    'N': 2048,
                    'n_epochs': 4
                }
                avg_reward = evaluate_hyperparameters(env, agent_params)
                print(f'Tested {agent_params} -> Avg Reward: {avg_reward}')
                
                if avg_reward > best_avg_reward:
                    best_avg_reward = avg_reward
                    best_params = agent_params

    print(f"Best Hyperparameters: {best_params} with Average Reward: {best_avg_reward}")
    return best_params

## Main Loop

In [316]:
import gym
import numpy as np

def plot_learning_curves(timesteps, scores, figure_file):
    os.makedirs(os.path.dirname(figure_file), exist_ok=True)
    fig, ax = plt.subplots()

    scores = np.array(scores)
    cumulative_avg = np.cumsum(scores) / (np.arange(len(scores)) + 1)  # Compute the cumulative average

    ax.plot(timesteps, scores, label='Reward per Episode', alpha=0.3)  # Plot raw scores
    ax.plot(timesteps, cumulative_avg, label='Cumulative Average', color='red')  # Plot cumulative average
    ax.set_xlabel('Episode')
    ax.set_ylabel('Reward')
    ax.legend()
    plt.title('Training Curve')
    plt.savefig(figure_file)
    plt.show()

def plot_curve_smooth(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

def plot_additional_metrics(episodes, actor_losses, critic_losses, values, figure_file_prefix):
    fig, axs = plt.subplots(3, 1, figsize=(5, 10))

    # Ensure 'episodes' is a list or array of the right size
    episodes = list(episodes) if len(episodes) == len(actor_losses) else list(range(len(actor_losses)))

    # Actor Loss
    axs[0].plot(episodes, actor_losses, label='Actor Loss', color='blue')
    axs[0].set_title('Actor Loss Over Time')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Loss')
    axs[0].legend()

    # Critic Loss
    axs[1].plot(episodes, critic_losses, label='Critic Loss', color='green')
    axs[1].set_title('Critic Loss Over Time')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Loss')
    axs[1].legend()

    # Value Estimates
    axs[2].plot(episodes, values, label='Value Estimates', color='red')
    axs[2].set_title('Value Estimates Over Time')
    axs[2].set_xlabel('Episode')
    axs[2].set_ylabel('Value')
    axs[2].legend()

    plt.tight_layout()
    plt.savefig(f"{figure_file_prefix}_additional_metrics.png")
    plt.show()

def evaluate_hyperparameters(env, agent_params, n_games=100):
    agent = PPOAgent(**agent_params)
    total_rewards = []
    for _ in range(n_games):
        observation, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _, _ = agent.choose_action(observation)
            observation, reward, done, _, _ = env.step(action)
            total_reward += reward
        total_rewards.append(total_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward

def hyperparameter_tuning():
    env = gym.make('CartPole-v1')
    learning_rates = [0.001, 0.0005, 0.0001, 0.00001, 0.000001]
    gammas = [0.99, 0.95, 0.90]
    gae_lambdas = [0.95, 0.97, 0.99]

    best_avg_reward = -np.inf
    # Initialize best_params with a default configuration
    best_params = {
        'n_actions': env.action_space.n,
        'input_dims': env.observation_space.shape,
        'alpha': 0.0003,  # Default learning rate
        'gamma': 0.99,    # Default discount factor
        'gae_lambda': 0.95,  # Default GAE lambda
        'policy_clip': 0.2,
        'batch_size': 5,
        'N': 2048,
        'n_epochs': 4
    }

    for alpha in learning_rates:
        for gamma in gammas:
            for gae_lambda in gae_lambdas:
                agent_params = {
                    'n_actions': env.action_space.n,
                    'input_dims': env.observation_space.shape,
                    'alpha': alpha,
                    'gamma': gamma,
                    'gae_lambda': gae_lambda,
                    'policy_clip': 0.2,
                    'batch_size': 5,
                    'N': 2048,
                    'n_epochs': 4
                }
                avg_reward = evaluate_hyperparameters(env, agent_params)
                print(f'Tested {agent_params} -> Avg Reward: {avg_reward}')
                
                if avg_reward > best_avg_reward:
                    best_avg_reward = avg_reward
                    best_params = agent_params

    print(f"Best Hyperparameters: {best_params} with Average Reward: {best_avg_reward}")
    return best_params


if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    best_params = hyperparameter_tuning()

    # Create the agent with the best hyperparameters
    agent = PPOAgent(**best_params)
    n_games = 1000

    # Directory for saving plots and model checkpoints
    plot_dir = './logs/ppo/plots'
    checkpoint_dir = './logs/ppo/checkpoints'
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)

    figure_file = os.path.join(plot_dir, 'cartpole.png')

    best_score = env.reward_range[0]
    score_history = []

    learn_iters = 0
    avg_score = 0
    n_steps = 0

    for i in range(n_games):
        observation, _ = env.reset()
        done = False
        score = 0

        while not done:
            action, prob, val = agent.choose_action(observation)
            observation_, reward, done, info, _ = env.step(action)
            n_steps += 1
            score += reward
            agent.remember(observation, action, prob, val, reward, done)
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            agent.save_models()

        print("Episode ", i, ": score %.1f" % score, " avg score %.1f" % avg_score,\
                " timesteps", n_steps, " learning steps", learn_iters)
    x = [i+1 for i in range(len(score_history))]
    plot_curve_smooth(x, score_history, figure_file)
    episodes = range(1, n_games + 1)
    plot_additional_metrics(episodes, agent.actor_losses, agent.critic_losses, agent.values, plot_dir)

    #report average values per however many runs

    # changed to LeakyReLU for cases of negative input
    # added entropy bonus to avoid agent converging too early, rewards more exploration - saw much better initial results
    # added hp tuning to ensure best params - as model seemed quite sensitive
    # add some information about CATASTROPHE FORGETTING in report. PPO is very susceptible and converges on bad policies often

Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.99, 'gae_lambda': 0.95, 'policy_clip': 0.2, 'batch_size': 5, 'N': 2048, 'n_epochs': 4} -> Avg Reward: 22.83
Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.99, 'gae_lambda': 0.97, 'policy_clip': 0.2, 'batch_size': 5, 'N': 2048, 'n_epochs': 4} -> Avg Reward: 22.51
Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.99, 'gae_lambda': 0.99, 'policy_clip': 0.2, 'batch_size': 5, 'N': 2048, 'n_epochs': 4} -> Avg Reward: 23.11
Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.95, 'gae_lambda': 0.95, 'policy_clip': 0.2, 'batch_size': 5, 'N': 2048, 'n_epochs': 4} -> Avg Reward: 22.3
Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.95, 'gae_lambda': 0.97, 'policy_clip': 0.2, 'batch_size': 5, 'N': 2048, 'n_epochs': 4} -> Avg Reward: 20.41
Tested {'n_actions': 2, 'input_dims': (4,), 'alpha': 0.001, 'gamma': 0.95, 'gae_lambda': 0.99, 'policy_cli