## Initialize VizDoom

In [66]:
#necessary
!pip install vizdoom
!pip install opencv-python
!pip install pandas
!pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install gym
!pip install pyglet==1.5.11
!pip install joblib

# also need to install pytorch-cpu on anaconda

Looking in links: https://download.pytorch.org/whl/cu113/torch_stable.html


ERROR: Could not find a version that satisfies the requirement torch==1.10.1+cu113 (from versions: 1.11.0, 1.11.0+cu113, 1.12.0, 1.12.0+cu113, 1.12.1, 1.12.1+cu113, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0)
ERROR: No matching distribution found for torch==1.10.1+cu113




In [67]:
# import VizDoom for game env
from vizdoom import *
# Import environment base class from OpenAI Gym
from gymnasium import Env
# Import gym spaces
from gymnasium.spaces import Discrete, Box
# Import Opencv for greyscaling observations
import cv2

# Extra imports
import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from matplotlib import pyplot as plt


## VizDoom Environment

In [68]:
# Create VizDoom OpenAI Gym Environment
class VizDoomGym(Env): 
    def __init__(self, render=False):
        """
        Function called when we start the env.
        """

        # Inherit from Env
        super().__init__()
        
        # Set up game
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        

        # Whether we want to render the game 
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # Create action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8)
        self.action_space = Discrete(3)

    
    def step(self, action, frame_skip=4):
        """
        How we take a step in the environment.
        """

        # Specify action and take step
        actions = np.identity(3, dtype=np.uint8)
        total_reward = 0
        for _ in range(frame_skip):
            reward = self.game.make_action(actions[action], 2)  # Increase frame skip value here
            total_reward += reward
            
            # Break the loop if the game ends during frame skipping
            if self.game.is_episode_finished():
                break
        
        if self.game.get_state():  # if nothing is
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)  # Apply Grayscale
            ammo = self.game.get_state().game_variables[0] 
            info = ammo
        # If we don't have anything turned from game.get_state
        else:
            # Return a numpy zero array
            state = np.zeros(self.observation_space.shape)
            # Return info (game variables) as zero
            info = 0

        info = {"info": info}
        done = self.game.is_episode_finished()
        truncated = False  # Assuming it's not truncated, modify if applicable
        
        print("Step state shape:", state.shape)
        return state, total_reward, done, truncated, info


    
    def render(self):
        """
        Define how to render the game environment.
        """
        pass

    
    def reset(self, seed=None):
        """
        Function for defining what happens when we start a new game.
        """
        if seed is not None:
            self.game.set_seed(seed)
            
        self.game.new_episode()
        state = self.game.get_state().screen_buffer  # Apply Grayscale

        print("Reset state shape:", state.shape)
        return self.grayscale(state)

    
    def grayscale(self, observation):
        """
        Function to grayscale the game frame and resize it.
        observation: gameframe
        """
        # Change colour channels 
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        
        # Reduce image pixel size for faster training
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize,(100, 160,1))
        return state

    def close(self):
        """
        Call to close down the game.
        """
        self.game.close()


## Custom PPO model

### PPO Algorithm

In [69]:
# PPO Algorithm

"""
https://www.youtube.com/watch?v=hlv79rcHws0

https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/PolicyGradient/PPO/torch/main.py
"""

class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha, fc1_dims=64, fc2_dims=64, checkpoint_dir='tmp/ppo', noise_std=0.5):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(checkpoint_dir, 'actor_torch_ppo')
        os.makedirs(os.path.dirname(self.checkpoint_file), exist_ok=True)
        
        self.noise_std = noise_std  # Standard deviation of the Gaussian noise
        total_input_size = int(T.prod(T.tensor(input_dims)))  # Flatten the input dimensions
        
        self.actor = nn.Sequential(
            nn.Linear(total_input_size, fc1_dims),
            nn.ReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.ReLU(),
            nn.Linear(fc2_dims, n_actions),
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        if state.dim() > 1:
            state = state.view(state.size(0), -1)  # Flatten the state
        
        logits = self.actor(state)
        if self.training:  # Only add noise during training
            noise = T.randn_like(logits) * self.noise_std
            logits = logits + noise
        dist = Categorical(logits=logits.softmax(dim=-1))

        return dist 
    
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=64, fc2_dims=64, checkpoint_dir='tmp/ppo'):
        super(CriticNetwork, self).__init__()

        self.checkpoint_file = os.path.join(checkpoint_dir, 'critic_torch_ppo')
        os.makedirs(os.path.dirname(self.checkpoint_file), exist_ok=True)
        
        total_input_size = int(T.prod(T.tensor(input_dims)))  # Flatten the input dimensions

        self.critic = nn.Sequential(
            nn.Linear(total_input_size, fc1_dims),
            nn.ReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.ReLU(),
            nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        if state.dim() > 1:
            state = state.view(state.size(0), -1)  # Flatten the state
        
        value = self.critic(state)
        return value
    
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))
    

class PPOAgent:
    def __init__(self, n_actions, input_dims, gamma, alpha, gae_lambda,
                 policy_clip, batch_size, N, n_epochs, entropy_coefficient, save_dir, actor, critic):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.alpha = alpha
        self.batch_size = batch_size
        self.N = N
        self.n_epochs - n_epochs
        self.gae_lambda = gae_lambda
        self.entropy_coefficient = entropy_coefficient
        self.save_dir = save_dir

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)

        self.actor_losses = []
        self.critic_losses = []
        self.values = []

    def print_params(self):
        # Print statement accessing the attributes with descriptive names
        print(f"Agent Parameters:\n"
              f"  Discount Factor (Gamma): {self.gamma}\n"
              f"  Learning Rate (Alpha): {self.alpha}\n"
              f"  GAE Lambda: {self.gae_lambda}\n"
              f"  Policy Clipping Range: {self.policy_clip}\n"
              f"  Batch Size: {self.batch_size}\n"
              f"  Steps per Batch (N): {self.N}\n"
              f"  Number of Epochs per Update: {self.n_epochs}\n"
              f"  Entropy Coefficient: {self.entropy_coefficient}")

    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('...saving models...')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
        T.save(self.actor.state_dict(), os.path.join(self.save_dir, 'actor.pt'))
        T.save(self.critic.state_dict(), os.path.join(self.save_dir, 'critic.pt'))
        print("Models saved!")

    def load_models(self, actor_path, critic_path):
        self.actor.load_state_dict(T.load(actor_path))
        self.critic.load_state_dict(T.load(critic_path))
        self.actor.eval()
        self.critic.eval()

    def choose_action(self, observation):
        print("Observation shape in choose_action:", observation.shape)
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        
        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()
        
        return action

    def learn(self):
        episode_actor_losses = []
        episode_critic_losses = []
        episode_values = []
        
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_probs_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()
            
            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount * (reward_arr[k] + self.gamma * values[k+1] * (1 - int(dones_arr[k])) - values[k])
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)

            # sources explaining why we keep track of raw action probabilities:
            # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
            # https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient
            # essentially, makes gradient ascient easier
       
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_probs_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()

                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                # Calculate entropy bonus
                entropy = dist.entropy().mean()
                actor_loss -= self.entropy_coefficient * entropy  # Adding entropy bonus

                returns = advantage[batch] + values[batch]
                critic_loss = (returns - critic_value) ** 2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5 * critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

            # Collect losses for each batch
            episode_actor_losses.append(actor_loss.item())
            episode_critic_losses.append(critic_loss.item())
            episode_values.append(critic_value.mean().item())

        # Store average loss and value for the episode
        self.actor_losses.append(np.mean(episode_actor_losses))
        self.critic_losses.append(np.mean(episode_critic_losses))
        self.values.append(np.mean(episode_values))

        self.memory.clear_memory()

    # Reset stored data after each episode or training session
    def reset_learning_debug_data(self):
        self.actor_losses = []
        self.critic_losses = []
        self.values = []

    def reset_learning_debug_data(self):
        self.actor_losses = []
        self.critic_losses = []
        self.values = []

### Main Loop

In [70]:
import gym
import numpy as np
from IPython.display import clear_output

if __name__ == '__main__':
    # Create environment
    env = VizDoomGym(render=True)

    best_params = {
                        'n_actions': env.action_space.n,
                        'input_dims': env.observation_space.shape,
                        'alpha': 0.0003,
                        'gamma': 0.99,
                        'gae_lambda': 0.92,
                        'policy_clip': 0.2,
                        'batch_size': 32,
                        'N': 16,
                        'n_epochs': 15,
                        'entropy_coefficient': 0.01,
                        'save_dir': "./output",
                        'actor': actor,
                        'critic': critic,
                    }
    
    # Load the models
    actor = ActorNetwork(n_actions=3, input_dims=[100, 160, 1], alpha=0.0003)
    critic = CriticNetwork(input_dims=[100, 160, 1], alpha=0.0003)
    agent = PPOAgent(**best_params)
    agent.load_models('./models/actor.pt', './models/critic.pt')

    episodes = 10
    for _ in range(episodes):
        observation = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.choose_action(observation)
            observation, reward, done, info, _ = env.step(action)
            total_reward += reward

        print(f"Total reward: {total_reward}")

    env.close()

Reset state shape: (3, 240, 320)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shape: (100, 160, 1)
Observation shape in choose_action: (100, 160, 1)
Step state shap