# Entrenamiento de un Agente PPO con VizDoom

En este notebook, implementaremos un agente de Reinforcement Learning usando el algoritmo Proximal Policy Optimization (PPO) para jugar al juego de VizDoom.
Trabajo realizado por:
* Damián Pramparo
* Federico Domínguez Gómez

## Importación de bibliotecas
Usaremos las siguientes bibliotecas:
* **vizdoom**: Librería para interactuar con el entorno de VizDoom.
* **pandas**: Usada para crear un DataFrame y guardar datos en un archivo CSV.
* **torch**: Biblioteca para el aprendizaje profundo con PyTorch.
* **numpy**: Usada para manipular matrices numéricas.
* **Pool**: Para poder entrenas varios agentes al mismo tiempo

In [1]:
import vizdoom
import pandas as pd
import torch
import numpy as np
from multiprocessing import Pool

# Definición de la clase PPOAgent
Creamos una clase **PPOAgent** que representa nuestro agente PPO. Esta clase se encarga de definir la arquitectura de la política y de implementar los métodos necesarios para el entrenamiento.

In [2]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, learning_rate=0.001, alpha=0.99, epsilon=0.1):
        self.policy_net = torch.nn.Sequential(
            torch.nn.Linear(state_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, action_dim),
        )

        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=learning_rate, alpha=alpha, eps=epsilon)
        
    def policy(self, state):
        logits = self.policy_net(state)

        logits = torch.squeeze(logits)

        action_probs = torch.softmax(logits, dim=0)
        action = torch.multinomial(action_probs, 1).item()

        return action

    def learn(self, experiences):
        loss = self.ppo_loss(experiences)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def ppo_loss(self, experiences):
        log_probs = [experience['log_probs'] for experience in experiences]
        advantages = [experience['advantage'] for experience in experiences]
        
        log_probs_tensor = torch.tensor(log_probs, requires_grad=True)
        advantages_tensor = torch.tensor(advantages, requires_grad=True)    

        ## Agrego una dimension adicional sino da error 
        log_probs_tensor = log_probs_tensor.unsqueeze(1)  
        advantages_tensor = advantages_tensor.unsqueeze(1) 

        policy_loss = -torch.mean(torch.sum(log_probs_tensor * advantages_tensor, dim=1))

        return policy_loss

In [3]:
sample_actions = [
    [1, 0, 0, 0, 0, 0, 0],  # MOVE_LEFT
    [0, 1, 0, 0, 0, 0, 0],  # MOVE_RIGHT
    [0, 0, 1, 0, 0, 0, 0],  # ATTACK
    [0, 0, 0, 1, 0, 0, 0],  # MOVE_FORWARD
    [0, 0, 0, 0, 1, 0, 0],  # MOVE_BACKWARD
    [0, 0, 0, 0, 0, 1, 0],  # TURN_LEFT
    [0, 0, 0, 0, 0, 0, 1],  # TURN_RIGHT
]

In [6]:
def train_agent(agent_id, learning_rate, alpha, epsilon, sample_actions):
    state_dim = 4
    action_dim = 7
    num_episodes = 100
    
    agent = PPOAgent(state_dim, action_dim, learning_rate, alpha, epsilon)
    
    game = vizdoom.DoomGame()
    game.load_config("deadly_corridor.cfg")
    game.set_doom_scenario_path("deadly_corridor.wad")
    game.init()
    
    for episode in range(num_episodes):
        game.new_episode()

        state = torch.from_numpy(np.zeros((4,))).float() 

        experiences = []
        while not game.is_episode_finished():
            action = agent.policy(state)

            game.make_action(sample_actions[action])

            next_state = torch.from_numpy(np.zeros((4,))).float()
            reward = game.get_last_reward()
    
            advantage = reward 

            log_prob = torch.log(agent.policy_net(state)[action])

            experiences.append({
                'state': state,
                'action': action,
                'reward': reward,
                'next_state': next_state,
                'advantage': advantage,
                'log_probs': log_prob
            })
        
            state = next_state

        agent.learn(experiences)
        df = pd.DataFrame(experiences)
        df.to_csv('datos.csv')
        print('Episode {}: {}'.format(episode, game.get_total_reward()))

    game.close()
    return agent_id, agent

In [7]:
if __name__ == '__main__':
    # Definimos las combinaciones de hiperparámetros a probar
    hyperparameter_combinations = [
        (0, 0.001, 0.99, 0.1),
        (1, 0.001, 0.95, 0.2),
        (2, 0.0005, 0.99, 0.05),
        (3, 0.002, 0.98, 0.15)
    ]
    
    # Creamos una piscina de procesos para entrenar a los agentes en paralelo
    with Pool(4) as pool:
        results = pool.starmap(train_agent, [(agent_id, lr, a, eps, sample_actions) for agent_id, lr, a, eps in hyperparameter_combinations])

    # Procesamos los resultados
    trained_agents = {agent_id: agent for agent_id, agent in results}

Episode 0: -103.22137451171875Episode 0: -103.23233032226562

Episode 0: -107.21281433105469
Episode 0: -114.01736450195312
Episode 1: -113.34738159179688
Episode 1: -99.25469970703125
Episode 1: -115.98941040039062
Episode 2: -103.3702392578125
Episode 1: -101.48481750488281
Episode 3: -112.37275695800781
Episode 4: -115.9073486328125
Episode 2: -112.25404357910156Episode 2: -110.64141845703125

Episode 2: -100.48384094238281
Episode 5: -113.77871704101562
Episode 6: -115.99784851074219
Episode 3: -105.92947387695312
Episode 3: -98.95620727539062
Episode 7: -115.99856567382812
Episode 8: -109.82630920410156
Episode 4: -112.47196960449219
Episode 9: -115.9708251953125
Episode 3: -79.02413940429688
Episode 10: -105.54454040527344Episode 4: -101.91645812988281

Episode 4: -78.50425720214844Episode 11: -83.68807983398438

Episode 5: -92.35569763183594
Episode 5: -102.18661499023438
Episode 12: -82.40225219726562
Episode 5: -113.17874145507812Episode 13: -114.45501708984375

Episode 14: -1

In [None]:
#no funciona, y no tengo idea por qué ni como hacer que ande
# Definimos una función para evaluar a un agente en el entorno
def evaluate_agent(agent, game, sample_actions, num_evaluation_episodes):
    total_rewards = []
    
    for episode in range(num_evaluation_episodes):
        game.new_episode()
        state = torch.from_numpy(np.zeros((4,))).float()
        episode_reward = 0
        
        while not game.is_episode_finished():
            action = agent.policy(state)
            game.make_action(sample_actions[action])
            next_state = torch.from_numpy(np.zeros((4,))).float()
            episode_reward += game.get_last_reward()
            state = next_state
        
        total_rewards.append(episode_reward)
    
    average_reward = np.mean(total_rewards)
    return average_reward

best_agent_id = None
best_average_reward = float("-inf")

evaluation_results = {}

for agent_id, learning_rate, alpha, epsilon in hyperparameter_combinations:
    print(f"Evaluando Agente {agent_id} con hiperparámetros: lr={learning_rate}, alpha={alpha}, epsilon={epsilon}")
    
    trained_agent = train_agent(agent_id, learning_rate, alpha, epsilon, sample_actions)
    
    average_reward = evaluate_agent(trained_agent, game, sample_actions, num_evaluation_episodes)
print(evaluation_results)