# Multi Agent Reinforcement Learning
In this notebook we create several reinforcement learning environments, based on *open AI*'s FrozenLake game:
- a single-agent frozen lake environment
- a multi-agent/ single goal environment
- a multi-agent/ 4 goals environment

In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
import time
import random
import sys
import pygame
from collections import defaultdict
import os
import colorsys

from environments import MAPS, FrozenLakeOneGoal, createMap
from algorithms import SingleGoalCentralQLearning, RandomPolicy

## Single agent

## Multi agents
### Common goal
- **Running a learning algorithm**

In [None]:
def run_simulation(agent, map_, num_agent, num_episodes=10000, silent=True):
    # Create environment
    env = FrozenLakeOneGoal(map_=map_, max_steps=100, num_agents=num_agent)
    
    # Tracking metrics
    episode_rewards = []
    success_rate = []
    success_window = []
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        truncated = False
        step = 0
        
        # Run episode
        while not done and not truncated:
            # Select action
            action = agent.select_action(state)
            # Take action
            next_state, reward, done, truncated, _ = env.step(action)
            # Update Q-table
            agent.update(state, action, reward, next_state, done)
            # Update state and total reward
            state = next_state
            total_reward += reward
            step += 1
        
        # Record episode success/failure
        success = total_reward > 0.5
        success_window.append(success)
        if len(success_window) > 200:
            success_window.pop(0)
        
        # Calculate success rate over last 100 episodes
        current_success_rate = sum(success_window) / len(success_window)
        success_rate.append(current_success_rate)
        
        # Record total reward
        # mean_reward = total_reward / step if step > 0 else 0
        mean_reward = total_reward
        episode_rewards.append(mean_reward)
        
        # Print progress
        if not silent and episode % 100 == 0:
            print(f"Episode: {episode}, Total Reward: {mean_reward}, Success Rate: {current_success_rate:.2f}, Epsilon: {agent.epsilon:.4f}")
    
    window_size = 500
    mean_rewards_smooth = np.convolve(episode_rewards, np.ones(window_size)/window_size, mode='valid')
    
    # Plot learning curve
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(mean_rewards_smooth)
    plt.axhline(y=1, color='black', linestyle='--', linewidth=2)
    plt.title('Episode Rewards')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    
    plt.subplot(1, 2, 2)
    plt.plot(success_rate)
    plt.title('Success Rate (500-episode moving average)')
    plt.xlabel('Episode')
    plt.ylabel('Success Rate')
    
    plt.tight_layout()
    plt.show()
    
    return agent

- **Visualizing the learned policy**

In [None]:
def visualize_policy(map_, agent, num_episodes=2, max_steps=20, use_pygame=True, num_agents=2):
    """Visualize the learned policy"""
    env = FrozenLakeOneGoal(map_=map_, num_agents=num_agents)
    
    # Action names for better visualization
    action_names = {0: "LEFT", 1: "DOWN", 2: "RIGHT", 3: "UP"}
    
    try:
        for i in range(num_episodes):
            state, _ = env.reset()
            done = False
            truncated = False
            total_reward = 0
            steps = 0
            
            print(f"\n=== Test Episode {i+1} ===")
            if use_pygame:
                env.render_pygame()
            else:
                print("Initial state:")
                env.render()
            
            while not done and not truncated and steps < max_steps:
                # Use trained policy (no exploration)
                state_tuple = tuple(state)
                
                # Get actions based on agent's Q-table
                # This assumes agent.q_table is structured to handle num_agents
                joint_actions = np.unravel_index(
                    np.argmax(agent.q_table[state_tuple]),
                    tuple([agent.action_size] * num_agents)
                )
                action = joint_actions
                
                # Take action
                next_state, reward, done, truncated, _ = env.step(action)
                
                # Check for overlaps - this needs to be generalized for multiple agents
                overlaps = []
                for i in range(num_agents):
                    for j in range(i+1, num_agents):
                        # Compare positions of each pair of agents
                        agent_i_pos = (next_state[i*2], next_state[i*2 + 1])
                        agent_j_pos = (next_state[j*2], next_state[j*2 + 1])
                        if agent_i_pos == agent_j_pos:
                            overlaps.append((i, j))
                
                # Update state and reward
                state = next_state
                total_reward += reward
                steps += 1
                
                # Render with action information
                print(f"Step {steps}:")
                for agent_idx in range(num_agents):
                    print(f"Agent {agent_idx+1}: {action_names[action[agent_idx]]}")
                print(f"Reward: {reward}")
                
                if overlaps:
                    print("Overlaps detected between agents:", overlaps)
                
                if use_pygame:
                    env.render_pygame()
                    time.sleep(0.5)
                else:
                    env.render()
                    time.sleep(0.5)
            
            print(f"Episode finished after {steps} steps with total reward: {total_reward}")
            if done and total_reward > 0:
                print("Success! At least one agent reached the goal.")
            elif done and total_reward <= 0:
                print("Failed. Agents fell into holes or couldn't reach the goal.")
            else:
                print("Truncated. Maximum steps reached.")
            
            # Short pause between episodes
            time.sleep(1)
        
    # Only close environment once after all episodes
    finally:
        # Make sure we close properly even if there's an exception
        env.close()
        if pygame.get_init():  # Check if pygame is still initialized
            pygame.quit()  # Quit pygame completely

- **Main**

In [None]:
if __name__ == "__main__":
    print("Training the agents...")
    
    num_agent = 2
    n_ep            = 10000
    learning_rate   = 0.1
    discount_factor = 0.1
    explo_rate      = 1.0
    explo_decay     = 0.999
    min_explo_rate  = 0.05
    map_name        = None #'4x4'
    map_size        = 4
    
    if map_name is None:
        state_size  = (map_size * map_size) ** num_agent
    else:
        state_size  = (map_name[0] * map_name[0]) ** num_agent
        
    action_size     = 4
    
    seed            = 0
    
    map_ = createMap(num_agent, map_size, map_name, seed)
    agent = SingleGoalCentralQLearning(state_size=state_size, action_size=action_size,num_agents=num_agent, 
                           learning_rate=learning_rate, discount_factor=discount_factor, exploration_rate=explo_rate,
                           exploration_decay=explo_decay, min_exploration_rate=min_explo_rate)
    
    trained_agent   = run_simulation(agent, map_, num_agent, num_episodes=n_ep)
    print("Training complete!")
    
    # Visualize the learned policy
    print("Visualizing the learned policy...")
    visualize_policy(map_, trained_agent, num_episodes=3, num_agents=num_agent)

### 4 goals

In [None]:
from environments import Frozen4goals
