In [1]:
!pip install pettingzoo[classic,mpe] supersuit torch matplotlib seaborn pandas scipy > /dev/null 2>&1

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pettingzoo.mpe import simple_spread_v3
import random
import os
from google.colab import drive
from scipy.optimize import linprog

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

  from pettingzoo.mpe import simple_spread_v3


Using device: cuda


In [2]:
class MatrixGame:
    def __init__(self, payoffs, max_steps=50):
        self.payoffs = torch.tensor(payoffs, dtype=torch.float32, device=device)
        self.max_steps = max_steps
        self.current_step = 0
        self.n_actions = self.payoffs.shape[0]
        self.last_actions = (0, 0)

    def reset(self):
        self.current_step = 0
        self.last_actions = (0, 0)
        return (torch.tensor([0.0], device=device), torch.tensor([0.0], device=device))

    def step(self, a1, a2):
        r1 = self.payoffs[a1, a2, 0]
        r2 = self.payoffs[a1, a2, 1]
        self.current_step += 1
        done = self.current_step >= self.max_steps
        obs1 = torch.tensor([float(a2)], device=device)
        obs2 = torch.tensor([float(a1)], device=device)
        self.last_actions = (a1, a2)
        return (obs1, obs2), r1, r2, done

RPS_PAYOFF = [
    [[0, 0], [-1, 1], [1, -1]],
    [[1, -1], [0, 0], [-1, 1]],
    [[-1, 1], [1, -1], [0, 0]]
]

MP_PAYOFF = [
    [[1, -1], [-1, 1]],
    [[-1, 1], [1, -1]]
]

IPD_PAYOFF = [
    [[3, 3], [0, 5]],
    [[5, 0], [1, 1]]
]

BOS_PAYOFF = [
    [[3, 2], [0, 0]],
    [[0, 0], [2, 3]]
]


In [3]:
def solve_minimax(Q_values):
    Q_np = Q_values.detach().cpu().numpy()
    rows, cols = Q_np.shape
    c = np.zeros(rows + 1); c[-1] = -1
    A_ub = np.zeros((cols, rows + 1)); A_ub[:, :rows] = -Q_np.T; A_ub[:, -1] = 1
    b_ub = np.zeros(cols)
    A_eq = np.ones((1, rows + 1)); A_eq[0, -1] = 0; b_eq = np.array([1])
    bounds = [(0, 1) for _ in range(rows)] + [(None, None)]
    try:
        res = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')
        if res.success:
            probs = res.x[:rows]; probs = np.maximum(probs, 0); probs /= probs.sum()
            return torch.tensor(probs, dtype=torch.float32, device=device), res.x[-1]
    except: pass
    return torch.ones(rows, device=device)/rows, 0.0

def solve_ce(Q1, Q2):
    Q1_np = Q1.detach().cpu().numpy()
    Q2_np = Q2.detach().cpu().numpy()
    n_a1, n_a2 = Q1_np.shape

    c = -(Q1_np + Q2_np).flatten()

    A_ub = []
    b_ub = []

    for i in range(n_a1):
        for k in range(n_a1):
            if i == k: continue
            row = np.zeros((n_a1, n_a2))
            for j in range(n_a2):
                row[i, j] = Q1_np[i, j] - Q1_np[k, j]
            A_ub.append(-row.flatten())
            b_ub.append(0)

    for j in range(n_a2):
        for k in range(n_a2):
            if j == k: continue
            row = np.zeros((n_a1, n_a2))
            for i in range(n_a1):
                row[i, j] = Q2_np[i, j] - Q2_np[i, k]
            A_ub.append(-row.flatten())
            b_ub.append(0)

    A_eq = np.ones((1, n_a1 * n_a2))
    b_eq = np.array([1])

    bounds = [(0, 1) for _ in range(n_a1 * n_a2)]

    try:
        res = linprog(c, A_ub=np.array(A_ub), b_ub=np.array(b_ub),
                      A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')
        if res.success:
            probs = res.x.reshape(n_a1, n_a2)
            probs = np.maximum(probs, 0)
            probs /= probs.sum()
            val1 = np.sum(probs * Q1_np)
            val2 = np.sum(probs * Q2_np)
            return torch.tensor(probs, dtype=torch.float32, device=device), val1, val2
    except: pass

    uni = torch.ones((n_a1, n_a2), device=device) / (n_a1 * n_a2)
    return uni, 0.0, 0.0


In [4]:
class MinimaxQAgent:
    def __init__(self, n_actions, lr=0.1, gamma=0.99, epsilon=0.1):
        self.n_actions = n_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = torch.zeros((n_actions, n_actions), device=device)
        self.V = 0.0
        self.pi = torch.ones(n_actions, device=device) / n_actions

    def get_action(self, obs=None):
        if random.random() < self.epsilon:
            action = random.randint(0, self.n_actions - 1)
        else:
            dist = torch.distributions.Categorical(self.pi)
            action = dist.sample().item()
        return torch.tensor(action, device=device), None, self.pi

    def update(self, prev_a, prev_opp_a, reward):
        self.pi, self.V = solve_minimax(self.Q)
        target = reward + self.gamma * self.V
        self.Q[prev_a, prev_opp_a] += self.lr * (target - self.Q[prev_a, prev_opp_a])

class CEQAgent:
    def __init__(self, n_actions, player_id, lr=0.1, gamma=0.99, epsilon=0.1):
        self.n_actions = n_actions
        self.player_id = player_id
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q_own = torch.zeros((n_actions, n_actions), device=device)
        self.Q_opp = torch.zeros((n_actions, n_actions), device=device)
        self.joint_pi = torch.ones((n_actions, n_actions), device=device) / (n_actions * n_actions)

    def get_action(self, obs=None):
        if random.random() < self.epsilon:
            action = random.randint(0, self.n_actions - 1)
            marginals = torch.ones(self.n_actions, device=device) / self.n_actions
        else:
            if self.player_id == 0:
                marginals = self.joint_pi.sum(dim=1)
            else:
                marginals = self.joint_pi.sum(dim=0)

            if marginals.sum() == 0: marginals = torch.ones(self.n_actions, device=device) / self.n_actions
            dist = torch.distributions.Categorical(marginals)
            action = dist.sample().item()
        return torch.tensor(action, device=device), None, marginals

    def update(self, prev_a, prev_opp_a, reward, opp_reward):
        self.joint_pi, v_own, v_opp = solve_ce(self.Q_own, self.Q_opp)

        target_own = reward + self.gamma * v_own
        target_opp = opp_reward + self.gamma * v_opp

        self.Q_own[prev_a, prev_opp_a] += self.lr * (target_own - self.Q_own[prev_a, prev_opp_a])
        self.Q_opp[prev_a, prev_opp_a] += self.lr * (target_opp - self.Q_opp[prev_a, prev_opp_a])

class NashFoFAgent:
    def __init__(self, n_actions, algo_type="Nash-Q", lr=0.1, gamma=0.99, epsilon=0.1):
        self.n_actions = n_actions
        self.algo_type = algo_type
        self.lr = lr; self.gamma = gamma; self.epsilon = epsilon
        self.Q_own = torch.zeros((n_actions, n_actions), device=device)
        self.Q_opp = torch.zeros((n_actions, n_actions), device=device)

    def get_action(self, obs=None):
        if random.random() < self.epsilon:
            action = random.randint(0, self.n_actions - 1)
            probs = torch.ones(self.n_actions, device=device) / self.n_actions
        else:
            if self.algo_type == "FoF-Q": probs, _ = solve_minimax(self.Q_own)
            else: probs, _ = solve_minimax(self.Q_own)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample().item()
        return torch.tensor(action, device=device), None, probs

    def update(self, prev_a, prev_opp_a, reward, opp_reward):
        if self.algo_type == "FoF-Q": _, next_val_own = solve_minimax(self.Q_own); next_val_opp = 0
        else: _, next_val_own = solve_minimax(self.Q_own); _, next_val_opp = solve_minimax(self.Q_opp)

        target_own = reward + self.gamma * next_val_own
        self.Q_own[prev_a, prev_opp_a] += self.lr * (target_own - self.Q_own[prev_a, prev_opp_a])
        if self.algo_type == "Nash-Q":
            target_opp = opp_reward + self.gamma * next_val_opp
            self.Q_opp[prev_a, prev_opp_a] += self.lr * (target_opp - self.Q_opp[prev_a, prev_opp_a])

class PPO_ActorCritic(nn.Module):
    def __init__(self, actor_obs_dim, critic_obs_dim, act_dim, hidden_dim=64):
        super().__init__()
        self.actor = nn.Sequential(nn.Linear(actor_obs_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, act_dim), nn.Softmax(dim=-1))
        self.critic = nn.Sequential(nn.Linear(critic_obs_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1))
    def get_action(self, x):
        probs = self.actor(x)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action, dist.log_prob(action), probs
    def get_value(self, x): return self.critic(x)

class PPO_Agent_Wrapper:
    def __init__(self, actor_obs_dim, critic_obs_dim, act_dim, lr=0.002, is_mappo=False, pareto_weight=0.0):
        self.ac = PPO_ActorCritic(actor_obs_dim, critic_obs_dim, act_dim).to(device)
        self.opt = optim.Adam(self.ac.parameters(), lr=lr)
        self.is_mappo = is_mappo
        self.pareto_weight = pareto_weight

    def get_action(self, x):
        return self.ac.get_action(x)

    def update(self, rollouts):
        actor_obs = torch.stack([x[0] for x in rollouts])
        critic_obs = torch.stack([x[1] for x in rollouts])
        act = torch.tensor([x[2] for x in rollouts], device=device)
        ind_r = torch.tensor([x[3] for x in rollouts], device=device)
        team_r = torch.tensor([x[4] for x in rollouts], device=device)
        old_log_probs = torch.tensor([x[5] for x in rollouts], device=device)

        hybrid_reward = (1 - self.pareto_weight) * ind_r + self.pareto_weight * team_r
        V = self.ac.get_value(critic_obs).view(-1)
        hybrid_reward = hybrid_reward.view(-1)
        adv = hybrid_reward - V.detach()

        _, _, new_probs = self.ac.get_action(actor_obs)
        dist = torch.distributions.Categorical(new_probs)
        curr_log_prob = dist.log_prob(act)
        ratio = torch.exp(curr_log_prob - old_log_probs)
        surr1 = ratio * adv; surr2 = torch.clamp(ratio, 0.8, 1.2) * adv
        actor_loss = -torch.min(surr1, surr2).mean()
        critic_loss = F.mse_loss(V, hybrid_reward)
        self.opt.zero_grad(); (actor_loss + 0.5 * critic_loss).backward(); self.opt.step()

class QMixer(nn.Module):
    def __init__(self, n_agents, state_dim, embed_dim=32):
        super().__init__()
        self.n_agents = n_agents; self.embed_dim = embed_dim
        self.hyper_w1 = nn.Sequential(nn.Linear(state_dim, embed_dim * n_agents), nn.ReLU())
        self.hyper_w2 = nn.Sequential(nn.Linear(state_dim, embed_dim), nn.ReLU())
        self.hyper_b1 = nn.Linear(state_dim, embed_dim)
        self.hyper_b2 = nn.Sequential(nn.Linear(state_dim, 1), nn.ReLU())
    def forward(self, q_values, state):
        bs = q_values.size(0)
        w1 = torch.abs(self.hyper_w1(state)).view(bs, self.n_agents, self.embed_dim)
        b1 = self.hyper_b1(state).view(bs, 1, self.embed_dim)
        w2 = torch.abs(self.hyper_w2(state)).view(bs, self.embed_dim, 1)
        b2 = self.hyper_b2(state).view(bs, 1, 1)
        hidden = F.elu(torch.bmm(q_values.view(bs, 1, self.n_agents), w1) + b1)
        q_tot = torch.bmm(hidden, w2) + b2
        return q_tot.view(bs, -1)

class LolaAgent:
    def __init__(self, n_actions, lr=0.1):
        self.theta = torch.zeros(n_actions, requires_grad=True, device=device)
        self.lr = lr; self.opt = optim.SGD([self.theta], lr=lr)
    def get_probs(self): return torch.softmax(self.theta, dim=0)
    def step_lola(self, opp_theta, payoff_matrix, opp_lr, is_player_1=True):
        p1, p2 = torch.softmax(self.theta, dim=0), torch.softmax(opp_theta, dim=0)
        idx_me, idx_opp = (0, 1) if is_player_1 else (1, 0)
        V_opp = torch.einsum('i,j,ij->', p1, p2, payoff_matrix[:,:,idx_opp]) if is_player_1 else torch.einsum('i,j,ij->', p2, p1, payoff_matrix[:,:,idx_opp])
        grad_opp = torch.autograd.grad(V_opp, opp_theta, create_graph=True)[0]
        opp_theta_new = opp_theta + opp_lr * grad_opp
        p2_new = torch.softmax(opp_theta_new, dim=0)
        if is_player_1: V_lola = torch.einsum('i,j,ij->', p1, p2_new, payoff_matrix[:,:,0])
        else: V_lola = torch.einsum('i,j,ij->', p2_new, p1, payoff_matrix[:,:,1])
        self.opt.zero_grad(); (-V_lola).backward(); self.opt.step()

class BotAgent:
    def __init__(self, strategy, action_dim):
        self.strategy = strategy; self.action_dim = action_dim
    def get_action(self, obs):
        opp_last_action = int(obs.item())
        if self.strategy == 'Random': act = random.randint(0, self.action_dim - 1)
        elif self.strategy == 'Bully': act = 1
        elif self.strategy == 'TitForTat': act = opp_last_action
        else: act = 0
        return torch.tensor(act, device=device), None, None
    def update(self, *args): pass


In [5]:
class PSRO_Session:
    def __init__(self, game_matrix, n_epochs=5, oracle_episodes=50):
        self.game_matrix = torch.tensor(game_matrix, dtype=torch.float32, device=device)
        self.n_actions = self.game_matrix.shape[0]
        self.pop_1 = [torch.ones(self.n_actions, device=device)/self.n_actions]
        self.pop_2 = [torch.ones(self.n_actions, device=device)/self.n_actions]
        self.n_epochs = n_epochs
        self.oracle_episodes = oracle_episodes
        self.meta_game_matrix = np.zeros((1, 1, 2))

    def evaluate_matchup(self, policy1, policy2):
        u1 = torch.einsum('i,j,ij->', policy1, policy2, self.game_matrix[:,:,0])
        u2 = torch.einsum('i,j,ij->', policy1, policy2, self.game_matrix[:,:,1])
        return u1.item(), u2.item()

    def update_meta_game(self):
        r1_len = len(self.pop_1)
        r2_len = len(self.pop_2)
        meta = np.zeros((r1_len, r2_len, 2))
        for i in range(r1_len):
            for j in range(r2_len):
                r1, r2 = self.evaluate_matchup(self.pop_1[i], self.pop_2[j])
                meta[i, j, 0] = r1
                meta[i, j, 1] = r2
        self.meta_game_matrix = meta

    def solve_meta_nash(self):
        r1, r2 = self.meta_game_matrix[:,:,0], self.meta_game_matrix[:,:,1]

        c = np.zeros(len(self.pop_1) + 1); c[-1] = -1
        A_ub = np.zeros((len(self.pop_2), len(self.pop_1) + 1))
        A_ub[:, :len(self.pop_1)] = -r1.T
        A_ub[:, -1] = 1
        b_ub = np.zeros(len(self.pop_2))
        A_eq = np.ones((1, len(self.pop_1) + 1)); A_eq[0, -1] = 0; b_eq = np.array([1])
        bounds = [(0, 1) for _ in range(len(self.pop_1))] + [(None, None)]
        try:
            res1 = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')
            meta_dist_1 = res1.x[:len(self.pop_1)]
        except: meta_dist_1 = np.ones(len(self.pop_1))/len(self.pop_1)

        c2 = np.zeros(len(self.pop_2) + 1); c2[-1] = -1
        A_ub2 = np.zeros((len(self.pop_1), len(self.pop_2) + 1))
        A_ub2[:, :len(self.pop_2)] = -r2
        A_ub2[:, -1] = 1
        b_ub2 = np.zeros(len(self.pop_1))
        A_eq2 = np.ones((1, len(self.pop_2) + 1)); A_eq2[0, -1] = 0; b_eq2 = np.array([1])
        bounds2 = [(0, 1) for _ in range(len(self.pop_2))] + [(None, None)]
        try:
            res2 = linprog(c2, A_ub=A_ub2, b_ub=b_ub2, A_eq=A_eq2, b_eq=b_eq2, bounds=bounds2, method='highs')
            meta_dist_2 = res2.x[:len(self.pop_2)]
        except: meta_dist_2 = np.ones(len(self.pop_2))/len(self.pop_2)

        return torch.tensor(meta_dist_1, dtype=torch.float32, device=device), torch.tensor(meta_dist_2, dtype=torch.float32, device=device)

    def train_oracle(self, opp_dist, is_p1):
        best_r = -float('inf')
        best_pure = 0

        if is_p1:
            avg_policy_opp = torch.zeros(self.n_actions, device=device)
            for idx, prob in enumerate(opp_dist):
                avg_policy_opp += prob * self.pop_2[idx]

            for a in range(self.n_actions):
                pure = torch.zeros(self.n_actions, device=device); pure[a] = 1.0
                r, _ = self.evaluate_matchup(pure, avg_policy_opp)
                if r > best_r: best_r = r; best_pure = a
        else:
            avg_policy_opp = torch.zeros(self.n_actions, device=device)
            for idx, prob in enumerate(opp_dist):
                avg_policy_opp += prob * self.pop_1[idx]

            for a in range(self.n_actions):
                pure = torch.zeros(self.n_actions, device=device); pure[a] = 1.0
                _, r = self.evaluate_matchup(avg_policy_opp, pure)
                if r > best_r: best_r = r; best_pure = a

        new_pol = torch.zeros(self.n_actions, device=device)
        new_pol[best_pure] = 1.0
        return new_pol

    def run(self):
        for epoch in range(self.n_epochs):
            self.update_meta_game()
            m1, m2 = self.solve_meta_nash()

            new_pol_1 = self.train_oracle(m2, is_p1=True)
            new_pol_2 = self.train_oracle(m1, is_p1=False)

            self.pop_1.append(new_pol_1)
            self.pop_2.append(new_pol_2)

        return m1, m2


In [6]:
def calculate_metrics(rewards_n):
    social_welfare = sum(rewards_n)
    diff = abs(rewards_n[0] - rewards_n[1])
    total = abs(rewards_n[0]) + abs(rewards_n[1])
    fairness_index = 1 - (diff / (total + 1e-8))
    return social_welfare, fairness_index

def train_matrix_game_generic(algo_type, game_matrix, opponent_type="Self", n_episodes=200, is_one_shot=False, lr=0.01):
    steps_per_ep = 1 if is_one_shot else 50
    if is_one_shot: n_episodes *= 50

    env = MatrixGame(game_matrix, max_steps=steps_per_ep)
    n_acts = env.n_actions

    if algo_type == "PSRO":
        psro = PSRO_Session(game_matrix, n_epochs=10)
        m1, m2 = psro.run()

        p1_atomic = torch.zeros(n_acts, device=device)
        for w, pol in zip(m1, psro.pop_1):
            p1_atomic += w * pol

        p2_atomic = torch.zeros(n_acts, device=device)
        for w, pol in zip(m2, psro.pop_2):
            p2_atomic += w * pol

        return {
            'p1_probs': [p1_atomic.cpu().numpy()],
            'p2_probs': [p2_atomic.cpu().numpy()],
            'social_welfare': [],
            'fairness': []
        }

    history = {'p1_probs': [], 'p2_probs': [], 'social_welfare': [], 'fairness': []}

    def create_agent(atype, pid=0):
        if atype == "LOLA": return LolaAgent(n_acts, lr=lr)
        elif atype == "Minimax-Q": return MinimaxQAgent(n_acts, lr=lr)
        elif atype == "CE-Q": return CEQAgent(n_acts, pid, lr=lr)
        elif atype in ["Nash-Q", "FoF-Q"]: return NashFoFAgent(n_acts, algo_type=atype, lr=lr)
        else:
            is_mappo = (atype == "MAPPO")
            pareto_w = 0.5 if atype == "Pareto-AC" else 0.0
            critic_dim = 2 if is_mappo else 1
            return PPO_Agent_Wrapper(1, critic_dim, n_acts, lr=lr, is_mappo=is_mappo, pareto_weight=pareto_w)

    agent1 = create_agent(algo_type, 0)

    if opponent_type == "Self":
        agent2 = create_agent(algo_type, 1)
    else:
        agent2 = BotAgent(opponent_type, n_acts)

    for ep in range(n_episodes):
        obs1, obs2 = env.reset()

        if algo_type == "LOLA" and opponent_type == "Self":
            history['p1_probs'].append(agent1.get_probs().detach().cpu().numpy())
            history['p2_probs'].append(agent2.get_probs().detach().cpu().numpy())
            agent1.step_lola(agent2.theta, env.payoffs, agent2.lr, True)
            agent2.step_lola(agent1.theta, env.payoffs, agent1.lr, False)
            continue

        ep_r1, ep_r2 = 0, 0
        rollout1, rollout2 = [], []

        for _ in range(steps_per_ep):
            with torch.no_grad():
                if algo_type == "LOLA": a1 = torch.multinomial(agent1.get_probs(), 1); lp1 = None
                else: a1, lp1, _ = agent1.get_action(obs1)

                if opponent_type == "Self": a2, lp2, _ = agent2.get_action(obs2)
                else: a2, _, _ = agent2.get_action(obs2)

            (next_o1, next_o2), r1, r2, done = env.step(a1.item(), a2.item())
            ep_r1 += r1.item(); ep_r2 += r2.item()

            if algo_type == "Minimax-Q":
                agent1.update(a1.item(), a2.item(), r1)
                if opponent_type == "Self": agent2.update(a2.item(), a1.item(), r2)

            elif algo_type == "CE-Q":
                agent1.update(a1.item(), a2.item(), r1, r2)
                if opponent_type == "Self": agent2.update(a2.item(), a1.item(), r2, r1)

            elif algo_type in ["Nash-Q", "FoF-Q"]:
                agent1.update(a1.item(), a2.item(), r1, r2)
                if opponent_type == "Self":
                    agent2.update(a2.item(), a1.item(), r2, r1)

            elif algo_type != "LOLA":
                c_obs1 = obs1 if not agent1.is_mappo else torch.cat([obs1, obs2])
                rollout1.append((obs1, c_obs1, a1, r1, r1+r2, lp1.detach()))
                if opponent_type == "Self":
                    c_obs2 = obs2 if not agent2.is_mappo else torch.cat([obs2, obs1])
                    rollout2.append((obs2, c_obs2, a2, r2, r1+r2, lp2.detach()))

            obs1, obs2 = next_o1, next_o2

        if algo_type not in ["LOLA", "Nash-Q", "FoF-Q", "Minimax-Q", "CE-Q"]:
            agent1.update(rollout1)
            if opponent_type == "Self": agent2.update(rollout2)

        if algo_type != "LOLA":
            with torch.no_grad():
                _, _, p1 = agent1.get_action(torch.zeros(1, device=device))
                history['p1_probs'].append(p1.cpu().numpy())
                if opponent_type == "Self":
                    _, _, p2 = agent2.get_action(torch.zeros(1, device=device))
                    history['p2_probs'].append(p2.cpu().numpy())
                else:
                    history['p2_probs'].append([0.5, 0.5])

        w, f = calculate_metrics([ep_r1, ep_r2])
        history['social_welfare'].append(w)
        history['fairness'].append(f)

    return history

def train_spread_generic(algo_name, n_episodes=200, noise_level=0.0):
    env = simple_spread_v3.parallel_env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False)
    env.reset(seed=SEED)
    n_agents = 3
    obs_dim = 18; act_dim = 5
    history = {'rewards': [], 'social_welfare': [], 'fairness': []}

    if algo_name == "QMIX":
        q_nets = [nn.Sequential(nn.Linear(obs_dim, 64), nn.ReLU(), nn.Linear(64, act_dim)).to(device) for _ in range(n_agents)]
        mixer = QMixer(n_agents, state_dim=obs_dim*n_agents).to(device)
        optimizer = optim.Adam(list(mixer.parameters()) + [p for net in q_nets for p in net.parameters()], lr=0.001)
    else:
        is_mappo = (algo_name == "MAPPO")
        pareto_w = 0.5 if algo_name == "Pareto-AC" else 0.0
        critic_dim = obs_dim * n_agents if is_mappo else obs_dim
        agents = [PPO_Agent_Wrapper(obs_dim, critic_dim, act_dim, lr=0.001, is_mappo=is_mappo, pareto_weight=pareto_w) for _ in range(n_agents)]

    for ep in range(n_episodes):
        obs_dict, _ = env.reset()
        total_reward = 0
        agent_rewards = {a: 0 for a in env.agents}
        episode_data_qmix = []
        rollouts_ppo = [[] for _ in range(n_agents)]

        while env.agents:
            agent_ids = env.agents
            obs_tensors = []
            for a in agent_ids:
                raw = torch.tensor(obs_dict[a], dtype=torch.float32, device=device)
                if noise_level > 0: raw += torch.randn_like(raw) * noise_level
                obs_tensors.append(raw)
            global_state = torch.cat(obs_tensors) if obs_tensors else torch.zeros(1, device=device)

            actions = {}; q_vals = []

            if algo_name == "QMIX":
                for i, agent in enumerate(agent_ids):
                    q = q_nets[i](obs_tensors[i])
                    q_vals.append(q)
                    if random.random() < max(0.05, 1 - ep/150): act = env.action_space(agent).sample()
                    else: act = torch.argmax(q).item()
                    actions[agent] = act
                episode_data_qmix.append((global_state, torch.stack(q_vals), actions, list(agent_ids)))
            else:
                for i, agent in enumerate(agent_ids):
                    a, lp, _ = agents[i].ac.get_action(obs_tensors[i])
                    actions[agent] = a.item()

            next_obs, rewards, terminations, truncations, _ = env.step(actions)
            team_r = sum(rewards.values())
            total_reward += team_r
            for a in agent_ids: agent_rewards[a] += rewards[a]

            if algo_name != "QMIX":
                for i, agent in enumerate(agent_ids):
                    if agent in next_obs:
                        old_lp = agents[i].ac.get_action(obs_tensors[i])[1].detach()
                        c_obs = global_state if agents[i].is_mappo else obs_tensors[i]
                        rollouts_ppo[i].append((obs_tensors[i], c_obs, torch.tensor(actions[agent], device=device),
                                                torch.tensor(rewards[agent], device=device), torch.tensor(team_r, device=device), old_lp))
            obs_dict = next_obs

        if algo_name == "QMIX":
            R = torch.tensor(total_reward, device=device)
            loss = 0
            for (state, qs, acts, step_agents) in episode_data_qmix:
                chosen_qs = [qs[i][acts[agent]] for i, agent in enumerate(step_agents)]
                if chosen_qs:
                    q_tot = mixer(torch.stack(chosen_qs).unsqueeze(0), state.unsqueeze(0))
                    loss += (q_tot - R)**2
            optimizer.zero_grad(); loss.backward(); optimizer.step()
        else:
            for i in range(n_agents):
                if rollouts_ppo[i]: agents[i].update(rollouts_ppo[i])

        history['rewards'].append(total_reward)
        w, f = calculate_metrics(list(agent_rewards.values()))
        history['social_welfare'].append(w)
        history['fairness'].append(f)

    return history


In [7]:
results_store = {}

print("--- EXP A: Zero-Sum Games ---")

print("Running MP: Minimax-Q, CE-Q, PSRO, Nash-Q, IPPO, FoF-Q, LOLA...")
results_store['mp_minimax'] = train_matrix_game_generic('Minimax-Q', MP_PAYOFF, n_episodes=150)
results_store['mp_ceq'] = train_matrix_game_generic('CE-Q', MP_PAYOFF, n_episodes=150)
results_store['mp_psro'] = train_matrix_game_generic('PSRO', MP_PAYOFF)
results_store['mp_nash'] = train_matrix_game_generic('Nash-Q', MP_PAYOFF, n_episodes=150)
results_store['mp_ippo'] = train_matrix_game_generic('IPPO', MP_PAYOFF, n_episodes=150)
results_store['mp_fof'] = train_matrix_game_generic('FoF-Q', MP_PAYOFF, n_episodes=150)
results_store['mp_lola'] = train_matrix_game_generic('LOLA', MP_PAYOFF, n_episodes=150, lr=0.1)
results_store['mp_mappo'] = train_matrix_game_generic('MAPPO', MP_PAYOFF, n_episodes=150)

print("Running RPS: Minimax-Q, CE-Q, PSRO, Nash-Q, IPPO, MAPPO, FoF-Q, LOLA...")
results_store['rps_minimax'] = train_matrix_game_generic('Minimax-Q', RPS_PAYOFF, n_episodes=200)
results_store['rps_ceq'] = train_matrix_game_generic('CE-Q', RPS_PAYOFF, n_episodes=200)
results_store['rps_psro'] = train_matrix_game_generic('PSRO', RPS_PAYOFF)
results_store['rps_nash'] = train_matrix_game_generic('Nash-Q', RPS_PAYOFF, n_episodes=200)
results_store['rps_ippo'] = train_matrix_game_generic('IPPO', RPS_PAYOFF, n_episodes=200)
results_store['rps_mappo'] = train_matrix_game_generic('MAPPO', RPS_PAYOFF, n_episodes=200)
results_store['rps_fof'] = train_matrix_game_generic('FoF-Q', RPS_PAYOFF, n_episodes=200)
results_store['rps_lola'] = train_matrix_game_generic('LOLA', RPS_PAYOFF, n_episodes=200, lr=0.1)


print("\n--- EXP B: Mixed-Motive Games ---")

print("Running IPD: CE-Q, Minimax-Q, PSRO, Nash-Q, FoF-Q, IPPO, MAPPO, LOLA, Pareto-AC...")
results_store['ipd_ceq'] = train_matrix_game_generic('CE-Q', IPD_PAYOFF, n_episodes=200)
results_store['ipd_minimax'] = train_matrix_game_generic('Minimax-Q', IPD_PAYOFF, n_episodes=200)
results_store['ipd_psro'] = train_matrix_game_generic('PSRO', IPD_PAYOFF)
results_store['ipd_nash'] = train_matrix_game_generic('Nash-Q', IPD_PAYOFF, n_episodes=200)
results_store['ipd_fof'] = train_matrix_game_generic('FoF-Q', IPD_PAYOFF, n_episodes=200)
results_store['ipd_ippo'] = train_matrix_game_generic('IPPO', IPD_PAYOFF, n_episodes=200)
results_store['ipd_mappo'] = train_matrix_game_generic('MAPPO', IPD_PAYOFF, n_episodes=200)
results_store['ipd_lola'] = train_matrix_game_generic('LOLA', IPD_PAYOFF, n_episodes=200, lr=0.1)
results_store['ipd_pac'] = train_matrix_game_generic('Pareto-AC', IPD_PAYOFF, n_episodes=200)

bot_list = ['TitForTat', 'Random', 'Bully']
for bot in bot_list:
    print(f"Running IPD vs {bot}...")
    results_store[f'ipd_vs_{bot}_ippo'] = train_matrix_game_generic('IPPO', IPD_PAYOFF, opponent_type=bot, n_episodes=150)
    results_store[f'ipd_vs_{bot}_nash'] = train_matrix_game_generic('Nash-Q', IPD_PAYOFF, opponent_type=bot, n_episodes=150)
    results_store[f'ipd_vs_{bot}_minimax'] = train_matrix_game_generic('Minimax-Q', IPD_PAYOFF, opponent_type=bot, n_episodes=150)
    results_store[f'ipd_vs_{bot}_ceq'] = train_matrix_game_generic('CE-Q', IPD_PAYOFF, opponent_type=bot, n_episodes=150)
    results_store[f'ipd_vs_{bot}_fof'] = train_matrix_game_generic('FoF-Q', IPD_PAYOFF, opponent_type=bot, n_episodes=150)

print("Running BoS: CE-Q, Minimax-Q, PSRO, Nash-Q, IPPO, FoF-Q...")
results_store['bos_ceq'] = train_matrix_game_generic('CE-Q', BOS_PAYOFF, n_episodes=200)
results_store['bos_minimax'] = train_matrix_game_generic('Minimax-Q', BOS_PAYOFF, n_episodes=200)
results_store['bos_psro'] = train_matrix_game_generic('PSRO', BOS_PAYOFF)
results_store['bos_nash'] = train_matrix_game_generic('Nash-Q', BOS_PAYOFF, n_episodes=200)
results_store['bos_ippo'] = train_matrix_game_generic('IPPO', BOS_PAYOFF, n_episodes=200)
results_store['bos_fof'] = train_matrix_game_generic('FoF-Q', BOS_PAYOFF, n_episodes=200)

print("Running IPD (One-Shot): IPPO, Nash-Q, Minimax-Q, CE-Q, FoF-Q, LOLA...")

results_store['pd_oneshot_ippo'] = train_matrix_game_generic('IPPO', IPD_PAYOFF, n_episodes=200, is_one_shot=True)
results_store['pd_oneshot_nash'] = train_matrix_game_generic('Nash-Q', IPD_PAYOFF, n_episodes=200, is_one_shot=True)
results_store['pd_oneshot_minimax'] = train_matrix_game_generic('Minimax-Q', IPD_PAYOFF, n_episodes=200, is_one_shot=True)
results_store['pd_oneshot_ceq'] = train_matrix_game_generic('CE-Q', IPD_PAYOFF, n_episodes=200, is_one_shot=True)
results_store['pd_oneshot_fof'] = train_matrix_game_generic('FoF-Q', IPD_PAYOFF, n_episodes=200, is_one_shot=True)
results_store['pd_oneshot_lola'] = train_matrix_game_generic('LOLA', IPD_PAYOFF, n_episodes=200, is_one_shot=True, lr=0.1)


print("\n--- EXP C: Simple Spread (Cooperative) ---")

print("Running Spread: IPPO, MAPPO, QMIX, Pareto-AC...")
results_store['spread_ippo'] = train_spread_generic('IPPO', n_episodes=150)
results_store['spread_mappo'] = train_spread_generic('MAPPO', n_episodes=150)
results_store['spread_qmix'] = train_spread_generic('QMIX', n_episodes=150)
results_store['spread_pac'] = train_spread_generic('Pareto-AC', n_episodes=150)


print("Running Spread (Noisy - Robustness): MAPPO, QMIX...")
results_store['spread_ippo_noisy'] = train_spread_generic('IPPO', n_episodes=150, noise_level=0.5)
results_store['spread_mappo_noisy'] = train_spread_generic('MAPPO', n_episodes=150, noise_level=0.5)
results_store['spread_qmix_noisy'] = train_spread_generic('QMIX', n_episodes=150, noise_level=0.5)
results_store['spread_pac_noisy'] = train_spread_generic('Pareto-AC', n_episodes=150, noise_level=0.5)


--- EXP A: Zero-Sum Games ---
Running MP: Minimax-Q, CE-Q, PSRO, Nash-Q, IPPO, FoF-Q, LOLA...
Running RPS: Minimax-Q, CE-Q, PSRO, Nash-Q, IPPO, MAPPO, FoF-Q, LOLA...

--- EXP B: Mixed-Motive Games ---
Running IPD: CE-Q, Minimax-Q, PSRO, Nash-Q, FoF-Q, IPPO, MAPPO, LOLA, Pareto-AC...
Running IPD vs TitForTat...
Running IPD vs Random...
Running IPD vs Bully...
Running BoS: CE-Q, Minimax-Q, PSRO, Nash-Q, IPPO, FoF-Q...
Running IPD (One-Shot): IPPO, Nash-Q, Minimax-Q, CE-Q, FoF-Q, LOLA...

--- EXP C: Simple Spread (Cooperative) ---
Running Spread: IPPO, MAPPO, QMIX, Pareto-AC...
Running Spread (Noisy - Robustness): MAPPO, QMIX...


In [8]:
drive.mount('/content/drive')
save_path = '/content/drive/My Drive/MARL_Final_Project_Full_Revised'
os.makedirs(save_path, exist_ok=True)
np.savez(f'{save_path}/complete_results.npz', **results_store)
print(f"Results saved to {save_path}/complete_results.npz")

Mounted at /content/drive
Results saved to /content/drive/My Drive/MARL_Final_Project_Full_Revised/complete_results.npz


In [9]:
import numpy as np
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive')
load_path = '/content/drive/My Drive/MARL_Final_Project_Full_Revised/complete_results.npz'
save_path = '/content/drive/My Drive/MARL_Final_Project_Full_Revised/MARL_Metrics_Analysis.xlsx'

print(f"Loading results from: {load_path}")
try:
    data = np.load(load_path, allow_pickle=True)
    results_store = {key: data[key].item() for key in data}
    print("Data successfully loaded and converted to dictionary format.")
except FileNotFoundError:
    print("ERROR: File not found! Please run the training code first.")
    results_store = {}

def calculate_metrics(results, window=20):
    metrics_list = []

    game_groups = {
        'Matching Pennies': ['mp_minimax', 'mp_ceq', 'mp_psro', 'mp_nash', 'mp_ippo', 'mp_fof', 'mp_lola', 'mp_mappo'],
        'RPS': ['rps_minimax', 'rps_ceq', 'rps_psro', 'rps_nash', 'rps_ippo', 'rps_mappo', 'rps_fof', 'rps_lola'],
        'Iterated PD': ['ipd_ceq', 'ipd_minimax', 'ipd_psro', 'ipd_nash', 'ipd_fof', 'ipd_ippo', 'ipd_mappo', 'ipd_lola', 'ipd_pac'],
        'IPD vs Bots': [k for k in results.keys() if 'ipd_vs_' in k],
        'BoS': ['bos_ceq', 'bos_minimax', 'bos_psro', 'bos_nash', 'bos_ippo', 'bos_fof'],
        'Spread (Coop)': ['spread_ippo', 'spread_mappo', 'spread_qmix', 'spread_pac',
                          'spread_ippo_noisy', 'spread_mappo_noisy', 'spread_qmix_noisy', 'spread_pac_noisy'],
        'One-Shot PD': ['pd_oneshot_ippo', 'pd_oneshot_nash', 'pd_oneshot_minimax', 'pd_oneshot_ceq', 'pd_oneshot_fof', 'pd_oneshot_lola']
    }

    final_dfs = {}

    for group_name, keys in game_groups.items():
        rows = []
        for key in keys:
            if key not in results:
                continue

            item = results[key]
            algo_name = key.split('_')[-1].upper()
            if 'noisy' in key: algo_name += " (Noisy)"
            if 'vs' in key: algo_name = key

            p1_probs = np.array(item.get('p1_probs', []))

            welfare = np.array(item.get('social_welfare', []))
            fairness = np.array(item.get('fairness', []))
            rewards = np.array(item.get('rewards', []))

            stats = {'Algorithm': algo_name}

            if len(p1_probs) > 0:
                if len(p1_probs) == 1:
                    stability = 0.0
                    final_prob_dist = p1_probs[0]
                else:
                    last_chunk = p1_probs[-window:]
                    stability = np.mean(np.std(last_chunk, axis=0))
                    final_prob_dist = np.mean(last_chunk, axis=0)

                stats['Convergence Score (Lower is Better)'] = round(stability, 5)
                stats['Dominant Action Prob'] = round(np.max(final_prob_dist), 4)

            elif len(rewards) > 0:
                last_rewards = rewards[-window:]
                stats['Convergence Score (Lower is Better)'] = round(np.std(last_rewards), 4)
                stats['Dominant Action Prob'] = "N/A"

            if len(welfare) > 0:
                stats['Avg Final Welfare'] = round(np.mean(welfare[-window:]), 4)
            elif len(rewards) > 0:
                stats['Avg Final Welfare'] = round(np.mean(rewards[-window:]), 4)
            else:
                stats['Avg Final Welfare'] = 0.0

            if len(fairness) > 0:
                stats['Avg Final Fairness'] = round(np.mean(fairness[-window:]), 4)
            else:
                stats['Avg Final Fairness'] = "N/A"

            rows.append(stats)

        if rows:
            final_dfs[group_name] = pd.DataFrame(rows)

    return final_dfs

print("Calculating analysis...")
if results_store:
    dfs = calculate_metrics(results_store)

    with pd.ExcelWriter(save_path, engine='openpyxl') as writer:
        for sheet_name, df in dfs.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

            worksheet = writer.sheets[sheet_name]
            for column in df:
                column_width = max(df[column].astype(str).map(len).max(), len(column))
                col_idx = df.columns.get_loc(column)
                worksheet.column_dimensions[chr(65 + col_idx)].width = column_width + 2

    print(f"\nSUCCESS: All analyses saved to Excel file:\n{save_path}")
    print("\nExcel File Content (Sheet Names):")
    for sheet in dfs.keys():
        print(f"- {sheet}")
else:
    print("No data to process.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading results from: /content/drive/My Drive/MARL_Final_Project_Full_Revised/complete_results.npz
Data successfully loaded and converted to dictionary format.
Calculating analysis...

SUCCESS: All analyses saved to Excel file:
/content/drive/My Drive/MARL_Final_Project_Full_Revised/MARL_Metrics_Analysis.xlsx

Excel File Content (Sheet Names):
- Matching Pennies
- RPS
- Iterated PD
- IPD vs Bots
- BoS
- Spread (Coop)
- One-Shot PD
