In [26]:
import numpy as np
from modules.utility import generate_player_beliefs

# Example usage
P1_beliefs, P2_beliefs = generate_player_beliefs()

# Define the rewards and payoffs
r1, r2 = 3, 3  # Reward for mutual cooperation
t1, t2 = 5, 5  # Temptation payoff
p1, p2 = 0, 0  # Punishment payoff
s1, s2 = 1, 1  # Sucker's payoff
gamma = 0.9,.6,.3,.2,.1    # Discount rate

# Define the rewards matrix
rewards_matrix = np.array([
    [r1, r2],  # Payoffs for when both cooperate
    [t1, p2],  # Payoffs for when one defects and the other cooperates
    [p1, t2],  # Payoffs for when one defects and the other cooperates
    [s1, s2]   # Payoffs for when both defect
])

print("Player 1 Beliefs:", P1_beliefs)
print("Player 2 Beliefs:", P2_beliefs)

Player 1 Beliefs: {'CC': 0.21, 'CB': 0.68, 'BC': 0.83, 'BB': 0.67}
Player 2 Beliefs: {'CC': 0.09, 'CB': 0.19, 'BC': 0.12, 'BB': 0.5}


In [27]:
from modules.Player import Player
from modules.PlayerEnv import PlayerEnv
from torch import optim

pl1,pl2=Player(P1_beliefs,[r1,t1,p1,s1]),Player(P2_beliefs,[r2,p2,t2,s2])

# Define optimizers for each player
# Assuming that Player class has a method 'parameters()' that returns its parameters
optimizer_p1 = optim.SGD(pl1.parameters(), lr=0.01)
optimizer_p2 = optim.SGD(pl2.parameters(), lr=0.01)

env=PlayerEnv(pl1,pl2,.2)

print(env.play())

print("done testing")

# Training loop
num_epochs = 100
for epoch in range(num_epochs):

    reward = env.play()
    optimizer_p1.zero_grad()
    rp1 = reward[:, 0]
    loss_p1 = -rp1.sum()
    loss_p1.backward()
    optimizer_p1.step()
    
    reward = env.play()
    optimizer_p2.zero_grad()
    rp2 = reward[:, 1]
    loss_p2 = -rp2.sum()
    loss_p2.backward()
    optimizer_p2.step()

    # Logging
    #if epoch % 10 == 0:
    #    print(f"Epoch {epoch}: pl1 reward = {rp1.sum().item()}, pl2 reward = {rp2.sum().item()}")

print(env.play())

[3, 5, 0, 1]
[3, 0, 5, 1]
tensor([[3.6560, 3.5983],
        [5.6826, 0.5472],
        [0.6826, 5.5472],
        [1.6560, 1.5983]], grad_fn=<MmBackward0>)
done testing
tensor([[3.6820, 3.6378],
        [5.7049, 0.5950],
        [0.7049, 5.5950],
        [1.6820, 1.6378]], grad_fn=<MmBackward0>)


In [28]:
from CGDs import ACGD
lr = 0.0001

pl1=Player(P1_beliefs,[r1,t1,p1,s1])
pl2=Player(P2_beliefs,[r2,p2,t2,s2])
env=PlayerEnv(pl1,pl2,.2)
optimizer = ACGD(max_params=pl1.parameters(), min_params=pl2.parameters(), lr_max=lr, lr_min=lr)

# max_parems is maximizing the objective function while the min_params is trying to minimizing it. 
# BCGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr, lr_min=lr, device=device)
# ACGD: Adaptive CGD;
for epoch in range(100):
    reward=-env.play().sum()
    optimizer.zero_grad()
    optimizer.step(loss=reward)

print(env.play())

[3, 5, 0, 1]
[3, 0, 5, 1]
tensor([[3.6531, 3.6011],
        [5.6797, 0.5501],
        [0.6797, 5.5501],
        [1.6531, 1.6011]], grad_fn=<MmBackward0>)
