In [1]:
%matplotlib inline
import numpy as np
import torch
import matplotlib.pyplot as plt

## variables in program:


1. empirical observed joint action probabilities. equiv. to computed joint action probabilities. should be of size (p1_action x p2_action x p3_action ...)

2. expected regret features per deviation function -- can be computed once for each joint action and deviation function and then averaged over. expectation under either empirical strategy or dual strategy.


3. dual variables -- one utility vector per deviation.

In [2]:
# let's try RPS, computing with external regret deviations only
K = 2 # dim of util features (going to be 0/1)
N = 2 # num players
actions_per_player = 3
num_external_deviations = N * actions_per_player


In [6]:
def rps_feats(action_tuple):
    # 0 is rock, 1 is paper, 2 is scissors
    p1, p2 = action_tuple
    # feat_vecs has shape N, K
    if p1 == 0:
        if p2 == 0:
            return torch.tensor([[0.0,0.0],[0.0,0.0]])
        if p2 == 1:
            return torch.tensor([[0.0,1.0],[0.0,1.0]])
        if p2 == 2:
            return torch.tensor([[1.0,0.0],[1.0,0.0]])
    elif p1 == 1:
        if p2 == 0:
            return torch.tensor([[1.0,0.0],[1.0,0.0]])
        if p2 == 1:
            return torch.tensor([[0.0,0.0],[0.0,0.0]])
        if p2 == 2:
            return torch.tensor([[0.0,1.0],[0.0,1.0]])
    elif p1 == 2:
        if p2 == 0:
            return torch.tensor([[0.0,1.0],[0.0,1.0]])
        if p2 == 1:
            return torch.tensor([[1.0,0.0],[1.0,0.0]])
        if p2 == 2:
            return torch.tensor([[0.0,0.0],[0.0,0.0]])

In [14]:
def compute_external_regrets_for_action( action_tens ):
    # these are the instantaneous regrets for all the specific deviations
    ext_regrets = torch.zeros(N, actions_per_player, K)
    # ext_regrets[i, j] is player i, deviation f_i = ->j
    for player in range(N):
        for action in range(actions_per_player):
            
            deviation_applied = torch.clone(action_tens)
            deviation_applied[player] = action
            
            ext_regrets[player, action] = rps_feats(deviation_applied)[player] - rps_feats(action_tens)[player]
    return ext_regrets
            

    

In [33]:
ext_regrets = compute_external_regrets_for_action(torch.tensor([0,0]))

In [277]:
compute_external_regrets_for_action(torch.tensor([0,1]))

tensor([[[ 0.,  0.],
         [ 0., -1.],
         [ 1., -1.]],

        [[ 0., -1.],
         [ 0.,  0.],
         [ 1., -1.]]])

In [21]:
def compute_expected_external_regret_feats( action_dist ):
    total_regret_feats = torch.zeros(N, actions_per_player, K)
    n = 0
    for p1_act in range(action_dist.shape[0]):
        for p2_act in range(action_dist.shape[1]):
            n += 1
            total_regret_feats += action_dist[p1_act, p2_act] * compute_external_regrets_for_action(torch.tensor([p1_act, p2_act]))
    return total_regret_feats / n

In [30]:
nash_eq_rps = torch.tensor([1/3 + 0.001,1/3,1/3 - 0.001]).view(-1,1) @ torch.tensor([1/3,1/3 - 0.01,1/3 + 0.01]).view(1,-1)

In [252]:
expected_regret_feats = compute_expected_external_regret_feats(nash_eq_rps)

In [38]:
def predicted_strategy(regret_feats, theta):
    # both regret_feats and theta are N x actions_per_player x K
    # output should be of size p1_actions x p2_actions
    unnormalized_dist = torch.zeros(actions_per_player, actions_per_player)
    # dot product of each regret feat with each theta
    for p1_action in range(actions_per_player):
        for p2_action in range(actions_per_player):
            joint_action = torch.tensor([p1_action, p2_action])
            action_regret_feats = compute_external_regrets_for_action(joint_action)
            action_regret_scalars = torch.sum(action_regret_feats * theta, dim=2)
            unnormalized_dist[p1_action, p2_action] = torch.exp(-torch.sum(action_regret_scalars))
    Z = torch.sum(unnormalized_dist)
    return unnormalized_dist / Z
    

In [43]:
predicted_strategy(expected_regret_feats, torch.ones_like(expected_regret_feats))

tensor([[0.0004, 0.1665, 0.1665],
        [0.1665, 0.0004, 0.1665],
        [0.1665, 0.1665, 0.0004]])

In [58]:
def maxent_ice_gradient(prediction, empirical, theta):
    gradients = torch.zeros(N, actions_per_player, K)
    predicted_external_regret_feats = compute_expected_external_regret_feats(prediction)
    expected_external_regret_feats = compute_expected_external_regret_feats(empirical)
    
    for player in range(N):
        for action in range(actions_per_player):
            # external deviation player, ->action
            this_deviation_theta = theta[player, action].view(1,1,-1) # unsqueeze to broadcast
            this_deviation_scalar_regrets = torch.sum(expected_external_regrets * this_deviation_theta, dim=2)
            f_star = torch.argmax()
    raise NotImplementedError

In [52]:
scalars = torch.sum(torch.rand(N, actions_per_player, K) * torch.rand(N, actions_per_player, K), dim=2)

In [57]:
torch.max(scalars, 1)

torch.return_types.max(
values=tensor([0.7183, 0.8648]),
indices=tensor([2, 1]))

In [54]:
scalars

tensor([[0.3877, 0.3270, 0.7183],
        [0.4011, 0.8648, 0.6257]])

In [None]:
## what about just explicitly computing the objective, and letting torch handle the gradients?

In [271]:
import ipdb

In [67]:
def maxent_dual_objective(observed, theta):
    
    bigZ = torch.tensor(0.0)
    
    # for each joint action in A
    for p1_action in range(actions_per_player):
        for p2_action in range(actions_per_player):
            
            little_r_a_feats = compute_external_regrets_for_action(torch.tensor([p1_action, p2_action]))
            # scalar features for all deviations f with their own theta_fs
            little_r_a_scalar = torch.sum(little_r_a_feats * theta, dim=2)
            # sum up, exp, add to Z
            bigZ += torch.exp( - torch.sum(little_r_a_scalar))
    obj = torch.log(bigZ)
    # computing expected big regret for theta_f is max over phi_f of r_f(predicted | theta_f)
    # phi_f here is just the whole phi
    expected_er_feats = compute_expected_external_regret_feats(observed)
    
    # for each deviation
    for player in range(N):
        for dev_action in range(actions_per_player):
            this_deviation_theta = theta[player, dev_action].view(1,1,-1) # unsqueeze to broadcast
            little_scalar_regrets = torch.sum(expected_er_feats * this_deviation_theta, dim=2)
            # little_scalar_regrets contains the regret for theta_f for all the different fs
            big_Regret = torch.max(little_scalar_regrets)
            obj += big_Regret
    return obj

In [68]:
maxent_dual_objective(nash_eq_rps, expected_regret_feats)

tensor(2.1972)

In [236]:
test_theta = torch.rand(expected_regret_feats.shape).requires_grad_(True)

In [237]:
test_theta

tensor([[[0.9399, 0.3180],
         [0.3251, 0.0824],
         [0.3891, 0.1169]],

        [[0.5737, 0.3316],
         [0.9650, 0.1151],
         [0.2646, 0.4102]]], requires_grad=True)

In [238]:
from torch import optim

In [274]:
def optimize(empirical_dist, theta, epochs=100):
    optimizer = optim.Adam([theta], lr=0.1)
    for i in range(epochs):
        optimizer.zero_grad()
        loss = maxent_dual_objective(empirical_dist, theta)
        loss.backward()
        optimizer.step()
    

In [275]:
theta = torch.rand(expected_regret_feats.shape).requires_grad_(True)
optimize(nash_eq_rps, theta)
print(theta)

Exception: 

In [255]:
predicted_strategy(expected_regret_feats, theta.detach())

tensor([[0.1113, 0.1116, 0.1110],
        [0.1110, 0.1108, 0.1103],
        [0.1116, 0.1117, 0.1108]])

In [256]:
exact_nash_eq = torch.tensor([1/3,1/3,1/3]).view(-1,1) @ torch.tensor([1/3,1/3,1/3]).view(1,-1)

In [257]:
expected_regret_feats_exact = compute_expected_external_regret_feats(exact_nash_eq)

In [262]:
theta = torch.rand(expected_regret_feats_exact.shape).requires_grad_(True)


In [263]:
predicted_strategy(expected_regret_feats_exact, theta.detach())

tensor([[0.0161, 0.2234, 0.0312],
        [0.0438, 0.0064, 0.1158],
        [0.4999, 0.0534, 0.0101]])

In [264]:
optimize(exact_nash_eq, theta)
print(theta)

tensor([[[-0.2228,  0.2000],
         [ 0.2579,  0.2672],
         [-0.1545, -0.2178]],

        [[-0.2853, -0.2037],
         [ 0.1581, -0.2872],
         [ 0.2470,  0.2444]]], requires_grad=True)


In [265]:
predicted_strategy(expected_regret_feats_exact, theta.detach())

tensor([[0.1111, 0.1109, 0.1107],
        [0.1112, 0.1106, 0.1111],
        [0.1121, 0.1111, 0.1113]])