In [1]:
import numpy as np
import torch
from torch import nn
import random
import matplotlib.pyplot as plt
from scipy.optimize import dual_annealing
from utils import GradientBandit

In [2]:
class ConstantReward:
    def __init__(self,arms):
        reward = np.random.rand(arms)
        reward /= np.sum(reward)
        self.reward = reward

    def get_reward(self,context):
        return self.reward

In [3]:
class StrategicAgents:
    def __init__(self, private_types, dim, delta_radius, reward):
        self.private_types = private_types
        self.delta_radius = delta_radius
        self.t = 0
        self.reward = reward
        self.dim = dim

    def generate_context(self, policy):
        priv = self.private_types[self.t]
        reward = self.reward.get_reward(priv)
        bounds = [(-self.delta_radius, self.delta_radius)] * (self.dim)
        def objective(delta):
            if np.linalg.norm(delta) > self.delta_radius:
                return np.inf
            x_prime = np.append((priv+delta),[1])
            return -reward[np.argmax(policy@x_prime)]

        opt_delt = dual_annealing(objective, bounds)
        x_prime = priv + opt_delt.x
        self.t += 1
        return x_prime, reward[np.argmax(policy@np.append(x_prime,[1]))]


In [4]:
class NonStrategicAgents:
    def __init__(self, private_types, dim, delta_radius, reward):
        self.private_types = private_types
        self.delta_radius = delta_radius
        self.t = 0
        self.reward = reward
        self.dim = dim

    def generate_context(self, policy):
        priv = self.private_types[self.t],[1]
        reward = self.reward.get_reward(priv)

        self.t += 1
        return priv, reward[np.argmax(policy@np.append(priv,[1]))]


In [134]:
T = 100
ARMS = 4
CONTEXT_DIM = 5
delta_radius = 0.5

private_types = np.random.rand(T,CONTEXT_DIM)
rewards = ConstantReward(ARMS)
strat_agents = StrategicAgents(private_types,CONTEXT_DIM,delta_radius,rewards)
nostrat_agents = NonStrategicAgents(private_types,CONTEXT_DIM,delta_radius,rewards)

In [135]:
policy = np.random.rand(CONTEXT_DIM,ARMS) - 0.5
print("policy: {} ".format(policy))
print("reward: {} ".format(rewards.get_reward(0)))

policy: [[ 0.31827152 -0.06689342  0.39410267  0.40178453]
 [-0.15655195 -0.23157348 -0.12652979 -0.29663049]
 [ 0.10962736 -0.48593396  0.10674391  0.21765811]
 [ 0.11098109 -0.25367018 -0.37081219 -0.33286856]
 [-0.28542119  0.35176982 -0.43220067  0.04631573]] 
reward: [0.2942194  0.02555277 0.36679221 0.31343562] 


Sanity check to make sure the strategic agents are performing better than non-strategic agents

In [136]:
total_strat_reward = 0
total_nostrat_reward = 0
for i in range(T):
    xp, strat_reward = strat_agents.generate_context(policy)
    x, nostrat_reward = nostrat_agents.generate_context(policy)
    total_strat_reward += strat_reward
    total_nostrat_reward += nostrat_reward

print("strategic reward: {}, non-strategic reward: {}".format(total_strat_reward,total_nostrat_reward))

  df = fun(x) - f0


strategic reward: 32.27858860198842, non-strategic reward: 29.226927653796203


In [5]:
class ModelReward:
    def __init__(self,arms,dim,noise=0.1):
        self.theta = torch.rand(arms,dim)
        self.noise_scale = noise

    def get_reward(self,action,context):
        true_reward = action @ self.theta @ torch.t(context)
        return true_reward + (torch.rand(1)-0.5) * 2*self.noise_scale

In [6]:
class StrategyAwareGradientModel:
    def __init__(self,T,est_reward,lr,dim,delta_radius,arms):
        self.delta_radius = delta_radius
        self.t = 0
        self.T = T
        self.est_agent_reward = est_reward
        self.dim = dim
        self.model = GradientBandit(arms,dim)
        self.reward_function = ModelReward(arms,dim)
        self.opt = torch.optim.SGD(self.model.parameters(), lr=lr, momentum=0.9)
        self.criterion = torch.nn.L1Loss()

    def get_policy(self):
        return self.model.get_hyperplanes()

    def observe_reward(self, context):
        self.opt.zero_grad()
        context = torch.tensor(context).reshape(1,-1).float()
        est_agent_rew = torch.tensor(self.est_agent_reward.get_reward(context)).float()
        y_hat = self.model.forward(context,est_agent_rew)
        print(y_hat)
        reward = self.reward_function.get_reward(y_hat,context)
        #loss = self.criterion(reward,torch.zeros(1))
        reward.backward()
        self.opt.step()
        return reward

In [7]:
torch.autograd.set_detect_anomaly(True)
T = 1
ARMS = 4
CONTEXT_DIM = 5
DELTA = 0.5
lr = 0.5

private_types = np.random.rand(T,CONTEXT_DIM)
agent_rewards = ConstantReward(ARMS)
strat_agents = StrategicAgents(private_types,CONTEXT_DIM,DELTA,agent_rewards)
nostrat_agents = NonStrategicAgents(private_types,CONTEXT_DIM,DELTA,agent_rewards)
model = StrategyAwareGradientModel(T,agent_rewards,lr,CONTEXT_DIM,DELTA,ARMS)
total_strat_reward = 0
total_nostrat_reward = 0
model_rewards = []
for i in range(T):
    policy = model.get_policy().detach().numpy() #.T[:-1]
    print(policy.shape)
    xp, strat_reward = strat_agents.generate_context(policy)
    total_strat_reward += strat_reward
    # x, nostrat_reward = nostrat_agents.generate_context(policy)
    # total_nostrat_reward += nostrat_reward
    r = model.observe_reward(xp)
    model_rewards.append(r)

(4, 6)


  df = fun(x) - f0


tensor([[0.2012, 0.2915, 0.1910, 0.3163]], grad_fn=<SoftmaxBackward0>)


  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
    self.io_loop.start()
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\asyncio\base_events.py", line 603, in run_forever
    self._run_once()
  File "c:\Users\DeanA\anaconda3\envs\pytorch\lib\asyncio\base_

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 6]], which is output 0 of SubBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!