In [115]:
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.optimize import dual_annealing

In [107]:
class ConstantReward:
    def __init__(self,arms):
        reward = np.random.rand(arms)
        reward /= np.sum(reward)
        self.reward = reward

    def get_reward(self,context):
        return self.reward

In [127]:
class StrategicAgents:
    def __init__(self, private_types, dim, delta_radius, reward):
        self.private_types = private_types
        self.delta_radius = delta_radius
        self.t = 0
        self.reward = reward
        self.dim = dim

    def generate_context(self, policy):
        priv = self.private_types[self.t]
        reward = self.reward.get_reward(priv)
        bounds = [(-self.delta_radius, self.delta_radius)] * (self.dim)
        def objective(delta):
            if np.linalg.norm(delta) > self.delta_radius:
                return np.inf
            x_prime = (priv+delta)
            return -reward[np.argmax(x_prime@policy)]

        # norm_constraint = NonlinearConstraint(lambda x : np.linalg.norm(x), 0, self.delta_radius)
        opt_delt = dual_annealing(objective, bounds)
        x_prime = priv + opt_delt.x
        self.t += 1
        return x_prime, reward[np.argmax(x_prime@policy)]


In [86]:
class NonStrategicAgents:
    def __init__(self, private_types, dim, delta_radius, reward):
        self.private_types = private_types
        self.delta_radius = delta_radius
        self.t = 0
        self.reward = reward
        self.dim = dim

    def generate_context(self, policy):
        priv = self.private_types[self.t]
        reward = self.reward.get_reward(priv)

        self.t += 1
        return priv, reward[np.argmax(priv@policy)]


In [134]:
T = 100
ARMS = 4
CONTEXT_DIM = 5
delta_radius = 0.5

private_types = np.random.rand(T,CONTEXT_DIM)
rewards = ConstantReward(ARMS)
strat_agents = StrategicAgents(private_types,CONTEXT_DIM,delta_radius,rewards)
nostrat_agents = NonStrategicAgents(private_types,CONTEXT_DIM,delta_radius,rewards)

In [135]:
policy = np.random.rand(CONTEXT_DIM,ARMS) - 0.5
print("policy: {} ".format(policy))
print("reward: {} ".format(rewards.get_reward(0)))

policy: [[ 0.31827152 -0.06689342  0.39410267  0.40178453]
 [-0.15655195 -0.23157348 -0.12652979 -0.29663049]
 [ 0.10962736 -0.48593396  0.10674391  0.21765811]
 [ 0.11098109 -0.25367018 -0.37081219 -0.33286856]
 [-0.28542119  0.35176982 -0.43220067  0.04631573]] 
reward: [0.2942194  0.02555277 0.36679221 0.31343562] 


Sanity check to make sure the strategic agents are performing better than non-strategic agents

In [136]:
total_strat_reward = 0
total_nostrat_reward = 0
for i in range(T):
    xp, strat_reward = strat_agents.generate_context(policy)
    x, nostrat_reward = nostrat_agents.generate_context(policy)
    total_strat_reward += strat_reward
    total_nostrat_reward += nostrat_reward

print("strategic reward: {}, non-strategic reward: {}".format(total_strat_reward,total_nostrat_reward))

  df = fun(x) - f0


strategic reward: 32.27858860198842, non-strategic reward: 29.226927653796203


In [None]:
class ModelReward:
    def __init__(self,arms,dim,noise):
        self.theta = np.random.rand(arms,dim)

    def get_reward(self,action,context):
        true_reward = self.theta[action]
        return true_reward + random.uniform(-self.noise,self.noise)

In [None]:
class StrategyAwareModel:
    