/
algorithms.py
147 lines (122 loc) · 4.38 KB
/
algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
from scipy.stats import norm,bernoulli
class ActionCenteredThompsonSampling(object):
"""
Implementation of Action Centered Thompson Sampling
"""
def __init__(self,d,sig,prior_v=1.):
# dimension of Z_t
self.d = d
# prior
self.sig = sig
self.vec = np.zeros(self.d)
self.V = np.identity(self.d)*prior_v
self.V_inv = np.linalg.inv(self.V)
self.delta = np.matmul(self.V_inv,self.vec)
def update(self,a,pi,r,z):
self.V += (np.matmul((a*z[np.newaxis,:]).T,a*z[np.newaxis,:])*pi*(1-pi))
self.vec+= (a*z*r*(a-pi))
self.V_inv = np.linalg.inv(self.V)
self.delta = np.matmul(self.V_inv,self.vec)
def get_pi(self,z0, z1):
diff = z1-z0
if isinstance(diff,float):
mu = np.squeeze(self.delta*diff)
var = np.squeeze(self.V_inv*self.sig**2)
else:
diff = z1 - z0
mu = np.dot(diff,self.delta)
var = np.matmul(diff,np.matmul(self.V_inv*self.sig**2,diff))
pi = 1-norm.cdf(0,loc=mu,scale=np.sqrt(var))
return pi
def get_action(self,z0, z1,misc=True):
pi = self.get_pi(z0,z1)
delta_s = np.random.multivariate_normal(self.delta,self.V_inv*self.sig**2)
a = np.argmax([np.squeeze(np.dot(z0,delta_s)),np.squeeze(np.dot(z1,delta_s))])
#a = bernoulli.rvs(pi)
return int(a),pi
class LinearUCB(object):
"""
Implementation of Linear Upper Confidence Bound
"""
def __init__(self,d,gamma=1.0,prior_v=1):
# dimension of Z_t
self.d = d
# hyperparameter
self.gamma = gamma
# prior
self.vec = np.zeros(self.d)
self.V = np.identity(self.d)*prior_v
self.V_inv = np.linalg.inv(self.V)
self.theta = np.matmul(self.V_inv,self.vec)
#clipped probability
#self.pi_min = pi_min
#self.pi_max = 1-self.pi_min
def update(self,a,b,r,z):
x = np.hstack([b,a*z])
self.V += np.matmul((x[np.newaxis,:]).T,x[np.newaxis,:])
self.vec+= (x*r)
self.V_inv = np.linalg.inv(self.V)
self.theta = np.matmul(self.V_inv,self.vec)
def get_pi(self,b,z0,z1):
x0 = np.hstack([b,z0])
x1 = np.hstack([b,z1])
diff = np.dot(x0-x1,self.theta)+self.gamma*(np.sqrt(np.matmul(x0,np.matmul(self.V_inv,x0)))\
-np.sqrt(np.sqrt(np.matmul(x1,np.matmul(self.V_inv,x1)))))
if diff<0:
return 1.0
#return self.pi_max
elif diff>0:
return 0.0
#return self.pi_min
else:
return 0.5
def get_action(self,b,z0, z1,misc=False):
pi = self.get_pi(b,z0,z1)
a = bernoulli.rvs(pi)
return int(a),pi
class BOSE(object):
"""
Implementation of BOSE (bandit semiparametric orthogonalized
estimator) algorithm. This algorithm only works if features are
available in the SemibanditSim object
The SemibanditSim must also expose B.D as an instance variable
"""
def __init__(self, d, T, gamma=None):
self.d = d
self.T = T
if gamma is not None:
self.reg = 1
self.gamma = gamma
else:
self.p = 0.1
self.reg = 4*self.d*np.log(9*self.T)+8*np.log(4*T/self.p)
self.gamma = np.sqrt(self.reg)+np.sqrt(27*self.d*np.log(1+2*T/d)+54*np.log(4*T/self.p))
# init hat_delta,cov
self.vec = np.zeros(self.d)
self.V = np.identity(d)*self.reg
self.V_inv = np.linalg.inv(self.V)
self.delta = np.matmul(self.V_inv,self.vec)
def update(self, a, pi,r,z):
mu = pi*z
self.V += np.matmul((z*a-mu)[:,np.newaxis],(z*a-mu)[np.newaxis,:])
self.vec += (r*(z*a-mu))
self.V_inv = np.linalg.inv(self.V)
self.delta = np.matmul(self.V_inv,self.vec)
def get_pi(self,z0, z1):
diff = z1-z0
ls = np.sqrt(np.matmul(diff,np.matmul(self.V_inv,diff)))*self.gamma
rs = np.sum(self.delta*diff)
if rs >= ls:
pi = 1.0
#pi = self.pi_max
elif -rs >= ls:
#pi = self.pi_min
pi = 0.0
else: pi = 0.5
return pi
def get_action(self, z0, z1,misc=False):
pi = self.get_pi(z0, z1)
a = bernoulli.rvs(pi)
if misc: return a,pi,rs,ls
else:return a,pi