forked from Pdbz199/Koopman-RL-Old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Koopman RL 4-14.py
369 lines (305 loc) · 14.3 KB
/
Koopman RL 4-14.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import math
import observables
import numpy as np
import scipy as sp
import numba as nb
from scipy import integrate
from estimate_L import *
from cartpole_reward import cartpoleReward
#Brownian Updates
## Condition number
print("Condition number of L:", np.linalg.cond(L)) # inf
print("Condition number of Phi_X:", np.linalg.cond(Psi_X)) # 98 million+
## RRR
"""
Using the fact that OLS is essentially orthogonal projectionon the column space of X, we can rewrite L as
\begin{align}
L = ||Y - X\hat{B}_{\text{OLS}}||^2 + ||X\hat{B}_{\text{OLS}} - XB||^2
\end{align}
The first term does not depend on B and the second term can be minimized by the SVD/PCA of the fitted values $\hat{Y} = X\hat{B}_{\text{OLS}}$
Specifically, if $U_r$ are the first r prinicpal axes of \hat{Y}, then
\begin{align}
\hat{B}_{\text{RRR}} = \hat{B}_{\text{OLS}} U_r U_r^\top
\end{align}
MAYBE USEFUL TO INCLUDE MORE?
First, one can use it for regularization purposes. Similarly to ridge regression (RR), lasso, etc., RRR introduces some "shrinkage" penalty on B.
The optimal rank r can be found via cross-validation. In my experience, RRR easily outperforms OLS but tends to lose to RR.
However, RRR+RR can perform (slightly) better than RR alone.
"""
@nb.njit(fastmath=True)
def rrr(X, Y, rank=8):
B_ols = ols(X, Y)
U, S, V = np.linalg.svd(Y.T @ X @ B_ols)
W = V[0:rank].T
B_rr = B_ols @ W @ W.T
L = B_rr#.T
return L
## CartPole Reward
"""
The default CartPole reward is always 1 until the episode terminates.
We needed to modify this to make sense in the scope of our design so we found a variable reward formulation.
We defined it below and put it into the CartPole environment for an agent to learn from to collect data from it.
"""
theta_threshold_radians = 12 * 2 * math.pi / 360
x_threshold = 2.4
gravity = 9.8
masscart = 1.0
masspole = 0.1
total_mass = (masspole + masscart)
length = 0.5 # actually half the pole's length
polemass_length = (masspole * length)
force_mag = 10.0
tau = 0.02 # seconds between state updates
kinematics_integrator = 'euler'
# Angle at which to fail the episode
theta_threshold_radians = 12 * 2 * math.pi / 360
x_threshold = 2.4
# Angle limit set to 2 * theta_threshold_radians so failing observation
# is still within bounds.
high = np.array([x_threshold * 2,
np.finfo(np.float32).max,
theta_threshold_radians * 2,
np.finfo(np.float32).max],
dtype=np.float32)
def cartpoleReward(state, action):
x, x_dot, theta, theta_dot = state
force = force_mag if action >= 0.5 else -force_mag
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + polemass_length * theta_dot ** 2 * sintheta) / total_mass
thetaacc = (gravity * sintheta - costheta * temp) / (length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass))
xacc = temp - polemass_length * thetaacc * costheta / total_mass
if kinematics_integrator == 'euler':
x = x + tau * x_dot
x_dot = x_dot + tau * xacc
theta = theta + tau * theta_dot
theta_dot = theta_dot + tau * thetaacc
else: # semi-implicit euler
x_dot = x_dot + tau * xacc
x = x + tau * x_dot
theta_dot = theta_dot + tau * thetaacc
theta = theta + tau * theta_dot
# done = bool(
# x < -x_threshold
# or x > x_threshold
# or theta < -theta_threshold_radians
# or theta > theta_threshold_radians
# )
reward = (1 - (x ** 2) / 11.52 - (theta ** 2) / 288)
return reward
"""
The important line from the above is the asignment
reward = (1 - (x ** 2) / 11.52 - (theta ** 2) / 288)
We can see that with an increase in the absolute values of x and θ, the reward decreases and reaches 0 when |x| = 2.4 and |θ| = 12.
"""
## Setup for Algorithms
"""
The following cells will guide through the setup of the variables necessary to run through the three algorithms.
We used the Numba package in order to heavily reduce the runtime of various functions
"""
@nb.njit(fastmath=True)
def ln(x):
return np.log(x)
@nb.njit(fastmath=True) #parallel=True,
def nb_einsum(A, B):
assert A.shape == B.shape
res = 0
for i in range(A.shape[0]):
for j in range(A.shape[1]):
res += A[i,j]*B[i,j]
return res
#%%
X = (np.load('random-cartpole-states.npy'))[:5000].T # states
U = (np.load('random-cartpole-actions.npy'))[:5000].T # actions
X_tilde = np.append(X, [U], axis=0) # extended states
d = X_tilde.shape[0]
m = X_tilde.shape[1]
s = int(d*(d+1)/2) # number of second order poly terms
#%%
psi = observables.monomials(2)
Psi_X_tilde = psi(X_tilde)
Psi_X_tilde_T = Psi_X_tilde.T
k = Psi_X_tilde.shape[0]
nablaPsi = psi.diff(X_tilde)
nabla2Psi = psi.ddiff(X_tilde)
#%%
@nb.njit(fastmath=True)
def dpsi(X, k, l, t=1):
difference = X[:, l+1] - X[:, l]
term_1 = (1/t) * (difference)
term_2 = nablaPsi[k, :, l]
term_3 = (1/(2*t)) * np.outer(difference, difference)
term_4 = nabla2Psi[k, :, :, l]
return np.dot(term_1, term_2) + nb_einsum(term_3, term_4)
#%% Construct \text{d}\Psi_X matrix
dPsi_X_tilde = np.zeros((k, m))
for row in range(k):
for column in range(m-1):
dPsi_X_tilde[row, column] = dpsi(X_tilde, row, column)
dPsi_X_tilde_T = dPsi_X_tilde.T
#%%
L = rrr(Psi_X_tilde_T, dPsi_X_tilde_T)
#%%
@nb.njit
def psi_x_tilde_with_diff_u(l, u):
result = Psi_X_tilde[:,l].copy()
result[-1] = u
return result
## Algorithm 1
"""
We implemented the learning algorithm as outlined in our Koopman RL write-up
The optimal value function $V$ satisfies the (regularized) Hamilton-Jacobi-Bellman (HJB) equation
\begin{align}
\rho v(x) &= \sup_{\pi_t \in \mathcal{P}(U)} \int_U \left( r(x,u) - \lambda \ln{\pi_t(u)} + (\mathcal{L}v)(x,u)\right) \pi_t(u) \text{d}u
\end{align}
where $\mathcal{P}(U) := \{ \pi_t:\int_U \pi_t(u) \text{d}u = 1 \text{ and } \pi_t(u) \geq 0 \text{ a.e on } U \}$. Using \eqref{est_eigen} applied to the value function $v$, we plug in our estimated
eigenfunctions and also reduce the dimension by choosing a cut off $c<n$ to give us an approximate characterization of optimality which we call {\bf Koopman HJB}:
\begin{align}
\rho v(x) &= \sup_{\pi_t \in \mathcal{P}(U)} \int_U \left (r(x,u) - \lambda \ln{\pi_t(u)}\right)\pi_t(u) \text{d}u + \int_U \sum_{\ell = 1}^c \langle \widehat{\varphi}_\ell, v \rangle \widehat \varphi_\ell(x,u) \pi_t(u) \text{d}u \label{inner_prod_approx}
\\
&= \sup_{\pi_t \in \mathcal{P}(U)} \int_U \left(r(x,u) - \lambda \ln{\pi_t(u)} \right)\pi_t(u)\text{d}u + \sum_{\ell = 1}^cm^v_\ell\lambda_\ell\int_U \widehat{\varphi}_\ell(x,u) \pi_t(u) \text{d}u \label{KoopmanHJB}
\end{align}
%If r(x,u) is also sufficient smooth, and the state-actions which show up in the cost function are themselves eigenfunctions of the Koopman operator (i.e. they are martingales) we can reformulate the reward function as a quadratic in terms of the vector of eigenfunctions $\Phi$:
Solving this maximization policy we get the feedback control:
\begin{align}
\pi^*(u|x) &= \frac{\exp\left(\frac{1}{\lambda}(r(x,u) + (\mathcal{L}v)(x,u))\right)}{\int_U \exp\left(\frac{1}{\lambda}(r(x,u) + (\mathcal{L}v)(x,u))\right)du}\notag
\\
&\approx \frac{\exp\left(\frac{1}{\lambda}(r(x,u) + \sum_{\ell = 1}^c m^v_\ell\lambda_\ell \widehat{\varphi}_\ell(x,u)) \right)}{\int_U \exp\left(\frac{1}{\lambda}(r(x,u) + \sum_{\ell = 1}^c m^v_\ell\lambda_\ell \widehat{\varphi}_\ell(x,u)) \right)du} \label{approxOptPolicy}
\\
&=: \widehat{\pi}^*(u | x, v)
\end{align}
To run our algorithm, we initialize V in either some random way or we set it to 0. Then we want to find the OLS projection matrix B^\top_v of V^{\pi_0^*} onto the dictionary space by solving the following least squares problem
\begin{align}
\min_{B_g}\; \lVert G_{\tilde X} - B_g^\top \Psi_{\tilde X}\rVert_F \label{gProjPsi}
\end{align}
Next, We approximate \mathcal{L}v in
\begin{align}
(\mathcal{L}g)(\tilde{x}) \approx \sum_{\ell = 1}^n \lambda_\ell \widehat{\varphi}_\ell(\tilde{x}) m^g_\ell \label{est_eigen}
\end{align}
which we then plug into
\begin{align}
\frac{\exp\left(\frac{1}{\lambda}(r(x,u) + (\mathcal{L}v)(x,u))\right)}{\int_U \exp\left(\frac{1}{\lambda}(r(x,u) + (\mathcal{L}v)(x,u))\right)du}
\end{align}
to get our estimated optimal policy \hat{pi}^*(u | x, v)
Once we have our updated \hat{pi}^*(u | x, v), we can plug that into
\begin{align}
\sup_{\pi_t \in \mathcal{P}(U)} \int_U \left(r(x,u) - \lambda \ln{\pi_t(u)} \right)\pi_t(u)\text{d}u + \sum_{\ell = 1}^cm^v_\ell\lambda_\ell\int_U \widehat{\varphi}_\ell(x,u) \pi_t(u) \text{d}u \label{KoopmanHJB}
\end{align}
to get our updated V^{\pi_j^*}
We repeat this process for t timesteps, specified by the caller of the function.
"""
def learningAlgorithm(L, X, Psi_X_tilde, U, reward, timesteps=100, cutoff=8, lamb=0.05):
# placeholder functions
V = lambda x: x
pi_hat_star = lambda x: x
low = np.min(U)
high = np.max(U)
constant = 1/lamb
eigenvalues, eigenvectors = sp.linalg.eig(L)
eigenvectors = eigenvectors
@nb.njit(fastmath=True)
def eigenfunctions(i, psi_x_tilde):
return np.dot(np.real(eigenvectors[i]), psi_x_tilde) #Psi_X_tilde[:, l]
eigenvectors_inverse_transpose = sp.linalg.inv(eigenvectors).T
currentV = np.zeros(X.shape[1]) # V^{\pi*_0}
lastV = currentV.copy()
t = 0
while t < timesteps:
G_X_tilde = currentV.copy()
B_v = ols(Psi_X_tilde_T, G_X_tilde)
generatorModes = B_v.T @ eigenvectors_inverse_transpose
@nb.jit(forceobj=True, fastmath=True)
def Lv_hat(l, u):
psi_x_tilde = psi_x_tilde_with_diff_u(l, u)
summation = 0
for ell in range(cutoff):
summation += eigenvalues[ell] * eigenfunctions(ell, psi_x_tilde) * generatorModes[ell]
return summation
@nb.jit(forceobj=True, fastmath=True)
def compute(u, l):
inp = (constant * (reward(X[:,l], u) + Lv_hat(l, u))).astype('longdouble')
return np.exp(inp)
def pi_hat_star(u, l): # action given state
numerator = compute(u, l)
denominator = integrate.romberg(compute, low, high, args=(l,), divmax=30)
return numerator / denominator
def compute_2(u, l):
eval_pi_hat_star = pi_hat_star(u, l)
return (reward(X[:,l], u) - (lamb * ln(eval_pi_hat_star))) * eval_pi_hat_star
def integral_summation(l):
summation = 0
for ell in range(cutoff):
summation += generatorModes[ell] * eigenvalues[ell] * \
integrate.romberg(
lambda u, l: eigenfunctions(ell, Psi_X_tilde[:, l]) * pi_hat_star(u, l),
low, high, args=(l,), divmax=30
)
return summation
def V(l):
return (integrate.romberg(compute_2, low, high, args=(l,), divmax=30) + \
integral_summation(l))
lastV = currentV
for i in range(currentV.shape[0]):
currentV[i] = V(i)
t+=1
print("Completed learning step", t, "\n")
return currentV, pi_hat_star
"""
Unfortunately the results are a little strange. For one, the algorithm far too computationally complex, resulting in roughly a few minutes of compute per timestep.
The other issue we were finding is that the optimal policy, regardless of the state, always has a slight preference to pick action 0 over action 1 (1.02 vs 0.98).
This of course is incorrect as we would expect that the policy would prefer action 1 in the case where action 0 would cause you to terminate the episode.
"""
## Algorithm 2
"""
Sinha et al. (https://arxiv.org/pdf/1909.12520.pdf) proposed an algorithm, termed Recursive EDMD, for learnign the Koopman operator in an online learning setting.
We propose an altered algorithm that allows us to retrieve the Koopman generator \mathcal{L} by making use of gEDMD in order to get its eigenfunctions and running the rest of the algorithm.
"""
def rgEDMD(
x_tilde,
X_tilde,
Psi_X_tilde,
dPsi_X_tilde,
k,
z_m=np.zeros((k,k)),
phi_m_inverse=np.linalg.inv(np.identity(k))
):
X_tilde = np.append(X_tilde, x_tilde.reshape(-1,1), axis=1)
Psi_X_tilde = psi(X_tilde)
for l in range(k):
dPsi_X_tilde[l, -1] = dpsi(X_tilde, l, -2)
dPsi_X_tilde = np.append(dPsi_X_tilde, np.zeros((k,1)), axis=1) #? should this really append 0s?
Psi_X_tilde_m = Psi_X_tilde[:,-1].reshape(-1,1)
Psi_X_tilde_m_T = Psi_X_tilde_m.T #? maybe pinv?
# update z_m
z_m = z_m + dPsi_X_tilde[:,-2].reshape(-1,1) @ Psi_X_tilde_m_T
# update \phi_m^{-1}
phi_m_inverse = phi_m_inverse - \
((phi_m_inverse @ Psi_X_tilde_m @ Psi_X_tilde_m_T @ phi_m_inverse) / \
(1 + Psi_X_tilde_m_T @ phi_m_inverse @ Psi_X_tilde_m))
L_m = z_m @ phi_m_inverse
# updated dPsi_X_tilde, updated z_m, updated \phi_m^{-1}, and approximate generator
return dPsi_X_tilde, z_m, phi_m_inverse, L_m
## Algorithm 3
"""
Now that we can calculate the Koopman generator in an online learning setting by calling rgEDMD
every time a new observation is made, we can, by extension, also run the learning algorithm in an online learning setting.
The following is the combination of the two algorithms.
"""
def onlineKoopmanLearning(X_tilde, Psi_X_tilde, dPsi_X_tilde):
X_tilde_builder = X_tilde[:,:2]
Psi_X_tilde_builder = Psi_X_tilde[:,:2]
dPsi_X_tilde_builder = dPsi_X_tilde[:,:2]
k = dPsi_X_tilde_builder.shape[0]
z_m = np.zeros((k,k))
phi_m_inverse = np.linalg.inv(np.identity(k))
for x_tilde in X_tilde.T: # for each data point
dPsi_X_tilde, z_m, phi_m_inverse, L_m = rgEDMD(
x_tilde, X_tilde_builder, Psi_X_tilde_builder, dPsi_X_tilde_builder, k, z_m, phi_m_inverse
) # add new data point
_, pi = learningAlgorithm(L, X, Psi_X_tilde, np.array([0,1]), cartpoleReward, timesteps=2, lamb=1) # learn new optimal policy
# _, pi = learningAlgorithm(L, X, Psi_X_tilde, np.array([0,1]), cartpoleReward, timesteps=2, lamb=1)
return pi # esimated optimal policy
"""
The way the algorithm is written, it might be infeasible to test since it has to run the computationally expensive learning algorithm every time a new data point is added.
For testing purposes we will uncomment the line before "return pi" and comment out the looping learningAlgorithm call.
"""