In [1]:
import numpy as np
from TMDP import TMDP
from River_swim import River

from algorithms import *
from model_functions import *

#np.set_printoptions(precision=4)
import math
from utils import *

nS = 8
nA = 2
seed = 3231
gamma = .9
river = River(nS, gamma=gamma, small=5, large=1000, seed=seed)
tau = 0.9
xi = np.ones(nS) * 1/nS
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)


In [2]:
print(tmdp.s)
s_prime, r, flags, prob = tmdp.step(1)
print(s_prime, r, flags, prob)

[1]
[1] [0.] {'done': False, 'teleport': False} [0.1725]


In [3]:
r_s_a = compute_r_s_a(tmdp.P_mat_tau, tmdp.reward)
Q_star = bellman_optimal_q(tmdp.nS, tmdp.nA, tmdp.P_mat_tau, tmdp.reward, 1e-4, gamma)
pi = get_policy(Q_star[0])
pi_prime = np.array([[0., 1.] for i in range(nS)])

V_star = get_value_function(Q_star[0])
print(tmdp.P_mat.shape)


(8, 2, 8)


In [4]:
d = compute_d(tmdp.mu, tmdp.P_mat_tau, pi, tmdp.gamma)
delta = compute_delta(d, pi)
print(delta)

[[0.1373545  0.        ]
 [0.12491773 0.        ]
 [0.12408586 0.        ]
 [0.11484289 0.        ]
 [0.         0.121432  ]
 [0.         0.12496323]
 [0.         0.1318389 ]
 [0.         0.1205649 ]]


In [5]:
U_star = get_state_action_nextstate_value_function(tmdp.P_mat_tau, tmdp.reward, tmdp.gamma, Q_star[0])
q_reb = rebuild_Q_from_U(tmdp.P_mat_tau, U_star)


In [6]:
model_adv = get_model_advantage_function(tmdp.P_mat_tau, tmdp.reward, tmdp.gamma, Q_star[0])
rel_model_adv_hat = compute_relative_model_advantage_function_hat(tmdp.P_mat, tmdp.xi, U_star)
# With the fixed policy, evaluate the relative model advantage function in moving from P_mat_tau to P_mat, that is, moving from the simplified problem
# with tau != 0 to the original problem with tau = 0
rel_model_adv = compute_relative_model_advantage_function(tmdp.P_mat, model_adv)


print(compute_discounted_distribution_relative_model_advantage_function_from_delta_tau(rel_model_adv_hat, delta, tmdp.tau, 0.))
print(compute_discounted_distribution_relative_model_advantage_function(rel_model_adv, delta))

-5.469412689521509
-5.469390576833751


In [7]:
dif = get_expected_difference_transition_models(tmdp.P_mat_tau, tmdp.P_mat, delta)
print(dif)
print(tmdp.P_mat_tau.shape)

expected_value = np.sum(delta.T * np.abs(tmdp.P_mat - tmdp.P_mat_tau))

print(expected_value)
dif_tau = get_expected_difference_transition_models(tmdp.P_mat, tmdp.xi, delta)
print(dif_tau*tmdp.tau)


1.394688077659496
(8, 2, 8)
1.5092592543131351
1.394688077659496


In [8]:
sup = get_sup_difference_transition_models(tmdp.P_mat_tau, tmdp.P_mat)
print(sup)

print(np.max(np.abs(tmdp.P_mat_tau - tmdp.P_mat)))

Xi = np.tile(tmdp.xi, (nA, nS)).T
Xi = Xi.reshape((nS, nA, nS))

print(np.max(np.abs(tmdp.P_mat - Xi))*tmdp.tau)

0.7875000000000001
0.7875000000000001
0.7875


In [11]:
d = get_sup_difference_q(Q_star[0])
print(d)
print(np.max(np.abs(Q_star[0] - Q_star[0])))
print(Q_star[0].shape)

j = get_performance_improvement_lower_bound(tmdp.P_mat_tau, tmdp.P_mat, tmdp.reward, tmdp.gamma, tmdp.tau, 0., Q_star[0], tmdp.mu)
print(j)

tau_prime = get_optimal_tauprime(tmdp.P_mat_tau, tmdp.reward, tmdp.gamma, tmdp.tau, Q_star[0], tmdp.mu, tmdp.xi)
print(tau_prime)
tau = 0.9
while tau > 0:
    tmdp1 = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
    Q = bellman_optimal_q(tmdp1.nS, tmdp1.nA, tmdp1.P_mat_tau, tmdp1.reward, 1e-4, gamma)
    tau_1 = get_optimal_tauprime(tmdp1.P_mat_tau, tmdp1.reward, tmdp1.gamma, tmdp1.tau, Q[0], tmdp1.mu, tmdp1.xi)
    pi = get_policy(Q[0])
    j = get_expected_avg_reward(tmdp1.P_mat_tau, pi, tmdp.reward, tmdp.gamma, tmdp.mu)
    print(j, tau_1)
    tau = tau_1 if tau_1 < tau else tau - 0.05
    

146.72542323850752
0.0
(8, 2)
-5341.252667545748
0.918855374688584
173.26437773473467 0.918855374688584
180.9329503694926 0.8620650585459549
188.33119691843842 0.8086420296885103
195.48462018966146 0.7565636378168032
202.42290310658385 0.7051555449066603
209.1807516962191 0.6541286257327648
215.79895958605113 0.6033382033193156
222.32575903922262 0.5527038123582615
228.81854281856621 0.5021771257412624
235.34606379815725 0.45172742243816877
242.07533376983304 0.40130062865048993
249.15623471475635 0.35088895958371163
256.7695705943151 0.3004952990298659
265.1905724442956 0.25010843876813155
274.80355630506017 0.19972201960575428
274.8612602014873 0.19944188452055314
274.9194652589659 0.19915957814042654
274.9781762957535 0.19887508355339303
275.03739822252015 0.1985883837146144
275.09713600658495 0.19829946144534724
275.15739467272425 0.19800829943188708
275.218179303994 0.19771488022450462
275.27949504256736 0.19741918623637422
275.3413470905863 0.19712119974249476
275.4037407110309 0