In [None]:
import numpy as np
from TMDP import TMDP
from RiverSwim import RiverSwim

from algorithms import *
from model_functions import *

import matplotlib.pyplot as plt


#np.set_printoptions(precision=4)
import math
from utils import *

nS = 50
nA = 2
seed = get_current_seed()
seed = 44697628841978080856580175700798794719
gamma = .9
mu = np.ones(nS) * 1/nS
river = RiverSwim(nS, mu, small=5, large=1000, seed=seed)
tau = 1.
xi = np.ones(nS) * 1/nS
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)
tmdp_1 = TMDP(river, xi, tau=.5, gamma=gamma, seed=seed)


In [None]:

res = bellman_optimal_q(tmdp.P_mat_tau, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]
res_1 = bellman_optimal_q(tmdp_1.P_mat_tau, tmdp_1.env.reward, tmdp_1.gamma)
Q_1 = res_1["Q"]
pi = get_policy(Q)
V = compute_V_from_Q(Q, pi)
U = compute_U_from_V(tmdp.env.reward, tmdp.gamma, V)
pi_prime = get_policy(Q_1)
pol_adv_func = compute_policy_advantage_function(Q, V)
print(pol_adv_func)
model_adv_func = compute_model_advantage_function(U, Q)
print(model_adv_func)

In [None]:
rel_pol_adv = compute_relative_policy_advantage_function(pi, pi_prime, Q)
print(rel_pol_adv)
rel_model_adv = compute_relative_model_advantage_function(tmdp.env.P_mat, tmdp.xi, U)
print((tmdp.tau - 0.4)*rel_model_adv)

In [None]:
d = compute_d_from_tau(tmdp.env.mu, tmdp.env.P_mat, tmdp.xi, pi, tmdp.gamma, tmdp.tau)
delta = compute_delta(d, pi)
print(d)
print(delta)

In [None]:
pol_adv = compute_expected_policy_advantage(rel_pol_adv, d)
model_adv = compute_expected_model_advantage(rel_model_adv, delta)
delta_Q = get_sup_difference_Q(Q)
delta_U = get_sup_difference_U(U)
d_inf_pol = get_d_inf_policy(pi, pi_prime)
d_inf_model = get_d_inf_model(tmdp.env.P_mat, tmdp.xi)
d_exp_pol = get_d_exp_policy(pi, pi_prime, d)
d_exp_model = get_d_exp_model(tmdp.env.P_mat, tmdp.xi, delta)
biased = True

print("Policy advantage: ",pol_adv)
print("Model advantage: ",model_adv)
print("D_inf_pol: ",d_inf_pol)
print("D_inf_model: ",d_inf_model)
print("D_exp_pol: ",d_exp_pol)
print("D_exp_model: ",d_exp_model)

alpha_tau = compute_alpha_tau(pol_adv, tmdp.gamma, delta_U, d_exp_pol, d_inf_pol)
print("alpha_tau: ",alpha_tau)
alpha_0 = compute_alpha_0(pol_adv, tmdp.tau, tmdp.gamma, delta_U, d_exp_pol, d_inf_pol, d_exp_model, d_inf_model)
print("alpha_0: ",alpha_0)

tau_0 = compute_tau_prime_0(tmdp.tau, model_adv, tmdp.gamma, d_exp_model, delta_U, d_inf_model, biased=biased)
print("tau_prime_0: ", tau_0)

tau_1 = compute_tau_prime_1(tmdp.tau, model_adv, tmdp.gamma, d_exp_model, delta_U, d_inf_model, d_inf_pol, d_exp_pol)
print("tau_prime_1: ", tau_1)



In [None]:
print(delta_U)
print(delta_Q)


In [None]:
b_alpha_tau = compute_teleport_bound(alpha_tau, tmdp.tau, tmdp.tau, pol_adv, model_adv,
                                     tmdp.gamma, d_inf_pol, d_inf_model, 
                                     d_exp_pol, d_exp_model, delta_U, biased=biased)
print("b_alpha_tau: ",b_alpha_tau)
b_alpha_0 = compute_teleport_bound(alpha_0, tmdp.tau, 0, pol_adv, model_adv,
                                     tmdp.gamma, d_inf_pol, d_inf_model, 
                                     d_exp_pol, d_exp_model, delta_U,biased=biased)
print("b_alpha_0: ",b_alpha_0)
b_0_tau = compute_teleport_bound(0, tmdp.tau, tau_0, pol_adv, model_adv,
                                     tmdp.gamma, d_inf_pol, d_inf_model, 
                                     d_exp_pol, d_exp_model, delta_U,biased=biased)
print("b_0_tau: ",b_0_tau)
b_1_tau = compute_teleport_bound(1, tmdp.tau, tau_1, pol_adv, model_adv,
                                     tmdp.gamma, d_inf_pol, d_inf_model, 
                                     d_exp_pol, d_exp_model, delta_U,biased=biased)
print("b_1_tau: ",b_1_tau)

In [None]:
tb_alpha_tau = compute_teleport_bound_alpha_tau(tmdp.tau, pol_adv, gamma, d_inf_pol, d_exp_pol, d_exp_model, delta_U,biased=biased)
print("tb_alpha_tau: ",tb_alpha_tau)
tb_alpha_0 = compute_teleport_bound_alpha_0(tmdp.tau, pol_adv, model_adv, gamma, d_inf_pol, d_exp_pol, d_inf_model, d_exp_model, delta_U)
print("tb_alpha_0: ",tb_alpha_0)

tb_0_tau = compute_teleport_bound_0_tau(tmdp.tau, model_adv, gamma, d_inf_model, d_exp_model, delta_U,biased=biased)
print("tb_0_tau: ",tb_0_tau)

tb_1_tau = compute_teleport_bound_1_tau(tmdp.tau, pol_adv, model_adv, gamma, d_inf_model, d_inf_pol, d_exp_model, d_exp_pol, delta_U)
print("tb_1_tau: ",tb_1_tau)

In [None]:
tb_alpha_0 = compute_teleport_bound_alpha_0_test(tmdp.tau, pol_adv, model_adv, gamma, d_inf_pol, d_exp_pol, d_inf_model, d_exp_model, delta_U)
print("tb_alpha_0: ",tb_alpha_0)

tb_1_tau = compute_teleport_bound_1_tau_test(tmdp.tau, pol_adv, model_adv, gamma, d_inf_model, d_inf_pol, d_exp_model, d_exp_pol, delta_U)
print("tb_1_tau: ",tb_1_tau)

In [None]:
q_c = np.zeros((nS, nA))
tmdp.update_tau(.5)
cur_res = curriculum_MPI(tmdp, q_c, alpha = 0.25, alpha_pol = 0.2, episodes=1000000, batch_nS=1, temperature=15., biased=False, lam=0.)


In [None]:
res = bellman_optimal_q(mdp.env.P_mat, mdp.env.reward, mdp.gamma)
Q = res["Q"]

d = compute_d_from_tau(mdp.env.mu, mdp.env.P_mat, mdp.xi, get_policy(Q), mdp.gamma, 0.)
d_curr = compute_d_from_tau(tmdp.env.mu, tmdp.P_mat_tau, tmdp.xi, get_policy(get_softmax_policy(cur_res["thetas"][-1])), tmdp.gamma, 0.)

print(get_policy(Q) - get_policy(get_softmax_policy(cur_res["thetas"][-1])))

r_s_a = compute_r_s_a(mdp.env.P_mat, mdp.env.reward)

j_opt = compute_j(r_s_a, get_policy(Q), d, mdp.gamma)
j_curr = compute_j(r_s_a, get_softmax_policy(cur_res["Qs"][-1]), d_curr, tmdp.gamma)
print("optimal performance: ",j_opt, "curriculum performance: ",j_curr)