## Optimal Q_tau* test

junix --filepath ./Test.ipynb --output_dir ./results/intermediate_curves

In [None]:
import numpy as np
from TMDP import TMDP
from RiverSwimSwimSwim impRiverSwimiverSwim

from algorithms import *
from model_functions import *
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
from utils import *

In [None]:
# Test with tau=0.9
nS = 8
nA = 2
seed = 3231
gamma = 0.9
river = RiverSwimSwim(nS, gamma, 5, 1000)
tau = 0.9
status_step = 500
xi = np.ones(river.nS)*1/river.nS
tmdp = TMDP(river, xi, tau, gamma, seed)
taus = [0.90, 0.80, 0.70, 0.60, 0.50, 0.40, 0.30, 0.20, 0.10, 0.090, 0.080, 0.070, 0.060, 0.050, 0.040, 0.030, 0.020, 0.010]

In [None]:
from gym import Env, spaces
from gym.utils import seeding
from model_functions import * 

taus = np.flip(np.linspace(0, 0.9, num=50))
type(np.ones(7))
type({})
type(spaces.Discrete(nA))

In [None]:
results = []
iterations = {}
# Learning from scratch with different values of tau
for tau in taus:
    tmdp = TMDP(river, xi, tau, gamma)
    
    Q_star_tau, iterations[tau] = bellman_optimal_q(tmdp.nS, tmdp.nA, tmdp.P_mat_tau, tmdp.reward, 1e-4, tmdp.gamma)
    
    d = compute_d(tmdp.mu, tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.gamma)
    # Compute the gamma discounted state distribution
    delta = compute_delta(d, get_policy(Q_star_tau))
    # Compute the state value function
    V_star_tau = get_value_function(Q_star_tau) 
    # Compute the expected reward when picking action a in state s
    r_s_a = compute_r_s_a(nS, nA, tmdp.P_mat_tau, tmdp.reward)
    # Compute the state action next-state value function U_tau(s,a,s') = R(s,a) + \gamma*V_tau(s')
    U_star_tau = compute_state_action_nextstate_value_function(nS, nA, r_s_a, tmdp.gamma, V_star_tau)
    # Rebuild Q using U as Q_tau(s,a) = \sum{s' \in S}P_tau(s'|s,a)*U_tau(s,a,s')
    Q_t = compute_Q_from_U(tmdp.P_mat_tau, U_star_tau)

    # Compute the relative model advantage function hat \hat{A}_{tau, mu}(s,a)
    A_tau_hat = compute_relative_model_advantage_function_hat(tmdp.P_mat, tmdp.xi, U_star_tau)
    # Compute the discounted distribution relative model advantage function hat \hat{A}_{tau, mu}
    A_hat = compute_discounted_distribution_relative_model_advantage_function_hat(A_tau_hat, delta)
    # The dissimilarity term D = D_e * gamma * D_inf is upperbounded by 4*gamma+(tau - tau_1)
    # Compute Delta Q_tau as the superior among the difference of the L_1 norm of elements of Q_tau
    d_q = get_sup_difference_Q(Q_star_tau)
    tau_1 = compute_tau_prime(A_hat, tmdp.gamma, tmdp.tau, d_q)
    
    J_star_tau = get_expected_avg_reward(tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)
    J_0 = get_expected_avg_reward(tmdp.P_mat, get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)

    # Compute the performance improvement lower bound when moving to tau=0
    l_b = compute_performance_improvement_lower_bound(A_hat, tmdp.gamma, d_q, tmdp.tau, 0.0)
    print("Moving from {} to {}".format(tmdp.tau, 0.))
    print("Theoretical lb on performance improvement: {}, Empirical one: {}".format(l_b, J_0-J_star_tau))
    print("Optimal tau': ", tau_1)
    print("Advantage: ", (tmdp.tau*A_hat))
    print("\n")
    results.append({"Q": Q_star_tau, "tau":tmdp.tau, "J":J_star_tau, "J_0":J_0, "policy":get_policy(Q_star_tau), "l_b":l_b, "Adv":tmdp.tau*A_hat})

In [None]:
print(iterations)

In [None]:
x = np.array([])
y = np.array([])
z = np.array([])
for tau in taus:
    y = np.append(y, iterations[tau])
    label = "τ="+str(tau)
    x = np.append(x, label)
x_map = np.array([i for i in range(len(x))])
#fig, ax = plt.subplots(fignS=(5, 2.7), layout='constrained')
plt.scatter(x, y, c='orange', label='#iterations for convergence')
plt.xticks(x_map, x)

plt.xlabel("Current value of τ")
plt.xticks(rotation=90)
plt.ylabel("E[J(τ)]")
plt.title("Expected discounted sum of rewards")

plt.legend(loc="lower right")



In [None]:
x = np.array([])
y = np.array([])
k = np.array([])
for i, res in enumerate(results):
    if i < len(results)-1:
        y = np.append(y, res['J'])
        label = "τ="+str(res['tau'])
        x = np.append(x, label)
x_map = np.array([i for i in range(len(x))])
#fig, ax = plt.subplots(fignS=(5, 2.7), layout='constrained')
plt.scatter(x, y, c='orange', label='#iterations')
plt.xticks(x_map, x)

plt.hlines(y=results[-1]['J_0'], xmin=0, xmax=len(x), colors='g')
plt.xlabel("Current value of τ")
plt.xticks(rotation=90)
plt.ylabel("E[J(τ)]")
plt.title("Expected discounted sum of rewards")

plt.legend(loc="lower right")




In [None]:
x = np.array([])
y = np.array([])
z = np.array([])
k = np.array([])
old = -np.inf
for i, res in enumerate(results):
    if i < len(results)-1:
        y = np.append(y, res['J'])
        label = "τ="+str(res['tau'])
        x = np.append(x, label)
        z = np.append(z, res['J_0'])
        k = np.append(k, res['Adv'])
        if res['Adv'] > 0 and old < 0:
            t_lim = res['tau']
            old = np.inf
x_map = np.array([i for i in range(len(x))])
#fig, ax = plt.subplots(fignS=(5, 2.7), layout='constrained')
#plt.scatter(x, y, c='orange', label='J on current τ')
#plt.scatter(x, z, label='J on original problem')
plt.scatter(x, k, label='Adv', c='r')
plt.xticks(x_map, x)

plt.hlines(y=0, xmin=0, xmax=len(x), colors='g')
plt.xlabel("Current value of τ")
plt.xticks(rotation=90)
plt.ylabel("E[J(τ)]")
plt.title("Expected discounted sum of rewards")

plt.legend(loc="lower right")


In [None]:
print("The limit value of tau is:", t_lim)

In [None]:
x = np.array([])
y = np.array([])
z = np.array([])
for i, res in enumerate(results):
    if i < len(results)-1:
        y = np.append(y, res['J_0'] - res['J'])
        label = str(res['tau'])
        x = np.append(x, label)
        z = np.append(z, res['l_b'])
x_map = np.array([i for i in range(len(x))])

plt.scatter(x, y, label='J_0 - J_τ')
plt.scatter(x, z,  c='orange', label='Performance improvement lower bound')
plt.xticks(x_map, x)

plt.xlabel("Current value of τ")
plt.xticks(rotation=90)
plt.ylabel("E[J(τ)]")
plt.title("Performance improvement")

plt.legend(loc="lower right")


In [None]:
"""# Learning from scratch with different values of tau
for tau in taus:
    tmdp = TMDP(river, xi, tau, gamma)
    
    Q_star_tau = bellman_optimal_q(tmdp.nS, tmdp.nA, tmdp.P_mat_tau, tmdp.reward, 1e-4, tmdp.gamma)

    d = compute_d(tmdp.mu, tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.gamma)
    # Compute the gamma discounted state distribution
    delta = compute_delta(d, get_policy(Q_star_tau))
    # Compute the state value function
    V_star_tau = get_value_function(Q_star_tau) 
    # Compute the expected reward when picking action a in state s
    r_s_a = compute_r_s_a(nS, nA, tmdp.P_mat_tau, tmdp.reward)
    # Compute the state action next-state value function U_tau(s,a,s') = R(s,a) + \gamma*V_tau(s')
    U_star_tau = compute_state_action_nextstate_value_function(nS, nA, r_s_a, tmdp.gamma, V_star_tau)
    # Rebuild Q using U as Q_tau(s,a) = \sum{s' \in S}P_tau(s'|s,a)*U_tau(s,a,s')
    Q_t = compute_Q_from_U(tmdp.P_mat_tau, U_star_tau)


    # Compute the relative model advantage function hat \hat{A}_{tau, mu}(s,a)
    A_tau_hat = compute_relative_model_advantage_function_hat(tmdp.P_mat, tmdp.xi, U_star_tau)
    # Compute the discounted distribution relative model advantage function hat \hat{A}_{tau, mu}
    A_hat = compute_discounted_distribution_relative_model_advantage_function_hat(A_tau_hat, delta)
    # The dissimilarity term D = D_e * gamma * D_inf is upperbounded by 4*gamma+(tau - tau_1)
    # Compute Delta Q_tau as the superior among the difference of the L_1 norm of elements of Q_tau
    d_q = get_sup_difference_Q(Q_star_tau)
    
    # Compute the performance improvement lower bound when moving to tau=0
    l_b = compute_performance_improvement_lower_bound(A_hat, tmdp.gamma, d_q, tmdp.tau, 0.0)
    
    
    tau_1 = compute_tau_prime(A_hat, tmdp.gamma, tmdp.tau, d_q)
    tmdp_1 = tmdp = TMDP(river, xi, tau_1, gamma)
    print(tmdp.tau)
   
    print(tmdp_1.tau)
    print(tmdp.tau - A_hat*(1-gamma)/(4*gamma**2*d_q))

    J_star_tau = get_expected_avg_reward(tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)
    J_0 = get_expected_avg_reward(tmdp.P_mat, get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)
    J_1 = get_expected_avg_reward(tmdp_1.P_mat_tau, get_policy(Q_star_tau), tmdp_1.reward, tmdp_1.gamma, tmdp_1.mu)

    print(tmdp.tau)
   

    print("Moving from {} to {}".format(tmdp.tau, 0.))
    print("Theoretical lb on performance improvement: {}, Empirical one: {}\n".format(l_b, J_0-J_star_tau))

    print("Moving from {} to {}".format(tmdp.tau, tmdp_1.tau))
    l_b_1 = compute_performance_improvement_lower_bound(A_hat, tmdp.gamma, d_q, tmdp.tau, tau_1)
    opt_lb = compute_optimal_lower_bound(A_hat, gamma, d_q)
    print("Theoretical lb on performance improvement: {}, Empirical one: {}".format(l_b_1, J_1-J_star_tau))
    print("\nThe lower bound for tau' is: {}".format(opt_lb))
    print("\n\n")
    
    results.append({"Q": Q_star_tau, "tau":tmdp.tau, "J":J_star_tau, "J_0":J_0, "policy":get_policy(Q_star_tau), "l_b":l_b})"""

In [None]:
results = []
bad = []
iterations = {}
# Learning from scratch with different values of tau
for tau in taus:
    tmdp = TMDP(river, xi, tau, gamma)
    
    Q_star_tau, iterations[tau] = bellman_optimal_q(tmdp.nS, tmdp.nA, tmdp.P_mat_tau, tmdp.reward, 1e-4, tmdp.gamma)

    d = compute_d(tmdp.mu, tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.gamma)
    # Compute the gamma discounted state distribution
    delta = compute_delta(d, get_policy(Q_star_tau))
    # Compute the state value function
    V_star_tau = get_value_function(Q_star_tau) 
    # Compute the expected reward when picking action a in state s
    r_s_a = compute_r_s_a(nS, nA, tmdp.P_mat_tau, tmdp.reward)
    # Compute the state action next-state value function U_tau(s,a,s') = R(s,a) + \gamma*V_tau(s')
    U_star_tau = compute_state_action_nextstate_value_function(nS, nA, r_s_a, tmdp.gamma, V_star_tau)
    # Rebuild Q using U as Q_tau(s,a) = \sum{s' \in S}P_tau(s'|s,a)*U_tau(s,a,s')
    Q_t = compute_Q_from_U(tmdp.P_mat_tau, U_star_tau)


    # Compute the relative model advantage function hat \hat{A}_{tau, mu}(s,a)
    A_tau_hat = compute_relative_model_advantage_function_hat(tmdp.P_mat, tmdp.xi, U_star_tau)
    # Compute the discounted distribution relative model advantage function hat \hat{A}_{tau, mu}
    A_hat = compute_discounted_distribution_relative_model_advantage_function_hat(A_tau_hat, delta)
    # The dissimilarity term D = D_e * gamma * D_inf is upperbounded by 4*gamma+(tau - tau_1)
    # Compute Delta Q_tau as the superior among the difference of the L_1 norm of elements of Q_tau
    d_q = get_sup_difference_Q(Q_star_tau)
    
    J_star_tau = get_expected_avg_reward(tmdp.P_mat_tau, get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)
    print("evaluating for tau: {}".format(tmdp.tau))
    for t in taus:
        if(t < tau):
            tmdp_1 = TMDP(river, xi, t, gamma)
            J_1 = get_expected_avg_reward(tmdp_1.P_mat_tau , get_policy(Q_star_tau), tmdp.reward, tmdp.gamma, tmdp.mu)

            # Compute the performance improvement lower bound when moving to tau=0
            l_b = compute_performance_improvement_lower_bound(A_hat, tmdp.gamma, d_q, tmdp.tau, tmdp_1.tau)
            print("Moving from {} to {}".format(tmdp.tau, tmdp_1.tau))
            print("Theoretical lb on performance improvement: {}, Empirical one: {}\n".format(l_b, J_1-J_star_tau))
            if J_1-J_star_tau < l_b:
                bad.append({'tau':tmdp.tau, 'tau_1':tmdp_1.tau, "J_star":J_star_tau, "J_1":J_1})
    print("\n")
    results.append({"Q": Q_star_tau, "tau":tmdp.tau, "J":J_star_tau, "J_0":J_0, "policy":get_policy(Q_star_tau), "l_b":l_b})

In [None]:
print(bad)