In [1]:
import numpy as np
from TMDP import TMDP
from River_swim import River

from algorithms import *
from model_functions import *

import matplotlib.pyplot as plt
import autograd.numpy as np
from autograd import grad, jacobian, elementwise_grad

#np.set_printoptions(precision=4)
import math
from utils import *

nS = 8
nA = 2
seed = 2184109
gamma = .9
mu = np.ones(nS) * 1/nS
river = River(nS, mu, gamma=gamma, small=5, large=1000, seed=seed)
tau = 1.
xi = np.ones(nS) * 1/nS
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)
  # argnum=5 corresponds to the position of 'tau'



Current seed for result reproducibility: 83037765519772580130275700911666628504


In [2]:
def numerical_gradient(func, mu, P_mat, xi, pi, gamma, tau, h=1e-5):
    return (func(mu, P_mat, xi, pi, gamma, tau + h) - func(mu, P_mat, xi, pi, gamma, tau - h)) / (2 * h)

In [3]:
def numerical_gradient_q(func, P_mat, xi, reward, gamma, tau, h=1e-5):
    print("This is:",tau)
    return (func(P_mat, xi, reward, gamma, tau + h)["Q"] - func(P_mat, xi, reward, gamma, tau - h)["Q"]) / (2 * h)

In [4]:
step = 0.01
tau = 0.99
while True:
    print("tau", tau)
    tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
    Q_star = bellman_optimal_q_tau(tmdp.P_mat, tmdp.xi, tmdp.reward, tmdp.gamma, tmdp.tau)["Q"]
    numerical_grad_q = numerical_gradient_q(bellman_optimal_q_tau, tmdp.P_mat, tmdp.xi, tmdp.reward, tmdp.gamma, tmdp.tau)
    analytical_grad_q = compute_gradient_q_tau(tmdp.P_mat, tmdp.xi, tmdp.reward, tmdp.mu, tmdp.gamma, tmdp.tau)
    pi = get_policy(Q_star)
    print("Discounted state visit distribution:")
    d = compute_d_from_tau(tmdp.mu, tmdp.P_mat, tmdp.xi, pi, tmdp.gamma, tmdp.tau)
    print(*d)
    grad_d = compute_grad_d(tmdp.P_mat, tmdp.P_mat_tau, tmdp.xi, tmdp.mu, pi, tmdp.gamma)
    print("Analytical gradient:")
    print(*grad_d)
    numerical_grad = numerical_gradient(compute_d_from_tau, tmdp.mu, tmdp.P_mat, tmdp.xi, pi, tmdp.gamma, tmdp.tau)
    print("Numerical gradient:")
    print(*numerical_grad)
    print("Gradient of Q function:")
    print("Analytical: ", analytical_grad_q)
    print("Numerical: ", numerical_grad_q)
    if tau == 0:
        break
    tau = max(0, tau-step)
    print("\n")
    


tau 0.99
This is: 0.99
Discounted state visit distribution:
0.12613521620753265 0.12499991796276269 0.12499088475140531 0.1239871946005924 0.12466066732488325 0.12499969036543723 0.1256758174296605 0.12455061135772592
Analytical gradient:
-0.11455237056542926 2.461365837565801e-05 0.0018233260711583486 0.10131124574349758 0.03411755859669498 6.252066567690325e-05 -0.06766423378312078 0.04487733961314676
Numerical gradient:
-0.11455237057672595 2.461366666040021e-05 0.0018233260699884732 0.10131124574333515 0.03411755859564214 6.252066434009329e-05 -0.0676642337840705 0.04487733961111572
Gradient of Q function:
Analytical:  [[-154.82699513 -150.44018263]
 [-150.45199513 -150.41493257]
 [-150.41262013 -150.40773032]
 [-150.40720326 -150.40715994]
 [-150.40715451 -150.40740837]
 [-150.40715635 -150.50081583]
 [-150.40799702 -151.15928927]
 [-150.71767328 -326.19025689]]
Numerical:  [[-140.96029265 -136.39291956]
 [-136.5852947  -136.00481465]
 [-135.94404623 -135.93393343]
 [-135.93285959