In [None]:
import numpy as np
from TMDP import TMDP
from River_swim import River

#from algorithms import *
from PG_algorithms import *
from model_functions import *
import gym
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.nn import functional as F

#np.set_printoptions(precision=4)
import math
from utils import *
from ActorCritic import *
from ReplayBuffer import ReplayBuffer

nS = 15
nA = 2
seed = get_current_seed()
seed = 44697628841978080856580175700798794719
gamma = .9
mu = np.ones(nS) * 1/nS
river = River(nS, mu, small=5, large=1000, seed=seed)
tau = 1.
xi = np.ones(nS) * 1/nS
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)
tmdp_1 = TMDP(river, xi, tau=.5, gamma=gamma, seed=seed)


In [None]:
rep_buffer = ReplayBuffer(max_size=int(1024), input_shape=(1,), seed=seed)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ref_policy = ActorNet(nS, nA, hidden_dim=256).to(device)
policy_pi = ActorNet(nS, nA, hidden_dim=256).to(device)

q1_func = QNet(nS, nA, hidden_dim=256).to(device)
q2_func = QNet(nS, nA, hidden_dim=256).to(device)
q1_target = QNet(nS, nA, hidden_dim=256).to(device)
q2_target = QNet(nS, nA, hidden_dim=256).to(device)

In [None]:
ref_opt = torch.optim.Adam(ref_policy.parameters(), lr=1e-3)
q1_opt = torch.optim.Adam(q1_func.parameters(), lr=1e-3)
q2_opt = torch.optim.Adam(q2_func.parameters(), lr=1e-3)


In [None]:

tmdp.update_tau(.5)
tmdp.reset()
cur_res = curriculum_SAC(tmdp, policy_pi, ref_policy, q1_target, q2_target, q1_func,
                           q2_func, ref_opt, q1_opt, q2_opt, rep_buffer, 
                           alpha=.2, alpha_u=.2, beta=0.005, episodes=1500000, 
                           batch_size=256, sample_steps=8, update_rate=64,
                           biased=False)

In [None]:
print(ref_policy.get_probabilities())
print(get_policy(q1_func.get_values()))

In [None]:
res = bellman_optimal_q(tmdp.env.P_mat, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]

d = compute_d_from_tau(tmdp.env.mu, tmdp.env.P_mat, tmdp.xi, get_policy(Q), tmdp.gamma, 0.)
d_curr = compute_d_from_tau(tmdp.env.mu, tmdp.P_mat_tau, tmdp.xi, get_policy(ref_policy.get_probabilities()), tmdp.gamma, 0.)

print(get_policy(Q))
print(get_policy(ref_policy.get_probabilities()))

r_s_a = compute_r_s_a(tmdp.env.P_mat, tmdp.env.reward)

j_opt = compute_j(r_s_a, get_policy(Q), d, tmdp.gamma)
j_curr = compute_j(r_s_a, get_policy(ref_policy.get_probabilities()), d_curr, tmdp.gamma)
print("optimal performance: ",j_opt, "curriculum performance: ",j_curr)

In [None]:
import matplotlib.pyplot as plt
reward_records = cur_res["reward_records"]
# Generate recent 50 interval average
average_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 50:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-49:idx+1]
    average_reward.append(np.average(avg_list))
# Plot
#plt.plot(reward_records)
plt.plot(average_reward)
