In [None]:
import numpy as np
from TMDP import TMDP
from model_functions import *
from policy_utils import *
import matplotlib.pyplot as plt
import gymnasium as gym
from algorithms import *
from FrozenLake import *
import pandas as pd
import constants
#np.set_printoptions(precision=4)
import math
from policy_utils import *
import torch
import mlflow
from CurriculumPMPO import CurriculumPMPO
import optuna
from experiment_result_utils import *
nrows = 20
nS = nrows**2
nA = 4
seed = constants.SEEDS[0]
set_policy_seed(seed)
num_bins = 10
gamma = .999
tau = 0.6
dense_reward = True
shape_range=(-1,0)
goal_reward = 1.
#tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
#mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)
env = FrozenLakeEnv(is_slippery=False, seed=seed, 
                        desc=generate_random_map(nrows, seed=seed), 
                        reward_shape=True,
                        num_bins=num_bins,
                        dense_reward=dense_reward,
                        shape_range=shape_range,
                        goal_reward=goal_reward,
                        )
xi = np.ones(env.nS) * 1/env.nS
tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)

In [None]:
res = bellman_optimal_q(tmdp.env.P_mat, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]
V = compute_V_from_Q(Q, get_policy(Q))
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)

In [None]:
"""env.render_mode = "human"
env.render()"""

In [None]:
model_lr = 0.373715
pol_lr = 0.215723
temp = 36.039501
epochs = 3
entropy_coef = 0.400000
eps_shift = 2e-3
batch_size = 20
debug = False

In [None]:
episodes = 1900000
final_temp=1
lam = 1
param_decay=True
eps_ppo = 0.2

In [None]:
curr_PMPO = CurriculumPMPO(tmdp, checkpoint_step=500)
tmdp.update_tau(tau) # Reset the teleport rate        

In [None]:
curr_PMPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_shift=eps_shift,
                param_decay=param_decay, epochs=epochs, max_length=1000,
                entropy_coef=entropy_coef, debug=debug)

In [None]:
#V_Q = compute_V_from_Q(Q, get_policy(curr_PMPO.Q))
V_curr = compute_V_from_Q(Q, get_softmax_policy(curr_PMPO.theta, temperature=1e-4))

j_curr = compute_expected_j(V_curr, tmdp.env.mu)
#j_q_curr = compute_expected_j(V_Q, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)

print("optimal performance: ",j_opt)
print("curriculum performance: ", j_curr)
#print("Q curriculum performance: ",j_q_curr)


In [None]:
import matplotlib.pyplot as plt
rewards = curr_PMPO.reward_records
# Generate recent 50 interval average
avg_rewards = []
for idx in range(len(rewards)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 50:
        avg_list = rewards[:idx+1]
    else:
        avg_list = rewards[idx-49:idx+1]
    avg_rewards.append(np.average(avg_list)/batch_size)
# Plot
#plt.plot(rewards)
plt.plot(avg_rewards)

In [None]:
pi = get_policy(get_softmax_policy(curr_PMPO.theta, temperature=1e-4))
#pi = get_policy(Q)

In [None]:
print(curr_PMPO.theta[0])
print(pi[0])

In [None]:

test_rewards = test_policies(tmdp, curr_PMPO.thetas, 100, 1e-5)
rewards_fig = plot_avg_test_return([test_rewards], f"PMPO Avg Return on runs")


In [None]:

tmdp.update_tau(0.)
tmdp.reset()
done = False
step = 0
env.render_mode = "human"
while True:
    s = tmdp.env.s
    a = select_action(pi[s])
    s_prime, reward, flags, prob = tmdp.step(a)
    print("state {} action {} next state {} reward {}".format(s, a, s_prime, reward))
    if flags["done"]:
        tmdp.reset()
        break
    step +=1
    if step > min(100,nrows*3):
        break
