In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from FrozenLake import *
from CurriculumMPI import CurriculumMPI

In [None]:
#Frozen Lake Environment
nrows = 10
nS = nrows**2
nA = 4
gamma = 0.99
tau = .6
xi = np.ones(nS) * 1/nS
is_slippery = False
reward_shape = True
num_bins = 10
checkpoint_step=500
test_episodes = 1000


In [None]:
seed = constants.SEEDS[0]
map_seed = seed
set_policy_seed(seed)
env = FrozenLakeEnv(is_slippery=False, seed=seed, 
                desc=generate_random_map(nrows, seed=map_seed), 
                reward_shape=False,
                num_bins=num_bins,
                )

# Environment independent configuration
tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
tmdp.update_tau(tau)

In [None]:
res = bellman_optimal_q(tmdp.env.P_mat, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]
V = compute_V_from_Q(Q, get_policy(Q))
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)

In [None]:
episodes = 2900000

model_lr = 0.25869281717397965
pol_lr = 0.243385176875263757
temp = 5

final_temp=1e-5
param_decay=True
lam = 1
batch_size = 5
epochs = 1


check_convergence=False
param_decay=True
biased=False

In [None]:
curr_MPI = CurriculumMPI(tmdp)
curr_MPI.train(model_lr, pol_lr, batch_size=batch_size, 
                        lam=lam, temp=temp, final_temp=final_temp,
                        episodes=episodes, check_convergence=check_convergence,
                        param_decay=param_decay, biased=biased)   

In [None]:
#V_Q = compute_V_from_Q(Q, get_policy(curr_MPI.Q))
V_curr = compute_V_from_Q(Q, get_softmax_policy(curr_MPI.theta, temperature=1e-5))

j_curr = compute_expected_j(V_curr, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)

print("optimal performance: ",j_opt)
print("curriculum performance: ", j_curr)


In [None]:
import matplotlib.pyplot as plt
rewards = curr_MPI.reward_records
# Generate recent 50 interval average
avg_rewards = []
for idx in range(len(rewards)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 50:
        avg_list = rewards[:idx+1]
    else:
        avg_list = rewards[idx-49:idx+1]
    avg_rewards.append(np.average(avg_list)/batch_size)
# Plot
#plt.plot(rewards)
plt.plot(avg_rewards)

In [None]:
pi = get_policy(get_softmax_policy(curr_MPI.theta, temperature=1e-4))
#pi = get_policy(Q)

In [None]:
print(curr_MPI.theta[0])
print(pi[0])