In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
from mlflow.tracking import MlflowClient
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from FrozenLake import *
from CurriculumQ import CurriculumQ

In [None]:
is_slippery = False
reward_shape = False
num_bins = 3
checkpoint_step=500
test_episodes = 1000
episodes = 7900000
dense_reward = True
shape_range=(-1,0)
goal_reward=1.
debug = True

In [None]:
#Frozen Lake Environment
nrows = 20
nS = nrows**2
nA = 4
gamma = 0.999999999
tau = 0.
xi = np.ones(nS) * 1/nS
exp_rate = 0.4

In [None]:

model_lr = 0.15869281717397965

param_decay=True
lam = 1
batch_size = 25

eps_model = compute_eps_model(gamma, tau, episodes/10)

experiment_results = []
tests_returns = []

In [None]:
seed = map_seed = constants.SEEDS[0]
set_policy_seed(seed)
env = FrozenLakeEnv(is_slippery=False, seed=seed, 
                desc=generate_random_map(nrows, seed=map_seed), 
                reward_shape=True,
                num_bins=num_bins,
                dense_reward=dense_reward,
                shape_range=shape_range,
                goal_reward=goal_reward,
                )

# Environment independent configuration
tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
tmdp.update_tau(tau)

In [None]:
curr_Q = CurriculumQ(tmdp)
tmdp.update_tau(tau) # Reset the teleport rate        

In [None]:
print(curr_Q.Q)

In [None]:
curr_Q.train(model_lr, batch_size=batch_size, 
                lam=lam, exp_rate=0.3,
                episodes=episodes,
                eps_model=eps_model,
                param_decay=param_decay)

In [None]:
print(curr_Q.Q)

In [None]:
"""
delta_pol = get_policy(Q)-get_policy(curr_Q.Q)
print(delta_pol)

r_s_a = compute_r_s_a(tmdp.env.P_mat, tmdp.env.reward)

V_Q = compute_V_from_Q(Q, get_policy(curr_Q.Q))

j_q_curr = compute_expected_j(V_Q, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)
print("Q curriculum performance: ",j_q_curr)
"""

In [None]:
pi = get_policy(curr_Q.Q)

In [None]:

tmdp.update_tau(0.)
tmdp.reset()
done = False
step = 0
env.render_mode = "human"
while True:
    s = tmdp.env.s
    a = greedy(tmdp.env.s, pi, tmdp.env.allowed_actions[int(0)])
    s_prime, reward, flags, prob = tmdp.step(a)
    print("state {} action {} next state {} reward {}".format(s, a, s_prime, reward))
    if flags["done"]:
        tmdp.reset()
        break
    step +=1
    if step > max(100,int(nrows*2.5)):
        break


In [None]:
print(curr_Q.Q)

In [None]:
a = np.array([1, 2])
b = np.array([4, 1])
np.matmul(a, b)