In [None]:
import numpy as np
from TMDP import TMDP
from model_functions import *
from policy_utils import *
import matplotlib.pyplot as plt
import gymnasium as gym
from algorithms import *
from FrozenLake import *
import pandas as pd
import constants
#np.set_printoptions(precision=4)
import math
import torch
import mlflow
from CurriculumPMPO import CurriculumPMPO
import optuna
from RiverSwim import RiverSwim

# Parameters
nS = 200
nA = 2
gamma = .99
mu = np.ones(nS) *1/(nS-2)
mu[0] = mu[nS-1] = 0

xi = np.ones(nS) * 1/(nS)
seed = constants.SEEDS[0]
set_policy_seed(seed)

river = RiverSwim(nS, mu, small=5, large=1000, seed=seed)
tau = .5
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)

In [None]:
res = bellman_optimal_q(tmdp.env.P_mat, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]
V = compute_V_from_Q(Q, get_policy(Q))
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)

In [None]:
eps_model = compute_eps_model(gamma, tau, 50000)
print("eps_model: {}".format(eps_model))


In [None]:
episodes = 6000000

model_lr = 0.2888499667800339
pol_lr = 0.11386852281441003
temp = 5.899275492134674

final_temp=1e-4
param_decay=True
lam = 1
batch_size = 32
epochs = 1

eps_ppo = 0.2
eps_model = compute_eps_model(gamma, tau, episodes/10)

In [None]:
curr_PMPO = CurriculumPMPO(tmdp)
tmdp.update_tau(tau) # Reset the teleport rate        

In [None]:
curr_PMPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_model=eps_model,
                param_decay=param_decay, epochs=epochs)

In [None]:

V_curr = compute_V_from_Q(Q, get_softmax_policy(curr_PMPO.theta, temperature=1))

j_curr = compute_expected_j(V_curr, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)

print("optimal performance: ",j_opt)
print("curriculum performance: ", j_curr)

In [None]:
pi = get_softmax_policy(curr_PMPO.theta, temperature=1e-4)
print(pi)
print(get_policy(Q))

In [None]:
import matplotlib.pyplot as plt
reward_records = curr_PMPO.reward_records
# Generate recent 50 interval average
average_reward = []
scaled_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    scaled_reward.append(reward_records[idx]/batch_size)
    if idx < 500:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-499:idx+1]
    average_reward.append(np.average(avg_list)/batch_size)
# Plot
#plt.plot(scaled_reward)
plt.plot(average_reward[:-1])

In [None]:
import matplotlib.pyplot as plt
exp_performances = curr_PMPO.exp_performances
# Generate recent 50 interval average
avg_performances = []
for idx in range(len(exp_performances)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 500:
        avg_list = exp_performances[:idx+1]
    else:
        avg_list = exp_performances[idx-499:idx+1]
    avg_performances.append(np.average(avg_list))
# Plot
#plt.plot(exp_performances)
plt.plot(avg_performances)
plt.xlabel("episodes")
plt.ylabel("Expected Performance")
plt.title("Expected Performance")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

exp_performances = curr_PMPO.exp_performances
# Generate recent 50 interval average
avg_performances = []
std_errors = []

for idx in range(len(exp_performances)):
    if idx < 500:
        avg_list = exp_performances[:idx+1]
    else:
        avg_list = exp_performances[idx-499:idx+1]

    average = np.mean(avg_list)
    std_dev = np.std(avg_list)
    n_samples = len(avg_list)
    sem = std_dev / np.sqrt(n_samples)  # Standard Error of the Mean
    std_errors.append(sem)

    avg_performances.append(average)

# Compute the upper and lower bounds for the 95% confidence interval
ci = 1.96  # Z-score for 95% confidence
upper_bound = [avg + ci * sem for avg, sem in zip(avg_performances, std_errors)]
lower_bound = [avg - ci * sem for avg, sem in zip(avg_performances, std_errors)]

# Plot
plt.figure(figsize=(10, 5))
#plt.plot(exp_performances, label='Exp Performances')
plt.plot(avg_performances, label='Moving Average', color='r')
plt.fill_between(range(len(avg_performances)), lower_bound, upper_bound, color='r', alpha=0.2, label='95% Confidence Interval')
plt.legend()
plt.title('Experiment Performances with Moving Average and Confidence Interval')
plt.xlabel('Index')
plt.ylabel('Performance')
plt.show()
