In [None]:
import numpy as np
from TMDP import TMDP
from RiverSwim import RiverSwim

from algorithms import *
from model_functions import *

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import functional as F

from CurriculumMPI import CurriculumMPI

#np.set_printoptions(precision=4)
import math
from utils import *

nS = 15
nA = 2
seed = get_current_seed()
seed = 47146882006333120128184314011326542902
gamma = .9
mu = np.ones(nS) * 1/nS
river = RiverSwim(nS, mu, small=5, large=1000, seed=seed)
tau = .5
xi = np.ones(nS) * 1/nS
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)
tmdp_1 = TMDP(river, xi, tau=.5, gamma=gamma, seed=seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
model_lr = 0.026563586934043884
pol_lr = 0.051995695341482906
episodes = 1000000
batch_size = 11
temp = 25.73110109982855
lam = 0.
final_temp = 0.02

In [None]:
{'model_lr': 0.15869281717397965,
 'pol_lr': 0.13385176875263757,
 'batch_size': 2,
 'lam': 0.7701417865047747,
 'temp': 8.376693546196126}

In [None]:
curr_MPI = CurriculumMPI(tmdp, device=device)

In [None]:
curr_MPI.train(model_lr, pol_lr, batch_size=batch_size, 
                        lam=lam, temp=temp, final_temp=final_temp,
                        episodes=1000000, check_convergence=False,
                        param_decay=True)

In [None]:
res = bellman_optimal_q(mdp.env.P_mat, mdp.env.reward, mdp.gamma)
Q = res["Q"]
V = compute_V_from_Q(Q, get_policy(Q))
#print(Q)

delta_pol = get_policy(Q)-get_policy(get_softmax_policy(curr_MPI.theta))
print(delta_pol)

r_s_a = compute_r_s_a(mdp.env.P_mat, mdp.env.reward)

V_Q = compute_V_from_Q(Q, get_policy(curr_MPI.Q))

V_curr = compute_V_from_Q(Q, get_softmax_policy(curr_MPI.theta))
V_ref = compute_V_from_Q(Q, get_softmax_policy(curr_MPI.theta_ref))

j_curr = compute_expected_j(V_curr, tmdp.env.mu)
j_curr_ref = compute_expected_j(V_ref, tmdp.env.mu)
j_q_curr = compute_expected_j(V_Q, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)
print("curriculum performance: ", j_curr)
print("reference curriculum performance: ",j_curr_ref)
print("Q curriculum performance: ",j_q_curr)

#print(get_softmax_policy(curr_MPI.theta))

In [None]:
print(curr_MPI.Qs[-1])

In [None]:
import matplotlib.pyplot as plt
reward_records = curr_MPI.reward_records
# Generate recent 50 interval average
average_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 500:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-499:idx+1]
    average_reward.append(np.average(avg_list))
# Plot
#plt.plot(reward_records)
plt.plot(average_reward)

In [None]:
import matplotlib.pyplot as plt
exp_performances = curr_MPI.exp_performances
# Generate recent 50 interval average
avg_performances = []
for idx in range(len(exp_performances)):
    avg_list = np.empty(shape=(1,), dtype=int)
    if idx < 50:
        avg_list = exp_performances[:idx+1]
    else:
        avg_list = exp_performances[idx-49:idx+1]
    avg_performances.append(np.average(avg_list))
# Plot
plt.plot(exp_performances)
plt.plot(avg_performances)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

exp_performances = curr_MPI.exp_performances
# Generate recent 50 interval average
avg_performances = []
std_errors = []

for idx in range(len(exp_performances)):
    if idx < 50:
        avg_list = exp_performances[:idx+1]
    else:
        avg_list = exp_performances[idx-49:idx+1]

    average = np.average(avg_list)
    std_dev = np.std(avg_list)
    n_samples = len(avg_list)
    sem = std_dev / np.sqrt(n_samples)  # Standard Error of the Mean
    std_errors.append(sem)

    avg_performances.append(average)

# Compute the upper and lower bounds for the 95% confidence interval
ci = 1.96  # Z-score for 95% confidence
upper_bound = [avg + ci * sem for avg, sem in zip(avg_performances, std_errors)]
lower_bound = [avg - ci * sem for avg, sem in zip(avg_performances, std_errors)]

# Plot
plt.figure(figsize=(10, 5))
#plt.plot(exp_performances, label='Exp Performances')
plt.plot(avg_performances, label='Moving Average', color='r')
plt.fill_between(range(len(avg_performances)), lower_bound, upper_bound, color='r', alpha=0.2, label='95% Confidence Interval')
plt.legend()
plt.title('Experiment Performances with Moving Average and Confidence Interval')
plt.xlabel('Index')
plt.ylabel('Performance')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

exp_performances = curr_MPI.exp_performances  # Replace with your actual data
# Generate recent 50 interval average
avg_performances = []
min_values = []
max_values = []

for idx in range(len(exp_performances)):
    if idx < 500:
        window = exp_performances[:idx+1]
    else:
        window = exp_performances[idx-499:idx+1]

    average = np.average(window)
    min_val = np.min(window)
    max_val = np.max(window)

    avg_performances.append(average)
    min_values.append(min_val)
    max_values.append(max_val)

# Plot
plt.figure(figsize=(10, 5))
#plt.plot(exp_performances, label='Exp Performances')
plt.plot(avg_performances, label='Moving Average', color='r')
plt.fill_between(range(len(avg_performances)), min_values, max_values, color='r', alpha=0.2, label='Min-Max Range')
plt.legend()
plt.title('Experiment Performances with Moving Average and Min-Max Range')
plt.xlabel('Index')
plt.ylabel('Performance')
plt.show()

