In [None]:
import numpy as np
from TMDP import TMDP
from algorithms import *
from policy_utils import *
from model_functions import *

import matplotlib.pyplot as plt
import gymnasium as gym
from FrozenLake import *
from RiverSwim import RiverSwim
#np.set_printoptions(precision=4)
import math
from utils import *

import torch
import mlflow
from CurriculumQ import CurriculumQ
import optuna

# Parameters
nS = 100
nA = 2
gamma = .99
mu = np.ones(nS) * 1/nS
xi = np.ones(nS) * 1/nS

seed = get_current_seed()

river = RiverSwim(nS, mu, small=5, large=1000, seed=seed)
tau = .5
tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)

In [None]:
print(seed)

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt

def compute_n(gamma, tau, eps_model):
    return np.ceil(2 * gamma * tau / ((1 - gamma) * eps_model))

def compute_eps_model(gamma, tau, n):
    return 2 * gamma * tau / (n * (1 - gamma))

def compute_tau_prime(tau, gamma, eps_model):
    tau_prime = tau - eps_model * (1 - gamma) / (2 * gamma)
    tau_prime = max(0, tau_prime)
    return tau_prime

def dynamic_update(gamma, initial_tau, max_episodes, trajectory_lengths):
    tau = initial_tau
    tau_values = [tau]
    eps_values = []
    curr_episode = 0
    
    while curr_episode < max_episodes and tau > 0:
        remaining_episodes = max_episodes - curr_episode
        n = compute_n(gamma, tau, eps_model=1e-10)  # Use a small eps_model to determine n
        eps_model = compute_eps_model(gamma, tau, max(1,min(n, remaining_episodes)))  # Adjust n to remaining episodes
        tau_prime = compute_tau_prime(tau, gamma, eps_model)
        tau = tau_prime*0.9999
        tau_values.append(tau)
        eps_values.append(eps_model)
        
        # Process a trajectory of varying length
        trajectory_length = trajectory_lengths[curr_episode % len(trajectory_lengths)]
        curr_episode += trajectory_length  # Increment current episode by the length of the trajectory


    return tau_values, eps_values

# Generate random trajectory lengths
def generate_random_int_list(size, low, high):
    return [random.randint(low, high) for _ in range(size)]

# Example usage
gamma = 0.9
initial_tau = .20
max_episodes = 1000000
trajectory_lengths = generate_random_int_list(max_episodes, 1, 10)  # Random lengths between 1 and 10

tau_values, eps_values = dynamic_update(gamma, initial_tau, max_episodes, trajectory_lengths)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(tau_values, label='tau')
plt.xlabel('Episode')
plt.ylabel('tau')
plt.title('Tau Convergence')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(eps_values, label='eps_model')
plt.xlabel('Episode')
plt.ylabel('eps_model')
plt.title('Eps Model Convergence')
plt.legend()

plt.tight_layout()
plt.show()
print(tau_values[-1], eps_values[-1])


In [None]:
print(tau_values[-1])

In [None]:
tau_prime = compute_tau_prime(0.3010648917309438 , gamma, 0.9031946751928317)
print(tau_prime)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def reward_shaping(n, num_bins, reward_range=(-1, 1)):
    # Initialize the grid with zeros
    rewards = np.zeros((n, n))
    # Coordinates of the goal
    goal = (n-1, n-1)
    
    # Calculate maximum distance in the grid (Manhattan distance)
    max_distance = (goal[0] - 0) + (goal[1] - 0)
    
    # Calculate distances for each cell from the goal (Manhattan distance)
    distances = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            distances[i, j] = abs(goal[0] - i) + abs(goal[1] - j)
            #distances[i,j] = np.sqrt((goal[0] - i)**2 + (goal[1] - j)**2)
    
    # Determine bin edges
    bin_edges = np.linspace(0, max_distance, num_bins + 1)

    # Flip the bin, the lower the distance, the higher the reward
    bin_edges = np.flip(bin_edges)
    
    # Calculate rewards for each bin using linear interpolation
    bin_rewards = np.linspace(reward_range[0], reward_range[1], num_bins+1)
    

    # Assign rewards based on bins
    for i in range(n):
        for j in range(n):
            distance = distances[i, j]
            bin_index = np.digitize(distance, bin_edges, right=False)-1   # -1 because np.digitize starts from 1
            bin_index = max(0, bin_index)  # Ensure bin_index is within range [0, num_bins-1]
            rewards[i, j] = bin_rewards[bin_index]
    
    # Set reward for the goal cell
    rewards[goal[0], goal[1]] = reward_range[1]
    
    return rewards


n = 30 
num_bins = 4 
reward_range = (-1, 1) 
rewards = reward_shaping(n, num_bins, reward_range)


reward_df = pd.DataFrame(rewards)
print(reward_df)

plt.imshow(rewards, cmap='coolwarm', interpolation='none')
plt.colorbar(label='Reward')
plt.title('Reward Grid')
plt.show()


In [None]:
print(np.min(rewards))

In [None]:
res = bellman_optimal_q(tmdp.env.P_mat, tmdp.env.reward, tmdp.gamma)
Q = res["Q"]
V = compute_V_from_Q(Q, get_policy(Q))
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)

In [None]:
eps_model = compute_eps_model(gamma, tau, 50000)
print("eps_model: {}".format(eps_model))

In [None]:
episodes = 5000000
lam = 1
batch_size = 1
param_decay=True
eps_model = compute_eps_model(gamma, tau, 500000)
model_lr = 0.10869281717397965
tuning_rate = 0.75

In [None]:
print(eps_model)

In [None]:
curr_Q = CurriculumQ(tmdp)
tmdp.update_tau(tau) # Reset the teleport rate        

In [None]:
n_updates = compute_n(gamma, tau, eps_model)   
print("n_updates: {}".format(n_updates))

In [None]:
update_rate = int(episodes/n_updates)
print("update_rate: {}".format(update_rate))

In [None]:
curr_Q.train(model_lr, batch_size=batch_size, 
                lam=lam, exp_rate=0.5,
                episodes=episodes,
                eps_model=eps_model,
                param_decay=param_decay)

In [None]:

delta_pol = get_policy(Q)-get_policy(curr_Q.Q)
print(delta_pol)

r_s_a = compute_r_s_a(tmdp.env.P_mat, tmdp.env.reward)

V_Q = compute_V_from_Q(Q, get_policy(curr_Q.Q))

j_q_curr = compute_expected_j(V_Q, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)
print("optimal performance: ",j_opt)
print("Q curriculum performance: ",j_q_curr)


In [None]:
import matplotlib.pyplot as plt
reward_records = curr_Q.reward_records[75:]
# Generate recent 50 interval average
average_reward = []
scaled_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    scaled_reward.append(reward_records[idx]/batch_size)
    if idx < 500:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-499:idx+1]
    average_reward.append(np.average(avg_list)/batch_size)
# Plot
#plt.plot(scaled_reward)
plt.plot(average_reward[:-1])