In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
from mlflow.tracking import MlflowClient
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from RiverSwim import *
from CurriculumMPI import CurriculumMPI

In [None]:
#River Swim Environment
nS = 70
uniform_restart = False


small = 5e-3
large = 1.
nA = 2
gamma = 0.99

original_mu = np.zeros(nS)
original_mu[1] = 1.
xi = np.ones(nS)/(nS-2)
xi[0] = 0
xi[-1] = 0

if uniform_restart:
    mu = xi
else:
    mu = original_mu


episodes = 7000000
n_trials = 30
checkpoint_step=10000
test_episodes = 10000
discount_tau = True
param_decay=True
debug = False

lam = 1
experiment_results = []
tests_returns = []
tests_lens = []
exp_taus = []

In [None]:
run_name = f"CurrMPI_{uniform_restart}"

mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = f"RiverSwim_{nS}_{uniform_restart}_optuna"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

label = run_name.split("_")[0]

In [None]:
tau = 0.2

epochs = 6
final_temp=1e-5
temp = 1
check_convergence=True
biased=False
max_length = nS**2

In [None]:
set_policy_seed(seed)
env = RiverSwim(nS, mu, small=small, large=large, seed=None)

# Environment independent configuration
tmdp = TMDP(env, xi, tau=tau, 
            gamma=gamma, discount_tau=discount_tau)
tmdp.update_tau(tau)


In [None]:
expected_hitting_times = compute_expected_hitting_time(env.P_mat)


In [None]:

def objective(trial):
    with mlflow.start_run(nested=True):
        global episodes
        # Parameter space optimization
        model_lr = trial.suggest_float("model_lr", 1e-4, 0.5, log=True)
        pol_lr = trial.suggest_float("pol_lr", 1e-4, 0.5, log=True)
        batch_size = trial.suggest_categorical('batch_size', [8, 16, 24, 32, 64])
        #lam = trial.suggest_float("lam", 0., 1., log=False)
        temp = trial.suggest_float("temp", 1, 30, log=False)

        tmdp.update_tau(tau) # Reset the teleport rate 
        
        curr_MPI = CurriculumMPI(tmdp, checkpoint_step=checkpoint_step)
        curr_MPI.train(model_lr, pol_lr, batch_size=batch_size, 
                        lam=lam, temp=temp, final_temp=final_temp,
                        episodes=episodes, check_convergence=check_convergence,
                        param_decay=param_decay, biased=biased,
                        debug=debug, epochs=epochs, original_mu=original_mu,
                        max_length=max_length)      
        
        avg_return = np.average(curr_MPI.reward_records[-30:])/batch_size
        
        test_policies = test_policies_len(tmdp, curr_MPI.thetas, test_episodes, 1e-40, mu=original_mu)
        test_policies_return = np.mean(test_policies[0][-30:])
        test_pol_len = np.mean(test_policies[1][-30:])

        mlflow.log_metric("Avg Return", avg_return)
        mlflow.log_metric("Test Return", test_policies_return)
        mlflow.log_metric("Test Policy Length", test_pol_len)
        
        mlflow.log_params({
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "batch_size": batch_size,
            "temp": temp,
        })

        mlflow.set_tags(tags={
            "Algorithm": "CurriculumMPI",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "tau": tau,
            "episodes": curr_MPI.episodes,
            "checkpoint_step": curr_MPI.checkpoint_step,
            "epochs": epochs,
            "final_temp": final_temp,
            "discount_tau": discount_tau,
            "param_decay": param_decay,
        })

    return test_policies_return


In [None]:

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    
    mlflow.log_params(study.best_params)
    
    curr_MPI = CurriculumMPI(tmdp, checkpoint_step=checkpoint_step)

    # Get the best parameters
    model_lr = study.best_params["model_lr"]
    pol_lr = study.best_params["pol_lr"]
    batch_size = study.best_params["batch_size"]
    #lam = study.best_params["lam"]
    #temp = study.best_params["temp"]
    
    # Train the best model
    curr_MPI.train(model_lr, pol_lr, batch_size=batch_size, 
                        lam=lam, temp=temp, final_temp=final_temp,
                        episodes=episodes, check_convergence=check_convergence,
                        param_decay=param_decay, biased=biased,
                        debug=debug, epochs=epochs, original_mu=original_mu,
                        max_length=max_length)  
    
    mlflow.set_tags(tags={
            "Algorithm": "CurriculumMPI",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "tau": tau,
            "episodes": curr_MPI.episodes,
            "checkpoint_step": curr_MPI.checkpoint_step,
            "epochs": epochs,
            "final_temp": final_temp,
            "discount_tau": discount_tau,
            "param_decay": param_decay,
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "batch_size": batch_size,
            "temp": temp,
        })


In [None]:
study.best_params

In [None]:
import matplotlib.pyplot as plt
reward_records = curr_MPI.reward_records
# Generate recent 50 interval average
average_reward = []
scaled_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    scaled_reward.append(reward_records[idx]/batch_size)
    if idx < 500:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-499:idx+1]
    average_reward.append(np.average(avg_list)/batch_size)
# Plot
#plt.plot(scaled_reward)
plt.plot(average_reward[:-1])

In [None]:
compute_expected_hitting_time(env.P_mat)