In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
from mlflow.tracking import MlflowClient
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from RiverSwim import *
from CurriculumPPO import CurriculumPPO

In [None]:
#River Swim Environment
nS = 70
uniform_restart = True


small = 5e-3
large = 1.
nA = 2
gamma = 0.99999999

original_mu = np.zeros(nS)
original_mu[1] = 1.
xi = np.ones(nS)/(nS-2)
xi[0] = 0
xi[-1] = 0

if uniform_restart:
    mu = xi
else:
    mu = original_mu


episodes = 7000000
n_trials = 30
checkpoint_step=10000
discount_tau = True
param_decay=True
debug = False
test_episodes = max_length = nS**2
lam = 1
experiment_results = []
tests_returns = []
tests_lens = []
exp_taus = []

In [None]:
run_name = f"CurrPPO_{uniform_restart}_optuna"

mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = f"RiverSwim_{nS}_{uniform_restart}_optuna"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

label = run_name.split("_")[0]

In [None]:
tau = 0.2
batch_size = 20
epochs = 6

final_temp=1e-3
entropy_coef = 0.0

param_decay=True
lam = 1
max_length = nS**2

eps_ppo = 0.2
eps_model = compute_eps_model(gamma, tau, episodes/10)

In [None]:
set_policy_seed(seed)
env = RiverSwim(nS, mu, small=small, large=large, seed=None)

# Environment independent configuration
tmdp = TMDP(env, xi, tau=tau, 
            gamma=gamma, discount_tau=discount_tau,)
tmdp.update_tau(tau)


In [None]:

def objective(trial):
    with mlflow.start_run(nested=True):
        # Parameter space optimization
        model_lr = trial.suggest_float("model_lr", 1e-4, 0.5, log=True)
        pol_lr = trial.suggest_float("pol_lr", 1e-4, 0.5, log=True)
        #batch_size = trial.suggest_categorical('batch_size', [8, 16, 24, 32, 64])
        lam = trial.suggest_float("lam", 0., .5, log=False)
        temp = trial.suggest_float("temp", 1, 30, log=False)

        tmdp.update_tau(tau) # Reset the teleport rate 
        # Curriculum class initialization
        
        curr_PPO = CurriculumPPO(tmdp,checkpoint_step=checkpoint_step)
        
        curr_PPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_model=eps_model,
                param_decay=param_decay, epochs=epochs, max_length=max_length,
                entropy_coef=entropy_coef, debug=debug)
        
        
        avg_return = np.mean(curr_PPO.reward_records[-10:])/batch_size
        test_policies = test_policies_len(tmdp, curr_PPO.thetas, test_episodes, 1e-40, mu=original_mu)
        test_policies_return = np.mean(test_policies[0][-30:])
        test_pol_len = np.mean(test_policies[1][-30:])

        mlflow.log_metric("Avg Return", avg_return)
        mlflow.log_metric("Test Return", test_policies_return)
        mlflow.log_metric("Test Policy Length", test_pol_len)

        mlflow.log_params({
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "batch_size": batch_size,
            "temp": temp,
            "lam": lam
        })

        mlflow.set_tags(tags={
            "Algorithm": "CurriculumPPO",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "tau": tau,
            "episodes": episodes,
            "checkpoint_step": checkpoint_step,
            "epochs": epochs,
            "final_temp": final_temp,
            "discount_tau": discount_tau,
            "param_decay": param_decay,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
            "entropy_coef": entropy_coef,
            "seed": seed,
        })

    return test_policies_return


In [None]:
run_name = "CPPO_RiverSwim_{}".format(nS)
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    
    mlflow.log_params(study.best_params)
    
    # Train and save the best model
    curr_PPO = CurriculumPPO(tmdp)
    tmdp.update_tau(tau) # Reset the teleport rate 

    # Get the best parameters
    model_lr = study.best_params["model_lr"]
    pol_lr = study.best_params["pol_lr"]
    #batch_size = study.best_params["batch_size"]
    lam = study.best_params["lam"]
    temp = study.best_params["temp"]
    
    # Train the best model
    curr_PPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_model=eps_model,
                param_decay=param_decay, epochs=epochs, max_length=max_length,
                entropy_coef=entropy_coef, debug=debug)
    
    mlflow.set_tags(tags={
            "Algorithm": "CurriculumPPO",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "tau": tau,
            "episodes": episodes,
            "checkpoint_step": checkpoint_step,
            "epochs": epochs,
            "final_temp": final_temp,
            "discount_tau": discount_tau,
            "param_decay": param_decay,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
            "entropy_coef": entropy_coef,
            "seed": seed,
        })


In [None]:
study.best_params