In [None]:
import numpy as np
from TMDP import TMDP
from algorithms import *
from model_functions import *

import matplotlib.pyplot as plt
import gymnasium as gym
from FrozenLake import *

#np.set_printoptions(precision=4)
import math

import torch
import mlflow
from CurriculumPPO import CurriculumPPO
import optuna
from policy_utils import *
from constants import *
from experiment_utils import *

nrows = 30
nS = nrows**2
nA = 4
map_seed = constants.SEEDS[6]


gamma = .999999999
tau = 0.
#tmdp = TMDP(river, xi, tau=tau, gamma=gamma, seed=seed)
#mdp = TMDP(river, xi, tau=0., gamma=gamma, seed=seed)


is_slippery = False
reward_shape = True
dense_reward = True
num_bins = 15

checkpoint_step=500
test_episodes = 1000
episodes = 5900000
num_runs = 30

shape_range=(-1,0)

env = FrozenLakeEnv(is_slippery=False, seed=None, 
                    desc=generate_random_map(nrows, seed=map_seed), 
                    reward_shape=True,
                    num_bins=num_bins,
                    goal_reward=1,
                    shape_range=shape_range,
                    dense_reward=dense_reward,
                    )

xi = np.ones(env.nS) * 1/env.nS
tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=None)

In [None]:

def get_or_create_experiment(experiment_name):
    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


In [None]:
mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = "Frozen_Lake_{}_{}_{} OPTUNA".format(nrows, nrows, num_bins)
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

In [None]:
final_temp=1
param_decay=True
lam = 1
batch_size = 32
epochs = 4

eps_ppo = 0.2
eps_model = compute_eps_model(gamma, tau, episodes/10)
entropy_coef = 0.4

In [None]:
from experiment_result_utils import *

In [None]:

def objective(trial):
    with mlflow.start_run(nested=True):
        global episodes
        # Parameter space optimization
        model_lr = trial.suggest_float("model_lr", 1e-4, 0.5, log=False)
        pol_lr = trial.suggest_float("pol_lr", 1e-4, 0.5, log=False)
        entropy_coef = trial.suggest_float("entropy_coef", 1e-4, 1, log=False)
        #batch_size = trial.suggest_int("batch_size", 8, 48, log=False)
        #epochs = trial.suggest_int("epochs", 1, 5, log=False)
        #lam = trial.suggest_float("lam", 0., 1., log=False)
        temp = trial.suggest_float("temp", 1, 150, log=False)
        param_decay = trial.suggest_categorical("param_decay", [True, False])
        set_policy_seed(None)
        # Curriculum class initialization
        curr_PPO = CurriculumPPO(tmdp, checkpoint_step=checkpoint_step)
        tmdp.reset()
        tmdp.update_tau(tau) # Reset the teleport rate 
        
        curr_PPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_model=eps_model,
                param_decay=param_decay, epochs=epochs, max_length=1000,
                entropy_coef=entropy_coef)
        
        
        avg_reward = np.mean(curr_PPO.reward_records[-1:])/batch_size
        
        cum_return = test_policies(tmdp, [curr_PPO.theta], test_episodes, 1e-40)
        
        mlflow.log_params({
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "batch_size": batch_size,
            "lam": lam,
            "temp": temp,
            "final_temp": final_temp,
            "episodes": curr_PPO.episodes,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
            "entropy_coef": entropy_coef,
            "epochs": epochs,
        })

        mlflow.log_metric("Avg Reward", avg_reward)
        mlflow.log_metric("Cumulative Return", cum_return)
        mlflow.set_tags(tags={
            "Project": "Curriculum Learning",
            "Algorithm": "CurriculumPPO",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "mu": tmdp.env.mu,
            "xi": tmdp.xi,
            "seed": seed,
            "tau": tau,
            "episodes": curr_PPO.episodes,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
        })
        curr_PPO.save_to_mlflow() # Save the model to MLflow

    return cum_return


In [None]:
run_name = "PPO"
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    if num_bins == 0:
        study = optuna.create_study(direction="maximize")
    else:
        study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=num_runs)
    
    # Train and save the best model
    curr_PPO = CurriculumPPO(tmdp,  checkpoint_step=checkpoint_step)
    tmdp.update_tau(tau) # Reset the teleport rate 

    # Get the best parameters
    model_lr = study.best_params["model_lr"]
    pol_lr = study.best_params["pol_lr"]
    #batch_size = study.best_params["batch_size"]
    #lam = study.best_params["lam"]
    temp = study.best_params["temp"]
    #epochs = study.best_params["epochs"]
    entropy_coef = study.best_params["entropy_coef"]
    param_decay = study.best_params["param_decay"]

    # Train the best model
    curr_PPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_model=eps_model,
                param_decay=param_decay, epochs=epochs, max_length=1000,
                entropy_coef=entropy_coef,)
    
    mlflow.log_params({
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "batch_size": batch_size,
            "lam": lam,
            "temp": temp,
            "final_temp": final_temp,
            "episodes": curr_PPO.episodes,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
            "entropy_coef": entropy_coef,
            "epochs": epochs,
        })

    mlflow.set_tags(tags={
            "Project": "Curriculum Learning",
            "Algorithm": "CurriculumPPO",
            "Environment": tmdp.env.__class__.__name__,
            "nS": nS,
            "nA": nA,
            "gamma": gamma,
            "mu": tmdp.env.mu,
            "xi": tmdp.xi,
            "seed": seed,
            "tau": tau,
            "episodes": curr_PPO.episodes,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
        })

    curr_PPO.save_to_mlflow() # Save the best model to MLflow

In [None]:
delta_pol = get_policy(Q) - get_policy(get_softmax_policy(curr_PPO.theta))


V_curr = compute_V_from_Q(Q, get_softmax_policy(curr_PPO.theta))

j_curr = compute_expected_j(V_curr, tmdp.env.mu)
j_opt = compute_expected_j(V, tmdp.env.mu)

print("optimal performance: ",j_opt)
print("curriculum performance: ", j_curr)

In [None]:
import matplotlib.pyplot as plt
reward_records = curr_PPO.reward_records
# Generate recent 50 interval average
average_reward = []
scaled_reward = []
for idx in range(len(reward_records)):
    avg_list = np.empty(shape=(1,), dtype=int)
    scaled_reward.append(reward_records[idx])
    if idx < 50:
        avg_list = reward_records[:idx+1]
    else:
        avg_list = reward_records[idx-49:idx+1]
    average_reward.append(np.average(avg_list))
# Plot
#plt.plot(scaled_reward)
plt.plot(average_reward[:-1])

In [None]:
print(tmdp)

In [None]:
pi = get_policy(get_softmax_policy(curr_PPO.theta))
tmdp.update_tau(0.)
tmdp.reset()
done = False
step = 0
env.render_mode = "human"
while True:
    s = tmdp.env.s
    a = greedy(tmdp.env.s, pi, tmdp.env.allowed_actions[int(tmdp.env.s)])
    s_prime, reward, flags, prob = tmdp.step(a)
    print("state {} action {} next state {} reward {}".format(s, a, s_prime, reward))
    if flags["done"]:
        tmdp.reset()
    step +=1
    if step > max(100,nrows*2):
        break
