In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from RiverSwim import *
from CurriculumPPO import CurriculumPPO

In [None]:
#River Swim Environment
nS = 200
nA = 2
gamma = 0.99
tau = .6

mu = np.ones(nS) *1/(nS-2)
mu[0] = mu[nS-1] = 0
xi = np.ones(nS) * 1/nS

checkpoint_step=500
test_episodes = 1000

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = f"RiverSwim_{nS}"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)
save_path = f"results/{experiment_name}/CPPO"

In [None]:
episodes = 800000

model_lr = 0.3888499667800339
pol_lr = 0.11386852281441003
temp = 2.899275492134674

final_temp=1
param_decay=True
lam = 1
batch_size = 32
epochs = 1

eps_ppo = 0.2
eps_model = compute_eps_model(gamma, tau, episodes/10)

In [None]:
experiment_results = []
tests_returns = []

In [None]:
def run_experiment(index, seed, run_name):
    sub_run_name = f"{run_name}_{index}"
    
    with mlflow.start_run(nested=True, run_name=sub_run_name):
        # Environment specific configuration   
        
        set_policy_seed(seed)
        env = RiverSwim(nS, mu, small=5, large=1000, seed=seed)
        
        # Environment independent configuration
        tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
        tmdp.update_tau(tau)
        curr_PPO = CurriculumPPO(tmdp, checkpoint_step=checkpoint_step)

        curr_PPO.train(model_lr, pol_lr, batch_size=batch_size, 
                    lam=lam, temp=temp, final_temp=final_temp,
                    episodes=episodes, eps_ppo=eps_ppo,
                    eps_model=eps_model,
                    param_decay=param_decay, epochs=epochs)
        
        
        avg_reward = np.average(curr_PPO.reward_records[-10:])
        
        mlflow.log_metric("Avg Reward", avg_reward)
        
        run_dict = {
            "episodes": curr_PPO.episodes,
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "temp": temp,
            "final_temp": final_temp,
            "batch_size": batch_size,
            "epochs": epochs,
            "lam": lam,
            "eps_ppo": eps_ppo,
            "eps_model": eps_model,
        }
        mlflow.log_params(run_dict)
        mlflow.set_tags(tags={
            "run_name": run_name,
            "seed": seed,
            "tau": nS,
            "gamma": gamma,
            "checkpoint_step": checkpoint_step,
            "test_episodes": test_episodes,
            "index": index,
        })

        
        test_policies_return = test_policies(tmdp, curr_PPO.thetas, test_episodes, final_temp)
        
        result_dict = {
            "thetas" : curr_PPO.thetas,
            "taus" : curr_PPO.taus,
            "reward_records" : curr_PPO.reward_records,
            "test_policies_return" : test_policies_return,
            "index" : index,
        }

        tests_returns.append(test_policies_return)
        experiment_results.append(result_dict)
        # Save artifact to MLFlow
        try:
            save_to_mlflow(result_dict, run_name)
        except Exception as e:
            print(f"Error saving to MLFlow: {e}")
            print("Saving locally instead.")
            time.sleep(5)
            path = save_path+f"/{sub_run_name}"
            save(path, result_dict)



In [None]:
def plot_avg_test_return(rewards, title, figsize=(10, 8)):
    fig = plt.figure(figsize=figsize)
    avg_rewards = np.average(rewards, axis=0)
    std_dev = np.std(avg_rewards)
    n_samples = len(avg_rewards)
    std_err = std_dev / np.sqrt(n_samples)

    ci = 1.96
    upper_bound = avg_rewards + ci * std_err
    lower_bound = avg_rewards - ci * std_err
    
    plt.plot(avg_rewards, label='Average Rewards', color='r')
    plt.fill_between(range(len(avg_rewards)), lower_bound, upper_bound, color='r', alpha=0.2, label='95% Confidence Interval')

    plt.legend()
    plt.title(title)
    plt.xlabel('Episode')
    plt.ylabel('Avg Reward')
    plt.show()
    return fig

In [None]:
def run_experiments(change_map=False, num_runs=10):
    run_name = f"CPPO"
    with mlflow.start_run(run_name=run_name):
        
        for i in range(num_runs):
            seed = constants.SEEDS[i]
            run_experiment(i, seed, run_name)
        
        experiment_dict = {
            "experiment_results": experiment_results,
            "tests_returns": tests_returns,
            "num_runs": num_runs,
            "change_map": change_map,
        }
        try:
            save_to_mlflow(experiment_dict, run_name)
        except Exception as e:
            print(e)
            print("Something went wrong saving the experiment results to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            save(save_path, experiment_dict)

        rewards_fig = plot_avg_test_return(tests_returns, f"CPPO Avg Rewards on {num_runs} runs")
        try:
            mlflow.log_figure(figure=rewards_fig, artifact_file="reward_image.png")
        except Exception as e:
            print(e)
            print("Something went wrong saving the figure to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            rewards_fig.savefig(save_path+"/reward_image.png")

In [None]:
run_experiments(change_map=False, num_runs=10)