In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from FrozenLake import *
from CurriculumPMPO import CurriculumPMPO

In [2]:
is_slippery = False
reward_shape = True
num_bins = 10
checkpoint_step=500
test_episodes = 1000
episodes = 2900000
num_runs = 10
dense_reward = True
shape_range=(-1,0)
goal_reward=1.

In [3]:
#Frozen Lake Environment
nrows = 20
nS = nrows**2
nA = 4
gamma = 0.999
tau = 0.6
xi = np.ones(nS) * 1/nS


In [4]:
run_name = f"CurrPMPO_{num_bins}"

mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = f"FrozenLake_{nrows}x{nrows}_{num_bins}"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

save_path = f"results/{experiment_name}/run_{run_name}"
label = run_name.split("_")[0]

In [5]:

model_lr = 0.373715
pol_lr = 0.215723
temp = 36.039501
epochs = 3
entropy_coef = 0.400000
eps_shift = 2e-3
batch_size = 20
debug = False

final_temp=1
lam = 1
param_decay=True
eps_ppo = 0.2

experiment_results = []
tests_rewards = []

In [6]:
def run_experiment(index, seed, run_name, change_map=False):
    sub_run_name = f"{run_name}_{index}"
    
    with mlflow.start_run(nested=True, run_name=sub_run_name):
        # Environment specific configuration   
        map_seed = seed if change_map else constants.SEEDS[0]
        set_policy_seed(seed)
        env = FrozenLakeEnv(is_slippery=False, seed=seed, 
                        desc=generate_random_map(nrows, seed=map_seed), 
                        reward_shape=True,
                        num_bins=num_bins,
                        dense_reward=dense_reward,
                        shape_range=shape_range,
                        goal_reward=goal_reward,
                        )
        
        # Environment independent configuration
        tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
        tmdp.update_tau(tau)
        curr_PMPO = CurriculumPMPO(tmdp, checkpoint_step=checkpoint_step)

        curr_PMPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_shift=eps_shift,
                param_decay=param_decay, epochs=epochs, max_length=1000,
                entropy_coef=entropy_coef, debug=debug)
        
        
        avg_reward = np.average(curr_PMPO.reward_records[-10:])/batch_size
        
        mlflow.log_metric("Avg Reward", avg_reward)
        
        run_dict = {
            "episodes": curr_PMPO.episodes,
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "temp": temp,
            "final_temp": final_temp,
            "batch_size": batch_size,
            "epochs": epochs,
            "lam": lam,
            "eps_ppo": eps_ppo,
            "eps_shift": eps_shift,
        }
        mlflow.log_params(run_dict)
        mlflow.set_tags(tags={
            "run_name": run_name,
            "change_map": change_map,
            "seed": seed,
            "tau": nS,
            "gamma": gamma,
            "checkpoint_step": checkpoint_step,
            "test_episodes": test_episodes,
            "index": index,
            "dense_reward": dense_reward,
            "shape_range": shape_range,
            "goal_reward": goal_reward,
        })

        
        test_rewards = test_policies(tmdp, curr_PMPO.thetas, test_episodes, 1e-5)
        
        result_dict = {
            "thetas" : curr_PMPO.thetas,
            "taus" : curr_PMPO.taus,
            "reward_records" : curr_PMPO.reward_records,
            "test_rewards" : test_rewards,
            "index" : index,
        }

        tests_rewards.append(test_rewards)
        experiment_results.append(result_dict)
        # Save artifact to MLFlow
        try:
            save_to_mlflow(result_dict, run_name)
        except Exception as e:
            print(f"Error saving to MLFlow: {e}")
            print("Saving locally instead.")
            time.sleep(5)
            path = save_path+f"/{sub_run_name}"
            save(path, result_dict)

In [7]:
def run_experiments(change_map=False, num_runs=10):
    with mlflow.start_run(run_name=run_name):
        
        for i in range(num_runs):
            seed = constants.SEEDS[i]
            run_experiment(i, seed, run_name, change_map)
        
        experiment_dict = {
            "tests_rewards": tests_rewards,
            "num_runs": num_runs,
            "change_map": change_map,
            "num_bins": num_bins,
            "label": label,
        }
        try:
            save_to_mlflow(experiment_dict, run_name)
        except Exception as e:
            print(e)
            print("Something went wrong saving the experiment results to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            save(save_path, experiment_dict)

        rewards_fig = plot_avg_test_return(tests_rewards, f"CPMPO Avg Rewards on {num_runs} runs")
        try:
            mlflow.log_figure(figure=rewards_fig, artifact_file="reward_image.png")
        except Exception as e:
            print(e)
            print("Something went wrong saving the figure to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            rewards_fig.savefig(save_path+"/reward_image.png")

In [8]:
run_experiments(change_map=False, num_runs=num_runs)

Current seed for result reproducibility: 2999
Episode: 5000 reward: -1237.4999999999998 length: 22 tau 0.6
Episode: 10000 reward: -837.5000000000002 length: 57 tau 0.6
Episode: 15000 reward: -571.0999999999999 length: 108 tau 0.6
Episode: 20000 reward: -1264.8000000000002 length: 71 tau 0.6
Episode: 25000 reward: -1459.6999999999998 length: 76 tau 0.6
Episode: 30000 reward: -788.7000000000003 length: 75 tau 0.6
Episode: 35000 reward: -2745.2000000000007 length: 99 tau 0.6
Episode: 40000 reward: -873.7 length: 67 tau 0.6
Episode: 45000 reward: -1134.3000000000002 length: 2 tau 0.6
Episode: 50000 reward: -1584.2999999999997 length: 4 tau 0.6
Episode: 55000 reward: -1758.2000000000007 length: 16 tau 0.6
Episode: 60000 reward: -1511.0000000000002 length: 27 tau 0.6
Episode: 65000 reward: -1466.2000000000005 length: 170 tau 0.6
Episode: 70000 reward: -1280.1999999999996 length: 19 tau 0.6
Episode: 75000 reward: -1453.8000000000002 length: 31 tau 0.6
Episode: 80000 reward: -880.1999999999999

KeyboardInterrupt: 

In [None]:
print(len(tests_rewards[0]))

In [None]:
num = 8
indexes = [0, 1, 2, 3, 4, 5,  ]
indexes = [6,9]
rew = [tests_rewards[i] for i in indexes]
len(rew)

In [None]:
rewards_fig = plot_avg_test_return(rew, f"CPMPO Avg Rewards on {num_runs} runs")