In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
from mlflow.tracking import MlflowClient
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from FrozenLake import *
from CurriculumQ import CurriculumQ

In [2]:
is_slippery = False
reward_shape = True
dense_reward = True
num_bins = 15
nrows = 30

num_runs = 10
episodes = 4900000
checkpoint_step=500
test_episodes = 1000

shape_range=(-1,0)
goal_reward = 1.
debug = False
param_decay=True

lam = 1
experiment_results = []
tests_returns = []

In [3]:
#Frozen Lake Environment
tau = 0.
nS = nrows**2
nA = 4
gamma = 0.999
xi = np.ones(nS) * 1/nS

In [4]:
run_name = f"Q_{num_bins}"

mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = f"FrozenLake_{nrows}x{nrows}_{num_bins}"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

save_path = f"results/{experiment_name}/run_{run_name}"
label = run_name.split("_")[0]

In [5]:
model_lr = 0.15869281717397965

batch_size = 20
exp_rate = 0.4
eps_model = compute_eps_model(gamma, tau, episodes/10)

In [6]:
def run_experiment(index, seed, run_name, change_map=False):
    sub_run_name = f"{run_name}_{index}"
    
    with mlflow.start_run(nested=True, run_name=sub_run_name):
        # Environment specific configuration   
        map_seed = seed if change_map else constants.SEEDS[0]
        set_policy_seed(seed)
        env = FrozenLakeEnv(is_slippery=is_slippery, seed=seed, 
                        desc=generate_random_map(nrows, seed=map_seed), 
                        reward_shape=reward_shape,
                        num_bins=num_bins,
                        dense_reward=dense_reward,
                        shape_range=shape_range,
                        goal_reward=goal_reward,
                        )
        
        # Environment independent configuration
        tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
        tmdp.update_tau(tau)
        curr_Q = CurriculumQ(tmdp, checkpoint_step=checkpoint_step)

        curr_Q.train(model_lr, batch_size=batch_size, 
                lam=lam, exp_rate=exp_rate,
                episodes=episodes,
                eps_model=eps_model,
                param_decay=param_decay,
                debug=debug,)
    
        avg_return = np.average(curr_Q.reward_records[-10:])/batch_size
        
        mlflow.log_metric("Avg Return", avg_return)
        
        run_dict = {
            "episodes": curr_Q.episodes,
            "model_lr": model_lr,
            "batch_size": batch_size,
            "lam": lam,
            "eps_model": eps_model,
            "exp_rate": exp_rate,
        }
        mlflow.log_params(run_dict)
        mlflow.set_tags(tags={
            "run_name": run_name,
            "change_map": change_map,
            "seed": seed,
            "tau": tau,
            "gamma": gamma,
            "checkpoint_step": checkpoint_step,
            "test_episodes": test_episodes,
            "index": index,
            "dense_reward": dense_reward,
            "shape_range": shape_range,
            "goal_reward": goal_reward,
            "reward_shape": reward_shape,
        })

        
        test_policies_return = test_Q_policies(tmdp, curr_Q.Qs, test_episodes)
        
        result_dict = {
            "Qs" : curr_Q.Qs,
            "taus" : curr_Q.taus,
            "reward_records" : curr_Q.reward_records,
            "test_policies_return" : test_policies_return,
            "index" : index,
        }

        tests_returns.append(test_policies_return)
        experiment_results.append(result_dict)
        # Save artifact to MLFlow
        try:
            save_to_mlflow(result_dict)
        except Exception as e:
            print(f"Error saving to MLFlow: {e}")
            print("Saving locally instead.")
            time.sleep(5)
            path = save_path+f"/{sub_run_name}"
            save(path, result_dict)

In [7]:
def run_experiments(change_map=False, num_runs=10):
    with mlflow.start_run(run_name=run_name):
        
        for i in range(num_runs):
            seed = constants.SEEDS[i]
            run_experiment(i, seed, run_name, change_map)
        
        experiment_dict = {
            "tests_returns": tests_returns,
            "num_runs": num_runs,
            "change_map": change_map,
            "num_bins": num_bins,
            "label": label,
        }
        try:
            save_to_mlflow(experiment_dict)
        except Exception as e:
            print(e)
            print("Something went wrong saving the experiment results to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            save(save_path, experiment_dict)

        rewards_fig = plot_avg_test_return(tests_returns, f"{run_name[:-3]} Avg Rewards on {num_runs} runs")
        try:
            mlflow.log_figure(figure=rewards_fig, artifact_file="reward_image.png")
        except Exception as e:
            print(e)
            print("Something went wrong saving the figure to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            rewards_fig.savefig(save_path+"/reward_image.png")

In [8]:
run_experiments(change_map=False, num_runs=num_runs)

Current seed for result reproducibility: 2999
Episode: 5000 reward: -17057.933333333258 length: 652
Episode: 10000 reward: -15624.199999999968 length: 23
Episode: 15000 reward: -15855.13333333327 length: 176
Episode: 20000 reward: -16128.39999999992 length: 300
Episode: 25000 reward: -15362.599999999971 length: 159
Episode: 30000 reward: -14657.799999999947 length: 34
Episode: 35000 reward: -15292.933333333256 length: 354
Episode: 40000 reward: -15043.333333333296 length: 231
Episode: 45000 reward: -14624.533333333295 length: 298
Episode: 50000 reward: -14743.133333333295 length: 441
Episode: 55000 reward: -15213.999999999936 length: 215
Episode: 60000 reward: -14817.866666666652 length: 514
Episode: 65000 reward: -15040.466666666616 length: 198
Episode: 70000 reward: -15576.533333333296 length: 30
Episode: 75000 reward: -15676.066666666617 length: 87
Episode: 80000 reward: -13883.39999999996 length: 341
Episode: 85000 reward: -14295.799999999927 length: 125
Episode: 90000 reward: -141