In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import math
import mlflow
import optuna
import os

from TMDP import TMDP
from algorithms import *
from model_functions import *
from policy_utils import *
from experiment_result_utils import *
from constants import *

from FrozenLake import *
from CurriculumPMPO import CurriculumPMPO

In [2]:
is_slippery = False
reward_shape = False
dense_reward = True
num_bins = 0
nrows = 30

num_runs = 10
episodes = 4900000
checkpoint_step=500
test_episodes = 1000

shape_range=(-1,0)
goal_reward = 1.
debug = False
param_decay=True

lam = 1
experiment_results = []
exp_taus = []
tests_returns = []
tests_lens = []

In [3]:
if num_bins == 0:
    reward_shape = dense_reward = False
    discount_tau = True
else:
    reward_shape = dense_reward = True
    discount_tau = False

In [4]:
#Frozen Lake Environment
tau = 0.2
nS = nrows**2
nA = 4
gamma = 0.99999999
xi = np.ones(nS) * 1/nS

In [5]:
pol_lr = 0.3153456231864957
model_lr = 0.07600227185743516
temp = 1.152622296227941
entropy_coef = 0.0010124439030950623
epochs = 4
batch_size = 32
final_temp=1e-4

check_convergence = True
eps_shift = 4e-3 #1.5 best
max_eps_model = 0.5*1.5e-3
eps_ppo = 0.2

In [6]:
run_name = f"CurrPMPO_{num_bins}"

mlflow.set_tracking_uri(MLFLOW_URI)
experiment_name = f"FrozenLake_{nrows}x{nrows}_{num_bins}"
experiment_id = get_or_create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

save_path = f"results/{experiment_name}/run_{run_name}"
label = run_name.split("_")[0]

In [7]:
def run_experiment(index, seed, run_name, change_map=False):
    sub_run_name = f"{run_name}_{index}"
    
    with mlflow.start_run(nested=True, run_name=sub_run_name):
        # Environment specific configuration   
        map_seed = seed if change_map else constants.SEEDS[0]
        set_policy_seed(seed)
        env = FrozenLakeEnv(is_slippery=is_slippery, seed=seed, 
                        desc=generate_random_map(nrows, seed=map_seed), 
                        reward_shape=reward_shape,
                        num_bins=num_bins,
                        dense_reward=dense_reward,
                        shape_range=shape_range,
                        goal_reward=goal_reward,
                        )
        
        # Environment independent configuration
        tmdp = TMDP(env, xi, tau=tau, 
                    gamma=gamma, discount_tau=discount_tau,
                    seed=seed)
        tmdp.update_tau(tau)
        curr_PMPO = CurriculumPMPO(tmdp, checkpoint_step=checkpoint_step)

        curr_PMPO.train(model_lr, pol_lr, batch_size=batch_size, 
                lam=lam, temp=temp, final_temp=final_temp,
                episodes=episodes, eps_ppo=eps_ppo,
                eps_shift=eps_shift,
                param_decay=param_decay, epochs=epochs, max_length=1000,
                entropy_coef=entropy_coef, debug=debug,
                max_eps_model=max_eps_model, check_convergence=check_convergence)
        
        
        avg_return = np.average(curr_PMPO.reward_records[-10:])/batch_size
        
        mlflow.log_metric("Avg Return", avg_return)
        test_policies = test_policies_len(tmdp, curr_PMPO.thetas, test_episodes, 1e-5)
        test_policies_return = test_policies[0]
        test_pol_len = test_policies[1]
        
        result_dict = {
            "thetas" : curr_PMPO.thetas,
            "taus" : curr_PMPO.taus,
            "reward_records" : curr_PMPO.reward_records,
            "test_policies_return" : test_policies_return,
            "index" : index,
            "test_pol_len" : test_pol_len,
        }

        tests_returns.append(test_policies_return)
        experiment_results.append(result_dict)
        tests_lens.append(test_pol_len)
        exp_taus.append(curr_PMPO.taus)

In [8]:
def run_experiments(change_map=False, num_runs=10):
    with mlflow.start_run(run_name=run_name):
        
        for i in range(num_runs):
            seed = constants.SEEDS[i]
            run_experiment(i, seed, run_name, change_map)
        
        pad_returns = pad_to_same_length(tests_returns)
        pad_lens = pad_to_same_length(tests_lens)
        pad_taus = pad_to_same_length(exp_taus)
        experiment_dict = {
            "tests_returns": pad_returns,
            "taus": pad_taus,
            "tests_lens": pad_lens,
            "num_runs": num_runs,
            "change_map": change_map,
            "num_bins": num_bins,
            "label": label,
            "checkpoint_step": checkpoint_step,
        }

        mlflow.set_tags(tags={
            "change_map": change_map,
            "seed": seed,
            "tau": tau,
            "gamma": gamma,
            "checkpoint_step": checkpoint_step,
            "test_episodes": test_episodes,
            "dense_reward": dense_reward,
            "shape_range": shape_range,
            "goal_reward": goal_reward,
            "reward_shape": reward_shape,
            "episodes": episodes,
            "model_lr": model_lr,
            "pol_lr": pol_lr,
            "temp": temp,
            "final_temp": final_temp,
            "batch_size": batch_size,
            "epochs": epochs,
            "lam": lam,
            "eps_ppo": eps_ppo,
            "eps_model": eps_shift,
            "entropy_coef": entropy_coef,
            "max_eps_model": max_eps_model,
            "param_decay": param_decay,
        })

        try:
            save_to_mlflow(experiment_dict, run_name)
        except Exception as e:
            print(e)
            print("Something went wrong saving the experiment results to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            save(save_path, experiment_dict)

        rewards_fig = plot_avg_test_return(tests_returns, f"CPMPO Avg Rewards on {num_runs} runs")
        try:
            mlflow.log_figure(figure=rewards_fig, artifact_file="reward_image.png")
        except Exception as e:
            print(e)
            print("Something went wrong saving the figure to MLFlow.")
            print("Saving locally instead.")
            time.sleep(5)
            rewards_fig.savefig(save_path+"/reward_image.png")

In [9]:
print(len(tests_returns))

0


In [10]:
run_experiments(change_map=False, num_runs=num_runs)

Current seed for result reproducibility: 2999
Episode: 5000 reward: 0.0 tau 0.1958524236679078 d_inf_pol 0.0033589303493499756
Episode: 10000 reward: 0.0 tau 0.19218433141708394 d_inf_pol 0.0032148361206054688
Episode: 15000 reward: 0.0 tau 0.18874710911512402 d_inf_pol 0.0028461962938308716
Episode: 20000 reward: 0.0 tau 0.18782789909839656 d_inf_pol 0.0035572797060012817
Episode: 25000 reward: 0.0 tau 0.1866909002661708 d_inf_pol 0.005071684718132019
Episode: 30000 reward: 0.0 tau 0.18557876205444368 d_inf_pol 0.003821432590484619
Episode: 35000 reward: 0.0 tau 0.1825787620544438 d_inf_pol 0.18380434811115265
Episode: 40000 reward: 1.6401508079767217 tau 0.1799245960116391 d_inf_pol 0.20138096809387207
Episode: 45000 reward: 0.8207874709963794 tau 0.1792125290036206 d_inf_pol 0.25259414315223694
Episode: 50000 reward: 0.0 tau 0.17659492689371162 d_inf_pol 0.004947870969772339
Episode: 55000 reward: 0.0 tau 0.17441158121824324 d_inf_pol 0.14851616322994232
Episode: 60000 reward: 0.0 t

KeyboardInterrupt: 

In [None]:
print(experiment_results[0].keys())
print(experiment_results[0]["thetas"][-1])
print(experiment_results[1]["thetas"][-1])

dict_keys(['thetas', 'taus', 'reward_records', 'test_policies_return', 'index', 'test_pol_len'])
[[-0.82809897  3.94998805 -1.87924164 -1.24264744]
 [ 1.32444795  0.13188454 -0.50298187 -0.95335063]
 [ 2.77125782 -0.99827071 -1.22295777 -0.55002934]
 ...
 [-2.06517896  0.503848    3.43545147 -1.87412051]
 [-1.00139364  0.02189496  2.05607031 -1.07657162]
 [ 0.          0.          0.          0.        ]]
[[-0.61608145  3.70679961 -1.51529603 -1.57542213]
 [-0.74679782  2.66977515 -1.12808598 -0.79489136]
 [ 2.13662224 -0.6923199  -1.2292258  -0.21507654]
 ...
 [-1.06147321 -1.66283592  4.58061616 -1.85630704]
 [-0.6601209  -0.17127968  1.96823254 -1.13683195]
 [ 0.          0.          0.          0.        ]]


In [None]:
map_seed = seed = SEEDS[0]
set_policy_seed(seed)
env = FrozenLakeEnv(is_slippery=is_slippery, seed=seed, 
                desc=generate_random_map(nrows, seed=map_seed), 
                reward_shape=reward_shape,
                num_bins=num_bins,
                dense_reward=dense_reward,
                shape_range=shape_range,
                goal_reward=goal_reward,
                )

# Environment independent configuration
tmdp = TMDP(env, xi, tau=tau, gamma=gamma, seed=seed)
tmdp.update_tau(tau)

Current seed for result reproducibility: 2999


In [None]:
pi = get_softmax_policy(experiment_results[0]["thetas"][-1], 1e-3)


In [None]:

tmdp.update_tau(0.)
tmdp.reset()
done = False
step = 0
env.render_mode = "human"
while True:
    s = tmdp.env.s
    a = select_action(pi[s])
    s_prime, reward, flags, prob = tmdp.step(a)
    print("state {} action {} next state {} reward {}".format(s, a, s_prime, reward))
    if flags["done"]:
        tmdp.reset()
        break
    step +=1
    if step > min(100,nrows*3):
        break


state 0 action 1 next state 30 reward 0.0
state 30 action 2 next state 31 reward 0.0
state 31 action 1 next state 61 reward 0.0
state 61 action 1 next state 91 reward 0.0
state 91 action 1 next state 121 reward 0.0
state 121 action 2 next state 122 reward 0.0
state 122 action 1 next state 152 reward 0.0
state 152 action 1 next state 182 reward 0.0
state 182 action 1 next state 212 reward 0.0
state 212 action 0 next state 211 reward 0.0
state 211 action 1 next state 241 reward 0.0
state 241 action 1 next state 271 reward 0.0
state 271 action 2 next state 272 reward 0.0
state 272 action 1 next state 302 reward 0.0
state 302 action 2 next state 303 reward 0.0
state 303 action 2 next state 304 reward 0.0
state 304 action 1 next state 334 reward 0.0
state 334 action 2 next state 335 reward 0.0
state 335 action 2 next state 336 reward 0.0
state 336 action 2 next state 337 reward 0.0
state 337 action 1 next state 367 reward 0.0
state 367 action 2 next state 368 reward 0.0
state 368 action 2 n