In [None]:
import numpy as np
from model_functions import *
from policy_utils import *
import matplotlib.pyplot as plt
import gymnasium as gym

from algorithms import *
from FrozenLake import *
import pandas as pd
import constants
import math
from policy_utils import *
import torch
import mlflow
from TeleportPPO import TeleportPPO
import optuna
from experiment_result_utils import *
from env_register import *
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
from TMDP_v1 import *
from TeleportRolloutBuffer import *

nrows = 20
nS = nrows**2
nA = 4
seed = constants.SEEDS[0]
set_policy_seed(seed)
num_bins = 0
gamma = .9999999
tau = 0.2
dense_reward = False
reward_shape = False
shape_range=(-1,0)
goal_reward = 1.
is_slippery=False
env = FrozenLakeEnv(is_slippery=is_slippery, seed=seed, 
                        desc=generate_random_map(nrows, seed=seed), 
                        reward_shape=reward_shape,
                        num_bins=num_bins,
                        dense_reward=dense_reward,
                        shape_range=shape_range,
                        goal_reward=goal_reward,
                        )
xi = np.ones(env.nS) * 1/env.nS


In [None]:
def linear_schedule(initial_value):
    """
    Linear learning rate schedule.
    :param initial_value: (float) Initial learning rate.
    :return: (function) Schedule that computes the current learning rate.
    """
    def schedule(progress_remaining):
        """
        Progress will decrease from 1 (beginning) to 0.
        :param progress_remaining: (float)
        :return: (float) current learning rate
        """
        return progress_remaining * initial_value

    return schedule

In [None]:
env_kwargs = dict(env=env, xi=xi, tau=tau, gamma=gamma)
tmdp = TMDP(**env_kwargs)

In [None]:
vec_env = DummyVecEnv([lambda: tmdp])
model_kwargs = {
    'rollout_buffer_class': TeleportRolloutBuffer,
    'verbose': 0,
    'learning_rate': linear_schedule(0.15),
    'gamma': gamma,
    'n_steps': 2048,
    'batch_size': 256,
    'ent_coef': 0.01,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'n_epochs': 5,
    'tensorboard_log':'./ppo_frozenlake_tensorboard/',
}

learn_kwargs = {
    'total_timesteps': 12288000,
    'callback': TeleportFilterCallback(),
    'eps_shift': 0.003,
    'max_eps_model': 0.0001,
    'check_convergence': False,
    'static_curriculum': True,
}
model = TeleportPPO("MlpPolicy", vec_env, **model_kwargs)
model.learn(**learn_kwargs)

In [None]:
obs = vec_env.reset()
done = False
truncated = False
tmdp.env.render_mode = 'human'
tmdp.update_tau(0.)
step = 0
while not done and not truncated:
    action, _states = model.predict(obs, deterministic=True)
    print(action)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    step += 1
    if step > 100:
        truncated = True
    print(step, reward, done, info)
#vec_env.close()

In [None]:
model.save("frozenlake_ppo")