# Setup

In [2]:
import os
os.environ['JAX_PLATFORMS'] = 'cpu'  # Usually faster than GPU for MCMC in NumPyro (which uses JAX)
os.environ['JAX_ENABLE_X64'] = "True" # Double precision needed to avoid nan's in posterior sampling

import random
from experiments.utils import save_configurations

from constants import (
    MIN_SEED, 
    MAX_SEED, 
    SARSOP_POMDPSOL_PATH,
    CONFIG_DIR
)

# RiverSwim

In [None]:
RIVER_LENGTH = 6
HORIZON = 40
ENV_PARAMS = {
    'transition_params': [0.6, 0.35, 0.05], # P(no move | right), P(went right | right), P(went left | right)
    'observation_params': [0.6, 0.2, 0.2], # P(no obs error), P(Left obs error), P(Right obs error)  
    'reward_params': [5/1000, 1], # left-most state reward, right-most state reward
    'init_state_probs': [1.0, 0, 0, 0, 0, 0],
}

exp_name = 'river_swim_generic_transitions'

n_runs = 20
n_episodes = 50 if exp_name == 'test' else 200

exp_dir = f'experiments/{exp_name}'
result_dir = f'{exp_dir}/results'
tmp_dir = f'{exp_dir}/tmp'

# Creating configurations
os.makedirs(result_dir, exist_ok=True)
os.makedirs(tmp_dir, exist_ok=True)

config_path = f'{CONFIG_DIR}/{exp_name}.json'

random.seed(exp_name)

planner_discount = 0.95 if 'discounted' in exp_name else 0.9999
env_discount = planner_discount if 'discounted' in exp_name else 1.0

global_config = {
    'exp_name': exp_name,
    'tmp_dir': tmp_dir,
    'result_dir': result_dir,
    'n_runs': n_runs,
    'n_episodes': n_episodes,
    'env_config': {
        'id': "pomdp_envs:RiverSwim-v0",
        'discount': env_discount,
        'horizon': HORIZON,
        'river_length': RIVER_LENGTH,
        'params': ENV_PARAMS,
    },
    'infer_config': {
        'prior_name': 'river_swim_pomdp_prior_generic_transitions',
        'model_name': 'river_swim_pomdp_model_generic_transitions',
        'mcmc_args': {
            'num_warmup': 1000,
            'num_samples': 2000, 
            'num_chains': 4,
        }
    },
    'known_params': ['reward_params', 'init_state_probs', 'n_states', 'n_actions', 'n_obs'],
    # Eval is to be run after all runs of the online experiments is done, so only one seed is needed.
    'eval_seed': random.randint(MIN_SEED, MAX_SEED),
    # Eval result stores policy value (i.e., expected return) for each run and each episode.
    'eval_result_path': f'{result_dir}/eval_results.csv',
}

configs = {'global': global_config}

for i in range(n_runs):
    configs[i] = {
        'run_id': i,
        'np_seed': random.randint(MIN_SEED, MAX_SEED),
        'jr_seed': random.randint(MIN_SEED, MAX_SEED),
        'fig_path' : f'{result_dir}/run{i}_eval_plot.jpg',
        'log_path':  f'{exp_dir}/run{i}.log',
        'result_path' : f'{result_dir}/run{i}_trajectories.pkl',
        'planner_config': {
            'pomdp_path': f'{tmp_dir}/run{i}.pomdp',
            'discount': planner_discount,   # Planner's discount could be different from env
            'sarsop_args': {
                'pomdpsol_path': SARSOP_POMDPSOL_PATH,
                'timeout': 60,
                'memory': 1024,
                'precision': 0.05,
                'policy_path': f'{tmp_dir}/run{i}_sarsop.policy',
                'logfile': f'{tmp_dir}/run{i}_sarsop.log',
            }
        },
    }

save_configurations(configs, config_path)

# Random POMDP

In [None]:
import numpyro as npyro
from agents.inference import generic_transition_kernel_prior, generic_observation_kernel_prior
import jax.random as jr

def sample_transition_kernel(key, n_states, n_actions):
    return npyro.handlers.seed(generic_transition_kernel_prior, key)(n_states, n_actions)

def sample_observation_kernel(key, n_states, n_obs):
    return npyro.handlers.seed(generic_observation_kernel_prior, key)(n_states, n_obs)


N_STATES = 10
N_OBS = N_STATES
N_ACTIONS = 4


HORIZON = 20
KEY = jr.key(2024)
repeats = 5

for i in range(repeats):
    KEY, transit_key, obs_key = jr.split(KEY, 3)
    transition_kernel = sample_transition_kernel(transit_key, N_STATES, N_ACTIONS)
    observation_kernel = sample_observation_kernel(obs_key, N_OBS, N_STATES)
    ENV_PARAMS = {
        'transition_kernel': transition_kernel.tolist(), # P(no move | right), P(went right | right), P(went left | right)
        'observation_params': [0.6, 0.2, 0.2], # P(no obs error), P(Left obs error), P(Right obs error)  
        'reward_params': [5/1000, 1], # left-most state reward, right-most state reward
        'init_state_probs': [1.0] + [0] * (N_STATES - 1),
    }

    exp_name = f'randomSparseRewardPOMDP_{i}'
    n_runs = 20
    n_episodes = 50 if exp_name == 'test' else 500

    exp_dir = f'experiments/{exp_name}'
    result_dir = f'{exp_dir}/results'
    tmp_dir = f'{exp_dir}/tmp'

    # Creating configurations
    os.makedirs(result_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    config_path = f'{CONFIG_DIR}/{exp_name}.json'

    random.seed(exp_name)

    planner_discount = 0.95 if 'discounted' in exp_name else 0.9999
    env_discount = planner_discount if 'discounted' in exp_name else 1.0

    global_config = {
        'exp_name': exp_name,
        'tmp_dir': tmp_dir,
        'result_dir': result_dir,
        'n_runs': n_runs,
        'n_episodes': n_episodes,
        'env_config': {
            'id': "pomdp_envs:SparseRewardPOMDP-v0",
            'discount': env_discount,
            'horizon': HORIZON,
            'n_states': N_STATES,
            'n_actions': N_ACTIONS,
            'n_obs': N_OBS,
            'params': ENV_PARAMS,
        },
        'infer_config': {
            'prior_name': 'generic_pomdp_prior_known_init_state', # 'randomT_pomdp_prior',
            'model_name': 'generic_pomdp_model_known_init_state', # 'randomT_pomdp_model',
            'mcmc_args': {
                'num_warmup': 1000,
                'num_samples': 2000, 
                'num_chains': 4,
            }
        },
        'known_params': ['reward_params', 'init_state_probs', 'n_states', 'n_actions', 'n_obs'],
        # Eval is to be run after all runs of the online experiments is done, so only one seed is needed.
        'eval_seed': random.randint(MIN_SEED, MAX_SEED),
        # Eval result stores policy value (i.e., expected return) for each run and each episode.
        'eval_result_path': f'{result_dir}/eval_results.csv',
    }

    configs = {'global': global_config}

    for i in range(n_runs):
        configs[i] = {
            'run_id': i,
            'np_seed': random.randint(MIN_SEED, MAX_SEED),
            'jr_seed': random.randint(MIN_SEED, MAX_SEED),
            'fig_path' : f'{result_dir}/run{i}_eval_plot.jpg',
            'log_path':  f'{exp_dir}/run{i}.log',
            'result_path' : f'{result_dir}/run{i}_trajectories.pkl',
            'planner_config': {
                'pomdp_path': f'{tmp_dir}/run{i}.pomdp',
                'discount': planner_discount,   # Planner's discount could be different from env
                'sarsop_args': {
                    'pomdpsol_path': SARSOP_POMDPSOL_PATH,
                    'timeout': 60,
                    'memory': 1024,
                    'precision': 0.05,
                    'policy_path': f'{tmp_dir}/run{i}_sarsop.policy',
                    'logfile': f'{tmp_dir}/run{i}_sarsop.log',
                }
            },
        }

    save_configurations(configs, config_path)

# Tiger

In [None]:
theta = 0.2
exp_name = f'new_tiger_theta{int(theta * 10)}_discounted'
n_runs = 20
n_episodes = 20 if exp_name == 'test' else 500
horizon = 20

exp_dir = './experiments/tiger'
config_dir = f'{exp_dir}/configs'
result_dir = f'{exp_dir}/{exp_name}/results'
tmp_dir = f'{exp_dir}/tmp/{exp_name}'


# Creating configurations
os.makedirs(config_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)
os.makedirs(tmp_dir, exist_ok=True)

config_path = f'{config_dir}/{exp_name}.json'

random.seed(exp_name)

planner_discount = 0.99 if 'discounted' in exp_name else 0.999
env_discount = planner_discount if 'discounted' in exp_name else 1.0

global_config = {
    'exp_name': exp_name,
    'exp_dir': exp_dir,
    'tmp_dir': tmp_dir,
    'result_dir': result_dir,
    'n_runs': n_runs,
    'n_episodes': n_episodes,
    'env_config': {
        'id': "pomdp_envs:Tiger-v0",
        'discount': env_discount,
        'horizon': horizon,
        'theta': theta,
        'listen_cost': -1,
        'treasure_reward': 10,
        'tiger_penalty': -100,
    },
    # Planner's discount could be different from env (e.g., when discount = 1, which is not a valid input to SARSOP)
    'planner_discount': planner_discount, 
    'mcmc_config': {
        'num_warmup': 5000, 
        'num_samples': 5000, 
        'num_chains': 5
    }
}

configs = {'global': global_config}

for i in range(n_runs):
    configs[i] = {
        'run_id': i,
        # seed for main experiment to generate policies
        'main_seed': random.randint(MIN_SEED, MAX_SEED),
        # seed for policy_evaluation 
        # 'policy_eval_seed': random.randint(MIN_SEED, MAX_SEED),
        'fig_path' : f'{result_dir}/run{i}_eval_plot.jpg',
        'result_path' : f'{result_dir}/run{i}_results.pkl',
        'pomdp_path': f'{result_dir}/run{i}_tiger.pomdp',
        'sarsop_config': {
            'pomdpsol_path': SARSOP_POMDPSOL_PATH,
            'timeout': 60,
            'memory': 1024,
            'precision': 0.05,
            'policy_path': f'{tmp_dir}/run{i}_sarsop.policy',
            'logfile': f'{tmp_dir}/run{i}_sarsop.log'
        },
        'sarsop_optimal_policy_config': {
            'pomdpsol_path': SARSOP_POMDPSOL_PATH,
            'timeout': 120,
            'memory': 2048,
            'precision': 0.01,
            'policy_path': f'{result_dir}/run{i}_optimal_sarsop.policy',
            'logfile': f'{result_dir}/run{i}_optimal_policy_sarsop.log',
        },
    }

save_configurations(configs, config_path)