In [1]:
import gym
import inventory
import torch
import numpy as np
import pickle

from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from inventory.envs.inventory_env import Inventory

In [2]:
# train and save models for each set of problem parameters


# these are parameters I used just to test code, please run parameters that are not commented out

# train_length = 500 # episode length when training
# n_train_episodes = 8 # number of episodes to simulate in parellel during training
# eval_length = 500 # episode length when evaluating
# training_steps = 1000 # how many total time steps to use for training
# n_eval_episodes = 10          #how many episodes to use when evaluating policy 
# eval_frequency = 100  # how many time steps between checkpoints in training
# problems = [[1/4,1],[1,1]]


train_length = 5000 # episode length when training
n_train_episodes = 8 # number of episodes to simulate in parellel during training
eval_length = 50000 # episode length when evaluating
training_steps = 5000000 # how many total time steps to use for training
n_eval_episodes = 10          #how many episodes to use when evaluating policy 
eval_frequency = 10000  # how many time steps between checkpoints in training
problems = [[1/4,1],[1,1],[4,1],[9,1],[39,1],[99,1],
            [1/4,4],[1,4],[4,4],[9,4],[39,4],[99,4],
            [1/4,10],[1,10],[4,10],[9,10],[39,10],[99,10],
            [1/4,20],[1,20],[4,20],[9,20],[39,20],[99,20],
            [1/4,30],[1,30],[4,30],[9,30],[39,30],[99,30],
            [1/4,50],[1,50],[4,50],[9,50],[39,50],[99,50],
            [1/4,70],[1,70],[4,70],[9,70],[39,70],[99,70],
            [1/4,100],[1,100],[4,100],[9,100],[39,100],[99,100]]

for p, L in problems:
    train_parameters = {"p": p, "L": L, "length": train_length}
    eval_parameters = train_parameters.copy()
    eval_parameters["length"] = eval_length
    train_env = make_vec_env(Inventory, n_envs=n_train_episodes, env_kwargs = train_parameters)
    eval_env = make_vec_env(Inventory, n_envs=1, env_kwargs = eval_parameters)
    save_path =  './logs/' + str(p) + "_" + str(L)
    eval_callback = EvalCallback(eval_env, best_model_save_path = save_path,
                             log_path = save_path, eval_freq = eval_frequency,
                             render = False, n_eval_episodes = n_eval_episodes)

    model = PPO(MlpPolicy, eval_env, verbose=False, gamma = 1)
    default = model.policy.state_dict() # get model parameters
    mine = default.copy() # make a copy to edit
    #mine["log_std"] = torch.tensor([-1], device=mine["log_std"].device) # set initial log std of actions taken by policy
    mine["action_net.weight"][0] = torch.zeros(len(mine["action_net.weight"][0]), device=mine["action_net.weight"].device) # set weights of last layer of policy network to 0 so we can implement approximately constant order policy
    mine["action_net.bias"] = torch.tensor([train_env.envs[0].env.opt_const], device=mine["action_net.bias"].device) # approximately setting mean action to optimal constant order amount
    model.policy.load_state_dict(mine) # initialize with our custom parameters 
    model.learn(training_steps, callback=eval_callback)

Eval num_timesteps=100, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200, episode_reward=-0.22 +/- 0.01
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=300, episode_reward=-0.22 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=400, episode_reward=-0.22 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=500, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=600, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=700, episode_reward=-0.22 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=800, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=900, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1000, episode_reward=-0.23 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1100, episode_reward=-0.22 +/- 0.01
Episode length: 500.00 +/- 0.00
Eval num

In [9]:
# perform evaluation of the best models selected in the previous step for each problem and save results

# evaluate 
ratios = []
ratio_95_conf_ints = []
for p, L in problems:
    const_order_val = (2*p + 1)**(1/2)-1
    load_path =  './logs/' + str(p) + "_" + str(L)
    model = PPO.load(load_path + "/best_model.zip")
    env = make_vec_env(Inventory, n_envs=1, env_kwargs = eval_parameters)
    rew_mean, rew_std = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes)
    rew_mean = -rew_mean
    ratios.append(const_order_val/(rew_mean))
    half_width = 1.96*rew_std/np.sqrt(n_eval_episodes)
    ratio_95_conf_ints.append([const_order_val/(rew_mean+half_width), const_order_val/(rew_mean-half_width)])

# save
with open('objs.pkl', 'wb') as f:
    pickle.dump([problems, ratios, ratio_95_conf_ints], f)
    f.close()

# if we need to load those objects
#with open('objs.pkl', 'rb') as f:
#    problems, ratios, ratio_95_conf_ints = pickle.load(f)
#    f.close()

In [13]:
# ratios are pretty bad because I didn't train for long at all
print(ratios)
print(ratio_95_conf_ints)

[0.2663916652432184, 1.0207682750072293]
[[0.2575173236370867, 0.275899476881756], [0.9806028861757383, 1.064364531843326]]
