In [None]:
### Import stuff and make environments

import gym
import inventory
import torch
import numpy as np
import matplotlib.pyplot as plt


from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy


env = make_vec_env('inventory-v1', n_envs=8)
test_env = gym.make('inventory-v1')


#best constant order policy evaluation
h_ = 1
p_ = 99
lamb_ = 1
tau_ph = (h_/(2*p_+h_))**(1/2)
r = (1/lamb_)*(1-tau_ph) #value for constant order policy
r_performance = (1/lamb_)*((h_*(2*p_ + h_))**(1/2)-h_)

print(r, r_performance)

In [None]:
### Train a policy and test its performance

n_timesteps = 100000      #how many learning steps?
n_episodes = 100          #how many episodes to use when evaluating policy?

model = PPO(MlpPolicy, env, verbose=1, n_steps=512, gamma = 1)
#default = model.policy.state_dict() # get model parameters
#mines = default.copy() # make a copy to edit
#mines["log_std"] = torch.tensor([-3], device=mines["log_std"].device) # set initial log std of actions taken by policy
#mines["action_net.weight"][0] = torch.zeros(len(mines["action_net.weight"][0]), device=mines["action_net.weight"].device) # set weights of last layer of policy network to 0 so we can implement approximately constant order policy
#mines["action_net.bias"] = torch.tensor([.929], device=mines["action_net.bias"].device) # approximately setting mean action to optimal constant order amount
#model.policy.load_state_dict(mines) # initialize with our custom parameters 
    
model.learn(n_timesteps)
res_mean, res_std = evaluate_policy(model, test_env, n_eval_episodes=n_episodes)
print(-res_mean,'+/-',1.96*res_std/np.sqrt(n_episodes))

In [None]:
### Train several policies of different lengths and plot their performances
# (can be parallelized)

n_episodes = 20      #how many episodes to use when evaluating policy?
n_learning = 5       #how many policies to train?
n_timesteps = 10000  #difference in number of learning steps between successive polices?

model_performance = np.zeros(n_learning)
learning_steps = range(n_timesteps, (n_learning+1)*n_timesteps, n_timesteps)

for i in range(0, n_learning):
    model = PPO(MlpPolicy, env, n_steps=512, gamma = 1)
    model.learn((i+1)*n_timesteps)
    res_mean, res_std = evaluate_policy(model, test_env, n_eval_episodes=n_episodes)
    print((i+1)*n_timesteps)
    print(-res_mean,'+/-',1.96*res_std/np.sqrt(n_episodes))
    model_performance[i] = -res_mean

plt.plot(learning_steps, model_performance)
plt.xlabel("Number of Learning Steps")
plt.ylabel("Approximate Long Run Average Cost")
plt.title("Number of Learning Steps vs Policy Performance")
plt.axhline(y=r_performance, color = 'r', linestyle = "--")



In [None]:
### Train a single policy, evaluate performance at several points, save best policy as you go
# (cannot be parallelized)

from stable_baselines3.common.callbacks import EvalCallback

n_episodes = 100        #how many episodes to use when evaluating policy?
n_timesteps = 50000     #how many learning steps?
eval_frequency = 10000  #how many learning steps between policy evaluations?
#cant figure out how to increase episode length when evaluating?

eval_callback = EvalCallback(test_env, best_model_save_path = './logs/',
                             log_path = './logs/', eval_freq = eval_frequency,
                             render = False, n_eval_episodes = n_episodes)

model = PPO(MlpPolicy, test_env, verbose=1, n_steps=512, gamma = 1)
model.learn(50000, callback=eval_callback)