# Using `rllib` to solve the inventory management custom environment

<img src="images/inv_sim.png" width="750"/>

In [3]:
import gym
from gym.spaces import Box
import numpy as np
from numpy.random import default_rng


class InventoryEnv(gym.Env):
    def __init__(self, config=None):
        """
        Must define self.observation_space and self.action_space here
        """
        
        # Define action space: bounds, space type, shape
        
        # Bound: Shelf space is limited
        self.max_capacity = 4000
        
        # Space type: Better to use Box than Discrete, since Discrete will lead to too many output nodes in the NN
        # Shape: rllib cannot handle scalar actions, so turn it into a numpy array with shape (1,)
        self.action_space = Box(low=np.array([0]), high=np.array([self.max_capacity]))
        
        # Define observation space: bounds, space type, shape
        
        # Shape: The lead time controls the shape of observation space
        self.lead_time = 5
        self.obs_dim = self.lead_time + 4
        
        # Bounds: Define high of the remaining observation space elements
        self.max_mean_daily_demand = 200
        self.max_unit_selling_price = 100
        self.max_daily_holding_cost_per_unit = 5
        
        obs_low = np.zeros((self.obs_dim,))
        obs_high = np.array([self.max_capacity for _ in range(self.lead_time)] +
                            [self.max_mean_daily_demand, self.max_unit_selling_price,
                             self.max_unit_selling_price, self.max_daily_holding_cost_per_unit
                             ]
                            )
        self.observation_space = Box(low=obs_low, high=obs_high)
        
        # The random number generator that will be used throughout the environment
        self.rng = default_rng()
        
        # All instance variables are defined in the __init__() method
        self.current_obs = None
        self.episode_length_in_days = 90
        self.day_num = None

    def reset(self):
        """
        Returns: the observation of the initial state
        Reset the environment to initial state so that a new episode (independent of previous ones) may start
        """
        # Sample parameter values from the parameter space
        
        # Set mean daily demand (lambda)
        mean_daily_demand = self.rng.uniform() * self.max_mean_daily_demand
        
        # Set selling price
        selling_price = self.rng.uniform() * self.max_unit_selling_price
        
        # Set buying price: buying price cannot be higher than selling price
        buying_price = self.rng.uniform() * selling_price
        
        # Set daily holding cose per unit: holding cost cannot be higher than buying_price
        daily_holding_cost_per_unit = self.rng.uniform() * min(buying_price,
                                                               self.max_daily_holding_cost_per_unit
                                                               )
        
        # Return the first observation
        self.current_obs = np.array([0 for _ in range(self.lead_time)] +
                                    [mean_daily_demand, selling_price, buying_price,
                                     daily_holding_cost_per_unit,
                                     ]
                                    )
        self.day_num = 0
        return self.current_obs

    def step(self, action):
        """
        Returns: Given current obs and action, returns the next observation, the reward, done and optionally additional info
        """
        # Action looks like np.array([20.0]). We convert that to float 20.0 for easier calculation
        buys = min(action[0], self.max_capacity - np.sum(self.current_obs[:self.lead_time]))
        
        # Compute next obs
        demand = self.rng.poisson(self.current_obs[self.lead_time])
        next_obs = np.concatenate((self.current_obs[1: self.lead_time],
                                   np.array([buys]),
                                   self.current_obs[self.lead_time:]
                                   )
                                  )
        next_obs[0] += max(0, self.current_obs[0] - demand)
        
        # Compute reward
        reward = (self.current_obs[self.lead_time + 1] * (self.current_obs[0] + self.current_obs[1] - next_obs[0]) -
                  self.current_obs[self.lead_time + 2] * buys - 
                  self.current_obs[self.lead_time + 3] * (next_obs[0] - self.current_obs[1])
                  )
                  
        # Compute done
        self.day_num += 1
        done = False
        if self.day_num >= self.episode_length_in_days:
            done = True
            
        self.current_obs = next_obs

        # info must be a dict
        return self.current_obs, reward, done, {}

    def render(self, mode="human"):
        """
        Returns: None
        Show the current environment state e.g. the graphical window in `CartPole-v1`
        This method must be implemented, but it is OK to have an empty implementation if rendering is not
        important
        """
        pass

    def close(self):
        """
        Returns: None
        This method is optional. Used to cleanup all resources (threads, graphical windows) etc.
        """
        pass
    
    def seed(self, seed=None):
        """
        Returns: List of seeds
        This method is optional. Used to set seeds for the environment's random number generator for 
        obtaining deterministic behavior
        """
        return

In [None]:
from ray import tune

tune.run("PPO",
         config={"env": InventoryEnv,    # Instead of strings e.g. "CartPole-v1", we pass the custom env class
                 "evaluation_interval": 1000,
                 # Each episode uses different shop params. Need lots of samples to gauge agent's performance
                 "evaluation_num_episodes": 10000    
                 },
         checkpoint_freq=1000,
         local_dir="experiment_vanilla"
         )

[2m[36m(PPOTrainer pid=27788)[0m 2022-10-13 15:14:09,521	INFO trainer.py:2140 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPOTrainer pid=27788)[0m 2022-10-13 15:14:09,522	INFO ppo.py:249 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=27788)[0m 2022-10-13 15:14:09,522	INFO trainer.py:779 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(RolloutWorker pid=27801)[0m   logger.warn(
[2m[36m(RolloutWorker pid=27792)[0m   logger.warn(
[2m[36m(PPOTrainer pid=27788)[0m   logger.warn(


Trial name,status,loc
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788


Trial name,status,loc
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788




Trial name,status,loc
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2022-10-13_15-14-20
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 636241.2473832322
  episode_reward_mean: -451034.21460738644
  episode_reward_min: -1438715.5781107056
  episodes_this_iter: 44
  episodes_total: 44
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.4288417100906372
          entropy_coeff: 0.0
          kl: 0.0028741948772221804
          model: {}
          policy_loss: -0.0014492435147985816
          total_loss: 52274954240.0
          vf_explained_var: -6.647520081060065e-07
          vf_loss: 52274954240.0
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_steps_sampled: 4000
    num_steps_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,1,6.98752,4000,-451034,636241,-1438720.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-10-13_15-14-27
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 636241.2473832322
  episode_reward_mean: -454837.1052217662
  episode_reward_min: -1535093.629592125
  episodes_this_iter: 44
  episodes_total: 88
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.4052077531814575
          entropy_coeff: 0.0
          kl: 0.006940712686628103
          model: {}
          policy_loss: -0.008040248416364193
          total_loss: 69405171712.0
          vf_explained_var: -3.22634178928638e-07
          vf_loss: 69405171712.0
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_steps_sampled: 8000
    num_steps_train



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,2,13.6406,8000,-454837,636241,-1535090.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2022-10-13_15-14-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 520561.9104830079
  episode_reward_mean: -507707.9893344553
  episode_reward_min: -1535093.629592125
  episodes_this_iter: 44
  episodes_total: 132
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.3370603322982788
          entropy_coeff: 0.0
          kl: 0.008609519340097904
          model: {}
          policy_loss: -0.012667425908148289
          total_loss: 69572132864.0
          vf_explained_var: -8.466422940500706e-08
          vf_loss: 69572132864.0
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12000
    num_steps



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,3,20.4116,12000,-507708,520562,-1535090.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-10-13_15-14-40
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 739623.9444485448
  episode_reward_mean: -505462.6560601601
  episode_reward_min: -1722478.8408258236
  episodes_this_iter: 44
  episodes_total: 176
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.330762267112732
          entropy_coeff: 0.0
          kl: 0.005340712610632181
          model: {}
          policy_loss: -0.002432489302009344
          total_loss: 63003963392.0
          vf_explained_var: -2.7366864330247154e-08
          vf_loss: 63003963392.0
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 16000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,4,26.9812,16000,-505463,739624,-1722480.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2022-10-13_15-14-47
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 739623.9444485448
  episode_reward_mean: -540412.8526597274
  episode_reward_min: -1787960.033802799
  episodes_this_iter: 46
  episodes_total: 222
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2272062301635742
          entropy_coeff: 0.0
          kl: 0.010175843723118305
          model: {}
          policy_loss: -0.0010648787720128894
          total_loss: 93502005248.0
          vf_explained_var: -2.5059586405973278e-08
          vf_loss: 93502005248.0
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled: 20000
    num_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,5,33.6826,20000,-540413,739624,-1787960.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,5,33.6826,20000,-540413,739624,-1787960.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-10-13_15-14-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 723253.3458310947
  episode_reward_mean: -530472.738902019
  episode_reward_min: -1787960.033802799
  episodes_this_iter: 44
  episodes_total: 266
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.1995543241500854
          entropy_coeff: 0.0
          kl: 0.01192067377269268
          model: {}
          policy_loss: -0.011487104929983616
          total_loss: 54479822848.0
          vf_explained_var: -1.0318653487217944e-08
          vf_loss: 54479822848.0
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_steps_sampled: 24000
    num_steps_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,6,40.4042,24000,-530473,723253,-1787960.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2022-10-13_15-15-00
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 723253.3458310947
  episode_reward_mean: -506115.1150541123
  episode_reward_min: -1599703.4826916477
  episodes_this_iter: 44
  episodes_total: 310
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2147294282913208
          entropy_coeff: 0.0
          kl: 0.010887835174798965
          model: {}
          policy_loss: -0.006326260045170784
          total_loss: 70321250304.0
          vf_explained_var: -2.204730975563507e-08
          vf_loss: 70321250304.0
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_steps_sampled: 28000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,7,47.0303,28000,-506115,723253,-1599700.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-10-13_15-15-07
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 992515.6231428592
  episode_reward_mean: -484195.20645574294
  episode_reward_min: -1646359.783836148
  episodes_this_iter: 44
  episodes_total: 354
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2212870121002197
          entropy_coeff: 0.0
          kl: 0.016632264479994774
          model: {}
          policy_loss: -0.012561638839542866
          total_loss: 74371817472.0
          vf_explained_var: -1.1087745832583096e-08
          vf_loss: 74371817472.0
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_steps_sampled: 32000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,8,53.7643,32000,-484195,992516,-1646360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2022-10-13_15-15-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 992515.6231428592
  episode_reward_mean: -506491.8513896533
  episode_reward_min: -1646359.783836148
  episodes_this_iter: 46
  episodes_total: 400
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.1226975917816162
          entropy_coeff: 0.0
          kl: 0.01144337560981512
          model: {}
          policy_loss: -0.0030552197713404894
          total_loss: 66072473600.0
          vf_explained_var: -2.0957761748263692e-08
          vf_loss: 66072473600.0
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_steps_sampled: 36000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,9,60.4352,36000,-506492,992516,-1646360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-10-13_15-15-20
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1048078.7521710768
  episode_reward_mean: -471272.3734028735
  episode_reward_min: -1720230.431572242
  episodes_this_iter: 44
  episodes_total: 444
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.091694712638855
          entropy_coeff: 0.0
          kl: 0.01276425551623106
          model: {}
          policy_loss: -0.008744750171899796
          total_loss: 74299408384.0
          vf_explained_var: -3.140459670092355e-09
          vf_loss: 74299408384.0
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40000
    num_steps_sampled: 40000
    num_steps_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,10,67.0353,40000,-471272,1048080.0,-1720230.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,10,67.0353,40000,-471272,1048080.0,-1720230.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2022-10-13_15-15-27
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1048078.7521710768
  episode_reward_mean: -474339.4308785737
  episode_reward_min: -1720230.431572242
  episodes_this_iter: 44
  episodes_total: 488
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.1692538261413574
          entropy_coeff: 0.0
          kl: 0.012427011504769325
          model: {}
          policy_loss: -0.007687926758080721
          total_loss: 57269559296.0
          vf_explained_var: -2.6213225368110216e-08
          vf_loss: 57269559296.0
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_steps_sampled: 44000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,11,73.6018,44000,-474339,1048080.0,-1720230.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-10-13_15-15-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 593284.9619669828
  episode_reward_mean: -476320.4543224036
  episode_reward_min: -1548356.8467520196
  episodes_this_iter: 44
  episodes_total: 532
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2163108587265015
          entropy_coeff: 0.0
          kl: 0.007108486723154783
          model: {}
          policy_loss: -0.00836988165974617
          total_loss: 68720615424.0
          vf_explained_var: -5.127281266226191e-09
          vf_loss: 68720615424.0
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 48000
    num_steps_sampled: 48000
    num_steps

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,12,80.1556,48000,-476320,593285,-1548360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 52000
  custom_metrics: {}
  date: 2022-10-13_15-15-40
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 708009.2586920443
  episode_reward_mean: -481099.7744528146
  episode_reward_min: -1548356.8467520196
  episodes_this_iter: 44
  episodes_total: 576
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.229792594909668
          entropy_coeff: 0.0
          kl: 0.010520200245082378
          model: {}
          policy_loss: -0.012441177852451801
          total_loss: 63553712128.0
          vf_explained_var: 2.4354585015373686e-09
          vf_loss: 63553712128.0
    num_agent_steps_sampled: 52000
    num_agent_steps_trained: 52000
    num_steps_sampled: 52000
    num_steps



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,13,86.6996,52000,-481100,708009,-1548360.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-10-13_15-15-47
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 708009.2586920443
  episode_reward_mean: -510137.397576981
  episode_reward_min: -1877339.6774211489
  episodes_this_iter: 46
  episodes_total: 622
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.1002966165542603
          entropy_coeff: 0.0
          kl: 0.013408933766186237
          model: {}
          policy_loss: -0.004241900518536568
          total_loss: 86448431104.0
          vf_explained_var: -4.403052855650458e-08
          vf_loss: 86448431104.0
    num_agent_steps_sampled: 56000
    num_agent_steps_trained: 56000
    num_steps_sampled: 56000
    num_steps

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,14,93.3245,56000,-510137,708009,-1877340.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 60000
  custom_metrics: {}
  date: 2022-10-13_15-15-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1188892.441021654
  episode_reward_mean: -559905.745712357
  episode_reward_min: -1877339.6774211489
  episodes_this_iter: 44
  episodes_total: 666
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.131700873374939
          entropy_coeff: 0.0
          kl: 0.015114150941371918
          model: {}
          policy_loss: -0.008312167599797249
          total_loss: 87597039616.0
          vf_explained_var: 1.2369565816072736e-08
          vf_loss: 87597039616.0
    num_agent_steps_sampled: 60000
    num_agent_steps_trained: 60000
    num_steps_sampled: 60000
    num_steps_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,15,99.8939,60000,-559906,1188890.0,-1877340.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,15,99.8939,60000,-559906,1188890.0,-1877340.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-10-13_15-16-00
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1188892.441021654
  episode_reward_mean: -616016.7736558075
  episode_reward_min: -1563228.2229759796
  episodes_this_iter: 44
  episodes_total: 710
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.0605506896972656
          entropy_coeff: 0.0
          kl: 0.014779823832213879
          model: {}
          policy_loss: -0.006850025150924921
          total_loss: 83409895424.0
          vf_explained_var: 2.3072765031884046e-08
          vf_loss: 83409895424.0
    num_agent_steps_sampled: 64000
    num_agent_steps_trained: 64000
    num_steps_sampled: 64000
    num_step

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,16,106.56,64000,-616017,1188890.0,-1563230.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 68000
  custom_metrics: {}
  date: 2022-10-13_15-16-07
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1188892.441021654
  episode_reward_mean: -603863.791011416
  episode_reward_min: -1696006.074940232
  episodes_this_iter: 44
  episodes_total: 754
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.299355149269104
          entropy_coeff: 0.0
          kl: 0.019126752391457558
          model: {}
          policy_loss: -0.01123115699738264
          total_loss: 58862710784.0
          vf_explained_var: -1.4356387190161968e-08
          vf_loss: 58862710784.0
    num_agent_steps_sampled: 68000
    num_agent_steps_trained: 68000
    num_steps_sampled: 68000
    num_steps_t

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,17,113.285,68000,-603864,1188890.0,-1696010.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-10-13_15-16-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 300079.0116544068
  episode_reward_mean: -560558.749164979
  episode_reward_min: -1796236.4037087257
  episodes_this_iter: 46
  episodes_total: 800
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.316943883895874
          entropy_coeff: 0.0
          kl: 0.011346153914928436
          model: {}
          policy_loss: -0.01066114567220211
          total_loss: 77113106432.0
          vf_explained_var: -3.653187974350658e-09
          vf_loss: 77113106432.0
    num_agent_steps_sampled: 72000
    num_agent_steps_trained: 72000
    num_steps_sampled: 72000
    num_steps_t



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,18,120.03,72000,-560559,300079,-1796240.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 76000
  custom_metrics: {}
  date: 2022-10-13_15-16-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 446694.00154273777
  episode_reward_mean: -511077.95114459743
  episode_reward_min: -1796236.4037087257
  episodes_this_iter: 44
  episodes_total: 844
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2095117568969727
          entropy_coeff: 0.0
          kl: 0.01664673537015915
          model: {}
          policy_loss: -0.005160840228199959
          total_loss: 51412041728.0
          vf_explained_var: -2.6725953006234704e-08
          vf_loss: 51412041728.0
    num_agent_steps_sampled: 76000
    num_agent_steps_trained: 76000
    num_steps_sampled: 76000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,19,126.703,76000,-511078,446694,-1796240.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-10-13_15-16-27
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 446694.00154273777
  episode_reward_mean: -525108.9732178384
  episode_reward_min: -1655104.5042627323
  episodes_this_iter: 44
  episodes_total: 888
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.2610549926757812
          entropy_coeff: 0.0
          kl: 0.015034890733659267
          model: {}
          policy_loss: -0.0064577022567391396
          total_loss: 80400367616.0
          vf_explained_var: -3.332732756433643e-09
          vf_loss: 80400367616.0
    num_agent_steps_sampled: 80000
    num_agent_steps_trained: 80000
    num_steps_sampled: 80000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,20,133.331,80000,-525109,446694,-1655100.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,20,133.331,80000,-525109,446694,-1655100.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 84000
  custom_metrics: {}
  date: 2022-10-13_15-16-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 467802.3196714013
  episode_reward_mean: -491050.65240206785
  episode_reward_min: -1788541.9934427454
  episodes_this_iter: 44
  episodes_total: 932
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.1664206981658936
          entropy_coeff: 0.0
          kl: 0.019246038049459457
          model: {}
          policy_loss: -0.00854575727134943
          total_loss: 70133579776.0
          vf_explained_var: 1.2081156341992028e-07
          vf_loss: 70133579776.0
    num_agent_steps_sampled: 84000
    num_agent_steps_trained: 84000
    num_steps_sampled: 84000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,21,139.854,84000,-491051,467802,-1788540.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-10-13_15-16-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 467802.3196714013
  episode_reward_mean: -534878.2457910257
  episode_reward_min: -1842684.6800004775
  episodes_this_iter: 44
  episodes_total: 976
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 0.9233801960945129
          entropy_coeff: 0.0
          kl: 0.01564963534474373
          model: {}
          policy_loss: -0.0014740020269528031
          total_loss: 69072527360.0
          vf_explained_var: 7.665285295388458e-08
          vf_loss: 69072527360.0
    num_agent_steps_sampled: 88000
    num_agent_steps_trained: 88000
    num_steps_sampled: 88000
    num_steps



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,22,146.53,88000,-534878,467802,-1842680.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 92000
  custom_metrics: {}
  date: 2022-10-13_15-16-47
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1226365.6760692454
  episode_reward_mean: -461744.8513760022
  episode_reward_min: -1842684.6800004775
  episodes_this_iter: 46
  episodes_total: 1022
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.10000000149011612
          cur_lr: 4.999999873689376e-05
          entropy: 1.0373921394348145
          entropy_coeff: 0.0
          kl: 0.02319004386663437
          model: {}
          policy_loss: -0.00530231324955821
          total_loss: 83230638080.0
          vf_explained_var: 6.666106742159172e-07
          vf_loss: 83230638080.0
    num_agent_steps_sampled: 92000
    num_agent_steps_trained: 92000
    num_steps_sampled: 92000
    num_steps



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,23,153.255,92000,-461745,1226370.0,-1842680.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-10-13_15-16-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1226365.6760692454
  episode_reward_mean: -540349.7576396645
  episode_reward_min: -1676693.1961876422
  episodes_this_iter: 44
  episodes_total: 1066
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 1.2289685010910034
          entropy_coeff: 0.0
          kl: 0.011981930583715439
          model: {}
          policy_loss: -0.005931259598582983
          total_loss: 91160756224.0
          vf_explained_var: -1.7176391864381912e-08
          vf_loss: 91160756224.0
    num_agent_steps_sampled: 96000
    num_agent_steps_trained: 96000
    num_steps_sampled: 96000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,24,160.116,96000,-540350,1226370.0,-1676690.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 100000
  custom_metrics: {}
  date: 2022-10-13_15-17-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1418308.3046520546
  episode_reward_mean: -565245.9410748373
  episode_reward_min: -1676693.1961876422
  episodes_this_iter: 44
  episodes_total: 1110
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.7925849556922913
          entropy_coeff: 0.0
          kl: 0.018444053828716278
          model: {}
          policy_loss: -0.010590787045657635
          total_loss: 93325328384.0
          vf_explained_var: 7.721685619799246e-07
          vf_loss: 93325328384.0
    num_agent_steps_sampled: 100000
    num_agent_steps_trained: 100000
    num_steps_sampled: 100000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,25,166.709,100000,-565246,1418310.0,-1676690.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,25,166.709,100000,-565246,1418310.0,-1676690.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-10-13_15-17-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1418308.3046520546
  episode_reward_mean: -544537.8429153509
  episode_reward_min: -1500657.906274878
  episodes_this_iter: 44
  episodes_total: 1154
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.8530815243721008
          entropy_coeff: 0.0
          kl: 0.013803554698824883
          model: {}
          policy_loss: -0.010995196178555489
          total_loss: 73660809216.0
          vf_explained_var: 2.436740373923385e-07
          vf_loss: 73660809216.0
    num_agent_steps_sampled: 104000
    num_agent_steps_trained: 104000
    num_steps_sampled: 104000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,26,173.502,104000,-544538,1418310.0,-1500660.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 108000
  custom_metrics: {}
  date: 2022-10-13_15-17-15
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 796858.5177988058
  episode_reward_mean: -523142.794917044
  episode_reward_min: -1467447.3600760477
  episodes_this_iter: 46
  episodes_total: 1200
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.9279830455780029
          entropy_coeff: 0.0
          kl: 0.015081815421581268
          model: {}
          policy_loss: -0.007046897429972887
          total_loss: 58025521152.0
          vf_explained_var: 2.8924276307407126e-07
          vf_loss: 58025521152.0
    num_agent_steps_sampled: 108000
    num_agent_steps_trained: 108000
    num_steps_sampled: 108000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,27,180.238,108000,-523143,796859,-1467450.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-10-13_15-17-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 734215.4955729733
  episode_reward_mean: -545115.6196264599
  episode_reward_min: -1689973.8045784873
  episodes_this_iter: 44
  episodes_total: 1244
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.9313879609107971
          entropy_coeff: 0.0
          kl: 0.01690497249364853
          model: {}
          policy_loss: -0.004634142387658358
          total_loss: 73055068160.0
          vf_explained_var: 3.387210085747938e-07
          vf_loss: 73055068160.0
    num_agent_steps_sampled: 112000
    num_agent_steps_trained: 112000
    num_steps_sampled: 112000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,28,186.844,112000,-545116,734215,-1689970.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 116000
  custom_metrics: {}
  date: 2022-10-13_15-17-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 734215.4955729733
  episode_reward_mean: -567289.0242905404
  episode_reward_min: -1689973.8045784873
  episodes_this_iter: 44
  episodes_total: 1288
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.9556253552436829
          entropy_coeff: 0.0
          kl: 0.016288822516798973
          model: {}
          policy_loss: -0.008649468421936035
          total_loss: 72932597760.0
          vf_explained_var: 4.338320991337241e-07
          vf_loss: 72932597760.0
    num_agent_steps_sampled: 116000
    num_agent_steps_trained: 116000
    num_steps_sampled: 116000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,29,193.347,116000,-567289,734215,-1689970.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-10-13_15-17-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1050550.285849809
  episode_reward_mean: -542694.1779796224
  episode_reward_min: -1540854.3877303253
  episodes_this_iter: 44
  episodes_total: 1332
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.9944259524345398
          entropy_coeff: 0.0
          kl: 0.0139922508969903
          model: {}
          policy_loss: -0.003104197094216943
          total_loss: 87390265344.0
          vf_explained_var: 4.733762466457847e-07
          vf_loss: 87390265344.0
    num_agent_steps_sampled: 120000
    num_agent_steps_trained: 120000
    num_steps_sampled: 120000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,30,199.925,120000,-542694,1050550.0,-1540850.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,30,199.925,120000,-542694,1050550.0,-1540850.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 124000
  custom_metrics: {}
  date: 2022-10-13_15-17-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1050550.285849809
  episode_reward_mean: -543105.0704736608
  episode_reward_min: -1540854.3877303253
  episodes_this_iter: 44
  episodes_total: 1376
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.7603514194488525
          entropy_coeff: 0.0
          kl: 0.01589028909802437
          model: {}
          policy_loss: -0.010823415592312813
          total_loss: 61033439232.0
          vf_explained_var: -1.4424964547288255e-06
          vf_loss: 61033439232.0
    num_agent_steps_sampled: 124000
    num_agent_steps_trained: 124000
    num_steps_sampled: 124000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,31,206.516,124000,-543105,1050550.0,-1540850.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-10-13_15-17-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 417773.5048729749
  episode_reward_mean: -597563.2949581489
  episode_reward_min: -1729356.8193638474
  episodes_this_iter: 46
  episodes_total: 1422
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.9844431281089783
          entropy_coeff: 0.0
          kl: 0.019719691947102547
          model: {}
          policy_loss: -0.007763024419546127
          total_loss: 89834635264.0
          vf_explained_var: 8.16519545310257e-08
          vf_loss: 89834635264.0
    num_agent_steps_sampled: 128000
    num_agent_steps_trained: 128000
    num_steps_sampled: 128000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,32,213.27,128000,-597563,417774,-1729360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 132000
  custom_metrics: {}
  date: 2022-10-13_15-17-55
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 417773.5048729749
  episode_reward_mean: -623869.6735708886
  episode_reward_min: -1729356.8193638474
  episodes_this_iter: 44
  episodes_total: 1466
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 1.167571783065796
          entropy_coeff: 0.0
          kl: 0.017205605283379555
          model: {}
          policy_loss: -0.013142027892172337
          total_loss: 79281127424.0
          vf_explained_var: 2.548899828980211e-07
          vf_loss: 79281127424.0
    num_agent_steps_sampled: 132000
    num_agent_steps_trained: 132000
    num_steps_sampled: 132000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,33,220.006,132000,-623870,417774,-1729360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-10-13_15-18-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 590832.8549305144
  episode_reward_mean: -593020.6814112059
  episode_reward_min: -1631772.7760187835
  episodes_this_iter: 44
  episodes_total: 1510
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.15000000596046448
          cur_lr: 4.999999873689376e-05
          entropy: 0.8403895497322083
          entropy_coeff: 0.0
          kl: 0.024707404896616936
          model: {}
          policy_loss: -0.006673905067145824
          total_loss: 72888811520.0
          vf_explained_var: 2.530697884139954e-06
          vf_loss: 72888811520.0
    num_agent_steps_sampled: 136000
    num_agent_steps_trained: 136000
    num_steps_sampled: 136000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,34,226.584,136000,-593021,590833,-1631770.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 140000
  custom_metrics: {}
  date: 2022-10-13_15-18-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1028207.0261846854
  episode_reward_mean: -474385.49343460525
  episode_reward_min: -1611213.5873367097
  episodes_this_iter: 44
  episodes_total: 1554
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.792876124382019
          entropy_coeff: 0.0
          kl: 0.013953977264463902
          model: {}
          policy_loss: -0.010120890103280544
          total_loss: 68254724096.0
          vf_explained_var: 2.425075763312634e-06
          vf_loss: 68254724096.0
    num_agent_steps_sampled: 140000
    num_agent_steps_trained: 140000
    num_steps_sampled: 140000
    num

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,35,233.288,140000,-474385,1028210.0,-1611210.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,35,233.288,140000,-474385,1028210.0,-1611210.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-10-13_15-18-15
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1028207.0261846854
  episode_reward_mean: -505980.5721527807
  episode_reward_min: -1490066.3214502528
  episodes_this_iter: 46
  episodes_total: 1600
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 1.3889199495315552
          entropy_coeff: 0.0
          kl: 0.01107168197631836
          model: {}
          policy_loss: -0.007677171379327774
          total_loss: 73780125696.0
          vf_explained_var: 5.65218670089962e-07
          vf_loss: 73780125696.0
    num_agent_steps_sampled: 144000
    num_agent_steps_trained: 144000
    num_steps_sampled: 144000
    num_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,36,239.912,144000,-505981,1028210.0,-1490070.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 148000
  custom_metrics: {}
  date: 2022-10-13_15-18-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 576404.2576262727
  episode_reward_mean: -524512.8649079911
  episode_reward_min: -1490066.3214502528
  episodes_this_iter: 44
  episodes_total: 1644
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 1.2192542552947998
          entropy_coeff: 0.0
          kl: 0.013070781715214252
          model: {}
          policy_loss: -0.009513742290437222
          total_loss: 59804704768.0
          vf_explained_var: 1.3455909311232972e-06
          vf_loss: 59804704768.0
    num_agent_steps_sampled: 148000
    num_agent_steps_trained: 148000
    num_steps_sampled: 148000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,37,246.532,148000,-524513,576404,-1490070.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-10-13_15-18-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 610916.7172744976
  episode_reward_mean: -540761.7518901916
  episode_reward_min: -1657679.5771603037
  episodes_this_iter: 44
  episodes_total: 1688
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.971340537071228
          entropy_coeff: 0.0
          kl: 0.01464129239320755
          model: {}
          policy_loss: -0.003106914460659027
          total_loss: 79388295168.0
          vf_explained_var: 1.5169702010098263e-06
          vf_loss: 79388295168.0
    num_agent_steps_sampled: 152000
    num_agent_steps_trained: 152000
    num_steps_sampled: 152000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,38,253.283,152000,-540762,610917,-1657680.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 156000
  custom_metrics: {}
  date: 2022-10-13_15-18-35
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 686490.8777731272
  episode_reward_mean: -533176.056646848
  episode_reward_min: -1924091.5839179684
  episodes_this_iter: 44
  episodes_total: 1732
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 1.4315719604492188
          entropy_coeff: 0.0
          kl: 0.020326396450400352
          model: {}
          policy_loss: -0.013157771900296211
          total_loss: 88121860096.0
          vf_explained_var: 4.521172741078772e-06
          vf_loss: 88121860096.0
    num_agent_steps_sampled: 156000
    num_agent_steps_trained: 156000
    num_steps_sampled: 156000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,39,259.805,156000,-533176,686491,-1924090.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-10-13_15-18-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 840074.3753991356
  episode_reward_mean: -553065.464105945
  episode_reward_min: -1924091.5839179684
  episodes_this_iter: 44
  episodes_total: 1776
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.3192152976989746
          entropy_coeff: 0.0
          kl: 0.014033311046659946
          model: {}
          policy_loss: -0.005511116702109575
          total_loss: 81956790272.0
          vf_explained_var: 1.3907109632782522e-06
          vf_loss: 81956790272.0
    num_agent_steps_sampled: 160000
    num_agent_steps_trained: 160000
    num_steps_sampled: 160000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,40,266.463,160000,-553065,840074,-1924090.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,40,266.463,160000,-553065,840074,-1924090.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 164000
  custom_metrics: {}
  date: 2022-10-13_15-18-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 840074.3753991356
  episode_reward_mean: -580228.1608491829
  episode_reward_min: -1477675.6102951977
  episodes_this_iter: 46
  episodes_total: 1822
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.0773252248764038
          entropy_coeff: 0.0
          kl: 0.016354158520698547
          model: {}
          policy_loss: -0.008646323345601559
          total_loss: 79362351104.0
          vf_explained_var: 3.6303074466559337e-06
          vf_loss: 79362351104.0
    num_agent_steps_sampled: 164000
    num_agent_steps_trained: 164000
    num_steps_sampled: 164000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,41,272.954,164000,-580228,840074,-1477680.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 168000
  custom_metrics: {}
  date: 2022-10-13_15-18-55
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 840074.3753991356
  episode_reward_mean: -566495.1301646404
  episode_reward_min: -1413060.5377259837
  episodes_this_iter: 44
  episodes_total: 1866
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 0.9683565497398376
          entropy_coeff: 0.0
          kl: 0.014610319398343563
          model: {}
          policy_loss: -0.019647376611828804
          total_loss: 54797955072.0
          vf_explained_var: 5.4376100706576835e-06
          vf_loss: 54797955072.0
    num_agent_steps_sampled: 168000
    num_agent_steps_trained: 168000
    num_steps_sampled: 168000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,42,279.556,168000,-566495,840074,-1413060.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 172000
  custom_metrics: {}
  date: 2022-10-13_15-19-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 344701.0840516987
  episode_reward_mean: -533837.2797913863
  episode_reward_min: -1491739.3491349455
  episodes_this_iter: 44
  episodes_total: 1910
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.2763068675994873
          entropy_coeff: 0.0
          kl: 0.012524193152785301
          model: {}
          policy_loss: -0.009090246632695198
          total_loss: 77856874496.0
          vf_explained_var: 3.117835603916319e-06
          vf_loss: 77856874496.0
    num_agent_steps_sampled: 172000
    num_agent_steps_trained: 172000
    num_steps_sampled: 172000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,43,285.997,172000,-533837,344701,-1491740.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 176000
  custom_metrics: {}
  date: 2022-10-13_15-19-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 344701.0840516987
  episode_reward_mean: -534007.6672725537
  episode_reward_min: -1667528.0376012407
  episodes_this_iter: 44
  episodes_total: 1954
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.2731634378433228
          entropy_coeff: 0.0
          kl: 0.01067464891821146
          model: {}
          policy_loss: -0.00925455242395401
          total_loss: 61618692096.0
          vf_explained_var: 2.486410949131823e-06
          vf_loss: 61618692096.0
    num_agent_steps_sampled: 176000
    num_agent_steps_trained: 176000
    num_steps_sampled: 176000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,44,292.698,176000,-534008,344701,-1667530.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 180000
  custom_metrics: {}
  date: 2022-10-13_15-19-15
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 665037.4452635654
  episode_reward_mean: -515238.9234274429
  episode_reward_min: -1667528.0376012407
  episodes_this_iter: 46
  episodes_total: 2000
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.2655081748962402
          entropy_coeff: 0.0
          kl: 0.013752976432442665
          model: {}
          policy_loss: -0.0056557063944637775
          total_loss: 71140556800.0
          vf_explained_var: 1.7282142152907909e-06
          vf_loss: 71140556800.0
    num_agent_steps_sampled: 180000
    num_agent_steps_trained: 180000
    num_steps_sampled: 180000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,45,299.466,180000,-515239,665037,-1667530.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,45,299.466,180000,-515239,665037,-1667530.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 184000
  custom_metrics: {}
  date: 2022-10-13_15-19-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 665037.4452635654
  episode_reward_mean: -558330.478673671
  episode_reward_min: -1681611.6888419595
  episodes_this_iter: 44
  episodes_total: 2044
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.1682051420211792
          entropy_coeff: 0.0
          kl: 0.014523990452289581
          model: {}
          policy_loss: -0.007954519242048264
          total_loss: 68958658560.0
          vf_explained_var: 4.237697908138216e-07
          vf_loss: 68958658560.0
    num_agent_steps_sampled: 184000
    num_agent_steps_trained: 184000
    num_steps_sampled: 184000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,46,306.085,184000,-558330,665037,-1681610.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 188000
  custom_metrics: {}
  date: 2022-10-13_15-19-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 290641.8287574219
  episode_reward_mean: -617451.6142805059
  episode_reward_min: -1755979.0436835508
  episodes_this_iter: 44
  episodes_total: 2088
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.043968677520752
          entropy_coeff: 0.0
          kl: 0.017597584053874016
          model: {}
          policy_loss: -0.005321233067661524
          total_loss: 81426644992.0
          vf_explained_var: 1.6085306924651377e-05
          vf_loss: 81426644992.0
    num_agent_steps_sampled: 188000
    num_agent_steps_trained: 188000
    num_steps_sampled: 188000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,47,312.614,188000,-617452,290642,-1755980.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 192000
  custom_metrics: {}
  date: 2022-10-13_15-19-35
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 649527.8106908791
  episode_reward_mean: -602715.4797247009
  episode_reward_min: -1755979.0436835508
  episodes_this_iter: 44
  episodes_total: 2132
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.1911612749099731
          entropy_coeff: 0.0
          kl: 0.01987648755311966
          model: {}
          policy_loss: -0.006016653496772051
          total_loss: 95620063232.0
          vf_explained_var: 4.0915065255830996e-06
          vf_loss: 95620063232.0
    num_agent_steps_sampled: 192000
    num_agent_steps_trained: 192000
    num_steps_sampled: 192000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,48,319.258,192000,-602715,649528,-1755980.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 196000
  custom_metrics: {}
  date: 2022-10-13_15-19-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 649527.8106908791
  episode_reward_mean: -569948.20671121
  episode_reward_min: -1683402.220587551
  episodes_this_iter: 44
  episodes_total: 2176
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.0197174549102783
          entropy_coeff: 0.0
          kl: 0.013437775894999504
          model: {}
          policy_loss: -0.002057990524917841
          total_loss: 74447978496.0
          vf_explained_var: -2.297983428434236e-06
          vf_loss: 74447978496.0
    num_agent_steps_sampled: 196000
    num_agent_steps_trained: 196000
    num_steps_sampled: 196000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,49,325.878,196000,-569948,649528,-1683400.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 200000
  custom_metrics: {}
  date: 2022-10-13_15-19-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 928936.5279323326
  episode_reward_mean: -435436.87515079154
  episode_reward_min: -1683402.220587551
  episodes_this_iter: 46
  episodes_total: 2222
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 0.7162348031997681
          entropy_coeff: 0.0
          kl: 0.016495879739522934
          model: {}
          policy_loss: -0.003664983669295907
          total_loss: 56388587520.0
          vf_explained_var: -2.0509125064904765e-08
          vf_loss: 56388587520.0
    num_agent_steps_sampled: 200000
    num_agent_steps_trained: 200000
    num_steps_sampled: 200000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,50,332.385,200000,-435437,928937,-1683400.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,50,332.385,200000,-435437,928937,-1683400.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 204000
  custom_metrics: {}
  date: 2022-10-13_15-19-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1336841.7913551782
  episode_reward_mean: -408877.78242654674
  episode_reward_min: -1683402.220587551
  episodes_this_iter: 44
  episodes_total: 2266
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.172021746635437
          entropy_coeff: 0.0
          kl: 0.015045561827719212
          model: {}
          policy_loss: -0.00521119823679328
          total_loss: 68561772544.0
          vf_explained_var: 1.264387606170203e-06
          vf_loss: 68561772544.0
    num_agent_steps_sampled: 204000
    num_agent_steps_trained: 204000
    num_steps_sampled: 204000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,51,338.994,204000,-408878,1336840.0,-1683400.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 208000
  custom_metrics: {}
  date: 2022-10-13_15-20-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1336841.7913551782
  episode_reward_mean: -434701.939660407
  episode_reward_min: -1472353.5285271378
  episodes_this_iter: 44
  episodes_total: 2310
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.007601261138916
          entropy_coeff: 0.0
          kl: 0.013740590773522854
          model: {}
          policy_loss: -0.009714437648653984
          total_loss: 75265990656.0
          vf_explained_var: -2.773263076960575e-05
          vf_loss: 75265990656.0
    num_agent_steps_sampled: 208000
    num_agent_steps_trained: 208000
    num_steps_sampled: 208000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,52,345.638,208000,-434702,1336840.0,-1472350.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 212000
  custom_metrics: {}
  date: 2022-10-13_15-20-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 966497.5810854285
  episode_reward_mean: -485146.8951711455
  episode_reward_min: -1694634.422840863
  episodes_this_iter: 44
  episodes_total: 2354
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.0367943048477173
          entropy_coeff: 0.0
          kl: 0.015681827440857887
          model: {}
          policy_loss: -0.00246697966940701
          total_loss: 83647324160.0
          vf_explained_var: 3.3133774195448495e-06
          vf_loss: 83647324160.0
    num_agent_steps_sampled: 212000
    num_agent_steps_trained: 212000
    num_steps_sampled: 212000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,53,352.203,212000,-485147,966498,-1694630.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 216000
  custom_metrics: {}
  date: 2022-10-13_15-20-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 966497.5810854285
  episode_reward_mean: -487513.5868914073
  episode_reward_min: -1694634.422840863
  episodes_this_iter: 46
  episodes_total: 2400
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 1.117628812789917
          entropy_coeff: 0.0
          kl: 0.02153829112648964
          model: {}
          policy_loss: -0.0034334331285208464
          total_loss: 60082700288.0
          vf_explained_var: 5.651417723129271e-06
          vf_loss: 60082700288.0
    num_agent_steps_sampled: 216000
    num_agent_steps_trained: 216000
    num_steps_sampled: 216000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,54,358.891,216000,-487514,966498,-1694630.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 220000
  custom_metrics: {}
  date: 2022-10-13_15-20-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 825150.7369870476
  episode_reward_mean: -536121.9349967369
  episode_reward_min: -1868621.5748707454
  episodes_this_iter: 44
  episodes_total: 2444
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.1251341104507446
          entropy_coeff: 0.0
          kl: 0.009856851771473885
          model: {}
          policy_loss: -0.007936220616102219
          total_loss: 87351238656.0
          vf_explained_var: 4.799519956577569e-06
          vf_loss: 87351238656.0
    num_agent_steps_sampled: 220000
    num_agent_steps_trained: 220000
    num_steps_sampled: 220000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,55,365.529,220000,-536122,825151,-1868620.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,55,365.529,220000,-536122,825151,-1868620.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 224000
  custom_metrics: {}
  date: 2022-10-13_15-20-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 825150.7369870476
  episode_reward_mean: -580971.9582685574
  episode_reward_min: -1868621.5748707454
  episodes_this_iter: 44
  episodes_total: 2488
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.1614933013916016
          entropy_coeff: 0.0
          kl: 0.01739305816590786
          model: {}
          policy_loss: -0.005228159949183464
          total_loss: 75723915264.0
          vf_explained_var: 6.247079454624327e-06
          vf_loss: 75723915264.0
    num_agent_steps_sampled: 224000
    num_agent_steps_trained: 224000
    num_steps_sampled: 224000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,56,372.128,224000,-580972,825151,-1868620.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 228000
  custom_metrics: {}
  date: 2022-10-13_15-20-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1012822.0925124971
  episode_reward_mean: -544896.3577069455
  episode_reward_min: -1868621.5748707454
  episodes_this_iter: 44
  episodes_total: 2532
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.1643227338790894
          entropy_coeff: 0.0
          kl: 0.013965759426355362
          model: {}
          policy_loss: -0.012562901712954044
          total_loss: 68884545536.0
          vf_explained_var: 2.043926542683039e-06
          vf_loss: 68884545536.0
    num_agent_steps_sampled: 228000
    num_agent_steps_trained: 228000
    num_steps_sampled: 228000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,57,378.614,228000,-544896,1012820.0,-1868620.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 232000
  custom_metrics: {}
  date: 2022-10-13_15-20-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1012822.0925124971
  episode_reward_mean: -531809.222576232
  episode_reward_min: -1524743.9411177675
  episodes_this_iter: 44
  episodes_total: 2576
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.6075135469436646
          entropy_coeff: 0.0
          kl: 0.009767907671630383
          model: {}
          policy_loss: -0.005033382214605808
          total_loss: 71998849024.0
          vf_explained_var: -2.4609028059785487e-06
          vf_loss: 71998849024.0
    num_agent_steps_sampled: 232000
    num_agent_steps_trained: 232000
    num_steps_sampled: 232000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,58,385.337,232000,-531809,1012820.0,-1524740.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 236000
  custom_metrics: {}
  date: 2022-10-13_15-20-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1012822.0925124971
  episode_reward_mean: -595622.8281939421
  episode_reward_min: -1540511.692929216
  episodes_this_iter: 46
  episodes_total: 2622
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.207789659500122
          entropy_coeff: 0.0
          kl: 0.01757282018661499
          model: {}
          policy_loss: -0.0010405597276985645
          total_loss: 81656709120.0
          vf_explained_var: 2.020853798967437e-06
          vf_loss: 81656709120.0
    num_agent_steps_sampled: 236000
    num_agent_steps_trained: 236000
    num_steps_sampled: 236000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,59,391.947,236000,-595623,1012820.0,-1540510.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 240000
  custom_metrics: {}
  date: 2022-10-13_15-20-55
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 830196.8056895175
  episode_reward_mean: -593357.1203556771
  episode_reward_min: -1820524.01880311
  episodes_this_iter: 44
  episodes_total: 2666
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.2890551090240479
          entropy_coeff: 0.0
          kl: 0.01434052549302578
          model: {}
          policy_loss: -0.004734880290925503
          total_loss: 74834083840.0
          vf_explained_var: 2.175377176172333e-06
          vf_loss: 74834083840.0
    num_agent_steps_sampled: 240000
    num_agent_steps_trained: 240000
    num_steps_sampled: 240000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,60,398.68,240000,-593357,830197,-1820520.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,60,398.68,240000,-593357,830197,-1820520.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 244000
  custom_metrics: {}
  date: 2022-10-13_15-21-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 984103.7476243136
  episode_reward_mean: -531689.6133265916
  episode_reward_min: -1820524.01880311
  episodes_this_iter: 44
  episodes_total: 2710
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.2059966325759888
          entropy_coeff: 0.0
          kl: 0.013287731446325779
          model: {}
          policy_loss: -0.006351038347929716
          total_loss: 82200051712.0
          vf_explained_var: 1.056130258803023e-05
          vf_loss: 82200051712.0
    num_agent_steps_sampled: 244000
    num_agent_steps_trained: 244000
    num_steps_sampled: 244000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,61,405.276,244000,-531690,984104,-1820520.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 248000
  custom_metrics: {}
  date: 2022-10-13_15-21-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 984103.7476243136
  episode_reward_mean: -579996.8359643905
  episode_reward_min: -1785550.2481373013
  episodes_this_iter: 44
  episodes_total: 2754
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 1.1066198348999023
          entropy_coeff: 0.0
          kl: 0.016030043363571167
          model: {}
          policy_loss: -0.008479689247906208
          total_loss: 99155574784.0
          vf_explained_var: 5.318849161994876e-06
          vf_loss: 99155574784.0
    num_agent_steps_sampled: 248000
    num_agent_steps_trained: 248000
    num_steps_sampled: 248000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,62,411.844,248000,-579997,984104,-1785550.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 252000
  custom_metrics: {}
  date: 2022-10-13_15-21-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 984103.7476243136
  episode_reward_mean: -555604.9349826688
  episode_reward_min: -1785550.2481373013
  episodes_this_iter: 46
  episodes_total: 2800
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 0.8851742148399353
          entropy_coeff: 0.0
          kl: 0.017166415229439735
          model: {}
          policy_loss: -0.00907523836940527
          total_loss: 64883343360.0
          vf_explained_var: 3.2321740945917554e-06
          vf_loss: 64883343360.0
    num_agent_steps_sampled: 252000
    num_agent_steps_trained: 252000
    num_steps_sampled: 252000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,63,418.325,252000,-555605,984104,-1785550.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 256000
  custom_metrics: {}
  date: 2022-10-13_15-21-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1105894.0825544058
  episode_reward_mean: -516362.17931166984
  episode_reward_min: -1785550.2481373013
  episodes_this_iter: 44
  episodes_total: 2844
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.5062500238418579
          cur_lr: 4.999999873689376e-05
          entropy: 0.9176168441772461
          entropy_coeff: 0.0
          kl: 0.025124115869402885
          model: {}
          policy_loss: -0.003649882273748517
          total_loss: 89672343552.0
          vf_explained_var: 5.31500381839578e-06
          vf_loss: 89672343552.0
    num_agent_steps_sampled: 256000
    num_agent_steps_trained: 256000
    num_steps_sampled: 256000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,64,425.08,256000,-516362,1105890.0,-1785550.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 260000
  custom_metrics: {}
  date: 2022-10-13_15-21-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1105894.0825544058
  episode_reward_mean: -477865.6850947815
  episode_reward_min: -1540724.123408879
  episodes_this_iter: 44
  episodes_total: 2888
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.7593749761581421
          cur_lr: 4.999999873689376e-05
          entropy: 1.0381224155426025
          entropy_coeff: 0.0
          kl: 0.010573449544608593
          model: {}
          policy_loss: -0.010063407011330128
          total_loss: 67597963264.0
          vf_explained_var: 5.589056854660157e-06
          vf_loss: 67597963264.0
    num_agent_steps_sampled: 260000
    num_agent_steps_trained: 260000
    num_steps_sampled: 260000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,65,431.736,260000,-477866,1105890.0,-1540720.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,65,431.736,260000,-477866,1105890.0,-1540720.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 264000
  custom_metrics: {}
  date: 2022-10-13_15-21-35
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1115896.316441658
  episode_reward_mean: -535332.5325715758
  episode_reward_min: -1625265.2839778448
  episodes_this_iter: 44
  episodes_total: 2932
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.7593749761581421
          cur_lr: 4.999999873689376e-05
          entropy: 1.2997500896453857
          entropy_coeff: 0.0
          kl: 0.02243298850953579
          model: {}
          policy_loss: -0.004753562621772289
          total_loss: 90247921664.0
          vf_explained_var: 1.4096113773121033e-05
          vf_loss: 90247921664.0
    num_agent_steps_sampled: 264000
    num_agent_steps_trained: 264000
    num_steps_sampled: 264000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,66,438.425,264000,-535333,1115900.0,-1625270.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 268000
  custom_metrics: {}
  date: 2022-10-13_15-21-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1115896.316441658
  episode_reward_mean: -474703.15350136446
  episode_reward_min: -1633693.3198181696
  episodes_this_iter: 44
  episodes_total: 2976
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.7796475887298584
          entropy_coeff: 0.0
          kl: 0.01428838912397623
          model: {}
          policy_loss: -0.008831292390823364
          total_loss: 59910815744.0
          vf_explained_var: 5.031080490880413e-06
          vf_loss: 59910815744.0
    num_agent_steps_sampled: 268000
    num_agent_steps_trained: 268000
    num_steps_sampled: 268000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,67,445.001,268000,-474703,1115900.0,-1633690.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 272000
  custom_metrics: {}
  date: 2022-10-13_15-21-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 672967.2513324069
  episode_reward_mean: -437014.3625570698
  episode_reward_min: -1633693.3198181696
  episodes_this_iter: 46
  episodes_total: 3022
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.4376715421676636
          entropy_coeff: 0.0
          kl: 0.014110737480223179
          model: {}
          policy_loss: -0.006686493754386902
          total_loss: 44533686272.0
          vf_explained_var: 1.2170755326224025e-05
          vf_loss: 44533686272.0
    num_agent_steps_sampled: 272000
    num_agent_steps_trained: 272000
    num_steps_sampled: 272000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,68,451.612,272000,-437014,672967,-1633690.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 276000
  custom_metrics: {}
  date: 2022-10-13_15-21-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 672967.2513324069
  episode_reward_mean: -472529.90775922616
  episode_reward_min: -1507362.2932199666
  episodes_this_iter: 44
  episodes_total: 3066
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.915173351764679
          entropy_coeff: 0.0
          kl: 0.008448977023363113
          model: {}
          policy_loss: -0.006682473234832287
          total_loss: 62446825472.0
          vf_explained_var: -2.8833906071668025e-06
          vf_loss: 62446825472.0
    num_agent_steps_sampled: 276000
    num_agent_steps_trained: 276000
    num_steps_sampled: 276000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,69,458.187,276000,-472530,672967,-1507360.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 280000
  custom_metrics: {}
  date: 2022-10-13_15-22-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 672967.2513324069
  episode_reward_mean: -504490.126090258
  episode_reward_min: -1670113.9996936051
  episodes_this_iter: 44
  episodes_total: 3110
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.7134535908699036
          entropy_coeff: 0.0
          kl: 0.0109794232994318
          model: {}
          policy_loss: 0.0016146654961630702
          total_loss: 66326097920.0
          vf_explained_var: 8.6325471784221e-06
          vf_loss: 66326097920.0
    num_agent_steps_sampled: 280000
    num_agent_steps_trained: 280000
    num_steps_sampled: 280000
    num_steps_t



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,70,464.675,280000,-504490,672967,-1670110.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,70,464.675,280000,-504490,672967,-1670110.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 284000
  custom_metrics: {}
  date: 2022-10-13_15-22-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 946908.7084426604
  episode_reward_mean: -449775.8131674831
  episode_reward_min: -1704895.680342316
  episodes_this_iter: 44
  episodes_total: 3154
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.47033238410949707
          entropy_coeff: 0.0
          kl: 0.015048080123960972
          model: {}
          policy_loss: -0.004600726533681154
          total_loss: 75323441152.0
          vf_explained_var: 1.7961956473300233e-05
          vf_loss: 75323441152.0
    num_agent_steps_sampled: 284000
    num_agent_steps_trained: 284000
    num_steps_sampled: 284000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,71,471.419,284000,-449776,946909,-1704900.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 288000
  custom_metrics: {}
  date: 2022-10-13_15-22-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 946908.7084426604
  episode_reward_mean: -514111.80455832893
  episode_reward_min: -1704895.680342316
  episodes_this_iter: 46
  episodes_total: 3200
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.529456377029419
          entropy_coeff: 0.0
          kl: 0.013243738561868668
          model: {}
          policy_loss: -0.004769091960042715
          total_loss: 80617676800.0
          vf_explained_var: -6.681123340968043e-05
          vf_loss: 80617676800.0
    num_agent_steps_sampled: 288000
    num_agent_steps_trained: 288000
    num_steps_sampled: 288000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,72,477.932,288000,-514112,946909,-1704900.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 292000
  custom_metrics: {}
  date: 2022-10-13_15-22-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1239766.4706025822
  episode_reward_mean: -552961.5956841876
  episode_reward_min: -1638227.0197801918
  episodes_this_iter: 44
  episodes_total: 3244
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 1.1020840406417847
          entropy_coeff: 0.0
          kl: 0.0178117286413908
          model: {}
          policy_loss: 0.005124968476593494
          total_loss: 90030522368.0
          vf_explained_var: 3.865265171043575e-06
          vf_loss: 90030522368.0
    num_agent_steps_sampled: 292000
    num_agent_steps_trained: 292000
    num_steps_sampled: 292000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,73,484.346,292000,-552962,1239770.0,-1638230.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 296000
  custom_metrics: {}
  date: 2022-10-13_15-22-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 1239766.4706025822
  episode_reward_mean: -505353.7029097623
  episode_reward_min: -1718532.1357202767
  episodes_this_iter: 44
  episodes_total: 3288
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.9015757441520691
          entropy_coeff: 0.0
          kl: 0.0059952642768621445
          model: {}
          policy_loss: -0.001772459945641458
          total_loss: 69502296064.0
          vf_explained_var: 3.743428169400431e-05
          vf_loss: 69502296064.0
    num_agent_steps_sampled: 296000
    num_agent_steps_trained: 296000
    num_steps_sampled: 296000
    num_



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,74,491.012,296000,-505354,1239770.0,-1718530.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 300000
  custom_metrics: {}
  date: 2022-10-13_15-22-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 934669.9519595274
  episode_reward_mean: -524838.6267912296
  episode_reward_min: -1814776.5134072916
  episodes_this_iter: 44
  episodes_total: 3332
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 0.9053293466567993
          entropy_coeff: 0.0
          kl: 0.013315346091985703
          model: {}
          policy_loss: -0.0020958257373422384
          total_loss: 88953421824.0
          vf_explained_var: 1.705237627902534e-05
          vf_loss: 88953421824.0
    num_agent_steps_sampled: 300000
    num_agent_steps_trained: 300000
    num_steps_sampled: 300000
    num_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,75,497.712,300000,-524839,934670,-1814780.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,75,497.712,300000,-524839,934670,-1814780.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 304000
  custom_metrics: {}
  date: 2022-10-13_15-22-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 934669.9519595274
  episode_reward_mean: -557881.7441946544
  episode_reward_min: -1860988.1373872138
  episodes_this_iter: 44
  episodes_total: 3376
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 1.0301275253295898
          entropy_coeff: 0.0
          kl: 0.0099044069647789
          model: {}
          policy_loss: -0.010783226229250431
          total_loss: 57529217024.0
          vf_explained_var: -1.7415451338820276e-06
          vf_loss: 57529217024.0
    num_agent_steps_sampled: 304000
    num_agent_steps_trained: 304000
    num_steps_sampled: 304000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,76,504.335,304000,-557882,934670,-1860990.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 308000
  custom_metrics: {}
  date: 2022-10-13_15-22-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 728354.7453711343
  episode_reward_mean: -581060.2407732703
  episode_reward_min: -1860988.1373872138
  episodes_this_iter: 46
  episodes_total: 3422
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.139062523841858
          cur_lr: 4.999999873689376e-05
          entropy: 1.110644817352295
          entropy_coeff: 0.0
          kl: 0.02087615244090557
          model: {}
          policy_loss: 0.0009313324699178338
          total_loss: 87423410176.0
          vf_explained_var: 1.0713005394791253e-05
          vf_loss: 87423410176.0
    num_agent_steps_sampled: 308000
    num_agent_steps_trained: 308000
    num_steps_sampled: 308000
    num_ste



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,77,510.848,308000,-581060,728355,-1860990.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 312000
  custom_metrics: {}
  date: 2022-10-13_15-22-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 748857.619654732
  episode_reward_mean: -579042.0731557115
  episode_reward_min: -1640692.1948541172
  episodes_this_iter: 44
  episodes_total: 3466
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.991172194480896
          entropy_coeff: 0.0
          kl: 0.010824271477758884
          model: {}
          policy_loss: -0.0005893716006539762
          total_loss: 83521560576.0
          vf_explained_var: 9.5621871878393e-06
          vf_loss: 83521560576.0
    num_agent_steps_sampled: 312000
    num_agent_steps_trained: 312000
    num_steps_sampled: 312000
    num_steps



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,78,517.466,312000,-579042,748858,-1640690.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 316000
  custom_metrics: {}
  date: 2022-10-13_15-23-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 748857.619654732
  episode_reward_mean: -601808.8622326199
  episode_reward_min: -1639369.6353479472
  episodes_this_iter: 44
  episodes_total: 3510
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 1.1159876585006714
          entropy_coeff: 0.0
          kl: 0.005754163023084402
          model: {}
          policy_loss: -0.0038214719388633966
          total_loss: 93680508928.0
          vf_explained_var: 1.5639490129615297e-06
          vf_loss: 93680508928.0
    num_agent_steps_sampled: 316000
    num_agent_steps_trained: 316000
    num_steps_sampled: 316000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,79,524.222,316000,-601809,748858,-1639370.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 320000
  custom_metrics: {}
  date: 2022-10-13_15-23-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 748857.619654732
  episode_reward_mean: -583345.9136153266
  episode_reward_min: -1493137.0568961382
  episodes_this_iter: 44
  episodes_total: 3554
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.31187310814857483
          entropy_coeff: 0.0
          kl: 0.007033149246126413
          model: {}
          policy_loss: -0.005243247840553522
          total_loss: 83349602304.0
          vf_explained_var: 1.8228125782115967e-06
          vf_loss: 83349602304.0
    num_agent_steps_sampled: 320000
    num_agent_steps_trained: 320000
    num_steps_sampled: 320000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,80,530.941,320000,-583346,748858,-1493140.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,80,530.941,320000,-583346,748858,-1493140.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 324000
  custom_metrics: {}
  date: 2022-10-13_15-23-14
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 605984.0037994388
  episode_reward_mean: -497460.1121261639
  episode_reward_min: -1554589.0907236598
  episodes_this_iter: 46
  episodes_total: 3600
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.7631836533546448
          entropy_coeff: 0.0
          kl: 0.012398798018693924
          model: {}
          policy_loss: 0.0030641104094684124
          total_loss: 50658566144.0
          vf_explained_var: -2.588463030406274e-05
          vf_loss: 50658566144.0
    num_agent_steps_sampled: 324000
    num_agent_steps_trained: 324000
    num_steps_sampled: 324000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,81,537.538,324000,-497460,605984,-1554590.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 328000
  custom_metrics: {}
  date: 2022-10-13_15-23-21
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 532704.2007866233
  episode_reward_mean: -544562.2151679571
  episode_reward_min: -1763869.0381827208
  episodes_this_iter: 44
  episodes_total: 3644
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.8151063919067383
          entropy_coeff: 0.0
          kl: 0.0052625141106545925
          model: {}
          policy_loss: -0.002598509658128023
          total_loss: 90735026176.0
          vf_explained_var: 5.607643288385589e-06
          vf_loss: 90735026176.0
    num_agent_steps_sampled: 328000
    num_agent_steps_trained: 328000
    num_steps_sampled: 328000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,82,544.11,328000,-544562,532704,-1763870.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 332000
  custom_metrics: {}
  date: 2022-10-13_15-23-28
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 303262.3029853933
  episode_reward_mean: -580304.1721132899
  episode_reward_min: -1763869.0381827208
  episodes_this_iter: 44
  episodes_total: 3688
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.9184696078300476
          entropy_coeff: 0.0
          kl: 0.006842876318842173
          model: {}
          policy_loss: -0.002246872056275606
          total_loss: 62170796032.0
          vf_explained_var: 3.7024738048785366e-06
          vf_loss: 62170796032.0
    num_agent_steps_sampled: 332000
    num_agent_steps_trained: 332000
    num_steps_sampled: 332000
    num_s



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,83,550.694,332000,-580304,303262,-1763870.0,90




Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 336000
  custom_metrics: {}
  date: 2022-10-13_15-23-34
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 734401.7952542793
  episode_reward_mean: -525258.7017775074
  episode_reward_min: -1620026.4066235588
  episodes_this_iter: 44
  episodes_total: 3732
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.8484278321266174
          entropy_coeff: 0.0
          kl: 0.007743925787508488
          model: {}
          policy_loss: -0.0013584318803623319
          total_loss: 60952186880.0
          vf_explained_var: 2.8683421987807378e-05
          vf_loss: 60952186880.0
    num_agent_steps_sampled: 336000
    num_agent_steps_trained: 336000
    num_steps_sampled: 336000
    num_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,84,557.221,336000,-525259,734402,-1620030.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 340000
  custom_metrics: {}
  date: 2022-10-13_15-23-41
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 941140.5578775547
  episode_reward_mean: -452823.9872967374
  episode_reward_min: -1625557.7217381361
  episodes_this_iter: 44
  episodes_total: 3776
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.7847670912742615
          entropy_coeff: 0.0
          kl: 0.007242908235639334
          model: {}
          policy_loss: -0.008410214446485043
          total_loss: 60087242752.0
          vf_explained_var: 6.769357241864782e-06
          vf_loss: 60087242752.0
    num_agent_steps_sampled: 340000
    num_agent_steps_trained: 340000
    num_steps_sampled: 340000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,85,563.838,340000,-452824,941141,-1625560.0,90


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,85,563.838,340000,-452824,941141,-1625560.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 344000
  custom_metrics: {}
  date: 2022-10-13_15-23-48
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 941140.5578775547
  episode_reward_mean: -456785.1846327894
  episode_reward_min: -1625557.7217381361
  episodes_this_iter: 46
  episodes_total: 3822
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.7839593291282654
          entropy_coeff: 0.0
          kl: 0.006309488322585821
          model: {}
          policy_loss: -0.004098461475223303
          total_loss: 69421989888.0
          vf_explained_var: 3.209190981579013e-05
          vf_loss: 69421989888.0
    num_agent_steps_sampled: 344000
    num_agent_steps_trained: 344000
    num_steps_sampled: 344000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,86,570.515,344000,-456785,941141,-1625560.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 348000
  custom_metrics: {}
  date: 2022-10-13_15-23-54
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 543533.3175955022
  episode_reward_mean: -463563.59846403025
  episode_reward_min: -1522018.8341546098
  episodes_this_iter: 44
  episodes_total: 3866
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 0.7072375416755676
          entropy_coeff: 0.0
          kl: 0.0073422943241894245
          model: {}
          policy_loss: -0.0004699081473518163
          total_loss: 49518637056.0
          vf_explained_var: 8.041755791055039e-06
          vf_loss: 49518637056.0
    num_agent_steps_sampled: 348000
    num_agent_steps_trained: 348000
    num_steps_sampled: 348000
    num



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,87,577.217,348000,-463564,543533,-1522020.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 352000
  custom_metrics: {}
  date: 2022-10-13_15-24-01
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 543533.3175955022
  episode_reward_mean: -468519.070406697
  episode_reward_min: -1415769.264770692
  episodes_this_iter: 44
  episodes_total: 3910
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 1.0874221324920654
          entropy_coeff: 0.0
          kl: 0.006509528029710054
          model: {}
          policy_loss: -0.008173041045665741
          total_loss: 67527688192.0
          vf_explained_var: 4.246991011314094e-06
          vf_loss: 67527688192.0
    num_agent_steps_sampled: 352000
    num_agent_steps_trained: 352000
    num_steps_sampled: 352000
    num_step



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,88,583.94,352000,-468519,543533,-1415770.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 356000
  custom_metrics: {}
  date: 2022-10-13_15-24-08
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 364402.5564114539
  episode_reward_mean: -583798.362350118
  episode_reward_min: -1790534.6667692396
  episodes_this_iter: 44
  episodes_total: 3954
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 1.2729345560073853
          entropy_coeff: 0.0
          kl: 0.006628402508795261
          model: {}
          policy_loss: -0.004022764042019844
          total_loss: 81067769856.0
          vf_explained_var: 1.2243434866832104e-05
          vf_loss: 81067769856.0
    num_agent_steps_sampled: 356000
    num_agent_steps_trained: 356000
    num_steps_sampled: 356000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,89,590.785,356000,-583798,364403,-1790530.0,90


Result for PPO_InventoryEnv_ea493_00000:
  agent_timesteps_total: 360000
  custom_metrics: {}
  date: 2022-10-13_15-24-15
  done: false
  episode_len_mean: 90.0
  episode_media: {}
  episode_reward_max: 953603.4691257151
  episode_reward_mean: -597572.9155973935
  episode_reward_min: -1790534.6667692396
  episodes_this_iter: 46
  episodes_total: 4000
  experiment_id: 16c92e0843fe45b6a7fa3194faada972
  hostname: devbox-x299
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.708593726158142
          cur_lr: 4.999999873689376e-05
          entropy: 1.2467707395553589
          entropy_coeff: 0.0
          kl: 0.007653938140720129
          model: {}
          policy_loss: -0.002141451695933938
          total_loss: 79705309184.0
          vf_explained_var: 7.646443009434734e-06
          vf_loss: 79705309184.0
    num_agent_steps_sampled: 360000
    num_agent_steps_trained: 360000
    num_steps_sampled: 360000
    num_st



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_InventoryEnv_ea493_00000,RUNNING,192.168.0.178:27788,90,597.344,360000,-597573,953603,-1790530.0,90


# My agent does not learn anything!

<img src="images/no_learning.png" width="500" />

## In the next videos in the course, we will learn various tricks that improve performance in a wide range of custom environments.

1. Observation Normalization
2. Action Normalization
3. Reward Scaling
4. Simple Hyperparameter Tuning
5. Advanced hyperparameter tuning e.g. Population Based Training