In [1]:
import gym
from gym.spaces import Box
import numpy as np
from numpy.random import default_rng


class InventoryEnv(gym.Env):
    def __init__(self):
        """
        Must define self.observation_space and self.action_space here
        """
        self.max_capacity = 4000

        self.action_space = Box(low=np.array([0]), high=np.array([self.max_capacity]))

        self.lead_time = 5
        self.obs_dim = self.lead_time + 4

        self.max_mean_daily_demand = 200
        self.max_unit_selling_price = 100
        self.max_daily_holding_cost_per_unit = 5

        obs_low = np.zeros((self.obs_dim,))
        obs_high = np.array([self.max_capacity for _ in range(self.lead_time)] +
                            [self.max_mean_daily_demand, self.max_unit_selling_price,
                             self.max_unit_selling_price, self.max_daily_holding_cost_per_unit
                             ]
                            )
        self.observation_space = Box(low=obs_low, high=obs_high)

        self.rng = default_rng()

        self.current_obs = None
        self.episode_length_in_days = 90
        self.day_num = None

    def reset(self):
        """
        Returns: the observation of the initial state
        Reset the environment to initial state so that a new episode (independent of previous ones) may start
        """
        mean_daily_demand = self.rng.uniform() * self.max_mean_daily_demand
        selling_price = self.rng.uniform() * self.max_unit_selling_price
        buying_price = self.rng.uniform() * selling_price
        daily_holding_cost_per_unit = self.rng.uniform() * min(buying_price,
                                                               self.max_daily_holding_cost_per_unit
                                                               )
        self.current_obs = np.array([0 for _ in range(self.lead_time)] +
                                    [mean_daily_demand, selling_price, buying_price,
                                     daily_holding_cost_per_unit,
                                     ]
                                    )
        self.day_num = 0
        return self.current_obs

    def step(self, action):
        """
        Returns: Given current obs and action, returns the next observation, the reward, done and optionally additional info
        """
        buys = min(action[0], self.max_capacity - np.sum(self.current_obs[:self.lead_time]))

        demand = self.rng.poisson(self.current_obs[self.lead_time])
        next_obs = np.concatenate((self.current_obs[1: self.lead_time],
                                   np.array([buys]),
                                   self.current_obs[self.lead_time:]
                                   )
                                  )
        next_obs[0] += max(0, self.current_obs[0] - demand)

        reward = (self.current_obs[self.lead_time + 1] * (self.current_obs[0] + self.current_obs[1] - next_obs[0]) -
                  self.current_obs[self.lead_time + 2] * buys -
                  self.current_obs[self.lead_time + 3] * (next_obs[0] - self.current_obs[1])
                  )

        self.day_num += 1
        done = False
        if self.day_num >= self.episode_length_in_days:
            done = True

        self.current_obs = next_obs

        return self.current_obs, reward, done, {"demand": demand}

    def render(self, mode="human"):
        """
        Returns: None
        Show the current environment state e.g. the graphical window in `CartPole-v1`
        This method must be implemented, but it is OK to have an empty implementation if rendering is not
        important
        """
        pass

    def close(self):
        """
        Returns: None
        This method is optional. Used to cleanup all resources (threads, graphical windows) etc.
        """
        pass

    def seed(self, seed=None):
        """
        Returns: List of seeds
        This method is optional. Used to set seeds for the environment's random number generator for
        obtaining deterministic behavior
        """
        return

Here's a possible implementation of the `ModifyReward` wrapper.

In [25]:
class ModifyReward(gym.Wrapper):
    def step(self, action):
        # Must store on hand inventory before taking the step, since stepping through will
        # overwrite it with the next day's value
        on_hand_inventory = self.current_obs[0]
        obs, r, done, info = self.env.step(action)
        demand = info["demand"]
        # Calculate goodwill penalty in the last step
        # We are assuming that the last element of the observation is the goodwill_penalty_per_unit
        # This will be the case if we are wrapping ModifyObservation(inventory_env)
        goodwill_penalty = - obs[-1] * max(0, demand - on_hand_inventory)
        return obs, r + goodwill_penalty, done, info

In [16]:
from gym import ObservationWrapper


class ModifyObservation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.max_goodwill_penalty_per_unit = 10
        obs_low = self.env.observation_space.low
        obs_high = self.env.observation_space.high
        self.observation_space = Box(
            low = np.append(obs_low, 0),
            high = np.append(obs_high, self.max_goodwill_penalty_per_unit)
        )
        
    def reset(self):
        self.goodwill_penalty_per_unit = self.env.rng.uniform() * self.max_goodwill_penalty_per_unit
        return super().reset()
    
    def observation(self, obs):
        return np.append(obs, self.goodwill_penalty_per_unit)

In the test below, notice how we get negative rewards even though we never order any inventory. That's because there is a goodwill penalty.

In [27]:
inventory_env = InventoryEnv()
inventory_env_hard = ModifyReward(ModifyObservation(inventory_env))    # The order of wrappers matter in this case
inventory_env_hard.reset()
while True:
    obs, r, done, info = inventory_env_hard.step(np.array([0]))     # We do not order any inventory
    print(r)
    if done:
        break

-593.6272992865224
-533.053085073612
-508.8233993884478
-609.7804230766319
-617.8569849716866
-561.3210517063035
-605.7421421291045
-512.8616803359752
-537.0913660211394
-585.5507373914677
-561.3210517063035
-484.59371370328364
-609.7804230766319
-484.59371370328364
-557.2827707587762
-581.5124564439403
-625.9335468667414
-533.053085073612
-456.32574707059206
-589.5890183389951
-613.8187040241593
-508.8233993884478
-496.7085565458657
-524.9765231785573
-524.9765231785573
-500.74683749339306
-545.1679279161941
-480.55543275575627
-533.053085073612
-573.4358945488856
-545.1679279161941
-569.3976136013582
-537.0913660211394
-529.0148041260846
-541.1296469686667
-464.4023089656468
-573.4358945488856
-456.32574707059206
-496.7085565458657
-609.7804230766319
-573.4358945488856
-496.7085565458657
-529.0148041260846
-520.9382422310299
-597.6655802340498
-625.9335468667414
-484.59371370328364
-480.55543275575627
-541.1296469686667
-545.1679279161941
-577.474175496413
-516.8999612835025
-480.555