In [2]:
import gym
from gym.spaces import Box
import numpy as np
from numpy.random import default_rng


class InventoryEnv(gym.Env):
    def __init__(self):
        """
        Must define self.observation_space and self.action_space here
        """
        self.max_capacity = 4000

        self.action_space = Box(low=np.array([0]), high=np.array([self.max_capacity]))

        self.lead_time = 5
        self.obs_dim = self.lead_time + 4

        self.max_mean_daily_demand = 200
        self.max_unit_selling_price = 100
        self.max_daily_holding_cost_per_unit = 5

        obs_low = np.zeros((self.obs_dim,))
        obs_high = np.array([self.max_capacity for _ in range(self.lead_time)] +
                            [self.max_mean_daily_demand, self.max_unit_selling_price,
                             self.max_unit_selling_price, self.max_daily_holding_cost_per_unit
                             ]
                            )
        self.observation_space = Box(low=obs_low, high=obs_high)

        self.rng = default_rng()

        self.current_obs = None
        self.episode_length_in_days = 90
        self.day_num = None

    def reset(self):
        """
        Returns: the observation of the initial state
        Reset the environment to initial state so that a new episode (independent of previous ones) may start
        """
        mean_daily_demand = self.rng.uniform() * self.max_mean_daily_demand
        selling_price = self.rng.uniform() * self.max_unit_selling_price
        buying_price = self.rng.uniform() * selling_price
        daily_holding_cost_per_unit = self.rng.uniform() * min(buying_price,
                                                               self.max_daily_holding_cost_per_unit
                                                               )
        self.current_obs = np.array([0 for _ in range(self.lead_time)] +
                                    [mean_daily_demand, selling_price, buying_price,
                                     daily_holding_cost_per_unit,
                                     ]
                                    )
        self.day_num = 0
        return self.current_obs

    def step(self, action):
        """
        Returns: Given current obs and action, returns the next observation, the reward, done and optionally additional info
        """
        buys = min(action[0], self.max_capacity - np.sum(self.current_obs[:self.lead_time]))

        demand = self.rng.poisson(self.current_obs[self.lead_time])
        next_obs = np.concatenate((self.current_obs[1: self.lead_time],
                                   np.array([buys]),
                                   self.current_obs[self.lead_time:]
                                   )
                                  )
        next_obs[0] += max(0, self.current_obs[0] - demand)

        reward = (self.current_obs[self.lead_time + 1] * (self.current_obs[0] + self.current_obs[1] - next_obs[0]) -
                  self.current_obs[self.lead_time + 2] * buys -
                  self.current_obs[self.lead_time + 3] * (next_obs[0] - self.current_obs[1])
                  )

        self.day_num += 1
        done = False
        if self.day_num >= self.episode_length_in_days:
            done = True

        self.current_obs = next_obs

        return self.current_obs, reward, done, {"demand": demand}

    def render(self, mode="human"):
        """
        Returns: None
        Show the current environment state e.g. the graphical window in `CartPole-v1`
        This method must be implemented, but it is OK to have an empty implementation if rendering is not
        important
        """
        pass

    def close(self):
        """
        Returns: None
        This method is optional. Used to cleanup all resources (threads, graphical windows) etc.
        """
        pass

    def seed(self, seed=None):
        """
        Returns: List of seeds
        This method is optional. Used to set seeds for the environment's random number generator for
        obtaining deterministic behavior
        """
        return

Here's a possible implementation of the `MakeHard` wrapper that returns the correct reward.

In [9]:
class MakeHard(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.max_goodwill_penalty_per_unit = 10
        obs_low = self.env.observation_space.low
        obs_high = self.env.observation_space.high
        self.observation_space = Box(
            low = np.append(obs_low, 0),
            high = np.append(obs_high, self.max_goodwill_penalty_per_unit)
        )
        self.obs_dim = self.env.obs_dim + 1
        
    def reset(self):
        obs = self.env.reset()
        self.goodwill_penalty_per_unit = self.env.rng.uniform() * self.max_goodwill_penalty_per_unit
        return np.append(obs, self.goodwill_penalty_per_unit)
    
    def step(self, action):
        # ----- SOLUTION ------ #
        # Must store on hand inventory before taking the step, since stepping through will
        # overwrite it with the next day's value
        on_hand_inventory = self.env.current_obs[0]
        obs, r, done, info = self.env.step(action)
        # ----- SOLUTION ----- #
        # Calculate goodwill penalty and add it to the reward
        demand = info["demand"]
        goodwill_penalty = - self.goodwill_penalty_per_unit * max(0, demand - on_hand_inventory)
        return np.append(obs, self.goodwill_penalty_per_unit), r + goodwill_penalty, done, info

In the test below, notice how we get negative rewards even though we never order any inventory. That's because there is a goodwill penalty.

In [10]:
inventory_env = InventoryEnv()
inventory_env_hard = MakeHard(inventory_env)
inventory_env_hard.reset()
while True:
    obs, r, done, info = inventory_env_hard.step(np.array([0]))     # We do not order any inventory
    print(r)
    if done:
        break

-621.5914918381816
-726.7135823696387
-557.6041323842511
-726.7135823696387
-726.7135823696387
-703.8609539932351
-735.8546337202002
-763.2777877718847
-763.2777877718847
-749.5662107460424
-717.5725310190772
-662.7262229157083
-713.0020053437966
-681.0083256168313
-713.0020053437966
-630.7325431887431
-612.4504404876201
-685.5788512921121
-671.8672742662698
-617.0209661629008
-585.0272864359356
-744.9956850707617
-621.5914918381816
-676.4377999415506
-754.1367364213232
-713.0020053437966
-644.4441202145853
-667.2967485909891
-630.7325431887431
-690.1493769673928
-731.2841080449195
-658.1556972404276
-607.8799148123393
-758.7072620966039
-735.8546337202002
-703.8609539932351
-767.8483134471654
-690.1493769673928
-548.4630810336896
-772.4188391224462
-639.8735945393046
-690.1493769673928
-749.5662107460424
-548.4630810336896
-676.4377999415506
-722.143056694358
-776.9893647977269
-749.5662107460424
-676.4377999415506
-758.7072620966039
-694.7199026426736
-690.1493769673928
-735.85463372