In [2]:
import gym
from gym.spaces import Box
import numpy as np
from numpy.random import default_rng


class InventoryEnv(gym.Env):
    def __init__(self):
        """
        Must define self.observation_space and self.action_space here
        """
        self.max_capacity = 4000

        self.action_space = Box(low=np.array([0]), high=np.array([self.max_capacity]))

        self.lead_time = 5
        self.obs_dim = self.lead_time + 4

        self.max_mean_daily_demand = 200
        self.max_unit_selling_price = 100
        self.max_daily_holding_cost_per_unit = 5

        obs_low = np.zeros((self.obs_dim,))
        obs_high = np.array([self.max_capacity for _ in range(self.lead_time)] +
                            [self.max_mean_daily_demand, self.max_unit_selling_price,
                             self.max_unit_selling_price, self.max_daily_holding_cost_per_unit
                             ]
                            )
        self.observation_space = Box(low=obs_low, high=obs_high)

        self.rng = default_rng()

        self.current_obs = None
        self.episode_length_in_days = 90
        self.day_num = None

    def reset(self):
        """
        Returns: the observation of the initial state
        Reset the environment to initial state so that a new episode (independent of previous ones) may start
        """
        mean_daily_demand = self.rng.uniform() * self.max_mean_daily_demand
        selling_price = self.rng.uniform() * self.max_unit_selling_price
        buying_price = self.rng.uniform() * selling_price
        daily_holding_cost_per_unit = self.rng.uniform() * min(buying_price,
                                                               self.max_daily_holding_cost_per_unit
                                                               )
        self.current_obs = np.array([0 for _ in range(self.lead_time)] +
                                    [mean_daily_demand, selling_price, buying_price,
                                     daily_holding_cost_per_unit,
                                     ]
                                    )
        self.day_num = 0
        return self.current_obs

    def step(self, action):
        """
        Returns: Given current obs and action, returns the next observation, the reward, done and optionally additional info
        """
        buys = min(action[0], self.max_capacity - np.sum(self.current_obs[:self.lead_time]))

        demand = self.rng.poisson(self.current_obs[self.lead_time])
        next_obs = np.concatenate((self.current_obs[1: self.lead_time],
                                   np.array([buys]),
                                   self.current_obs[self.lead_time:]
                                   )
                                  )
        next_obs[0] += max(0, self.current_obs[0] - demand)

        reward = (self.current_obs[self.lead_time + 1] * (self.current_obs[0] + self.current_obs[1] - next_obs[0]) -
                  self.current_obs[self.lead_time + 2] * buys -
                  self.current_obs[self.lead_time + 3] * (next_obs[0] - self.current_obs[1])
                  )

        self.day_num += 1
        done = False
        if self.day_num >= self.episode_length_in_days:
            done = True

        self.current_obs = next_obs

        return self.current_obs, reward, done, {}

    def render(self, mode="human"):
        """
        Returns: None
        Show the current environment state e.g. the graphical window in `CartPole-v1`
        This method must be implemented, but it is OK to have an empty implementation if rendering is not
        important
        """
        pass

    def close(self):
        """
        Returns: None
        This method is optional. Used to cleanup all resources (threads, graphical windows) etc.
        """
        pass

    def seed(self, seed=None):
        """
        Returns: List of seeds
        This method is optional. Used to set seeds for the environment's random number generator for
        obtaining deterministic behavior
        """
        return

In [25]:
class MakeHard(gym.Wrapper):
    # ----- SOLUTION ----- #
    # Adjust observation space and obs_dim to match InventoryEnvHard
    def __init__(self, env):
        super().__init__(env)
        self.max_goodwill_penalty_per_unit = 10
        obs_low = self.env.observation_space.low
        obs_high = self.env.observation_space.high
        self.observation_space = Box(
            low = np.append(obs_low, 0),
            high = np.append(obs_high, self.max_goodwill_penalty_per_unit)
        )
        self.obs_dim = self.env.obs_dim + 1
        
    def reset(self):
        obs = self.env.reset()
        # ----- SOLUTION ----- #
        # For each episode, we need a new random value for goodwill_penalty_per_unit
        # This value should be stored as an instance variable, so that other methods can use it while an episode is running
        self.goodwill_penalty_per_unit = self.env.rng.uniform() * self.max_goodwill_penalty_per_unit
        # ----- SOLUTION ----- #
        # The state in InventoryEnvHard has one additional element compared to InventoryEnv
        # It's the goodwill_penalty_per_unit
        return np.append(obs, self.goodwill_penalty_per_unit)
    
    def step(self, action):
        obs, r, done, info = self.env.step(action)
        # ----- SOLUTION ----- #
        # The state in InventoryEnvHard has one additional element compared to InventoryEnv
        # It's the goodwill_penalty_per_unit
        return np.append(obs, self.goodwill_penalty_per_unit), r, done, info

In [26]:
env = InventoryEnv()
wrapped = MakeHard(env)

Notice how the state has one more element compared to the original env

In [27]:
wrapped.reset()    # wrapped env

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       13.95544291, 77.01593137, 71.21781838,  3.27505571,  8.66245348])

In [28]:
env.reset()    # original env

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        , 103.11751729,  26.2935599 ,   3.91117194,
         2.33109171])

We get different values of `goodwill_penalty_per_unit` for each episode, as expected

In [29]:
wrapped.reset()    # last element has a new random value

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        , 194.79944821,  72.0614243 ,  63.9324476 ,
         0.79374726,   7.63863415])

The next state preserves the value of `goodwill_penalty_per_unit`, as expected

In [30]:
obs, _, _, _ = wrapped.step(wrapped.action_space.sample())
print(obs)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.02853955e+03 1.94799448e+02 7.20614243e+01 6.39324476e+01
 7.93747261e-01 7.63863415e+00]


Notice how the observation space of the wrapped environment has one additional dimension compared to the original environment. If you don't define this correctly in the wrapped environment, `rllib` will complain since it checks whether the observations belong to the observation space.

In [31]:
wrapped.observation_space

Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], [4000. 4000. 4000. 4000. 4000.  200.  100.  100.    5.   10.], (10,), float32)

In [32]:
env.observation_space

Box([0. 0. 0. 0. 0. 0. 0. 0. 0.], [4000. 4000. 4000. 4000. 4000.  200.  100.  100.    5.], (9,), float32)