In [2]:
import gym


class MyScaleReward(gym.RewardWrapper):
    def reward(self, reward):
        avg_unit_selling_price = self.env.max_unit_selling_price / 2
        avg_num_items_bought_per_day = avg_num_items_sold_per_day = self.env.max_mean_daily_demand / 2
        avg_unit_buying_price = self.env.max_unit_selling_price / 4
        avg_daily_holding_cost_per_unit = self.env.max_daily_holding_cost_per_unit / 2
        avg_num_items_held_per_day = self.env.max_mean_daily_demand / 2
        # ----- SOLUTION ------ #
        # goodwill_penalty_per_unit is chosen randomly from an uniform distribution over (0, max value).
        # Therefore, the average is half of the max value.
        avg_goodwill_penalty_per_unit = self.env.max_goodwill_penalty_per_unit / 2
        # ----- SOLUTION ----- #
        # Unmet demand is max(0, demand - on hand inventory)
        # Since average value of demand and on hand inventory are both ~ max demand / 2, a rough 
        # approximation for average unmet demand is max demand / 4.
        avg_unmet_demand = self.env.max_mean_daily_demand / 4
        avg_high_scale = avg_unit_selling_price * avg_num_items_sold_per_day
        # ----- SOLUTION ----- #
        # Adjust the low scale to account for goodwill penalty
        avg_low_scale = - (avg_unit_buying_price * avg_num_items_bought_per_day +
                           avg_daily_holding_cost_per_unit * avg_num_items_held_per_day +
                           avg_goodwill_penalty_per_unit * avg_unmet_demand
                           )
        mid = (avg_high_scale + avg_low_scale) / 2
        linearly_mapped_reward = 2 * (reward - mid) / (avg_high_scale - avg_low_scale)
        return np.arctan(linearly_mapped_reward) / np.arctan(1)