In [1]:
import torch
import torch.distributions as d
import torch.nn.functional as F
from retail import retail
import numpy as np


In this experiment, we define our CVaR at the 5% level: knowing that we're in the 5% worst cases, how much do we expect to waste per item on average? To do so, we weight a linear utility that only caters about waste, not sales or availability.

In [None]:
n_customers = 2500
n_buckets = 4
monte_carlo_size = 100
store_args= {'assortment_size': 1000, 'bucket_cov': torch.eye(n_buckets)/100, 'seed' : 1066,
             'max_stock': 1000, 'forecastVariance' :0., 'horizon': 100, 'lead_time': 1}
bucketDist = d.uniform.Uniform(0,1)

store_args = {
'assortment_size': 1000,
    'max_stock': 1000,
 'bucket_cov': torch.eye(n_buckets)/100, 
 'seed' : 1066,
 'utility_function': 'linear',
 # We give a null weight to availability and sales
 'utility_weights': {
  'alpha': 0., 
  'beta': 1., 
  'gamma':0. },
'forecastVariance' :0., 'horizon': 100, 'lead_time': 1
}
# We define our quantile for the CVAR
cvar_level = 0.05


We simply define a computation loop where we operate on our environment and store waste results. We compute the CVaR over trajectories of length 100, over 100 stores.

In [None]:
#Create the list of the average daily reward for each customer distribution for the chosen policy
summed_rewards_policy = []
for i in range(monte_carlo_size):
    sub_rewards = []
    done = False
    #Generate the store and its customer repartition throughout the day
    torch.manual_seed(i)
    sampled = bucketDist.sample((n_buckets,))
    sample_bucket_customers = (n_customers*sampled/sampled.sum()).round()
    store = retail.StoreEnv(**store_args, bucket_customers = sample_bucket_customers)
    while not (done):
        #Compute the order according to the policy 
        customers = sample_bucket_customers.max()
        p = store.forecast.squeeze()
        std = torch.sqrt(customers*p+(1-p))
        order = F.relu(3*std+store.forecast.squeeze()*customers-store.get_full_inventory_position()).round()
        # Step the environment and get its observation
        obs = store.step(order.numpy())
        # Store reward for the specific time step
        sub_rewards.append(obs[1])
        done = obs[2]
    #Append average reward of this customer repartition to the list of rewards
    summed_rewards_policy.append(torch.stack(sub_rewards).mean())

Having stored the results, we simply need to compute the estimator of the 5% quantile (VaR), and the expectation below it (our CVaR)

In [3]:
rewards = torch.stack(summed_rewards_policy)
# We first obtain the Value-at-risk
var = np.quantile(rewards, cvar_level)
# We retrieve elements below the var
bad_cases = rewards[rewards<var]
# Finally, we compute the CVAR:
bad_cases.mean()


tensor(-0.4768)

Thus, we can expect to waste 48 cents of monetary unit (aka € here) per item in the assortment per day, over our defined item distribution for 100-days long trajectories.