# Define Running Mode

'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.

In [18]:
full_dataset = False

# Import Packages

In [19]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [20]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged.csv')

# Compute Weights

In [21]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_001_TX_2_validation,0,0,0,0,0,181,1.0,2015-10-22,11538,...,0.7701321,1.662342,1.33946,1.0,1,0.0,0.0,0.0,0,22
1,HOBBIES_1_002_TX_2_validation,1,0,0,0,0,181,0.0,2015-10-22,11538,...,0.6396838,3.169402,2.249556,2.0,1,0.0,0.0,0.0,0,22
2,HOBBIES_1_003_TX_2_validation,2,0,0,0,0,181,0.0,2015-10-22,11538,...,0.3457459,5.613785,2.272519,0.0,1,0.0,0.0,0.0,0,22
3,HOBBIES_1_004_TX_2_validation,3,0,0,0,0,181,0.0,2015-10-22,11538,...,0.0,,,0.0,1,0.0,0.0,0.0,0,22
4,HOBBIES_1_005_TX_2_validation,4,0,0,0,0,181,0.0,2015-10-22,11538,...,0.980265,0.566582,1.288937,0.0,1,0.0,0.0,0.0,0,22
5,HOBBIES_1_006_TX_2_validation,5,0,0,0,0,181,0.0,2015-10-22,11538,...,3.548984e-08,,,0.0,1,0.0,0.0,0.0,0,22
6,HOBBIES_1_007_TX_2_validation,6,0,0,0,0,181,0.0,2015-10-22,11538,...,0.0,,,0.0,1,0.0,0.0,0.0,0,22
7,HOBBIES_1_008_TX_2_validation,7,0,0,0,0,181,2.0,2015-10-22,11538,...,14.95879,1.513989,1.686714,4.0,1,0.0,0.0,0.0,0,22
8,HOBBIES_1_009_TX_2_validation,8,0,0,0,0,181,0.0,2015-10-22,11538,...,0.6789106,0.267606,1.320249,0.0,1,0.0,0.0,0.0,0,22
9,HOBBIES_1_010_TX_2_validation,9,0,0,0,0,181,0.0,2015-10-22,11538,...,1.09387,-0.006054,0.806169,1.0,1,0.0,0.0,0.0,0,22


In [22]:
# drop products that were not up for sale
df_merged = df_merged.loc[df_merged['sell_price'].notna()]

In [23]:
# take 28 last observations for each product
df_merged = df_merged.groupby(['id']).tail(28)
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
89270,HOBBIES_1_001_TX_2_validation,0,0,0,0,0,339,0.0,2016-03-28,11609,...,1.166585,19.222143,3.823490,0.0,1,0.0,0.0,0.0,0,28
89271,HOBBIES_1_002_TX_2_validation,1,0,0,0,0,339,0.0,2016-03-28,11609,...,0.402578,17.401083,4.280921,0.0,1,0.0,0.0,0.0,0,28
89272,HOBBIES_1_003_TX_2_validation,2,0,0,0,0,339,0.0,2016-03-28,11609,...,0.253708,11.183432,3.659999,0.0,1,0.0,0.0,0.0,0,28
89273,HOBBIES_1_004_TX_2_validation,3,0,0,0,0,339,0.0,2016-03-28,11609,...,1.633345,1.346979,1.479220,0.0,1,0.0,0.0,0.0,0,28
89274,HOBBIES_1_005_TX_2_validation,4,0,0,0,0,339,0.0,2016-03-28,11609,...,0.621456,0.542877,1.329992,0.0,1,0.0,0.0,0.0,0,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105085,HOBBIES_2_145_TX_2_validation,560,1,0,0,0,366,0.0,2016-04-24,11613,...,0.668675,2.301931,1.638289,0.0,1,0.0,0.0,0.0,1,24
105086,HOBBIES_2_146_TX_2_validation,561,1,0,0,0,366,2.0,2016-04-24,11613,...,0.556053,0.889431,1.215798,0.0,1,0.0,0.0,0.0,1,24
105087,HOBBIES_2_147_TX_2_validation,562,1,0,0,0,366,0.0,2016-04-24,11613,...,0.932183,6.228641,2.014144,0.0,1,0.0,0.0,0.0,1,24
105088,HOBBIES_2_148_TX_2_validation,563,1,0,0,0,366,0.0,2016-04-24,11613,...,0.402578,17.401083,4.280921,0.0,1,0.0,0.0,0.0,1,24


In [24]:
# compute turnover
df_merged.loc[:,'turnover'] = df_merged.loc[:,'sale'] * df_merged.loc[:,'sell_price']
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day,turnover
89270,HOBBIES_1_001_TX_2_validation,0,0,0,0,0,339,0.0,2016-03-28,11609,...,19.222143,3.823490,0.0,1,0.0,0.0,0.0,0,28,0.00
89271,HOBBIES_1_002_TX_2_validation,1,0,0,0,0,339,0.0,2016-03-28,11609,...,17.401083,4.280921,0.0,1,0.0,0.0,0.0,0,28,0.00
89272,HOBBIES_1_003_TX_2_validation,2,0,0,0,0,339,0.0,2016-03-28,11609,...,11.183432,3.659999,0.0,1,0.0,0.0,0.0,0,28,0.00
89273,HOBBIES_1_004_TX_2_validation,3,0,0,0,0,339,0.0,2016-03-28,11609,...,1.346979,1.479220,0.0,1,0.0,0.0,0.0,0,28,0.00
89274,HOBBIES_1_005_TX_2_validation,4,0,0,0,0,339,0.0,2016-03-28,11609,...,0.542877,1.329992,0.0,1,0.0,0.0,0.0,0,28,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105085,HOBBIES_2_145_TX_2_validation,560,1,0,0,0,366,0.0,2016-04-24,11613,...,2.301931,1.638289,0.0,1,0.0,0.0,0.0,1,24,0.00
105086,HOBBIES_2_146_TX_2_validation,561,1,0,0,0,366,2.0,2016-04-24,11613,...,0.889431,1.215798,0.0,1,0.0,0.0,0.0,1,24,3.94
105087,HOBBIES_2_147_TX_2_validation,562,1,0,0,0,366,0.0,2016-04-24,11613,...,6.228641,2.014144,0.0,1,0.0,0.0,0.0,1,24,0.00
105088,HOBBIES_2_148_TX_2_validation,563,1,0,0,0,366,0.0,2016-04-24,11613,...,17.401083,4.280921,0.0,1,0.0,0.0,0.0,1,24,0.00


In [25]:
# compute total turnover of that product during last 28d
turnover_last_28d = df_merged[['id', 'item_id','turnover']].groupby(['id']).sum()
turnover_last_28d = turnover_last_28d.reset_index()
turnover_last_28d['weights'] = turnover_last_28d.turnover / turnover_last_28d.turnover.sum()

turnover_last_28d

Unnamed: 0,id,item_id,turnover,weights
0,HOBBIES_1_001_TX_2_validation,0,66.08,0.001216
1,HOBBIES_1_002_TX_2_validation,28,23.82,0.000438
2,HOBBIES_1_003_TX_2_validation,56,8.91,0.000164
3,HOBBIES_1_004_TX_2_validation,84,162.40,0.002989
4,HOBBIES_1_005_TX_2_validation,112,24.57,0.000452
...,...,...,...,...
560,HOBBIES_2_145_TX_2_validation,15680,11.52,0.000212
561,HOBBIES_2_146_TX_2_validation,15708,15.76,0.000290
562,HOBBIES_2_147_TX_2_validation,15736,19.40,0.000357
563,HOBBIES_2_148_TX_2_validation,15764,0.00,0.000000


# Validate Results

- The currents weights are provided by the competition organisation in order to test our method.
- The validation_weights provided by the organisator can be downloaded here: https://github.com/Mcompetitions/M5-methods/blob/master/validation/weights_validation.csv

In [26]:
# import validation weights provided by competition organisator
val_weights = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/input/weights_validation.csv')
val_weights

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
0,Level1,Total,X,1.000000
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332
4,Level3,CA_1,X,0.110888
...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,0.000013
42836,Level12,HOUSEHOLD_2_516,TX_3,0.000008
42837,Level12,HOUSEHOLD_2_516,WI_1,0.000002
42838,Level12,HOUSEHOLD_2_516,WI_2,0.000002


In [27]:
# take only weights on bottom level (no aggregation)
val_weights = val_weights.loc[val_weights.Level_id == 'Level12']
val_weights

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
12350,Level12,FOODS_1_001,CA_1,1.970000e-05
12351,Level12,FOODS_1_001,CA_2,1.850000e-05
12352,Level12,FOODS_1_001,CA_3,1.430000e-05
12353,Level12,FOODS_1_001,CA_4,5.380000e-06
12354,Level12,FOODS_1_001,TX_1,5.980000e-07
...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,1.270000e-05
42836,Level12,HOUSEHOLD_2_516,TX_3,7.920000e-06
42837,Level12,HOUSEHOLD_2_516,WI_1,1.580000e-06
42838,Level12,HOUSEHOLD_2_516,WI_2,1.580000e-06


In [28]:
# compute difference between computed and validation weights
if full_dataset:
    assert len(turnover_last_28d.weights.values) == len(val_weights.Weight.values)
    
    # check if max difference isn't greater than rounding error
    assert np.max(diff) < 1e-5

    diff = turnover_last_28d.weights.values - val_weights.Weight.values
    diff

# Save Weights

In [29]:
if full_dataset:
    turnover_last_28d.drop(['turnover'], axis=1).to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/weights_for_wrmsse.csv', index=False)