# Define Running Mode

'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.

In [1]:
full_dataset = True

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged.csv')

# Compute Weights

In [4]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,912,0.0,2011-07-29,11126,...,0.345746,2.859231,2.272519,1.0,1,0.0,0.0,0.0,0,29
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,912,1.0,2011-07-29,11126,...,1.691425,2.492573,1.822482,3.0,1,0.0,0.0,0.0,0,29
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,912,0.0,2011-07-29,11126,...,1.597412,5.926539,2.229717,0.0,1,0.0,0.0,0.0,0,29
5,HOBBIES_1_006_CA_1_validation,1442,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
6,HOBBIES_1_007_CA_1_validation,1443,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
7,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,912,0.0,2011-07-29,11126,...,8.177907,-0.281568,1.213215,0.0,1,0.0,0.0,8.986714e-09,0,29
8,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,912,1.0,2011-07-29,11126,...,2.155186,0.783767,1.156818,0.0,1,0.0,0.0,1.927735e-08,0,29
9,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,912,0.0,2011-07-29,11126,...,0.73968,-1.014088,0.4804,2.0,1,0.0,0.0,2.65265e-08,0,29


In [5]:
# drop products that were not up for sale
df_merged = df_merged.loc[df_merged['sell_price'].notna()]

In [6]:
# take 28 last observations for each product
df_merged = df_merged.groupby(['id']).tail(28)
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
51954960,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,985,1.0,2016-03-28,11609,...,1.231764e+00,1.165137,1.423363,0.0,1,0.0,0.0,8.622286e-08,0,28
51954961,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,985,1.0,2016-03-28,11609,...,3.790490e-01,2.859231,1.884415,0.0,1,0.0,0.0,0.000000e+00,0,28
51954962,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,985,0.0,2016-03-28,11609,...,4.660916e-01,-1.455547,0.919500,0.0,1,0.0,0.0,0.000000e+00,0,28
51954963,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,985,0.0,2016-03-28,11609,...,1.954658e+00,-0.455635,0.679141,0.0,1,0.0,0.0,0.000000e+00,0,28
51954964,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,985,1.0,2016-03-28,11609,...,1.306043e+00,1.330443,1.329091,1.0,1,0.0,0.0,4.434852e-08,0,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52808675,FOODS_3_823_WI_3_validation,1432,2,0,9,2,1016,1.0,2016-04-24,11613,...,5.832923e-01,4.439427,2.147972,0.0,1,0.0,0.0,7.810822e-08,1,24
52808676,FOODS_3_824_WI_3_validation,1433,2,0,9,2,1016,0.0,2016-04-24,11613,...,4.427322e-08,,,0.0,1,0.0,0.0,1.872101e-01,1,24
52808677,FOODS_3_825_WI_3_validation,1434,2,0,9,2,1016,0.0,2016-04-24,11613,...,9.352607e-01,-0.928225,0.038939,1.0,1,0.0,0.0,0.000000e+00,1,24
52808678,FOODS_3_826_WI_3_validation,1435,2,0,9,2,1016,3.0,2016-04-24,11613,...,1.085431e+00,0.021799,0.686957,4.0,1,0.0,0.0,0.000000e+00,1,24


In [7]:
# compute turnover
df_merged.loc[:,'turnover'] = df_merged.loc[:,'sale'] * df_merged.loc[:,'sell_price']
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day,turnover
51954960,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,985,1.0,2016-03-28,11609,...,1.165137,1.423363,0.0,1,0.0,0.0,8.622286e-08,0,28,8.26
51954961,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,985,1.0,2016-03-28,11609,...,2.859231,1.884415,0.0,1,0.0,0.0,0.000000e+00,0,28,3.97
51954962,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,985,0.0,2016-03-28,11609,...,-1.455547,0.919500,0.0,1,0.0,0.0,0.000000e+00,0,28,0.00
51954963,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,985,0.0,2016-03-28,11609,...,-0.455635,0.679141,0.0,1,0.0,0.0,0.000000e+00,0,28,0.00
51954964,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,985,1.0,2016-03-28,11609,...,1.330443,1.329091,1.0,1,0.0,0.0,4.434852e-08,0,28,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52808675,FOODS_3_823_WI_3_validation,1432,2,0,9,2,1016,1.0,2016-04-24,11613,...,4.439427,2.147972,0.0,1,0.0,0.0,7.810822e-08,1,24,2.98
52808676,FOODS_3_824_WI_3_validation,1433,2,0,9,2,1016,0.0,2016-04-24,11613,...,,,0.0,1,0.0,0.0,1.872101e-01,1,24,0.00
52808677,FOODS_3_825_WI_3_validation,1434,2,0,9,2,1016,0.0,2016-04-24,11613,...,-0.928225,0.038939,1.0,1,0.0,0.0,0.000000e+00,1,24,0.00
52808678,FOODS_3_826_WI_3_validation,1435,2,0,9,2,1016,3.0,2016-04-24,11613,...,0.021799,0.686957,4.0,1,0.0,0.0,0.000000e+00,1,24,3.84


In [8]:
# compute total turnover of that product during last 28d
turnover_last_28d = df_merged[['id','turnover']].groupby(['id']).sum()
turnover_last_28d = turnover_last_28d.reset_index()
turnover_last_28d['weights'] = turnover_last_28d.turnover / turnover_last_28d.turnover.sum()

turnover_last_28d = turnover_last_28d.drop(['turnover'], axis=1)

turnover_last_28d = df_merged[['id','item_id']].drop_duplicates().merge(turnover_last_28d)


turnover_last_28d

Unnamed: 0,id,item_id,weights
0,HOBBIES_1_001_CA_1_validation,1437,0.000060
1,HOBBIES_1_002_CA_1_validation,1438,0.000002
2,HOBBIES_1_003_CA_1_validation,1439,0.000013
3,HOBBIES_1_004_CA_1_validation,1440,0.000063
4,HOBBIES_1_005_CA_1_validation,1441,0.000029
...,...,...,...
30485,FOODS_3_823_WI_3_validation,1432,0.000005
30486,FOODS_3_824_WI_3_validation,1433,0.000006
30487,FOODS_3_825_WI_3_validation,1434,0.000027
30488,FOODS_3_826_WI_3_validation,1435,0.000009


# Validate Results

- The currents weights are provided by the competition organisation in order to test our method.
- The validation_weights provided by the organisator can be downloaded here: https://github.com/Mcompetitions/M5-methods/blob/master/validation/weights_validation.csv

In [9]:
# import validation weights provided by competition organisator
val_weights = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/input/weights_validation.csv')
val_weights

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
0,Level1,Total,X,1.000000
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332
4,Level3,CA_1,X,0.110888
...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,0.000013
42836,Level12,HOUSEHOLD_2_516,TX_3,0.000008
42837,Level12,HOUSEHOLD_2_516,WI_1,0.000002
42838,Level12,HOUSEHOLD_2_516,WI_2,0.000002


In [10]:
# take only weights on bottom level (no aggregation) and merge AggLevels
val_weights = val_weights.loc[val_weights.Level_id == 'Level12']
val_weights['id'] = val_weights['Agg_Level_1']+'_'+val_weights['Agg_Level_2']+'_validation'
val_weights = val_weights.drop(['Agg_Level_1', 'Agg_Level_2'], axis=1)
val_weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Level_id,Weight,id
12350,Level12,1.970000e-05,FOODS_1_001_CA_1_validation
12351,Level12,1.850000e-05,FOODS_1_001_CA_2_validation
12352,Level12,1.430000e-05,FOODS_1_001_CA_3_validation
12353,Level12,5.380000e-06,FOODS_1_001_CA_4_validation
12354,Level12,5.980000e-07,FOODS_1_001_TX_1_validation
...,...,...,...
42835,Level12,1.270000e-05,HOUSEHOLD_2_516_TX_2_validation
42836,Level12,7.920000e-06,HOUSEHOLD_2_516_TX_3_validation
42837,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_1_validation
42838,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_2_validation


In [11]:
# compute difference between computed and validation weights
if full_dataset:
    
    df = val_weights.copy()
    df = df.merge(turnover_last_28d, on='id')

    assert df.shape[0] == 30490 
    
    # check if max difference isn't greater than rounding error
    assert np.max(df.Weight - df.weights) < 1e-5


# Save Weights

In [12]:
if full_dataset:
    turnover_last_28d.to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/weights_for_wrmsse.csv', index=False)