# Define Running Mode

- 'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.
- 'sales_type' = 'evaluation' if we want to predict for the final M5 leaderboard, else 'validation' 

In [1]:
full_dataset = True
sales_type = 'evaluation'

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged_{sales_type}.csv')

# Compute Weights

In [4]:
# drop days in future
if sales_type == 'evaluation':
    df_merged = df_merged.loc[df_merged['data_type'] == 'validation']

else:
    df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
45244213,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.033333,0.927858,0.246603,0.763245,1.0,0.0,0.0,0.05968164,0,25
45244214,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.333333,0.479463,-1.731692,0.744881,1.0,0.0,0.0,0.0,0,25
45244215,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.5,1.167077,16.58596,3.904594,0.0,0.0,0.0,0.0,0,25
45244216,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.966667,2.07586,-0.529831,0.790924,0.0,0.0,0.0,0.0,0,25
45244217,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1017,1.0,2016-04-25,11613,...,1.1,0.959526,-0.717982,0.542001,1.0,0.0,0.0,4.434852e-08,0,25
45244218,HOBBIES_1_006_CA_1_evaluation,1442,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.033333,1.376736,1.006636,1.380472,0.0,0.0,0.0,0.0,0,25
45244219,HOBBIES_1_007_CA_1_evaluation,1443,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.433333,0.626062,0.542877,1.171699,1.0,0.0,0.0,0.0,0,25
45244220,HOBBIES_1_008_CA_1_evaluation,1444,3,1,0,0,1017,19.0,2016-04-25,11613,...,4.733333,8.144951,2.202501,1.884402,0.0,0.0,0.0,6.377117e-09,0,25
45244221,HOBBIES_1_009_CA_1_evaluation,1445,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.3,1.149213,20.982733,4.618247,0.0,0.0,0.0,1.927735e-08,0,25
45244222,HOBBIES_1_010_CA_1_evaluation,1446,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.366667,0.718395,5.913438,2.297693,0.0,0.0,0.0,2.65265e-08,0,25


In [5]:
# take 28 last observations for each product
df_merged = df_merged.groupby(['id']).tail(28)
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
45244213,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.033333,0.927858,0.246603,0.763245,1.0,0.0,0.0,5.968164e-02,0,25
45244214,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.333333,0.479463,-1.731692,0.744881,1.0,0.0,0.0,0.000000e+00,0,25
45244215,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.500000,1.167077,16.585960,3.904594,0.0,0.0,0.0,0.000000e+00,0,25
45244216,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.966667,2.075860,-0.529831,0.790924,0.0,0.0,0.0,0.000000e+00,0,25
45244217,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1017,1.0,2016-04-25,11613,...,1.100000,0.959526,-0.717982,0.542001,1.0,0.0,0.0,4.434852e-08,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46097928,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2,1047,1.0,2016-05-22,11617,...,0.200000,0.550861,6.029183,2.758372,1.0,0.0,0.0,7.810822e-08,1,22
46097929,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2,1047,0.0,2016-05-22,11617,...,0.300000,0.534983,1.580669,1.621490,0.0,0.0,0.0,4.964984e-08,1,22
46097930,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2,1047,2.0,2016-05-22,11617,...,0.866667,1.136642,1.720479,1.485839,0.0,0.0,0.0,0.000000e+00,1,22
46097931,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2,1047,0.0,2016-05-22,11617,...,1.066667,1.172481,-0.140881,0.963120,3.0,0.0,0.0,0.000000e+00,1,22


In [6]:
# compute turnover
df_merged.loc[:,'turnover'] = df_merged.loc[:,'sale'] * df_merged.loc[:,'sell_price']
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day,turnover
45244213,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.927858,0.246603,0.763245,1.0,0.0,0.0,5.968164e-02,0,25,0.00
45244214,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.479463,-1.731692,0.744881,1.0,0.0,0.0,0.000000e+00,0,25,0.00
45244215,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.167077,16.585960,3.904594,0.0,0.0,0.0,0.000000e+00,0,25,0.00
45244216,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1017,0.0,2016-04-25,11613,...,2.075860,-0.529831,0.790924,0.0,0.0,0.0,0.000000e+00,0,25,0.00
45244217,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1017,1.0,2016-04-25,11613,...,0.959526,-0.717982,0.542001,1.0,0.0,0.0,4.434852e-08,0,25,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46097928,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2,1047,1.0,2016-05-22,11617,...,0.550861,6.029183,2.758372,1.0,0.0,0.0,7.810822e-08,1,22,2.98
46097929,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2,1047,0.0,2016-05-22,11617,...,0.534983,1.580669,1.621490,0.0,0.0,0.0,4.964984e-08,1,22,0.00
46097930,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2,1047,2.0,2016-05-22,11617,...,1.136642,1.720479,1.485839,0.0,0.0,0.0,0.000000e+00,1,22,7.96
46097931,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2,1047,0.0,2016-05-22,11617,...,1.172481,-0.140881,0.963120,3.0,0.0,0.0,0.000000e+00,1,22,0.00


In [7]:
# compute total turnover of that product during last 28d
turnover_last_28d = df_merged[['id','turnover']].groupby(['id']).sum()
turnover_last_28d = turnover_last_28d.reset_index()
turnover_last_28d['weights'] = turnover_last_28d.turnover / turnover_last_28d.turnover.sum()

turnover_last_28d = turnover_last_28d.drop(['turnover'], axis=1)

turnover_last_28d

Unnamed: 0,id,weights
0,FOODS_1_001_CA_1_evaluation,0.000013
1,FOODS_1_001_CA_2_evaluation,0.000010
2,FOODS_1_001_CA_3_evaluation,0.000016
3,FOODS_1_001_CA_4_evaluation,0.000003
4,FOODS_1_001_TX_1_evaluation,0.000015
...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,0.000006
30486,HOUSEHOLD_2_516_TX_3_evaluation,0.000012
30487,HOUSEHOLD_2_516_WI_1_evaluation,0.000008
30488,HOUSEHOLD_2_516_WI_2_evaluation,0.000003


# Validate Results

- The currents weights are provided by the competition organisation in order to test our method.
- The validation_weights provided by the organisator can be downloaded here: https://github.com/Mcompetitions/M5-methods/blob/master/validation/weights_validation.csv

In [8]:
# import validation weights provided by competition organisator
val_weights = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/input/weights_validation.csv')
val_weights

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
0,Level1,Total,X,1.000000
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332
4,Level3,CA_1,X,0.110888
...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,0.000013
42836,Level12,HOUSEHOLD_2_516,TX_3,0.000008
42837,Level12,HOUSEHOLD_2_516,WI_1,0.000002
42838,Level12,HOUSEHOLD_2_516,WI_2,0.000002


In [9]:
# take only weights on bottom level (no aggregation) and merge AggLevels
val_weights = val_weights.loc[val_weights.Level_id == 'Level12']
val_weights['id'] = val_weights['Agg_Level_1']+'_'+val_weights['Agg_Level_2']+'_validation'
val_weights = val_weights.drop(['Agg_Level_1', 'Agg_Level_2'], axis=1)
val_weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Level_id,Weight,id
12350,Level12,1.970000e-05,FOODS_1_001_CA_1_validation
12351,Level12,1.850000e-05,FOODS_1_001_CA_2_validation
12352,Level12,1.430000e-05,FOODS_1_001_CA_3_validation
12353,Level12,5.380000e-06,FOODS_1_001_CA_4_validation
12354,Level12,5.980000e-07,FOODS_1_001_TX_1_validation
...,...,...,...
42835,Level12,1.270000e-05,HOUSEHOLD_2_516_TX_2_validation
42836,Level12,7.920000e-06,HOUSEHOLD_2_516_TX_3_validation
42837,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_1_validation
42838,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_2_validation


In [10]:
# compute difference between computed and validation weights
if full_dataset and sales_type =='validation':
    
    df = val_weights.copy()
    df = df.merge(turnover_last_28d, on='id')

    assert df.shape[0] == 30490 
    
    # check if max difference isn't greater than rounding error
    assert np.max(df.Weight - df.weights) < 1e-5


# Save Weights

In [11]:
if full_dataset:
    turnover_last_28d.to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/weights_for_wrmsse_{sales_type}.csv', index=False)