# Define Running Mode

- 'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.
- 'sales_type' = 'evaluation' if we want to predict for the final M5 leaderboard, else 'validation' 

In [1]:
full_dataset = True
sales_type = 'evaluation'

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged_{sales_type}.csv')

# Compute Weights

In [4]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.0,0.0,,,0.0,0.0,,,0,31
1,HOBBIES_1_008_CA_1_evaluation,1444,3,1,0,0,1520,0.0,2011-03-31,11109,...,3.333333,4.685512,4.28914,2.011404,0.0,0.0,0.0,0.0,0,31
2,HOBBIES_1_009_CA_1_evaluation,1445,3,1,0,0,1520,4.0,2011-03-31,11109,...,1.9,2.354013,2.22726,1.589597,0.0,0.0,0.0,0.1066537,0,31
3,HOBBIES_1_010_CA_1_evaluation,1446,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.166667,0.530669,8.178714,3.159005,0.0,0.0,0.0,0.0,0,31
4,HOBBIES_1_012_CA_1_evaluation,1448,3,1,0,0,1520,1.0,2011-03-31,11109,...,0.6,0.932183,0.869891,1.466822,0.0,0.0,0.0,0.1334127,0,31
5,HOBBIES_1_015_CA_1_evaluation,1451,3,1,0,0,1520,7.0,2011-03-31,11109,...,5.666666,7.284056,0.141198,1.189591,1.0,0.0,0.0,2.181385e-09,0,31
6,HOBBIES_1_016_CA_1_evaluation,1452,3,1,0,0,1520,0.0,2011-03-31,11109,...,7.033333,10.962265,5.062504,1.886527,6.0,0.0,0.0,2.181385e-09,0,31
7,HOBBIES_1_020_CA_1_evaluation,1456,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.266667,0.583292,3.233676,2.147972,0.0,0.0,0.0,0.05858439,0,31
8,HOBBIES_1_021_CA_1_evaluation,1457,3,1,0,0,1520,2.0,2011-03-31,11109,...,0.0,0.0,,,0.0,0.0,,,0,31
9,HOBBIES_1_022_CA_1_evaluation,1458,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.833333,0.833908,-0.117911,0.714973,0.0,0.0,0.0,0.05520519,0,31


In [5]:
# drop products that were not up for sale
df_merged = df_merged.loc[df_merged['sell_price'].notna()]

In [6]:
# take 28 last observations for each product
df_merged = df_merged.groupby(['id']).tail(28)
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
44390493,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,985,1.0,2016-03-28,11609,...,1.000000,1.231764e+00,1.165137,1.423363,0.0,0.0,0.0,8.622286e-08,0,28
44390494,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,985,1.0,2016-03-28,11609,...,0.166667,3.790490e-01,2.859231,1.884415,0.0,0.0,0.0,0.000000e+00,0,28
44390495,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,985,0.0,2016-03-28,11609,...,0.300000,4.660916e-01,-1.455547,0.919500,0.0,0.0,0.0,0.000000e+00,0,28
44390496,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,985,0.0,2016-03-28,11609,...,2.200000,1.954658e+00,-0.455635,0.679141,0.0,0.0,0.0,0.000000e+00,0,28
44390497,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,985,1.0,2016-03-28,11609,...,1.133333,1.306043e+00,1.330443,1.329091,1.0,0.0,0.0,4.434852e-08,0,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45244208,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2,1016,1.0,2016-04-24,11613,...,0.266667,5.832923e-01,4.439427,2.147972,0.0,0.0,0.0,7.810822e-08,1,24
45244209,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2,1016,0.0,2016-04-24,11613,...,0.000000,4.427322e-08,,,0.0,0.0,0.0,1.872101e-01,1,24
45244210,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2,1016,0.0,2016-04-24,11613,...,1.233333,9.352607e-01,-0.928225,0.038939,1.0,0.0,0.0,0.000000e+00,1,24
45244211,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2,1016,3.0,2016-04-24,11613,...,1.166667,1.085431e+00,0.021799,0.686957,4.0,0.0,0.0,0.000000e+00,1,24


In [7]:
# compute turnover
df_merged.loc[:,'turnover'] = df_merged.loc[:,'sale'] * df_merged.loc[:,'sell_price']
df_merged

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day,turnover
44390493,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,985,1.0,2016-03-28,11609,...,1.231764e+00,1.165137,1.423363,0.0,0.0,0.0,8.622286e-08,0,28,8.26
44390494,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,985,1.0,2016-03-28,11609,...,3.790490e-01,2.859231,1.884415,0.0,0.0,0.0,0.000000e+00,0,28,3.97
44390495,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,985,0.0,2016-03-28,11609,...,4.660916e-01,-1.455547,0.919500,0.0,0.0,0.0,0.000000e+00,0,28,0.00
44390496,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,985,0.0,2016-03-28,11609,...,1.954658e+00,-0.455635,0.679141,0.0,0.0,0.0,0.000000e+00,0,28,0.00
44390497,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,985,1.0,2016-03-28,11609,...,1.306043e+00,1.330443,1.329091,1.0,0.0,0.0,4.434852e-08,0,28,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45244208,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2,1016,1.0,2016-04-24,11613,...,5.832923e-01,4.439427,2.147972,0.0,0.0,0.0,7.810822e-08,1,24,2.98
45244209,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2,1016,0.0,2016-04-24,11613,...,4.427322e-08,,,0.0,0.0,0.0,1.872101e-01,1,24,0.00
45244210,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2,1016,0.0,2016-04-24,11613,...,9.352607e-01,-0.928225,0.038939,1.0,0.0,0.0,0.000000e+00,1,24,0.00
45244211,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2,1016,3.0,2016-04-24,11613,...,1.085431e+00,0.021799,0.686957,4.0,0.0,0.0,0.000000e+00,1,24,3.84


In [8]:
# compute total turnover of that product during last 28d
turnover_last_28d = df_merged[['id','turnover']].groupby(['id']).sum()
turnover_last_28d = turnover_last_28d.reset_index()
turnover_last_28d['weights'] = turnover_last_28d.turnover / turnover_last_28d.turnover.sum()

turnover_last_28d = turnover_last_28d.drop(['turnover'], axis=1)

turnover_last_28d

Unnamed: 0,id,weights
0,FOODS_1_001_CA_1_evaluation,1.972157e-05
1,FOODS_1_001_CA_2_evaluation,1.852632e-05
2,FOODS_1_001_CA_3_evaluation,1.434296e-05
3,FOODS_1_001_CA_4_evaluation,5.378610e-06
4,FOODS_1_001_TX_1_evaluation,5.976234e-07
...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,1.267815e-05
30486,HOUSEHOLD_2_516_TX_3_evaluation,7.923846e-06
30487,HOUSEHOLD_2_516_WI_1_evaluation,1.584769e-06
30488,HOUSEHOLD_2_516_WI_2_evaluation,1.584769e-06


# Validate Results

- The currents weights are provided by the competition organisation in order to test our method.
- The validation_weights provided by the organisator can be downloaded here: https://github.com/Mcompetitions/M5-methods/blob/master/validation/weights_validation.csv

In [9]:
# import validation weights provided by competition organisator
val_weights = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/input/weights_validation.csv')
val_weights

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
0,Level1,Total,X,1.000000
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332
4,Level3,CA_1,X,0.110888
...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,0.000013
42836,Level12,HOUSEHOLD_2_516,TX_3,0.000008
42837,Level12,HOUSEHOLD_2_516,WI_1,0.000002
42838,Level12,HOUSEHOLD_2_516,WI_2,0.000002


In [10]:
# take only weights on bottom level (no aggregation) and merge AggLevels
val_weights = val_weights.loc[val_weights.Level_id == 'Level12']
val_weights['id'] = val_weights['Agg_Level_1']+'_'+val_weights['Agg_Level_2']+'_validation'
val_weights = val_weights.drop(['Agg_Level_1', 'Agg_Level_2'], axis=1)
val_weights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Level_id,Weight,id
12350,Level12,1.970000e-05,FOODS_1_001_CA_1_validation
12351,Level12,1.850000e-05,FOODS_1_001_CA_2_validation
12352,Level12,1.430000e-05,FOODS_1_001_CA_3_validation
12353,Level12,5.380000e-06,FOODS_1_001_CA_4_validation
12354,Level12,5.980000e-07,FOODS_1_001_TX_1_validation
...,...,...,...
42835,Level12,1.270000e-05,HOUSEHOLD_2_516_TX_2_validation
42836,Level12,7.920000e-06,HOUSEHOLD_2_516_TX_3_validation
42837,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_1_validation
42838,Level12,1.580000e-06,HOUSEHOLD_2_516_WI_2_validation


In [11]:
# compute difference between computed and validation weights
if full_dataset and sales_type =='validation':
    
    df = val_weights.copy()
    df = df.merge(turnover_last_28d, on='id')

    assert df.shape[0] == 30490 
    
    # check if max difference isn't greater than rounding error
    assert np.max(df.Weight - df.weights) < 1e-5


# Save Weights

In [12]:
if full_dataset:
    turnover_last_28d.to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/weights_for_wrmsse_{sales_type}.csv', index=False)