# Define Running Mode

- 'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.
- 'sales_type' = 'evaluation' if we want to predict for the final M5 leaderboard, else 'validation' 

In [1]:
full_dataset = True
sales_type = 'evaluation'

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged_{sales_type}.csv')

In [4]:
df_merged = utils.reduce_mem_usage(df_merged)

Mem. usage of decreased to 4176.43 Mb (64.0% reduction)


# Compute Weights

In [5]:
# drop days in future
if sales_type == 'evaluation':
    df_merged = df_merged.loc[df_merged['data_type'] == 'validation']

else:
    df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
45244213,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.033333,0.927858,0.246603,0.763245,1.0,0.0,0.0,0.05968164,0,25
45244214,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.333333,0.479463,-1.731692,0.74488,1.0,0.0,0.0,0.0,0,25
45244215,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.5,1.167077,16.58596,3.904594,0.0,0.0,0.0,0.0,0,25
45244216,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.966667,2.07586,-0.529831,0.790924,0.0,0.0,0.0,0.0,0,25
45244217,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1017,1.0,2016-04-25,11613,...,1.1,0.959526,-0.717982,0.542001,1.0,0.0,0.0,4.434852e-08,0,25
45244218,HOBBIES_1_006_CA_1_evaluation,1442,3,1,0,0,1017,0.0,2016-04-25,11613,...,1.033333,1.376736,1.006636,1.380472,0.0,0.0,0.0,0.0,0,25
45244219,HOBBIES_1_007_CA_1_evaluation,1443,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.433333,0.626062,0.542877,1.171699,1.0,0.0,0.0,0.0,0,25
45244220,HOBBIES_1_008_CA_1_evaluation,1444,3,1,0,0,1017,19.0,2016-04-25,11613,...,4.733333,8.144951,2.202501,1.884402,0.0,0.0,0.0,6.377117e-09,0,25
45244221,HOBBIES_1_009_CA_1_evaluation,1445,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.3,1.149213,20.982733,4.618247,0.0,0.0,0.0,1.927735e-08,0,25
45244222,HOBBIES_1_010_CA_1_evaluation,1446,3,1,0,0,1017,0.0,2016-04-25,11613,...,0.366667,0.718395,5.913438,2.297693,0.0,0.0,0.0,2.65265e-08,0,25


In [6]:
# auxiliary variable: sale from previous day
df_merged['sale_lag_1'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(1))

# compute (y_t - t_{t-1})^2
df_merged['sale_squared_delta'] = (df_merged['sale'] -
                                   df_merged['sale_lag_1'])**2

# drop day 1 as there is no value
df_merged = df_merged.loc[df_merged['sale_squared_delta'].notna()]

df_merged[[
    'id','date', 'sell_price', 'sale', 'sale_lag_1',
    'sale_squared_delta'
]].loc[df_merged.id =='HOBBIES_1_001_TX_2_validation']

Unnamed: 0,id,date,sell_price,sale,sale_lag_1,sale_squared_delta


In [7]:
scaling_factors = df_merged[['id','sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']

scaling_factors = scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1)

scaling_factors.head(10)

Unnamed: 0,id,scaling_factor
0,FOODS_1_001_CA_1_evaluation,3.925926
1,FOODS_1_001_CA_2_evaluation,2.148148
2,FOODS_1_001_CA_3_evaluation,5.148148
3,FOODS_1_001_CA_4_evaluation,0.37037
4,FOODS_1_001_TX_1_evaluation,2.518519
5,FOODS_1_001_TX_2_evaluation,0.481481
6,FOODS_1_001_TX_3_evaluation,1.703704
7,FOODS_1_001_WI_1_evaluation,0.333333
8,FOODS_1_001_WI_2_evaluation,3.074074
9,FOODS_1_001_WI_3_evaluation,0.407407


In [8]:
scaling_factors = df_merged[['id','sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']

scaling_factors = scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1)

scaling_factors.head(10)

Unnamed: 0,id,scaling_factor
0,FOODS_1_001_CA_1_evaluation,3.925926
1,FOODS_1_001_CA_2_evaluation,2.148148
2,FOODS_1_001_CA_3_evaluation,5.148148
3,FOODS_1_001_CA_4_evaluation,0.37037
4,FOODS_1_001_TX_1_evaluation,2.518519
5,FOODS_1_001_TX_2_evaluation,0.481481
6,FOODS_1_001_TX_3_evaluation,1.703704
7,FOODS_1_001_WI_1_evaluation,0.333333
8,FOODS_1_001_WI_2_evaluation,3.074074
9,FOODS_1_001_WI_3_evaluation,0.407407


# Save Weights

In [9]:
if full_dataset:
    scaling_factors.to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/scaling_factors_rmsse_{sales_type}.csv', index=False)