# Define Running Mode

'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.

In [1]:
full_dataset = True

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged.csv')

In [4]:
df_merged = utils.reduce_mem_usage(df_merged)

Mem. usage of decreased to 4912.94 Mb (64.7% reduction)


# Compute Weights

In [5]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,912,0.0,2011-07-29,11126,...,0.345746,2.859231,2.272519,1.0,1,0.0,0.0,0.0,0,29
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,912,1.0,2011-07-29,11126,...,1.691425,2.492573,1.822482,3.0,1,0.0,0.0,0.0,0,29
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,912,0.0,2011-07-29,11126,...,1.597412,5.926538,2.229717,0.0,1,0.0,0.0,0.0,0,29
5,HOBBIES_1_006_CA_1_validation,1442,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
6,HOBBIES_1_007_CA_1_validation,1443,3,1,0,0,912,0.0,2011-07-29,11126,...,0.0,,,0.0,0,,,,0,29
7,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,912,0.0,2011-07-29,11126,...,8.177907,-0.281568,1.213215,0.0,1,0.0,0.0,8.986714e-09,0,29
8,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,912,1.0,2011-07-29,11126,...,2.155186,0.783767,1.156819,0.0,1,0.0,0.0,1.927735e-08,0,29
9,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,912,0.0,2011-07-29,11126,...,0.73968,-1.014088,0.4804,2.0,1,0.0,0.0,2.65265e-08,0,29


In [6]:
# drop products that were not up for sale
df_merged = df_merged.loc[df_merged['sell_price'].notna()]
df_merged[['id', 'date', 'sale','sell_price']].head(10)

Unnamed: 0,id,date,sale,sell_price
1,HOBBIES_1_002_CA_1_validation,2011-07-29,0.0,3.97
3,HOBBIES_1_004_CA_1_validation,2011-07-29,1.0,4.34
4,HOBBIES_1_005_CA_1_validation,2011-07-29,0.0,2.98
7,HOBBIES_1_008_CA_1_validation,2011-07-29,0.0,0.5
8,HOBBIES_1_009_CA_1_validation,2011-07-29,1.0,1.77
9,HOBBIES_1_010_CA_1_validation,2011-07-29,0.0,2.97
11,HOBBIES_1_012_CA_1_validation,2011-07-29,1.0,6.27
14,HOBBIES_1_015_CA_1_validation,2011-07-29,1.0,0.68
15,HOBBIES_1_016_CA_1_validation,2011-07-29,10.0,0.68
19,HOBBIES_1_020_CA_1_validation,2011-07-29,0.0,10.98


In [7]:
# auxiliary variable: sale from previous day
df_merged['sale_lag_1'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(1))

# compute (y_t - t_{t-1})^2
df_merged['sale_squared_delta'] = (df_merged['sale'] -
                                   df_merged['sale_lag_1'])**2

# drop day 1 as there is no value
df_merged = df_merged.loc[df_merged['sale_squared_delta'].notna()]

df_merged[[
    'id', 'date', 'sell_price', 'sale', 'sale_lag_1',
    'sale_squared_delta'
]].loc[df_merged.id =='HOBBIES_1_001_TX_2_validation']

Unnamed: 0,id,date,sell_price,sale,sale_lag_1,sale_squared_delta
22272945,HOBBIES_1_001_TX_2_validation,2013-07-28,9.58,0.0,1.0,1.0
22303435,HOBBIES_1_001_TX_2_validation,2013-07-29,9.58,2.0,0.0,4.0
22333925,HOBBIES_1_001_TX_2_validation,2013-07-30,9.58,0.0,2.0,4.0
22364415,HOBBIES_1_001_TX_2_validation,2013-07-31,9.58,0.0,0.0,0.0
22394905,HOBBIES_1_001_TX_2_validation,2013-08-01,9.58,0.0,0.0,0.0
...,...,...,...,...,...,...
52671475,HOBBIES_1_001_TX_2_validation,2016-04-20,8.26,1.0,0.0,1.0
52701965,HOBBIES_1_001_TX_2_validation,2016-04-21,8.26,0.0,1.0,1.0
52732455,HOBBIES_1_001_TX_2_validation,2016-04-22,8.26,0.0,0.0,0.0
52762945,HOBBIES_1_001_TX_2_validation,2016-04-23,8.26,0.0,0.0,0.0


In [8]:
scaling_factors = df_merged[['id', 'sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']


scaling_factors.head(10)

Unnamed: 0,id,sale_squared_delta,days_in_sale_count,scaling_factor
0,FOODS_1_001_CA_1_validation,4786.0,1731,2.764876
1,FOODS_1_001_CA_2_validation,9154.0,1731,5.288273
2,FOODS_1_001_CA_3_validation,17453.0,1731,10.082611
3,FOODS_1_001_CA_4_validation,1691.0,1731,0.976892
4,FOODS_1_001_TX_1_validation,6284.0,1731,3.630272
5,FOODS_1_001_TX_2_validation,3896.0,1731,2.250722
6,FOODS_1_001_TX_3_validation,2170.0,1731,1.253611
7,FOODS_1_001_WI_1_validation,2504.0,1731,1.446563
8,FOODS_1_001_WI_2_validation,2275.0,1731,1.314269
9,FOODS_1_001_WI_3_validation,1090.0,1731,0.629694


# Save Weights

In [9]:
if full_dataset:
    scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1).to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/scaling_factors_rmsse.csv', index=False)