# Define Running Mode

- 'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.

In [10]:
full_dataset = True

# Import Packages

In [11]:
import sys
sys.path.append('..')
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [12]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged.csv')

In [13]:
df_merged = utils.reduce_mem_usage(df_merged)

Mem. usage of decreased to 4176.43 Mb (64.0% reduction)


# Compute Weights

In [14]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_mean_t28,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.0,0.0,,,0.0,0.0,,,0,31
1,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,1520,0.0,2011-03-31,11109,...,3.333333,4.685512,4.28914,2.011404,0.0,0.0,0.0,0.0,0,31
2,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,1520,4.0,2011-03-31,11109,...,1.9,2.354013,2.22726,1.589597,0.0,0.0,0.0,0.1066537,0,31
3,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.166667,0.530669,8.178714,3.159005,0.0,0.0,0.0,0.0,0,31
4,HOBBIES_1_012_CA_1_validation,1448,3,1,0,0,1520,1.0,2011-03-31,11109,...,0.6,0.932183,0.869891,1.466822,0.0,0.0,0.0,0.1334127,0,31
5,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,1520,7.0,2011-03-31,11109,...,5.666667,7.284056,0.141198,1.189591,1.0,0.0,0.0,2.181386e-09,0,31
6,HOBBIES_1_016_CA_1_validation,1452,3,1,0,0,1520,0.0,2011-03-31,11109,...,7.033333,10.962265,5.062504,1.886527,6.0,0.0,0.0,2.181386e-09,0,31
7,HOBBIES_1_020_CA_1_validation,1456,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.266667,0.583292,3.233676,2.147972,0.0,0.0,0.0,0.05858439,0,31
8,HOBBIES_1_021_CA_1_validation,1457,3,1,0,0,1520,2.0,2011-03-31,11109,...,0.0,0.0,,,0.0,0.0,,,0,31
9,HOBBIES_1_022_CA_1_validation,1458,3,1,0,0,1520,0.0,2011-03-31,11109,...,0.833333,0.833908,-0.117911,0.714973,0.0,0.0,0.0,0.05520519,0,31


In [15]:
# auxiliary variable: sale from previous day
df_merged['sale_lag_1'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(1))

# compute (y_t - t_{t-1})^2
df_merged['sale_squared_delta'] = (df_merged['sale'] -
                                   df_merged['sale_lag_1'])**2

# drop day 1 as there is no value
df_merged = df_merged.loc[df_merged['sale_squared_delta'].notna()]

df_merged[[
    'id','date', 'sell_price', 'sale', 'sale_lag_1',
    'sale_squared_delta'
]].loc[df_merged.id =='HOBBIES_1_001_TX_2_validation']

Unnamed: 0,id,date,sell_price,sale,sale_lag_1,sale_squared_delta
16437130,HOBBIES_1_001_TX_2_validation,2013-07-28,9.58,0.0,1.0,1.0
16461979,HOBBIES_1_001_TX_2_validation,2013-07-29,9.58,2.0,0.0,4.0
16486828,HOBBIES_1_001_TX_2_validation,2013-07-30,9.58,0.0,2.0,4.0
16511677,HOBBIES_1_001_TX_2_validation,2013-07-31,9.58,0.0,0.0,0.0
16536526,HOBBIES_1_001_TX_2_validation,2013-08-01,9.58,0.0,0.0,0.0
...,...,...,...,...,...,...
45107008,HOBBIES_1_001_TX_2_validation,2016-04-20,8.26,1.0,0.0,1.0
45137498,HOBBIES_1_001_TX_2_validation,2016-04-21,8.26,0.0,1.0,1.0
45167988,HOBBIES_1_001_TX_2_validation,2016-04-22,8.26,0.0,0.0,0.0
45198478,HOBBIES_1_001_TX_2_validation,2016-04-23,8.26,0.0,0.0,0.0


In [16]:
scaling_factors = df_merged[['id','sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']

scaling_factors = scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1)

scaling_factors.head(10)

Unnamed: 0,id,scaling_factor
0,FOODS_1_001_CA_1_validation,2.882766
1,FOODS_1_001_CA_2_validation,5.563479
2,FOODS_1_001_CA_3_validation,10.384657
3,FOODS_1_001_CA_4_validation,1.011885
4,FOODS_1_001_TX_1_validation,3.491086
5,FOODS_1_001_TX_2_validation,2.284711
6,FOODS_1_001_TX_3_validation,1.300378
7,FOODS_1_001_WI_1_validation,1.473258
8,FOODS_1_001_WI_2_validation,1.470016
9,FOODS_1_001_WI_3_validation,0.731496


In [17]:
scaling_factors = df_merged[['id','sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']

scaling_factors = scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1)

scaling_factors.head(10)

Unnamed: 0,id,scaling_factor
0,FOODS_1_001_CA_1_validation,2.882766
1,FOODS_1_001_CA_2_validation,5.563479
2,FOODS_1_001_CA_3_validation,10.384657
3,FOODS_1_001_CA_4_validation,1.011885
4,FOODS_1_001_TX_1_validation,3.491086
5,FOODS_1_001_TX_2_validation,2.284711
6,FOODS_1_001_TX_3_validation,1.300378
7,FOODS_1_001_WI_1_validation,1.473258
8,FOODS_1_001_WI_2_validation,1.470016
9,FOODS_1_001_WI_3_validation,0.731496


# Save Weights

In [18]:
if full_dataset:
    scaling_factors.to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/scaling_factors_rmsse.csv', index=False)