# Define Running Mode

'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2.

In [1]:
full_dataset = False

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Import Data Set

In [3]:
if full_dataset:
    prefix = 'full_dataset'
else:
    prefix = 'subset'

df_merged = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged.csv')

In [4]:
df_merged = utils.reduce_mem_usage(df_merged)

Mem. usage of decreased to 11.07 Mb (64.7% reduction)


# Compute Weights

In [5]:
# drop days in future
df_merged = df_merged.loc[df_merged['data_type'] == 'train']
df_merged.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,rolling_std_t28,rolling_kurt_t28,rolling_skew_t28,lag_t28,up_for_sale,price_change_t1,price_change_t30,rolling_price_std_t28,is_weekend,day
0,HOBBIES_1_001_TX_2_validation,0,0,0,0,0,181,1.0,2015-10-22,11538,...,0.7701321,1.662343,1.33946,1.0,1,0.0,0.0,0.0,0,22
1,HOBBIES_1_002_TX_2_validation,1,0,0,0,0,181,0.0,2015-10-22,11538,...,0.6396838,3.169402,2.249556,2.0,1,0.0,0.0,0.0,0,22
2,HOBBIES_1_003_TX_2_validation,2,0,0,0,0,181,0.0,2015-10-22,11538,...,0.3457459,5.613785,2.272519,0.0,1,0.0,0.0,0.0,0,22
3,HOBBIES_1_004_TX_2_validation,3,0,0,0,0,181,0.0,2015-10-22,11538,...,0.0,,,0.0,1,0.0,0.0,0.0,0,22
4,HOBBIES_1_005_TX_2_validation,4,0,0,0,0,181,0.0,2015-10-22,11538,...,0.980265,0.566582,1.288937,0.0,1,0.0,0.0,0.0,0,22
5,HOBBIES_1_006_TX_2_validation,5,0,0,0,0,181,0.0,2015-10-22,11538,...,3.548984e-08,,,0.0,1,0.0,0.0,0.0,0,22
6,HOBBIES_1_007_TX_2_validation,6,0,0,0,0,181,0.0,2015-10-22,11538,...,0.0,,,0.0,1,0.0,0.0,0.0,0,22
7,HOBBIES_1_008_TX_2_validation,7,0,0,0,0,181,2.0,2015-10-22,11538,...,14.95879,1.513989,1.686714,4.0,1,0.0,0.0,0.0,0,22
8,HOBBIES_1_009_TX_2_validation,8,0,0,0,0,181,0.0,2015-10-22,11538,...,0.6789106,0.267606,1.320248,0.0,1,0.0,0.0,0.0,0,22
9,HOBBIES_1_010_TX_2_validation,9,0,0,0,0,181,0.0,2015-10-22,11538,...,1.09387,-0.006054,0.806169,1.0,1,0.0,0.0,0.0,0,22


In [6]:
# drop products that were not up for sale
df_merged = df_merged.loc[df_merged['sell_price'].notna()]
df_merged[['id','item_id', 'date', 'sale','sell_price']].head(10)

Unnamed: 0,id,item_id,date,sale,sell_price
0,HOBBIES_1_001_TX_2_validation,0,2015-10-22,1.0,8.26
1,HOBBIES_1_002_TX_2_validation,1,2015-10-22,0.0,3.97
2,HOBBIES_1_003_TX_2_validation,2,2015-10-22,0.0,2.97
3,HOBBIES_1_004_TX_2_validation,3,2015-10-22,0.0,4.64
4,HOBBIES_1_005_TX_2_validation,4,2015-10-22,0.0,2.73
5,HOBBIES_1_006_TX_2_validation,5,2015-10-22,0.0,0.96
6,HOBBIES_1_007_TX_2_validation,6,2015-10-22,0.0,7.88
7,HOBBIES_1_008_TX_2_validation,7,2015-10-22,2.0,0.48
8,HOBBIES_1_009_TX_2_validation,8,2015-10-22,0.0,1.63
9,HOBBIES_1_010_TX_2_validation,9,2015-10-22,0.0,2.97


In [11]:
# auxiliary variable: sale from previous day
df_merged['sale_lag_1'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(1))

# compute (y_t - t_{t-1})^2
df_merged['sale_squared_delta'] = (df_merged['sale'] -
                                   df_merged['sale_lag_1'])**2

# drop day 1 as there is no value
df_merged = df_merged.loc[df_merged['sale_squared_delta'].notna()]

df_merged[[
    'id', 'item_id','date', 'sell_price', 'sale', 'sale_lag_1',
    'sale_squared_delta'
]].loc[df_merged.id =='HOBBIES_1_001_TX_2_validation']

Unnamed: 0,id,item_id,date,sell_price,sale,sale_lag_1,sale_squared_delta
1695,HOBBIES_1_001_TX_2_validation,0,2015-10-25,8.26,0.0,0.0,0.0
2260,HOBBIES_1_001_TX_2_validation,0,2015-10-26,8.26,0.0,0.0,0.0
2825,HOBBIES_1_001_TX_2_validation,0,2015-10-27,8.26,0.0,0.0,0.0
3390,HOBBIES_1_001_TX_2_validation,0,2015-10-28,8.26,2.0,0.0,4.0
3955,HOBBIES_1_001_TX_2_validation,0,2015-10-29,8.26,1.0,2.0,1.0
...,...,...,...,...,...,...,...
102265,HOBBIES_1_001_TX_2_validation,0,2016-04-20,8.26,1.0,0.0,1.0
102830,HOBBIES_1_001_TX_2_validation,0,2016-04-21,8.26,0.0,1.0,1.0
103395,HOBBIES_1_001_TX_2_validation,0,2016-04-22,8.26,0.0,0.0,0.0
103960,HOBBIES_1_001_TX_2_validation,0,2016-04-23,8.26,0.0,0.0,0.0


In [15]:
scaling_factors = df_merged[['id', 'item_id','sale_squared_delta']].groupby(['id']).sum()
scaling_factors = scaling_factors.reset_index()
scaling_factors['item_id']

scaling_factors['days_in_sale_count'] = df_merged[['id', 'sale_squared_delta']].groupby(['id']).size().values

scaling_factors['scaling_factor'] = scaling_factors['sale_squared_delta'] / scaling_factors['days_in_sale_count']


scaling_factors.head(10)

Unnamed: 0,id,item_id,sale_squared_delta,days_in_sale_count,scaling_factor
0,HOBBIES_1_001_TX_2_validation,0.0,236.0,183,1.289617
1,HOBBIES_1_002_TX_2_validation,183.0,57.0,183,0.311475
2,HOBBIES_1_003_TX_2_validation,366.0,36.0,183,0.196721
3,HOBBIES_1_004_TX_2_validation,549.0,441.0,183,2.409836
4,HOBBIES_1_005_TX_2_validation,732.0,338.0,183,1.846995
5,HOBBIES_1_006_TX_2_validation,915.0,320.0,183,1.748634
6,HOBBIES_1_007_TX_2_validation,1098.0,56.0,183,0.306011
7,HOBBIES_1_008_TX_2_validation,1281.0,14366.0,183,78.502732
8,HOBBIES_1_009_TX_2_validation,1464.0,391.0,183,2.136612
9,HOBBIES_1_010_TX_2_validation,1647.0,345.0,183,1.885246


# Save Weights

In [16]:
if full_dataset:
    scaling_factors.drop(['days_in_sale_count', 'sale_squared_delta'], axis=1).to_csv(
        f'{utils.get_m5_root_dir()}/data/weights/scaling_factors_rmsse.csv', index=False)