# Define Running Mode

- 'full_dataset = True' to use the full data set. If 'full_dataset = True', then a data set containing only data one year of the hobbies sales in TX2. 
- 'save_results = True' to save the dataframe in m5_challenge\data\feature_engineering\
- 'sales_type' = 'evaluation' if we want to predict for the final M5 leaderboard, else 'validation' 


In [1]:
full_dataset = True
save_results = True

sales_type = 'evaluation'

# Import Packages

In [2]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Data Import and Preparation

In [3]:
# Import data
submission = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/input/sample_submission.csv')
if full_dataset:
    df_merged = pd.read_csv(
        f'{utils.get_m5_root_dir()}/data/preprocessed/preprocessed_input_data_{sales_type}.csv'
    )
else:
    df_merged = pd.read_csv(
        f'{utils.get_m5_root_dir()}/data/preprocessed/tx2_hobbies_1year_{sales_type}.csv')

# extract training and validation data (drop evaluation) since we are still in validation phase
df_merged = df_merged.loc[df_merged['data_type'] != 'evaluation']

# print top and bottom lines
df_merged.head(5).append(df_merged.tail(5))

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,data_type
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1.0,2016-05-22,11617,...,2016,,,,,0,0,0,2.98,validation
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0.0,2016-05-22,11617,...,2016,,,,,0,0,0,2.48,validation
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2.0,2016-05-22,11617,...,2016,,,,,0,0,0,3.98,validation
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0.0,2016-05-22,11617,...,2016,,,,,0,0,0,1.28,validation
59181089,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1941,1.0,2016-05-22,11617,...,2016,,,,,0,0,0,1.0,validation


In [4]:
# downcast numerical values to reduce mem usage
df_merged = utils.reduce_mem_usage(df_merged)

Mem. usage of decreased to 7732.21 Mb (28.6% reduction)


In [5]:
# Convert categorical features to integers as the categorical values cause problems when using large datasets

df_merged = utils.encode_categorical(df_merged, [
    "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1",
    "event_type_1", "event_name_2", "event_type_2", 'd'
])

df_merged.head(5)

Mem. usage of decreased to 3894.32 Mb (28.9% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,data_type
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,0,0.0,2011-01-29,11101,...,2011,13,1,3,1,0,0,0,,train
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,0,0.0,2011-01-29,11101,...,2011,13,1,3,1,0,0,0,,train
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,0,0.0,2011-01-29,11101,...,2011,13,1,3,1,0,0,0,,train
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,0,0.0,2011-01-29,11101,...,2011,13,1,3,1,0,0,0,,train
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,0,0.0,2011-01-29,11101,...,2011,13,1,3,1,0,0,0,,train


# Feature Engineering

In [6]:
# sales data

# rolling mean and rolling std (weekly, monthly, quarterly, and half-year)
df_merged['rolling_mean_t28'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(28).rolling(30).mean())

df_merged['rolling_std_t28'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(28).rolling(30).std())

df_merged['rolling_kurt_t28'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(28).rolling(28).kurt())

df_merged['rolling_skew_t28'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(28).rolling(30).skew())

df_merged['lag_t28'] = df_merged.groupby(
    ['id'])['sale'].transform(lambda x: x.shift(28))


In [7]:
# price data
df_merged['lag_price_t1'] = df_merged.groupby(
    ['id'])['sell_price'].transform(lambda x: x.shift(1))

df_merged['rolling_price_max_t30'] = df_merged.groupby(
    ['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(30).max())

df_merged['price_change_t1'] = (df_merged['lag_price_t1'] -
                                df_merged['sell_price']) / (
                                    df_merged['lag_price_t1'])

df_merged['price_change_t30'] = (df_merged['rolling_price_max_t30'] -
                                 df_merged['sell_price']) / (
                                     df_merged['rolling_price_max_t30'])

df_merged['rolling_price_std_t28'] = df_merged.groupby(
    ['id'])['sell_price'].transform(lambda x: x.rolling(28).std())

df_merged.drop(['rolling_price_max_t30', 'lag_price_t1'], inplace=True, axis=1)

In [8]:
# date data

# Saturday: wday = 1, Sunday: wday = 2
df_merged["is_weekend"] = df_merged["wday"].isin([1, 2]).astype(np.int8)

df_merged.loc[:, 'date'] = pd.to_datetime(df_merged['date'])
df_merged['day'] = df_merged['date'].dt.day.astype(np.int8)

In [9]:
# drop first 60 days since they have missing caused by the feature engineering
date_after_60_training_days = str(df_merged['date'].dt.date.min() +
                                  pd.to_timedelta(60, unit='d'))
df_merged = df_merged[df_merged['date'] > date_after_60_training_days]

In [10]:
# drop values where product was not up for sale - product was up for sale if price is not zero
df_merged = df_merged.drop(df_merged.loc[df_merged['sell_price'].isna()].index)

# Save Datasets

In [11]:
if save_results:

    if full_dataset:
        prefix = 'full_dataset'
    else:
        prefix = 'subset'

    df_merged.to_csv(
        f'{utils.get_m5_root_dir()}/data/feature_engineering/{prefix}_df_merged_{sales_type}.csv',
        index=False)

 