# Import Packages

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import itertools
sns.set(style="ticks")
%config IPCompleter.greedy = True
from sklearn.preprocessing import LabelEncoder

import utils

# Full Dataset

## Import Data

In [2]:
# Here we predict data for the final leaderboard (M5 - Evaluation Phase)
sales_type='evaluation'

In [3]:
  df_calendar, df_sales, df_prices= utils.import_m5_data(reduce_memory=False, sales_type=sales_type)

## Transform Dataframe

In [4]:
df_sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [5]:
# Add placeholders as timeframe is different for df_sales
if sales_type == 'validation':
    for i in range(1914, 1970):
        df_sales[f'd_{i}'] = 'NA'
else:
    for i in range(1942, 1970):
        df_sales[f'd_{i}'] = 'NA'


In [6]:
# transform Dataframe
df = utils.transform_dataframe(df_sales, df_calendar, df_prices)

In [7]:
# Add column with type of data (training, validation & evaluation)

# 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913 ---> training
# 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public) ---> validation
# 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private) ---> evaluation


conditions = [
    (df['date'] <= '2016-04-24'),
    (df['date'] >= '2016-04-25') & (df['date'] <= '2016-05-22'),
    (df['date'] >= '2016-05-23')]
choices = ['train', 'validation', 'evaluation']
df['data_type'] = np.select(conditions, choices)



In [8]:
# reduce memory usage
df = utils.reduce_mem_usage(df)

Mem. usage of decreased to 8072.77 Mb (26.6% reduction)


In [9]:
df.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,data_type
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train


## Save DataFrame as CSV

In [10]:
df.to_csv(f'{utils.get_m5_root_dir()}/data/preprocessed/preprocessed_input_data_{sales_type}.csv', index=False)

# Create Small Subset

Load full dataset if needed

In [11]:
# df = pd.read_csv(f'{utils.get_m5_root_dir()}/data/preprocessed/preprocessed_input_data{sales_type}.csv')

Create Subset

In [12]:
# Subset including 1 year training date and validation/evaluation data of store Texas 2, category Hobbies 
store = 'TX_2'
cat = 'HOBBIES'

if sales_type == 'validation':
    df_subset = df.loc[df['store_id'] == store].loc[
        df['cat_id'] == cat].loc[df['date'] >= '2015-04-24'].loc[df['date'] <= '2016-05-22'] 
else:
    df_subset = df.loc[df['store_id'] == store].loc[
        df['cat_id'] == cat].loc[df['date'] >= '2015-04-24'] 
    
df_subset.reset_index(drop=True)
df_subset = utils.reduce_mem_usage(df_subset)
df_subset.to_csv(f'{utils.get_m5_root_dir()}/data/preprocessed/tx2_hobbies_1year_{sales_type}.csv', index=False)
df_subset


Mem. usage of decreased to 32.14 Mb (0.0% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,data_type
47152785,HOBBIES_1_001_TX_2_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_2,TX,d_1547,0,2015-04-24,11512,...,2015,,,,,0,0,0,8.26,train
47152786,HOBBIES_1_002_TX_2_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,TX_2,TX,d_1547,1,2015-04-24,11512,...,2015,,,,,0,0,0,3.97,train
47152787,HOBBIES_1_003_TX_2_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,TX_2,TX,d_1547,0,2015-04-24,11512,...,2015,,,,,0,0,0,2.97,train
47152788,HOBBIES_1_004_TX_2_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,TX_2,TX,d_1547,0,2015-04-24,11512,...,2015,,,,,0,0,0,4.64,train
47152789,HOBBIES_1_005_TX_2_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,TX_2,TX,d_1547,0,2015-04-24,11512,...,2015,,,,,0,0,0,2.73,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60020125,HOBBIES_2_145_TX_2_evaluation,HOBBIES_2_145,HOBBIES_2,HOBBIES,TX_2,TX,d_1969,,2016-06-19,11621,...,2016,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,2.88,evaluation
60020126,HOBBIES_2_146_TX_2_evaluation,HOBBIES_2_146,HOBBIES_2,HOBBIES,TX_2,TX,d_1969,,2016-06-19,11621,...,2016,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,1.97,evaluation
60020127,HOBBIES_2_147_TX_2_evaluation,HOBBIES_2_147,HOBBIES_2,HOBBIES,TX_2,TX,d_1969,,2016-06-19,11621,...,2016,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,0.97,evaluation
60020128,HOBBIES_2_148_TX_2_evaluation,HOBBIES_2_148,HOBBIES_2,HOBBIES,TX_2,TX,d_1969,,2016-06-19,11621,...,2016,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,0.88,evaluation
