# Bakery Dataset

In [20]:
import pandas as pd
import numpy as np
from utilities import add_lag_features, day_to_string, month_to_string
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

In [9]:
data = pd.read_csv("raw/Bakery_dataset.csv")

In [10]:
data

Unnamed: 0.1,Unnamed: 0,date_short,shop_no,product_no,demand,is_schoolholiday,is_holiday,is_holiday_next2days,rain_mm,temp_avg_celsius,temp_max,temp_min,promotion_currentweek,promotion_lastweek,weekday,month
0,0,2016-01-02,2,101,254.0,True,False,False,11.9,2.1,2.8,1.4,0,0,5,1
1,1,2016-01-03,2,101,538.0,True,False,False,4.1,2.6,4.8,1.5,0,0,6,1
2,2,2016-01-04,2,101,132.0,True,False,True,7.9,3.2,7.0,0.3,0,0,0,1
3,3,2016-01-05,2,101,154.0,True,False,True,3.5,3.1,5.8,-0.1,0,0,1,1
4,4,2016-01-06,2,101,331.0,True,True,False,0.1,4.1,7.1,1.5,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127570,382720,2019-04-26,71,110,40.0,True,False,False,4.9,8.0,9.9,4.5,0,0,4,4
127571,382721,2019-04-27,71,110,59.0,True,False,False,6.1,7.8,12.7,4.3,0,0,5,4
127572,382722,2019-04-28,71,110,80.0,False,False,False,1.0,6.5,12.1,0.7,0,0,6,4
127573,382723,2019-04-29,71,110,65.0,False,False,True,9.1,6.5,12.0,-0.4,0,0,0,4


In [11]:
data.drop(columns=["Unnamed: 0", "temp_min", "temp_max"], inplace=True)

In [12]:
data['month'] = data['month'].apply(month_to_string)
data['weekday'] = data['weekday'].apply(day_to_string)

In [13]:
data.rename(columns={"date_short": "date", "shop_no": "store", "product_no": "item", "temp_avg_celsius": "temperature", "rain_mm": "rain"}, inplace=True)

In [15]:
y = pd.DataFrame(data['demand'])
X = data.drop(columns=["demand"])

## Add lag features

In [22]:
fc_parameters = MinimalFCParameters()
del fc_parameters['length']
fc_parameters

{'sum_values': None, 'median': None, 'mean': None, 'standard_deviation': None, 'variance': None, 'root_mean_square': None, 'maximum': None, 'absolute_maximum': None, 'minimum': None}

In [23]:
X, y  = add_lag_features(X, y, column_id=['item',"store"], column_sort='date', 
                        feature_dict=fc_parameters, time_windows = [(7,7),(14,14),(28,28)])

Rolling: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]
Feature Extraction: 100%|██████████| 20/20 [00:21<00:00,  1.06s/it]
Rolling: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Feature Extraction: 100%|██████████| 20/20 [00:21<00:00,  1.06s/it]
Rolling: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Feature Extraction: 100%|██████████| 20/20 [00:21<00:00,  1.09s/it]


In [29]:
X.drop(columns=["date"],inplace=True)

## Save final data

In [33]:
X.to_csv("final/bakery_data.csv.zip", index=False, compression="zip")
y.to_csv("final/bakery_target.csv.zip", index=False, compression="zip")

In [31]:
X.columns

Index(['store', 'item', 'is_schoolholiday', 'is_holiday',
       'is_holiday_next2days', 'rain', 'temperature', 'promotion_currentweek',
       'promotion_lastweek', 'weekday', 'month', 'demand__sum_values_7',
       'demand__median_7', 'demand__mean_7', 'demand__standard_deviation_7',
       'demand__variance_7', 'demand__root_mean_square_7', 'demand__maximum_7',
       'demand__absolute_maximum_7', 'demand__minimum_7',
       'demand__sum_values_14', 'demand__median_14', 'demand__mean_14',
       'demand__standard_deviation_14', 'demand__variance_14',
       'demand__root_mean_square_14', 'demand__maximum_14',
       'demand__absolute_maximum_14', 'demand__minimum_14',
       'demand__sum_values_28', 'demand__median_28', 'demand__mean_28',
       'demand__standard_deviation_28', 'demand__variance_28',
       'demand__root_mean_square_28', 'demand__maximum_28',
       'demand__absolute_maximum_28', 'demand__minimum_28'],
      dtype='object')