# Bakery Dataset

In [1]:
import pandas as pd
import numpy as np
from utilities import add_lag_features, day_to_string, month_to_string
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

  from pandas import Int64Index as NumericIndex


In [2]:
data = pd.read_csv("raw/Bakery_dataset.csv.zip")

## Preprocessing

In [3]:
data.drop(columns=["Unnamed: 0", "temp_min", "temp_max"], inplace=True)

In [4]:
data.rename(columns={"date_short": "date", "shop_no": "store", "product_no": "item", "temp_avg_celsius": "temperature", "rain_mm": "rain"}, inplace=True)

## Check for intermittend demand

Since we want to avoid intermittent demand, we select only the time series with less than 20 percent zero salesdata_grouped = data.groupby(["store", "item"])
groups = list(data_grouped.groups.keys())

In [5]:
data_grouped = data.groupby(["store", "item"])
groups = list(data_grouped.groups.keys())

In [6]:
# get all store/item instances with more than 20 percent zero sales
more_than_20_p_zero = []
for group in groups:
    data_temp = data_grouped.get_group(group)
    zero = data_temp[data_temp["demand"]==0].shape[0]
    non_zero = data_temp[data_temp["demand"]!=0].shape[0]
    if zero/(non_zero+zero) >= 0.2:
        more_than_20_p_zero.append(group)

In [7]:
# drop zero sales instances
for group in more_than_20_p_zero:
    data = data.drop(data_grouped.get_group(group).index)

## Add and formate calendar features

In [8]:
data['date'] =  pd.to_datetime(data['date'], format='%Y-%m-%d')

In [9]:
data['year'] = data['date'].dt.year
data['month'] = data['month'].apply(month_to_string)
data['weekday'] = data['weekday'].apply(day_to_string)

In [10]:
y = pd.DataFrame(data['demand'])
X = data.drop(columns=["demand"])

## Add lag features

In [11]:
fc_parameters = MinimalFCParameters()
del fc_parameters['length']
fc_parameters

{'sum_values': None, 'median': None, 'mean': None, 'standard_deviation': None, 'variance': None, 'root_mean_square': None, 'maximum': None, 'absolute_maximum': None, 'minimum': None}

In [12]:
X, y  = add_lag_features(X, y, column_id=['item',"store"], column_sort='date', 
                        feature_dict=fc_parameters, time_windows = [(7,7),(14,14),(28,28)])

Rolling: 100%|██████████| 20/20 [00:12<00:00,  1.54it/s]
Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.29s/it]
Rolling: 100%|██████████| 20/20 [00:13<00:00,  1.47it/s]
Feature Extraction: 100%|██████████| 20/20 [00:24<00:00,  1.24s/it]
Rolling: 100%|██████████| 20/20 [00:15<00:00,  1.29it/s]
Feature Extraction: 100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


In [13]:
X.drop(columns=["date"],inplace=True)

## Save final data

In [14]:
X.to_csv("final/bakery_data.csv.zip", index=False, compression="zip")
y.to_csv("final/bakery_target.csv.zip", index=False, compression="zip")