# YAZ Dataset

In [1]:
import pandas as pd
from utilities import add_lag_features
from tsfresh.feature_extraction import MinimalFCParameters

  from pandas import Int64Index as NumericIndex


In [16]:
data = pd.read_csv("raw/YAZ_dataset.csv.zip")

In [3]:
X = data.iloc[:, 0:11]
y = data.iloc[:, 11:]

In [4]:
data = pd.DataFrame()
for col in y.columns:
    X_temp = X
    X_temp["item"] = col
    X_temp["store"] = 1
    y_temp = y[col]
    data_temp = pd.concat([X_temp, y_temp], axis=1)
    data_temp.rename(columns={col: "demand"}, inplace=True)
    data = pd.concat([data, data_temp])
data.reset_index(inplace=True)
data.sort_values(by = ['index', 'item'], inplace=True)
data.reset_index(inplace=True, drop=True)

In [5]:
data

Unnamed: 0,index,weekday,month,year,is_holiday,is_closed,weekend,wind,clouds,rain,sunshine,temperature,item,store,demand
0,0,FRI,OCT,2013,0,0,0,1.9,7.7,0.1,150,15.9,calamari,1,6
1,0,FRI,OCT,2013,0,0,0,1.9,7.7,0.1,150,15.9,chicken,1,40
2,0,FRI,OCT,2013,0,0,0,1.9,7.7,0.1,150,15.9,fish,1,6
3,0,FRI,OCT,2013,0,0,0,1.9,7.7,0.1,150,15.9,koefte,1,23
4,0,FRI,OCT,2013,0,0,0,1.9,7.7,0.1,150,15.9,lamb,1,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5350,764,SAT,NOV,2015,0,0,1,1.9,5.6,0.0,46,17.3,fish,1,2
5351,764,SAT,NOV,2015,0,0,1,1.9,5.6,0.0,46,17.3,koefte,1,25
5352,764,SAT,NOV,2015,0,0,1,1.9,5.6,0.0,46,17.3,lamb,1,6
5353,764,SAT,NOV,2015,0,0,1,1.9,5.6,0.0,46,17.3,shrimp,1,2


## Add lag features

In [6]:
#split in X and y 
y = pd.DataFrame(data['demand'])
X = data.drop(columns=['demand'])

In [7]:
# set lag features
fc_parameters = MinimalFCParameters()

In [8]:
# delete length features
del fc_parameters['length']

In [9]:
# print all lag features
print("Lag features:", fc_parameters)

Lag features: {'sum_values': None, 'median': None, 'mean': None, 'standard_deviation': None, 'variance': None, 'root_mean_square': None, 'maximum': None, 'absolute_maximum': None, 'minimum': None}


In [10]:
# create lag features
X, y  = add_lag_features(X=X, y=y, column_id=['item',"store"], column_sort='index', 
                        feature_dict=fc_parameters, time_windows = [(7,7),(14,14),(28,28)])

Rolling: 100%|██████████| 20/20 [00:02<00:00,  9.01it/s]
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00,  8.10it/s]
Rolling: 100%|██████████| 20/20 [00:02<00:00,  9.03it/s]
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00,  8.21it/s]
Rolling: 100%|██████████| 20/20 [00:02<00:00,  9.04it/s]
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00,  8.31it/s]


In [11]:
X.drop(columns=["index"],inplace=True)

## Save final data

In [12]:
X.to_csv("final/yaz_data.csv.zip", index=False, compression="zip")
y.to_csv("final/yaz_target.csv.zip", index=False, compression="zip")

In [13]:
X.columns

Index(['weekday', 'month', 'year', 'is_holiday', 'is_closed', 'weekend',
       'wind', 'clouds', 'rain', 'sunshine', 'temperature', 'item', 'store',
       'demand__sum_values_7', 'demand__median_7', 'demand__mean_7',
       'demand__standard_deviation_7', 'demand__variance_7',
       'demand__root_mean_square_7', 'demand__maximum_7',
       'demand__absolute_maximum_7', 'demand__minimum_7',
       'demand__sum_values_14', 'demand__median_14', 'demand__mean_14',
       'demand__standard_deviation_14', 'demand__variance_14',
       'demand__root_mean_square_14', 'demand__maximum_14',
       'demand__absolute_maximum_14', 'demand__minimum_14',
       'demand__sum_values_28', 'demand__median_28', 'demand__mean_28',
       'demand__standard_deviation_28', 'demand__variance_28',
       'demand__root_mean_square_28', 'demand__maximum_28',
       'demand__absolute_maximum_28', 'demand__minimum_28'],
      dtype='object')