In [65]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [31]:
data = pd.read_csv('train.csv')
data['date'] = pd.to_datetime(data.date, dayfirst=False)

In [32]:
data

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [48]:
# Store item combinations
store_item_combis = np.unique(data[['store', 'item']].values, axis=0)

In [67]:
n_lookback = 4 * 7 # Roughly one month
n_future = 7 # One week
all_sub_data = []
for combi in tqdm(store_item_combis):
    sub_data = data[(data.store == combi[0]) & (data.item == combi[1])].sort_values('date', ascending=True)
    # Past observation
    for i in range(1, n_lookback):
        sub_data['sales-{}'.format(i)] = sub_data['sales'].shift(i)
    # Unseen observation
    for i in range(1, n_future):
        sub_data['sales+{}'.format(i)] = sub_data['sales'].shift(-i)
    all_sub_data.append(sub_data)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [74]:
target_col_names = ['sales+{}'.format(i) for i in range(1, n_future)]
feature_col_names = ['sales-{}'.format(i) for i in range(1, n_lookback)]

In [80]:
data = pd.concat(all_sub_data)
data.dropna(inplace=True)
data.head()

Unnamed: 0,date,store,item,sales,sales-1,sales-2,sales-3,sales-4,sales-5,sales-6,...,sales-24,sales-25,sales-26,sales-27,sales+1,sales+2,sales+3,sales+4,sales+5,sales+6
27,2013-01-28,1,1,11,12.0,12.0,14.0,8.0,9.0,7.0,...,13.0,14.0,11.0,13.0,6.0,9.0,13.0,11.0,21.0,15.0
28,2013-01-29,1,1,6,11.0,12.0,12.0,14.0,8.0,9.0,...,10.0,13.0,14.0,11.0,9.0,13.0,11.0,21.0,15.0,14.0
29,2013-01-30,1,1,9,6.0,11.0,12.0,12.0,14.0,8.0,...,12.0,10.0,13.0,14.0,13.0,11.0,21.0,15.0,14.0,9.0
30,2013-01-31,1,1,13,9.0,6.0,11.0,12.0,12.0,14.0,...,10.0,12.0,10.0,13.0,11.0,21.0,15.0,14.0,9.0,10.0
31,2013-02-01,1,1,11,13.0,9.0,6.0,11.0,12.0,12.0,...,9.0,10.0,12.0,10.0,21.0,15.0,14.0,9.0,10.0,13.0


In [83]:
y = data[target_col_names].values
x = data[feature_col_names].values
print(x.shape, y.shape)

(896500, 27) (896500, 6)


In [86]:
np.save('x.npy', x)
np.save('y.npy', y)

In [24]:
# date to categorical
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data.drop(columns=['date'], inplace=True)

In [None]:
# cyclical date features
# normalization