# Loading Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
import time, datetime
import pickle 

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import timeseries_dataset_from_array

import warnings
warnings.filterwarnings('ignore')

# Preprocessing data for building the ML model

### Loading the data

In [2]:
og_train_df = pd.read_csv('og_data/train.csv')
og_test_df = pd.read_csv('og_data/test.csv')
og_oil_df = pd.read_csv('og_data/oil.csv')
og_stores_df = pd.read_csv('og_data/stores.csv')
og_transactions_df = pd.read_csv('og_data/transactions.csv')
og_holidays_df = pd.read_csv('og_data/holidays_events.csv')

In [3]:
train_df = deepcopy(og_train_df)
test_df = deepcopy(og_test_df)
oil_df = deepcopy(og_oil_df)
stores_df = deepcopy(og_stores_df)
transactions_df = deepcopy(og_transactions_df) 

In [4]:
date_min, date_max = train_df.date.min(), test_df.date.max()

### Summary of original data

In [4]:
og_train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [5]:
og_oil_df.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [6]:
og_stores_df.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [7]:
og_transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


### Remove the nan values from oil prices

In [5]:
new_df = pd.DataFrame()
new_df['date'] = [d.strftime('%Y-%m-%d') for d in pd.date_range(date_min, date_max)]
new_df = pd.merge(new_df,oil_df, how="outer", on='date')
new_df['dcoilwtico'] = new_df['dcoilwtico'].interpolate(method = "linear")
oil_df = new_df

In [6]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isnan(y), lambda z: z.nonzero()[0]

In [7]:
oil_prices = np.asarray(oil_df['dcoilwtico'])

In [8]:
nans, x= nan_helper(oil_prices)
oil_prices[nans]= np.interp(x(nans), x(~nans), oil_prices[~nans])

In [9]:
oil_df['dcoilwtico'] = oil_prices

### Building the date conversion into unix time for ML model

In [10]:
date_min = time.mktime(datetime.datetime.strptime(date_min, "%Y-%m-%d").timetuple())
date_max = time.mktime(datetime.datetime.strptime(date_max, "%Y-%m-%d").timetuple())

In [11]:
og_test_df 

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [12]:
def GiveUnixTimeListNormalized(vector, date_format="%Y-%m-%d"):
    new_dates = np.zeros(len(vector), dtype=float)
    for i, s in enumerate(vector):
        new_dates[i] = time.mktime(datetime.datetime.strptime(s, date_format).timetuple())
    new_dates -= date_min
    new_dates /= (date_max - date_min)
    return new_dates

In [13]:
new_dates = GiveUnixTimeListNormalized(oil_df['date'], date_format="%Y-%m-%d")
oil_df['date'] = new_dates

In [14]:
new_dates = GiveUnixTimeListNormalized(og_train_df['date'], date_format="%Y-%m-%d")
train_df['date'] = new_dates

In [15]:
new_dates = GiveUnixTimeListNormalized(og_test_df['date'], date_format="%Y-%m-%d")
test_df['date'] = new_dates

In [16]:
train_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,0.000000,1,AUTOMOTIVE,0.000,0
1,1,0.000000,1,BABY CARE,0.000,0
2,2,0.000000,1,BEAUTY,0.000,0
3,3,0.000000,1,BEVERAGES,0.000,0
4,4,0.000000,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,0.990605,9,POULTRY,438.133,0
3000884,3000884,0.990605,9,PREPARED FOODS,154.553,1
3000885,3000885,0.990605,9,PRODUCE,2419.729,148
3000886,3000886,0.990605,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [17]:
oil_df

Unnamed: 0,date,dcoilwtico
0,0.000000,93.140000
1,0.000587,93.140000
2,0.001174,92.970000
3,0.001762,93.120000
4,0.002349,93.146667
...,...,...
1699,0.997651,46.816667
1700,0.998238,46.400000
1701,0.998826,46.460000
1702,0.999413,45.960000


In [18]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,0.991192,1,AUTOMOTIVE,0
1,3000889,0.991192,1,BABY CARE,0
2,3000890,0.991192,1,BEAUTY,2
3,3000891,0.991192,1,BEVERAGES,20
4,3000892,0.991192,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,1.000000,9,POULTRY,1
28508,3029396,1.000000,9,PREPARED FOODS,0
28509,3029397,1.000000,9,PRODUCE,1
28510,3029398,1.000000,9,SCHOOL AND OFFICE SUPPLIES,9


### Building the processed dictionaries to train and test the ML model

In [24]:
stores_transactions_dict = {}
for store in train_df['store_nbr'].unique():
    print(store)
    stores_transactions_dict[store] = {}
    for fam in train_df['family'].unique():
        stores_transactions_dict[store][fam] = {}
        stores_transactions_dict[store][fam]['date'] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['date'].dropna().values.tolist()
        stores_transactions_dict[store][fam]['sales'] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['sales'].dropna().values.tolist()
        stores_transactions_dict[store][fam]['onpromotion'] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['onpromotion'].dropna().values.tolist()
        oil_list = []
        for date in stores_transactions_dict[store][fam]['date']:
            if date in oil_df['date'].tolist():
                oil_list.append(oil_df.iloc[oil_df['date'].tolist().index(date)]['dcoilwtico'])
        stores_transactions_dict[store][fam]['oil_prices'] = oil_list

1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
6
7
8
9


In [25]:
with open('Data/train_dictionary.pkl', 'wb') as f:
    pickle.dump(stores_transactions_dict, f)

In [19]:
stores_transactions_test_dict = {}
for store in test_df['store_nbr'].unique():
    print(store)
    stores_transactions_test_dict[store] = {}
    for fam in test_df['family'].unique():
        stores_transactions_test_dict[store][fam] = {}
        stores_transactions_test_dict[store][fam]['id'] = test_df.where((test_df['store_nbr'] == store) & (test_df['family'] == fam))['id'].dropna().values.tolist()
        stores_transactions_test_dict[store][fam]['date'] = test_df.where((test_df['store_nbr'] == store) & (test_df['family'] == fam))['date'].dropna().values.tolist()
        stores_transactions_test_dict[store][fam]['onpromotion'] = test_df.where((test_df['store_nbr'] == store) & (test_df['family'] == fam))['onpromotion'].dropna().values.tolist()
        oil_list = []
        for date in stores_transactions_test_dict[store][fam]['date']:
            if date in oil_df['date'].tolist():
                oil_list.append(oil_df.iloc[oil_df['date'].tolist().index(date)]['dcoilwtico'])
        stores_transactions_test_dict[store][fam]['oil_prices'] = oil_list

1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
6
7
8
9


In [20]:
with open('Data/test_dictionary.pkl', 'wb') as f:
    pickle.dump(stores_transactions_test_dict, f)

# From here we work with the dictionary processed data

In [26]:
with open('Data/train_dictionary.pkl', 'rb') as f:
    stores_transactions_dict = pickle.load(f)

with open('Data/test_dictionary.pkl', 'rb') as f:
    stores_transactions_test_dict = pickle.load(f)

In [88]:
for key in stores_transactions_dict.keys():
    print(key)

1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
6
7
8
9


In [71]:
n_timesteps = len(stores_transactions_test_dict[1]['AUTOMOTIVE']['oil_prices'])
batch_size = 32

In [72]:
dataset = timeseries_dataset_from_array(np.asarray(stores_transactions_dict[1]['AUTOMOTIVE']['date']), None, n_timesteps, sequence_stride=n_timesteps, batch_size=batch_size)

In [None]:
def split_series(series, n_past, n_future):
    X, y = [], []
    for window_start in range(len(series)):
        past_end = window_start + n_past
        future_end = past_end + n_future
        if future_end > len(series):
            break
        X.append(series[window_start:past_end, :])
        y.append(series[past_end:future_end, :])
    return np.array(X), np.array(y)

n_past = 16
n_future = 16
n_features = num_stores_train * num_families_train

X_train, y_train = split_series(scaled_train_samples, n_past, n_future)
X_val, y_val = split_series(scaled_validation_samples, n_past, n_future)

model = Sequential()
model.add(layers.LSTM(units=200, return_sequences=True, input_shape=(n_past, n_features)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
'''
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
'''
model.add(layers.TimeDistributed(layers.Dense(n_features)))

model.compile(loss='msle', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['msle'])

model.summary()

early_stopping = EarlyStopping(monitor='val_msle', min_delta=0.0001, patience=100, restore_best_weights=True)

EPOCHS = 1000
model_history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, callbacks=[early_stopping], batch_size=512, shuffle=True)