# Loading Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from copy import deepcopy
import time, datetime
import pickle 

import IPython
import IPython.display
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import timeseries_dataset_from_array

import warnings
warnings.filterwarnings('ignore')

# Preprocessing data for building the ML model

### Loading the data

In [2]:
og_train_df = pd.read_csv('og_data/train.csv')
og_test_df = pd.read_csv('og_data/test.csv')
og_oil_df = pd.read_csv('og_data/oil.csv')
og_stores_df = pd.read_csv('og_data/stores.csv')
og_transactions_df = pd.read_csv('og_data/transactions.csv')
og_holidays_df = pd.read_csv('og_data/holidays_events.csv')

In [3]:
train_df = deepcopy(og_train_df)
test_df = deepcopy(og_test_df)
oil_df = deepcopy(og_oil_df)
stores_df = deepcopy(og_stores_df)
transactions_df = deepcopy(og_transactions_df) 

In [4]:
date_min, date_max = train_df.date.min(), test_df.date.max()

### Summary of original data

In [4]:
og_train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [5]:
og_oil_df.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [6]:
og_stores_df.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [7]:
og_transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


### Remove the nan values from oil prices

In [5]:
new_df = pd.DataFrame()
new_df['date'] = [d.strftime('%Y-%m-%d') for d in pd.date_range(date_min, date_max)]
new_df = pd.merge(new_df,oil_df, how="outer", on='date')
new_df['dcoilwtico'] = new_df['dcoilwtico'].interpolate(method = "linear")
oil_df = new_df

In [6]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isnan(y), lambda z: z.nonzero()[0]

In [7]:
oil_prices = np.asarray(oil_df['dcoilwtico'])

In [8]:
nans, x= nan_helper(oil_prices)
oil_prices[nans]= np.interp(x(nans), x(~nans), oil_prices[~nans])

In [9]:
oil_df['dcoilwtico'] = oil_prices

### Building the date conversion into unix time for ML model

In [10]:
date_min = time.mktime(datetime.datetime.strptime(date_min, "%Y-%m-%d").timetuple())
date_max = time.mktime(datetime.datetime.strptime(date_max, "%Y-%m-%d").timetuple())

In [11]:
og_test_df 

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [11]:
def GiveUnixTimeListNormalized(vector, date_format="%Y-%m-%d"):
    new_dates = np.zeros(len(vector), dtype=float)
    for i, s in enumerate(vector):
        new_dates[i] = time.mktime(datetime.datetime.strptime(s, date_format).timetuple())
    new_dates -= date_min
    new_dates /= (date_max - date_min)
    return new_dates

In [12]:
new_dates = GiveUnixTimeListNormalized(oil_df['date'], date_format="%Y-%m-%d")
oil_df['date'] = new_dates

In [13]:
new_dates = GiveUnixTimeListNormalized(og_train_df['date'], date_format="%Y-%m-%d")
train_df['date'] = new_dates

In [14]:
new_dates = GiveUnixTimeListNormalized(og_test_df['date'], date_format="%Y-%m-%d")
test_df['date'] = new_dates

In [16]:
train_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,0.000000,1,AUTOMOTIVE,0.000,0
1,1,0.000000,1,BABY CARE,0.000,0
2,2,0.000000,1,BEAUTY,0.000,0
3,3,0.000000,1,BEVERAGES,0.000,0
4,4,0.000000,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,0.990605,9,POULTRY,438.133,0
3000884,3000884,0.990605,9,PREPARED FOODS,154.553,1
3000885,3000885,0.990605,9,PRODUCE,2419.729,148
3000886,3000886,0.990605,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [17]:
oil_df

Unnamed: 0,date,dcoilwtico
0,0.000000,93.140000
1,0.000587,93.140000
2,0.001174,92.970000
3,0.001762,93.120000
4,0.002349,93.146667
...,...,...
1699,0.997651,46.816667
1700,0.998238,46.400000
1701,0.998826,46.460000
1702,0.999413,45.960000


In [18]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,0.991192,1,AUTOMOTIVE,0
1,3000889,0.991192,1,BABY CARE,0
2,3000890,0.991192,1,BEAUTY,2
3,3000891,0.991192,1,BEVERAGES,20
4,3000892,0.991192,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,1.000000,9,POULTRY,1
28508,3029396,1.000000,9,PREPARED FOODS,0
28509,3029397,1.000000,9,PRODUCE,1
28510,3029398,1.000000,9,SCHOOL AND OFFICE SUPPLIES,9


### Building the processed dictionaries to train and test the ML model

In [30]:
stores_transactions_dict = {}
for store in train_df['store_nbr'].unique():
    print(store)
    stores_transactions_dict[store] = {}
    first_cond = False
    for fam in train_df['family'].unique():
        if first_cond == False:
            stores_transactions_dict[store]['date'] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['date'].dropna().values.tolist()
            oil_list = []
            for date in stores_transactions_dict[store]['date']:
                if date in oil_df['date'].tolist():
                    oil_list.append(oil_df.iloc[oil_df['date'].tolist().index(date)]['dcoilwtico'])
            stores_transactions_dict[store]['oil_prices'] = oil_list
            first_cond = True
            
        stores_transactions_dict[store]['sales_'+fam] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['sales'].dropna().values.tolist()
        stores_transactions_dict[store]['onpromotion_'+fam] = train_df.where((train_df['store_nbr'] == store) & (train_df['family'] == fam))['onpromotion'].dropna().values.tolist()

1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
6
7
8
9


In [31]:
with open('Data/train_dictionary.pkl', 'wb') as f:
    pickle.dump(stores_transactions_dict, f)

In [24]:
stores_transactions_test_dict = {}
for store in test_df['store_nbr'].unique():
    print(store)
    stores_transactions_test_dict[store] = {}
    first_cond = False
    for fam in test_df['family'].unique():
        if first_cond == False:
            stores_transactions_test_dict[store]['date'] = test_df.where((test_df['store_nbr'] == store) & (test_df['family'] == fam))['date'].dropna().values.tolist()
            oil_list = []
            for date in stores_transactions_test_dict[store]['date']:
                if date in oil_df['date'].tolist():
                    oil_list.append(oil_df.iloc[oil_df['date'].tolist().index(date)]['dcoilwtico'])
            stores_transactions_test_dict[store]['oil_prices'] = oil_list
            first_cond=True

        stores_transactions_test_dict[store]['onpromotion_'+fam] = test_df.where((test_df['store_nbr'] == store) & (test_df['family'] == fam))['onpromotion'].dropna().values.tolist()

1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
6
7
8
9


In [29]:
with open('Data/test_dictionary.pkl', 'wb') as f:
    pickle.dump(stores_transactions_test_dict, f)

# From here we work with the dictionary processed data

In [2]:
og_test_df = pd.read_csv('og_data/test.csv')

In [3]:
with open('Data/train_dictionary.pkl', 'rb') as f:
    stores_transactions_dict = pickle.load(f)

with open('Data/test_dictionary.pkl', 'rb') as f:
    stores_transactions_test_dict = pickle.load(f)

In [60]:
class WindowGenerator():
  def __init__(self, input_width, label_width, shift, train_df=train_df, val_df=val_df, test_df=test_df, label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
    self.column_indices = {name: i for i, name in enumerate(train_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([f'Total window size: {self.total_window_size}', f'Input indices: {self.input_indices}', f'Label indices: {self.label_indices}', 
                      f'Label column name(s): {self.label_columns}'])
  
def split_window(self, features):
  inputs = features[:, self.input_slice, :] 
  labels = features[:, self.labels_slice, :]
  if self.label_columns is not None:
    labels = tf.stack([labels[:, :, self.column_indices[name]] for name in self.label_columns], axis=-1)

  # Slicing doesn't preserve static shape information, so set the shapes
  # manually. This way the `tf.data.Datasets` are easier to inspect.
  inputs.set_shape([None, self.input_width, None])
  labels.set_shape([None, self.label_width, None])

  return inputs, labels

WindowGenerator.split_window = split_window

def plot(self, model=None, plot_col='sales_AUTOMOTIVE', max_subplots=3):
  inputs, labels = self.example
  plt.figure(figsize=(12, 8))
  plot_col_index = self.column_indices[plot_col]
  max_n = min(max_subplots, len(inputs))
  for n in range(max_n):
    plt.subplot(max_n, 1, n+1)
    plt.ylabel(f'{plot_col} [normed]')
    plt.plot(self.input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)

    if self.label_columns:
      label_col_index = self.label_columns_indices.get(plot_col, None)
    else:
      label_col_index = plot_col_index

    if label_col_index is None:
      continue

    plt.scatter(self.label_indices, labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
    if model is not None:
      predictions = model(inputs)
      plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)

    if n == 0:
      plt.legend()

  plt.xlabel('Time (days)')

WindowGenerator.plot = plot

def make_dataset(self, data):
  data = np.array(data, dtype=np.float32)
  ds = tf.keras.utils.timeseries_dataset_from_array(
      data=data,
      targets=None,
      sequence_length=self.total_window_size,
      sequence_stride=1,
      shuffle=True,
      batch_size=32,)

  ds = ds.map(self.split_window)

  return ds

WindowGenerator.make_dataset = make_dataset

@property
def train(self):
  return self.make_dataset(self.train_df)

@property
def val(self):
  return self.make_dataset(self.val_df)

@property
def test(self):
  return self.make_dataset(self.test_df)

@property
def example(self):
  """Get and cache an example batch of `inputs, labels` for plotting."""
  result = getattr(self, '_example', None)
  if result is None:
    # No example batch was found, so get one from the `.train` dataset
    result = next(iter(self.train))
    # And cache it for next time
    self._example = result
  return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

class MultiStepLastBaseline(tf.keras.Model):
  def call(self, inputs):
    return tf.tile(inputs[:, -1:, :], [1, OUT_STEPS, 1])
  
class RepeatBaseline(tf.keras.Model):
  def call(self, inputs):
    return inputs
  
class FeedBack(tf.keras.Model):
  def __init__(self, units, out_steps):
    super().__init__()
    self.out_steps = out_steps
    self.units = units
    self.lstm_cell = tf.keras.layers.LSTMCell(units)
    # Also wrap the LSTMCell in an RNN to simplify the `warmup` method.
    self.lstm_rnn = tf.keras.layers.RNN(self.lstm_cell, return_state=True)
    self.dense = tf.keras.layers.Dense(num_features)

def warmup(self, inputs):
  # inputs.shape => (batch, time, features)
  # x.shape => (batch, lstm_units)
  x, *state = self.lstm_rnn(inputs)

  # predictions.shape => (batch, features)
  prediction = self.dense(x)
  return prediction, state

FeedBack.warmup = warmup

def call(self, inputs, training=None):
  # Use a TensorArray to capture dynamically unrolled outputs.
  predictions = []
  # Initialize the LSTM state.
  prediction, state = self.warmup(inputs)

  # Insert the first prediction.
  predictions.append(prediction)

  # Run the rest of the prediction steps.
  for n in range(1, self.out_steps):
    # Use the last prediction as input.
    x = prediction
    # Execute one lstm step.
    x, state = self.lstm_cell(x, states=state,
                              training=training)
    # Convert the lstm output to a prediction.
    prediction = self.dense(x)
    # Add the prediction to the output.
    predictions.append(prediction)

  # predictions.shape => (time, batch, features)
  predictions = tf.stack(predictions)
  # predictions.shape => (batch, time, features)
  predictions = tf.transpose(predictions, [1, 0, 2])
  return predictions

FeedBack.call = call

In [88]:
MAX_EPOCHS = 10

def compile_and_fit(model, window, patience=30):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=patience, restore_best_weights=True)

  model.compile(loss=tf.keras.losses.MeanSquaredLogarithmicError(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.MeanAbsoluteError()])

  history = model.fit(window.train, epochs=MAX_EPOCHS,
                      validation_data=window.val,
                      callbacks=[early_stopping])
  return history

In [104]:
for key in stores_transactions_dict.keys():
    if key == 1:
        df = pd.DataFrame.from_dict(stores_transactions_dict[key])

        column_indices = {name: i for i, name in enumerate(df.columns)}

        n = len(df)
        num_features = df.shape[1]
        #print(n, num_features)
        train_df = df[0:int(n*0.7)]
        val_df = df[int(n*0.7):int(n*0.9)]
        test_df = df[int(n*0.9):] #pd.DataFrame.from_dict(stores_transactions_test_dict[key])

        train_mean = train_df.mean()
        train_std = train_df.std()
        train_std = train_std.replace(0, 1.0)

        train_df = (train_df - train_mean) / train_std
        val_df = (val_df - train_mean) / train_std
        test_df = (test_df - train_mean) / train_std

        plot = False
        if plot:
            df_std = (df - train_mean) / train_std
            df_std = df_std.melt(var_name='Column', value_name='Normalized')
            plt.figure(figsize=(30, 20))
            ax = sns.violinplot(x='Column', y='Normalized', data=df_std)
            _ = ax.set_xticklabels(df.keys(), rotation=90)

        single_step_window = WindowGenerator(input_width=16, label_width=1, shift=1, label_columns=['sales_AUTOMOTIVE'])
        print(single_step_window)
        
        train_dense = False
        if train_dense:

            multi_dense_model = tf.keras.Sequential([
                # Take the last time step.
                # Shape [batch, time, features] => [batch, 1, features]
                tf.keras.layers.Lambda(lambda x: x[:, -1:, :]),
                # Shape => [batch, 1, dense_units]
                tf.keras.layers.Dense(512, activation='relu'),
                # Shape => [batch, out_steps*features]
                tf.keras.layers.Dense(OUT_STEPS*num_features,
                                    kernel_initializer=tf.initializers.zeros()),
                # Shape => [batch, out_steps, features]
                tf.keras.layers.Reshape([OUT_STEPS, num_features])
            ])

            history = compile_and_fit(multi_dense_model, multi_window)

            IPython.display.clear_output()
            multi_val_performance['Dense'] = multi_dense_model.evaluate(multi_window.val, return_dict=True)
            multi_performance['Dense'] = multi_dense_model.evaluate(multi_window.test, verbose=0, return_dict=True)
            multi_window.plot(multi_dense_model)

        train = False
        if train:
            OUT_STEPS =  pd.DataFrame.from_dict(stores_transactions_test_dict[key]).shape[0] #test_df.shape[1]

            multi_window = WindowGenerator(input_width=OUT_STEPS,
                                label_width=OUT_STEPS,
                                shift=OUT_STEPS)
            
            feedback_model = FeedBack(units=16, out_steps=OUT_STEPS)

            history = compile_and_fit(feedback_model, multi_window)

            #IPython.display.clear_output()

            multi_val_performance['AR LSTM'] = feedback_model.evaluate(multi_window.val, return_dict=True)
            multi_performance['AR LSTM'] = feedback_model.evaluate(multi_window.test, verbose=0, return_dict=True)
            multi_window.plot(feedback_model)
        

Total window size: 17
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Label indices: [16]
Label column name(s): ['sales_AUTOMOTIVE']


In [35]:
n_timesteps = len(stores_transactions_test_dict[1]['oil_prices'])
batch_size = 32

In [72]:
dataset = timeseries_dataset_from_array(np.asarray(stores_transactions_dict[1]['date']), None, n_timesteps, sequence_stride=n_timesteps, batch_size=batch_size)

In [None]:
def split_series(series, n_past, n_future):
    X, y = [], []
    for window_start in range(len(series)):
        past_end = window_start + n_past
        future_end = past_end + n_future
        if future_end > len(series):
            break
        X.append(series[window_start:past_end, :])
        y.append(series[past_end:future_end, :])
    return np.array(X), np.array(y)

n_past = 16
n_future = 16
n_features = num_stores_train * num_families_train

X_train, y_train = split_series(scaled_train_samples, n_past, n_future)
X_val, y_val = split_series(scaled_validation_samples, n_past, n_future)

model = Sequential()
model.add(layers.LSTM(units=200, return_sequences=True, input_shape=(n_past, n_features)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
'''
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(units=300, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
'''
model.add(layers.TimeDistributed(layers.Dense(n_features)))

model.compile(loss='msle', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['msle'])

model.summary()

early_stopping = EarlyStopping(monitor='val_msle', min_delta=0.0001, patience=100, restore_best_weights=True)

EPOCHS = 1000
model_history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, callbacks=[early_stopping], batch_size=512, shuffle=True)