In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras import optimizers, regularizers
from keras.callbacks import EarlyStopping
import keras.backend as KerasBackend

In [None]:
train_ds = pd.read_csv('/content/drive/MyDrive/datasets/train.csv')
test_ds = pd.read_csv('/content/drive/MyDrive/datasets/test.csv')

In [None]:
def extract_timedata(timedata, y):
  timedata_dt = timedata['date'].map(lambda x: pd.to_datetime(x, format = '%Y-%m-%d', errors = 'ignore'))
  X = pd.DataFrame({'year': timedata_dt.dt.year-2013, 'month': timedata_dt.dt.month, 'day': timedata_dt.dt.day, 
                    'weekday': timedata_dt.dt.weekday, 'store': timedata.store, 'item': timedata.item}, 
                   columns = ['year', 'month', 'day', 'weekday', 'store', 'item'])
  X = np.array(X)
  Y = np.array(timedata[y])

  print(X.shape, Y.shape)
  return X, Y

In [None]:
X_train, Y_train = extract_timedata(train_ds, 'sales')
X_test, id_test = extract_timedata(test_ds, 'id')

(913000, 6) (913000,)
(45000, 6) (45000,)


In [None]:
# find out number of categories in each for embedding
X_stacked = np.vstack((X_train, X_test)) # stack vertically
print("years:", len(np.unique(X_stacked[:, 0])))
print("months:", len(np.unique(X_stacked[:, 1])))
print("days:", len(np.unique(X_stacked[:, 2])))
print("weekdays:", len(np.unique(X_stacked[:, 3])))
print("stores:", len(np.unique(X_stacked[:, 4])))
print("items:", len(np.unique(X_stacked[:, 5])))

years: 6
months: 12
days: 31
weekdays: 7
stores: 10
items: 50


In [None]:
# validation set - take last portion for time series
# we take last 6 months of 2017 here - around 10% 
# July 2017 onwards - 4 = 2017 (since minus 2013) and >6 = July onwards
X_val = X_train[(X_train[:, 0]==4)&(X_train[:, 1]>6)]
Y_val = Y_train[(X_train[:, 0]==4)&(X_train[:, 1]>6)]
X_train_split = X_train[(X_train[:, 0]!=4)|(X_train[:, 1]<7)]
Y_train_split = Y_train[(X_train[:, 0]!=4)|(X_train[:, 1]<7)]

print(X_train_split.shape, Y_train_split.shape)
print(X_val.shape, Y_val.shape)

(821000, 6) (821000,)
(92000, 6) (92000,)


In [None]:
X_train_split, X_val, Y_train_split, Y_val = train_test_split(X_train, Y_train, test_size=(1-0.9), random_state=0, shuffle = True)
print(X_train_split.shape, Y_train_split.shape)
print(X_val.shape, Y_val.shape)

(821700, 6) (821700,)
(91300, 6) (91300,)


In [None]:
'''
# test run with smaller sample size
sample_size = 5000
ind = np.random.randint(X_train_split.shape[0], size=sample_size)
X_train_sample, Y_train_sample = X_train_split[ind,:], Y_train_split[ind]
'''

In [None]:
def NNmodel():
  # year not embedded, rest embed
  input_year = Input(shape=(1,), name="year")
  inputs_model = [input_year]
  outputs_embedded = [input_year]
  features = {'month': 12, 'day': 31, 'weekday': 7, 'stores': 10, 'items': 50}
  for key in features.keys():
    input = Input(shape=(1,))
    embedded = Embedding(features[key]+1, features[key]//2 +1, name=key+'_embedding')(input)
    embedded = Reshape(target_shape=(features[key]//2 +1,))(embedded)
    inputs_model.append(input)
    outputs_embedded.append(embedded)

  output_model = Concatenate()(outputs_embedded)
  output_model = Dense(500)(output_model)
  output_model = Activation('relu')(output_model)
  output_model = Dense(100)(output_model)
  output_model = Activation('relu')(output_model)
  output_model = Dense(10)(output_model)
  output_model = Activation('relu')(output_model)
  output_model = Dense(1)(output_model)
          
  # loss function
  def smape(x, y):
    x, y = float(x), float(y)
    return 100.*KerasBackend.mean(2*KerasBackend.abs(x-y)/(KerasBackend.abs(x)+KerasBackend.abs(y)))

  def split_features(X):
      result = []
      for i in range(6):
          result.append(X[:,i])
      
      return result

  model = KerasModel(inputs=inputs_model, outputs=output_model)
  model.compile(optimizer='Adam', loss=smape)
  model.fit(split_features(X_train_split), Y_train_split,
                        validation_data=(split_features(X_val), Y_val),
                        epochs=10, batch_size=128,
                        #callbacks=[EarlyStopping(monitor='val_loss', patience=2)],
            )


  # Val prediction results - SMAPE
  val_prediction = model.predict(split_features(X_val)).flatten()

  prediction = model.predict(split_features(X_test)).flatten()*0.981
  return prediction


In [None]:

# take avg
predictions = []
for i in range(25):
  print('Iteration No. ', i+1)
  predictions.append(NNmodel())
prediction = np.array(predictions).mean(axis=0)


Iteration No.  1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  6
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  7
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iteration No.  8
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [None]:
#prediction = NNmodel()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
results = pd.DataFrame({'id': id_test, 'sales': prediction*1.006})
results['sales'] = np.round(results['sales']).astype(int)
results.head()

In [None]:
results.to_csv('ML_results.csv', index=False)