### Reference Dates

Num Training Weeks = 4  
Num Test/Val Days = 15

- Train: 5/30/2017 (Tues), 6/6/2017,6/13/2017,6/20/2017  
- Val: 7/11/2017 (Tues) - 7/25/2017 (Wed)  
- Test: 8/1/2017 (Tues)  - 8/15/2017 (Wed)  

In [1]:
from tqdm import tqdm
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM
from keras import callbacks
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import gc

Using TensorFlow backend.


In [2]:
#Import sales data from 2017-01-01 and beyond

df_2017 = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 101688780))


# import items and stores data
items = pd.read_csv("../input/items.csv").set_index("item_nbr")
stores = pd.read_csv("../input/stores.csv").set_index("store_nbr")

# Create promotion dataset
promo_2017 = df_2017.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017.columns = promo_2017.columns.get_level_values(1)

# Transform sales training data
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

df_2017_index = df_2017.index

# transform items dataset
items['class'] = items['class'].astype('category')
items = pd.get_dummies(items)
items = items.reindex(df_2017.index.get_level_values(1))

# transform 
stores['cluster'] = stores.cluster.astype('category')
stores = pd.get_dummies(stores)
stores = stores.reindex(df_2017.index.get_level_values(0))

In [3]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        
        # Number of promotion in the last x days 
        "promo_7_2017": get_timespan(promo_df, t2017, 7, 7).sum(axis=1).values,
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_df, t2017, 30, 30).sum(axis=1).values,
        
        # Number of promotion in the next x days of reference date
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 14).sum(axis=1).values,
    }
    
# Removed due to the presence of nan values 
#     for i in [3, 7, 14, 30]:
#         tmp1 = get_timespan(df, t2017, i, i)
#         tmp2 = (get_timespan(promo_df, t2017, i, i) > 0) * 1

#         X['has_promo_mean_%s' % i] = (tmp1 * tmp2.replace(0, np.nan)).mean(axis=1).values
#         X['no_promo_mean_%s' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan)).mean(axis=1).values
    
    for i in [3, 7, 14, 30]:
        tmp = get_timespan(df, t2017, i, i)
        # mean daily difference in sales in the last x days
        X[f'diff_{i}_mean'] = tmp.diff(axis=1).mean(axis=1).values
        # mean sales in the last x days
        X[f'mean_{i}'] = tmp.mean(axis=1).values
        # median sales in the last x days
        X[f'median_{i}'] = tmp.median(axis=1).values
        # min sales in the last x days
        X[f'min_{i}'] = tmp.min(axis=1).values
        # max sales in the last x days
        X[f'max_{i}'] = tmp.max(axis=1).values
        # std dev sales in the last x days
        X[f'std_{i}'] = tmp.std(axis=1).values

    for i in [7, 14, 30]:
        tmp = get_timespan(df, t2017, i, i)
        # number of days with sales in the last x days
        X[f'has_sales_days_in_last_{i}'] = (tmp > 0).sum(axis=1).values
        # last day with sales in the last x days
        X[f'last_has_sales_day_in_last_{i}'] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        # first of days with sales in the last x days
        X[f'first_has_sales_day_in_last_{i}'] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        # number of days with promotions in the last x days
        X[f'has_promo_days_in_last_{i}'] = (tmp > 0).sum(axis=1).values
        # last day has promotion in the last x days
        X[f'last_has_promo_day_in_last_{i}'] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        # first day has promotion in the last x days
        X[f'first_has_promo_day_in_last_{i}'] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=15), 14, 14)
    # last day that has promotion in the next 14 days (8/2 to 8/15)
    # Count backwards: if last day is 8/15, then value = 1
    # if last day is 8/2, then value = 14
    X['last_has_promo_day_in_after_14_days'] = 14 - ((tmp > 0) * np.arange(14)).max(axis=1).values
    # first day that has promotion in the next 14 days (8/2 to 8/15)
    X['first_has_promo_day_in_after_14_days'] = ((tmp > 0) * np.arange(14, 0, -1)).max(axis=1).values

    # sale on day x days from reference date 
    for i in range(1, 15):
        X[f'day_{i}_2017'] = get_timespan(df, t2017, i, 1).values.ravel()
    
    # average sales on day of the week for the last 4 or 20 weeks
    for i in range(7):
        X[f'mean_4_dow{i}_2017'] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X[f'mean_20_dow{i}_2017'] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values        
    
    # promotion status of each day 14 days before and 14 days after the reference date
    
    for i in range(-14, 15):
        X[f'promo_{i}'] = promo_df[t2017 + timedelta(days=i)].values.astype(np.uint8)

    X = pd.DataFrame(X)

    if name_prefix is not None:
        X.columns = [f'{name_prefix}_{c}' for c in X.columns]
    
    if is_train:
        y = df[pd.date_range(t2017, periods=15)].values
        return X, y
    
    return X


In [4]:
#create training data

t2017 = date(2017, 5, 30)
num_days = 4
X_l, y_l = [], []
for i in tqdm(range(num_days)):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp = pd.concat([X_tmp, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)

X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l
gc.collect()

100%|██████████| 4/4 [00:09<00:00,  2.36s/it]


22

In [5]:
#create training data

X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 11))

X_val = pd.concat([X_val, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)

X_test, y_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 1))

X_test = pd.concat([X_test, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)

In [6]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(670060, 538) (670060, 15)
(167515, 538) (167515, 15)
(167515, 538) (167515, 15)


In [7]:
del df_2017, promo_2017
gc.collect()

0

In [8]:
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_test[:] = scaler.transform(X_test)

In [9]:
X_train = X_train.values
X_test = X_test.values
X_val = X_val.values

## MLP Model 3

256 x 128 x 64 x 32 x 16 x 1

In [10]:
def build_model():
    model = Sequential()

    model.add(Dense(256, input_dim=X_train.shape[1]))
    model.add(PReLU())
    model.add(BatchNormalization())

    model.add(Dense(128, input_dim=X_train.shape[1]))
    model.add(PReLU())
    model.add(BatchNormalization())
    
    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())

    model.add(Dense(32))
    model.add(PReLU())
    model.add(BatchNormalization())

    model.add(Dense(16))
    model.add(PReLU())
    model.add(BatchNormalization())

    model.add(Dense(1))
    return model

In [11]:
N_EPOCHS = 2000

val_pred = []
test_pred = []
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )
for i in range(15):
    print("=" * 50)
    print(f'Step {i+1}')
    print("=" * 50)
    y = y_train[:, i]
    y_mean = y.mean()
    xv = X_val
    yv = y_val[:, i]
    model = build_model()

    opt = optimizers.Adam(lr=0.001)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
        ]
    
    #smaller batch size runs faster
    #batch_size = 65536
    batch_size = 8192

    model.fit(X_train, y - y_mean, batch_size = batch_size, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv-y_mean), callbacks=callbacks )
    val_pred.append(model.predict(X_val) + y_mean)
    test_pred.append(model.predict(X_test) + y_mean)



Step 1
Train on 670060 samples, validate on 167515 samples
Epoch 1/2000
 - 10s - loss: 0.5186 - mse: 0.4869 - val_loss: 0.4748 - val_mse: 0.4748
Epoch 2/2000
 - 9s - loss: 0.3538 - mse: 0.3337 - val_loss: 0.3442 - val_mse: 0.3442
Epoch 3/2000
 - 9s - loss: 0.3383 - mse: 0.3195 - val_loss: 0.3352 - val_mse: 0.3352
Epoch 4/2000
 - 9s - loss: 0.3302 - mse: 0.3121 - val_loss: 0.3259 - val_mse: 0.3259
Epoch 5/2000
 - 9s - loss: 0.3245 - mse: 0.3069 - val_loss: 0.3236 - val_mse: 0.3236
Epoch 6/2000
 - 9s - loss: 0.3207 - mse: 0.3034 - val_loss: 0.3212 - val_mse: 0.3212
Epoch 7/2000
 - 9s - loss: 0.3174 - mse: 0.3003 - val_loss: 0.3206 - val_mse: 0.3206
Epoch 8/2000
 - 9s - loss: 0.3148 - mse: 0.2979 - val_loss: 0.3194 - val_mse: 0.3194
Epoch 9/2000
 - 9s - loss: 0.3121 - mse: 0.2954 - val_loss: 0.3180 - val_mse: 0.3180
Epoch 10/2000
 - 9s - loss: 0.3097 - mse: 0.2932 - val_loss: 0.3181 - val_mse: 0.3181
Epoch 11/2000
 - 9s - loss: 0.3079 - mse: 0.2915 - val_loss: 0.3210 - val_mse: 0.3210
Epo

In [12]:
weight = items["perishable"] * 0.25 + 1
val_err = (y_val - np.array(val_pred).squeeze(axis=2).transpose())**2
val_err = val_err.sum(axis=1) * weight
#change to 15 days
val_err = np.sqrt(val_err.sum() / weight.sum() / 15)
print(f'validation nwrmsle = {val_err}')

validation nwrmsle = 0.6098121112836306


In [13]:
test_err = (y_test - np.array(test_pred).squeeze(axis=2).transpose())**2
test_err = test_err.sum(axis=1) * weight
#change to 15 days
test_err = np.sqrt(test_err.sum() / weight.sum() / 15)
print(f'test nwrmsle = {test_err}')

test nwrmsle = 0.622227862622076


In [14]:
df_pred_test = pd.DataFrame(np.array(test_pred).squeeze(axis=2).transpose(), 
                            index = df_2017_index, columns = pd.date_range('2017-08-01',periods=15))

In [15]:
out_path = '../model_results/2020-01-08/'

In [16]:
df_pred_test.to_csv(out_path + 'nn_test_pred_model_3.csv')