In [1]:
import pandas as pd
import numpy as np
import datetime
from pandas_summary import DataFrameSummary

In [2]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [3]:
df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]

In [4]:
final_train = False

In [5]:
def get_metric(sales, sales_):
    return (((sales - sales_)/sales)**2).sum()/len(sales)

$$
\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}
$$

In [42]:
df_train.columns

Index(['index', 'Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open',
       'Promo', 'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week',
       'Day', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval',
       'CompetitionOpenSince', 'Promo2Since', 'State', 'file', 'week', 'trend',
       'Date_y', 'Month_y', 'Day_y', 'file_DE', 'week_DE', 'trend_DE',
       'Date_DE', 'State_DE', 'Month_DE', 'Day_DE', 'file_y',
       'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
       'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Humidity',
       'Mean_Humidity', 'Min_Humidity', 'Max_Sea_Level_PressurehPa',
       'Mean_Sea_Level_PressurehPa', 'Min_Sea_Level_PressurehPa',
       'Max_VisibilityKm', 'Mean_VisibilityKm', 'Min_VisibilitykM',
       'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'Max_Gust_SpeedKm_h',
       'Precipitatio

In [71]:
df_train['Events']

30188     20
30189     20
30190      0
30191     20
30192     20
          ..
844333    10
844334    10
844335    10
844336     1
844337    10
Name: Events, Length: 814150, dtype: int64

In [6]:
max_sales = df_train['Sales'].max()
df.loc[:, 'Sales_norm'] = df['Sales'].values/max_sales

In [7]:
df_train.loc[:, 'Sales_norm'] = df_train['Sales'].values/max_sales
df_val.loc[:, 'Sales_norm'] = df_val['Sales'].values/max_sales

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [8]:
print('Train:')
print(get_metric(df_train['Sales_norm'], df_train['Sales_norm'].mean()))
print('Val:')
get_metric(df_val['Sales_norm'], df_train['Sales_norm'].mean())

Train:
0.4059077987085825
Val:


0.30213969475255925

In [46]:
def get_mean_by_column(column, sales_str):
    stores_mean = {}
    stores_mean_list = []
    for store, g_df in df_train.groupby(column):
        store_m =  g_df[g_df[sales_str] > 0][sales_str].mean()
        stores_mean[store] = store_m
        stores_mean_list.append(store_m)
    print('Train:', get_metric(df_train[sales_str], df_train[column].apply(stores_mean.get)))
    print('Val:', get_metric(df_val[sales_str], df_val[column].apply(stores_mean.get)))
    return stores_mean, stores_mean_list

In [47]:
_ = get_mean_by_column('Store', 'Sales_norm')

Train: 0.15688835922756375
Val: 0.09435624012794681


In [48]:
_ = get_mean_by_column('DayOfWeek', 'Sales_norm')

Train: 0.36106577837306253
Val: 0.2561769086693393


In [50]:
_ = get_mean_by_column('Month', 'Sales_norm')

Train: 0.4002726878411708
Val: 0.2964421798587632


In [51]:
_ = get_mean_by_column('Week', 'Sales_norm')

Train: 0.3693486112676764
Val: 0.2674408728623329


In [80]:
_ = get_mean_by_column('StateHoliday', 'Sales_norm')

Train: 0.4070020265579082
Val: 0.30186165673507453


In [81]:
_ = get_mean_by_column('SchoolHoliday', 'Sales_norm')

Train: 0.4034957752536508
Val: 0.3050227258443403


In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate, Dense, BatchNormalization, Activation, LeakyReLU, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras import optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical

In [13]:
def rmspe(y_true, y_pred):
    return K.mean(K.square((y_true - y_pred)/y_true))

In [63]:
embed_outs_dict = {'Store': 50, 'DayOfWeek': 2, 'Week': 10, 'Month': 4}
X_columns = list(embed_outs_dict.keys())
if final_train:
    X_train = np.hsplit(df[X_columns].values, len(X_columns))
    y_train = df['Sales_norm']
else:
    X_train = np.hsplit(df_train[X_columns].values, len(X_columns))
    y_train = df_train['Sales_norm']
    
X_val = np.hsplit(df_val[X_columns].values, len(X_columns))
X_test = np.hsplit(df_test[X_columns].values, len(X_columns))

y_val = df_val['Sales_norm']

In [64]:
#X_train.shape, X_val.shape, len(np.unique(X_train))

In [82]:
embed_outs = []
inputs = []
hidden_units = 20
activation = 'relu'
for i, col in enumerate(X_columns):
    inp = Input(shape=(1,), name=f"{col}_input")
    inputs.append(inp)
    embed_out = Embedding(len(np.unique(X_train[i])), embed_outs_dict[col], name=f"{col}_embedding", mask_zero=False)(inp)
    out = Flatten(name=f"{col}_flatten")(embed_out)
    embed_outs.append(out)
if len(X_columns)>1:
    concat_out = Concatenate()(embed_outs)
    dense_out = Dense(hidden_units, activation=activation)(concat_out)
else:
    dense_out = Dense(hidden_units, activation=activation)(out)
out = Dense(1)(dense_out)
model = Model(inputs, out)
model.compile(optimizers.Adam(lr=0.0001), loss='mse', metrics=[rmspe, 'mse'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Store_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
DayOfWeek_input (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
Week_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
Month_input (InputLayer)        [(None, 1)]          0                                            
____________________________________________________________________________________________

In [83]:
# weights = model.get_weights()
# weights[0] = np.array(np.array(stores_mean_list).reshape(-1, 1))
# model.set_weights(weights)

In [84]:
model.evaluate(X_val, y_val)



[0.02367066778242588, 0.6465886235237122, 0.02367066778242588]

In [85]:
epochs = 20
if final_train:
    model.fit(X_train, y_train, epochs=epochs)
else:
    model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
model.evaluate(X_train, y_train)



[0.0008353318553417921, 0.061621442437171936, 0.0008353318553417921]

In [70]:
model.evaluate(X_val, y_val)



[0.0006676019984297454, 0.02359057404100895, 0.0006676019984297454]

In [59]:
train_predictions = model.predict(X_train)*max_sales
get_metric(df_train['Sales'].values, train_predictions.reshape(-1))

0.06149980579432537

In [60]:
test_predictions = model.predict(X_test)*max_sales
test_predictions[df_test['Open'] == 0] = 0

In [61]:
sample_csv = pd.read_csv('dataset/rossmann/sample_submission.csv')
sample_csv['Sales'] = test_predictions
sample_csv.head()

sample_csv.to_csv(f'submision_baseline_{"-".join(X_columns)}.csv', index=False)
