> **Problem overview**

This competition is provided as a way to explore different time series techniques on a relatively simple and clean dataset. You are given 5 years of store-item sales data, and asked to predict 3 months of sales for 50 different items at 10 different stores.

What's the best way to deal with seasonality? Should stores be modeled separately, or can you pool them together? Does deep learning work better than ARIMA? Can either beat xgboost?

In [None]:
# import data manipulation library
import numpy as np
import pandas as pd

# import data visualization library
import matplotlib.pyplot as plt
import seaborn as sns

# import scientific computing library
import statsmodels.api as sm

# import xgboost model class
import xgboost as xgb

# import sklearn model selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# import sklearn model evaluation regression metrics
from sklearn.metrics import mean_squared_error

> **Acquiring training and testing data**

We start by acquiring the training and testing datasets into Pandas DataFrames.

In [None]:
# acquiring training and testing data
df_train = pd.read_csv('../input/train.csv', parse_dates=['date'], index_col='date')
df_test = pd.read_csv('../input/test.csv', parse_dates=['date'], index_col='date')

In [None]:
# visualize head of the training data
df_train.head(n=5)

In [None]:
# visualize tail of the testing data
df_test.tail(n=5)

In [None]:
# combine training and testing dataframe
df_train['datatype'], df_test['datatype'] = 'training', 'testing'
df_train.insert(0, 'id', 0)
df_test.insert(df_test.shape[1] - 1, 'sales', np.nan)
df_data = pd.concat([df_train, df_test], ignore_index=False)

> **Feature exploration, engineering and cleansing**

Here we generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset抯 distribution together with exploring some data.

In [None]:
# countplot function plot - categorical variable (x-axis) vs. categorical variable (y-axis)
def countplot(x = None, y = None, data = None, ncols = 5, nrows = 3):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    for i, v in enumerate(x): sns.countplot(x=v, hue=y, data=data, ax=axes[i])

In [None]:
# boxplot function plot - categorical variable (x-axis) vs. numerical variable (y-axis)
def boxplot(cat = None, num = None, data = None, ncols = 5, nrows = 3):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    if type(cat) == list:
        for i, v in enumerate(cat): sns.boxplot(x=v, y=num, data=data, ax=axes[i])
    else:
        for i, v in enumerate(num): sns.boxplot(x=cat, y=v, data=data, ax=axes[i])

In [None]:
# boxplot function sorted plot - categorical variable (x-axis) vs. numerical variable (y-axis)
def boxplotsort(cat = None, num = None, data = None, ncols = 5, nrows = 3, orderby='median'):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    if type(cat) == list:
        for i, v in enumerate(cat): sns.boxplot(x=v, y=num, data=data, ax=axes[i], order=data.groupby([v], as_index=True).agg({num: orderby}).sort_values(num).index)
    else:
        for i, v in enumerate(num): sns.boxplot(x=cat, y=v, data=data, ax=axes[i], order=data.groupby([cat], as_index=True).agg({v: orderby}).sort_values(v).index)

In [None]:
# swarmplot function plot - categorical variable (x-axis) vs. numerical variable (y-axis)
def swarmplot(cat = None, num = None, data = None, ncols = 5, nrows = 3):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    if type(cat) == list:
        for i, v in enumerate(cat): sns.swarmplot(x=v, y=num, data=data, ax=axes[i])
    else:
        for i, v in enumerate(num): sns.swarmplot(x=cat, y=v, data=data, ax=axes[i])

In [None]:
# violinplot function plot - categorical variable (x-axis) vs. numerical variable (y-axis)
def violinplot(cat = None, num = None, data = None, ncols = 5, nrows = 3):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    if type(cat) == list:
        for i, v in enumerate(cat): sns.violinplot(x=v, y=num, data=data, ax=axes[i])
    else:
        for i, v in enumerate(num): sns.violinplot(x=cat, y=v, data=data, ax=axes[i])

In [None]:
# violinplot function sorted plot - categorical variable (x-axis) vs. numerical variable (y-axis)
def violinplotsort(cat = None, num = None, data = None, ncols = 5, nrows = 3, orderby='median'):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    if type(cat) == list:
        for i, v in enumerate(cat): sns.violinplot(x=v, y=num, data=data, ax=axes[i], order=data.groupby([v], as_index=True).agg({num: orderby}).sort_values(num).index)
    else:
        for i, v in enumerate(num): sns.violinplot(x=cat, y=v, data=data, ax=axes[i], order=data.groupby([cat], as_index=True).agg({v: orderby}).sort_values(v).index)

In [None]:
# scatterplot function plot - numerical variable (x-axis) vs. numerical variable (y-axis)
def scatterplot(x = None, y = None, data = None, ncols = 5, nrows = 3):
    fig, axes = plt.subplots(figsize=(4*ncols , 3*nrows), ncols=ncols, nrows=nrows)
    axes = axes.flatten()
    for i, xi in enumerate(x): sns.scatterplot(x=xi, y=y, data=data, ax=axes[i])

In [None]:
# describe training and testing data
df_data.describe(include='all')

In [None]:
# feature exploration: histogram of all numeric features
_ = df_data.hist(bins=20, figsize=(10, 6))

In [None]:
# feature exploration: season for store 1 to 10 and item 1
for i in range(1, 11):
    fig, axes = plt.subplots(figsize=(20, 3))
    _ = df_data.loc[(df_data['store'] == i) & (df_data['item'] == 1) & (df_data['datatype'] == 'training'), 'sales'].plot()
    axes.set_title('store %d, item %d' %(i, 1))

In [None]:
# feature exploration: seasonal decompose for store 5 and item 1
seasonal = sm.tsa.seasonal_decompose(df_data.loc[(df_data['store'] == 5) & (df_data['item'] == 1) & (df_data['datatype'] == 'training'), 'sales']).plot()
seasonal.set_figwidth(20)
seasonal.set_figheight(15)
plt.tight_layout(); plt.show()

In [None]:
# feature extraction: combination of keyword date
df_data['date'] = df_data.index
df_data['year'] = df_data['date'].dt.year - 2000
df_data['quarter'] = df_data['date'].dt.quarter
df_data['month'] = df_data['date'].dt.month
df_data['weekofyear'] = df_data['date'].dt.weekofyear
df_data['dayofweek'] = df_data['date'].dt.dayofweek

In [None]:
# feature extraction: statistic features for store, item and quarter
df_data['item_quarter_mean'] = df_data.groupby(['quarter', 'item'])['sales'].transform('mean')
df_data['store_quarter_mean'] = df_data.groupby(['quarter', 'store'])['sales'].transform('mean')
df_data['store_item_quarter_mean'] = df_data.groupby(['quarter', 'store', 'item'])['sales'].transform('mean')

In [None]:
# feature extraction: statistic features for store, item and month
df_data['item_month_mean'] = df_data.groupby(['month', 'item'])['sales'].transform('mean')
df_data['store_month_mean'] = df_data.groupby(['month', 'store'])['sales'].transform('mean')
df_data['store_item_month_mean'] = df_data.groupby(['month', 'store', 'item'])['sales'].transform('mean')

In [None]:
# feature extraction: statistic features for store, item and weekofyear
df_data['item_weekofyear_mean'] = df_data.groupby(['weekofyear', 'item'])['sales'].transform('mean')
df_data['store_weekofyear_mean'] = df_data.groupby(['weekofyear', 'store'])['sales'].transform('mean')
df_data['store_item_weekofyear_mean'] = df_data.groupby(['weekofyear', 'store', 'item'])['sales'].transform('mean')

In [None]:
# feature extraction: statistic features for store, item and dayofweek
df_data['item_dayofweek_mean'] = df_data.groupby(['dayofweek', 'item'])['sales'].transform('mean')
df_data['store_dayofweek_mean'] = df_data.groupby(['dayofweek', 'store'])['sales'].transform('mean')
df_data['store_item_dayofweek_mean'] = df_data.groupby(['dayofweek', 'store', 'item'])['sales'].transform('mean')

In [None]:
# feature extraction: shifted features for store, item and weekofyear shift 90 days
df_data['store_item_shift90'] = df_data.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(90))
df_data['item_weekofyear_shift90_mean'] = df_data.groupby(['weekofyear', 'item'])['sales'].transform(lambda x: x.shift(13).mean())
df_data['store_weekofyear_shift90_mean'] = df_data.groupby(['weekofyear', 'store'])['sales'].transform(lambda x: x.shift(13).mean())

In [None]:
# feature extraction: shifted features for store, item and weekofyear shift 180 days
df_data['store_item_shift180'] = df_data.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(180))
df_data['item_weekofyear_shift180_mean'] = df_data.groupby(['weekofyear', 'item'])['sales'].transform(lambda x: x.shift(26).mean())
df_data['store_weekofyear_shift180_mean'] = df_data.groupby(['weekofyear', 'store'])['sales'].transform(lambda x: x.shift(26).mean())

In [None]:
# feature extraction: shifted features for store, item and weekofyear shift 270 days
df_data['store_item_shift270'] = df_data.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(270))
df_data['item_weekofyear_shift270_mean'] = df_data.groupby(['weekofyear', 'item'])['sales'].transform(lambda x: x.shift(39).mean())
df_data['store_weekofyear_shift270_mean'] = df_data.groupby(['weekofyear', 'store'])['sales'].transform(lambda x: x.shift(39).mean())

In [None]:
# feature extraction: shifted features for store, item and weekofyear shift 365 days
df_data['store_item_shift365'] = df_data.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(365))
df_data['item_weekofyear_shift365_mean'] = df_data.groupby(['weekofyear', 'item'])['sales'].transform(lambda x: x.shift(52).mean())
df_data['store_weekofyear_shift365_mean'] = df_data.groupby(['weekofyear', 'store'])['sales'].transform(lambda x: x.shift(52).mean())

In [None]:
# feature extraction: fillna with 0
col_fillnas = ['store_item_shift90', 'store_item_shift180', 'store_item_shift270', 'store_item_shift365']
df_data[col_fillnas] = df_data[col_fillnas].fillna(0)

In [None]:
# feature exploration: sales
col_number = df_data.select_dtypes(include=['number']).columns.drop(['id']).tolist()
scatterplot(x=col_number, y='sales', data=df_data[df_data['datatype'] == 'training'], nrows=(len(col_number) - 1) // 5 + 1)

After extracting all features, it is required to convert category features to numerics features, a format suitable to feed into our Machine Learning models.

In [None]:
# feature extraction: fillna with 0
df_data['sales'] = df_data['sales'].fillna(0)

In [None]:
# convert category codes for data dataframe
df_data = pd.get_dummies(df_data, columns=None, drop_first=True)

In [None]:
# describe data dataframe
df_data.describe(include='all')

In [None]:
# verify dtypes object
df_data.info()

> **Analyze and identify patterns by visualizations**

Let us generate some correlation plots of the features to see how related one feature is to the next. To do so, we will utilize the Seaborn plotting package which allows us to plot very conveniently as follows.

The Pearson Correlation plot can tell us the correlation between features with one another. If there is no strongly correlated between features, this means that there isn't much redundant or superfluous data in our training data. This plot is also useful to determine which features are correlated to the observed value.

The pairplots is also useful to observe the distribution of the training data from one feature to the other.

The pivot table is also another useful method to observe the impact between features.

In [None]:
# compute pairwise correlation of columns, excluding NA/null values and present through heat map
corr = df_data[df_data['datatype_training'] == 1].corr()
fig, axes = plt.subplots(figsize=(200, 150))
heatmap = sns.heatmap(corr, annot=True, cmap=plt.cm.RdBu, fmt='.1f', square=True, vmin=-0.8, vmax=0.8)

> **Model, predict and solve the problem**

Now, it is time to feed the features to Machine Learning models.

In [None]:
# symmetric mean absolute percentage error (mape) function
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    condition = (y_true > 0) & (y_pred > 0)
    return np.mean(2 * np.abs((y_pred[condition] - y_true[condition])) / (np.abs(y_pred[condition]) + np.abs(y_true[condition]))) * 100

In [None]:
# symmetric mean absolute percentage error (mape) scoring function
def symmetric_mean_absolute_percentage_error_scoring(model, x, y):
    y_pred = model.predict(x)
    return symmetric_mean_absolute_percentage_error(y, y_pred)

In [None]:
# select all features
x = df_data[df_data['datatype_training'] == 1].drop(['id', 'sales', 'date', 'datatype_training'], axis=1)
y = df_data.loc[df_data['datatype_training'] == 1]['sales']

In [None]:
# perform train-test (validate) split
x_train, x_validate, y_train, y_validate = train_test_split(x, y, random_state=58, test_size=0.25)

In [None]:
# xgboost regression model setup
model_xgbreg = xgb.XGBRegressor(booster='gbtree', learning_rate=0.1, n_estimators=1000, objective='reg:linear', random_state=58,
                                colsample_bytree=0.9, max_depth=5, reg_alpha=0.1, reg_lambda=0.9, subsample=0.9)

# xgboost regression model fit
model_xgbreg.fit(x_train, y_train, early_stopping_rounds=50, eval_set=[(x_validate, y_validate)], verbose=False,
                 callbacks=[xgb.callback.print_evaluation(period=50)])

# xgboost regression model prediction
model_xgbreg_ypredict = model_xgbreg.predict(x_validate)

# xgboost regression model metrics
model_xgbreg_mape = symmetric_mean_absolute_percentage_error(y_validate, model_xgbreg_ypredict)
print('xgboost regression\n  symmetric mean absolute percentaged error: %0.4f' %model_xgbreg_mape)

> **Supply or submit the results**

Our submission to the competition site Kaggle is ready. Any suggestions to improve our score are welcome.

In [None]:
# model selection
final_model = model_xgbreg

# prepare testing data and compute the observed value
x_test = df_data[df_data['datatype_training'] == 0].drop(['id', 'sales', 'date', 'datatype_training'], axis=1)
y_test = pd.DataFrame(final_model.predict(x_test),
                      columns=['sales'], index=df_data.loc[df_data['datatype_training'] == 0, 'id'])

In [None]:
# summit the results
out = pd.DataFrame({'id': y_test.index, 'sales': y_test['sales']})
out.to_csv('submission.csv', index=False)