# Import Packages

In [8]:
import sys
sys.path.append('..')
import utils
from itertools import cycle
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
sns.set(style="ticks")
%config IPCompleter.greedy = True


# Import Data

In [9]:
df_calendar, df_sales, df_prices = utils.import_m5_data()

In [10]:
df_sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


# Compute Forecasts

In [11]:
def forecast_snaive(df_sales, fh=28):
    """
    Compute forecasts using the seasonal naive approach.
    
    :param df_sales: pandas.DataFrame containing sales data 
    :param fh: forecast horizon (default: 28)
    :return: pandas.DataFrame containing the forecasts
    """
    fc_dict = {'id': df_sales['id']}
    day = 1
    for i in cycle(reversed(range(1, 8))):
        fc_dict[f'F{day}'] = df_sales.iloc[:, -i]
        if day == fh:
            break
        day += 1
    return pd.DataFrame(fc_dict)

In [12]:
forecasts = forecast_snaive(df_sales)

forecasts

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1,1,1,3,0,1,1,1,1,...,0,1,1,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,1,1,1,0,1,1,1,1,1,...,1,1,1,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,4,1,0,1,3,7,2,4,1,...,3,7,2,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,0,1,1,2,2,2,4,0,1,...,2,2,4,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,2,0,1,0,0,1,0,2,0,...,0,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,0,0,1,0,3,1,3,0,0,...,3,1,3,0,0,1,0,3,1,3


# Submit Predictions

In [13]:
df_submission = forecasts.append(
    forecasts.replace(to_replace='validation', value='evaluation', regex=True))
filename = utils.get_m5_root_dir() + '/data/submissions/snaive.csv'
df_submission.to_csv(filename, index=False)
df_submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1,1,1,3,0,1,1,1,1,...,0,1,1,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,1,1,1,0,1,1,1,1,1,...,1,1,1,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,4,1,0,1,3,7,2,4,1,...,3,7,2,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,0,1,1,2,2,2,4,0,1,...,2,2,4,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_evaluation,2,0,1,0,0,1,0,2,0,...,0,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_evaluation,0,0,1,0,3,1,3,0,0,...,3,1,3,0,0,1,0,3,1,3


In [14]:
# Submit via Kaggle API or with website https://www.kaggle.com/c/m5-forecasting-accuracy/submissions
# !kaggle competitions submit -c m5-forecasting-accuracy -f data/submissions/snaive.csv -m "sNaive"

# Compute MSE for Benchmarking 

## On Full Data Set (Last 28 training days)

In [15]:
# validate performance on last 28 days
df_sales_train = df_sales.iloc[:,:-28]
df_sales_validation = df_sales.iloc[:,-28:]
forecasts = forecast_snaive(df_sales_train)

In [16]:
# convert dataframes to arrays
y_pred_flat = forecasts.drop('id', axis=1).values.flatten()
y_true_flat = df_sales_validation.values.flatten()

In [17]:
print('MSE of predicted values: %0.3f' % mean_squared_error(y_pred_flat, y_true_flat))

MSE of predicted values: 7.615


## On Small Data Set (TX2_HOBBIES approx. 1 Year)

In [18]:
# Create training and validation data set with sales figure. 
# Scope (items & date) equivalent to data\preprocessed\tx2_hobbies_1year.csv

tmp = pd.read_csv(
    f'{utils.get_m5_root_dir()}/data/preprocessed/tx2_hobbies_1year.csv')

relevant_ids = tmp['id'].drop_duplicates()

sub_train = df_sales[df_sales['id'].isin(relevant_ids)].iloc[:,-367:-28]
sub_train.insert(0,'id',relevant_ids.values)
sub_validation = df_sales[df_sales['id'].isin(relevant_ids)].iloc[:,-28:]

In [19]:
sub_forecasts = forecast_snaive(sub_train)

In [20]:
# convert dataframes to arrays
sub_y_pred_flat = sub_forecasts.drop('id', axis=1).values.flatten()
sub_y_true_flat = sub_validation.values.flatten()

In [21]:
print('MSE of predicted values: %0.3f' % mean_squared_error(sub_y_pred_flat, sub_y_true_flat))

MSE of predicted values: 3.565
