In [1]:
# Standard Imports
import os
import pandas as pd
import sys

# Modelling
from forecast_tools.baseline import Naive1, SNaive, Average, Drift, EnsembleNaive
import statsmodels.api as sm
from pmdarima import auto_arima, ARIMA


# Visualisation
import plotly.express as px
import plotly.graph_objects as go

# Src modules
sys.path.append(os.path.dirname(os.getcwd())) # Add the parent directory to the Python path so we can import src modules
from src.data_setup import get_data, get_daily_sales
from src.model_evaluation import model_eval_pipeline

In [6]:
# load train and test data
train, test, stores, transactions = get_data()
daily_sales = get_daily_sales(train).reset_index()
print(f'train: {train.shape[0]:,d} rows')

train: 3,000,888 rows


## EDA
---

In [7]:
# plot daily sales in a line chart
px.line(daily_sales.query('sales>100000').sort_values(["date"]), x='date', y='sales',title = "Daily Sales -- Jan 2013 - Aug 2017" )

In [8]:
# plot the daily sales by month
px.box(daily_sales, x="year", y="sales", color = "month", title = "Daily Sales per Month")

In [9]:
# plot daily sales by day of the week
a = daily_sales.groupby(["year", "day_of_week"]).sales.mean().reset_index()
px.line(a, x="day_of_week", y="sales" , color = "year", title = "Average Sales per Day of Week")

## Modelling
---

In [10]:
# Split training and validation data
# train data is before 2017, validation data is after 2017
SPLIT_YEAR = 2017
val = train.query(f'year >= {SPLIT_YEAR}')
train = train.query(f'year < {SPLIT_YEAR}')
print(f'train: {train.shape[0]:,d} rows')
print(f'val: {val.shape[0]:,d} rows')

train: 2,596,374 rows
val: 404,514 rows


In [14]:
# get the daily sales
val_daily_sales = get_daily_sales(val)
train_daily_sales = get_daily_sales(train)

In [13]:
# run a pipeline of models
models = [
    ('naive', Naive1()),
    ('snaive', SNaive(30)),
    ('drift', Drift()),
    ('average', Average())
]

# dict to store model perfomance and predictions
performance = {}

In [15]:
# proccess all models in the pipeline
for name, model in models:
    model.fit(train_daily_sales['sales'])
    preds = model.predict(horizon=len(val_daily_sales['sales']))
    performance[name] = {}
    performance[name]['preds'] = preds
    performance[name]['metrics'] = model_eval_pipeline(preds, val_daily_sales['sales'].values)
    
    

In [16]:
# train an arima model
model_arima_original = auto_arima(train_daily_sales['sales'],  seasonal=True, m=7,
                    suppress_warnings=True, 
                    error_action='ignore')
print('fit arima model')
preds_arima = model_arima_original.predict(n_periods=len(val_daily_sales))
performance['arima'] = {}    
performance['arima']['preds'] = preds_arima
performance['arima']['metrics'] = model_eval_pipeline(preds_arima, val_daily_sales['sales'].values)

fit arima model



No supported index is available. Prediction results will be given with an integer index beginning at `start`.



In [17]:
# plot model performance
traces = []
traces.append(go.Scatter(x=val_daily_sales.index, y=val_daily_sales['sales'], mode='lines', name='Actual'))

for model, mdict in performance.items():
    traces.append(go.Scatter(x=val_daily_sales.index, y=mdict['preds'], mode='lines', name=model))
    
layout = go.Layout(title='Predictions vs Actual', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
figure = go.Figure(data=traces, layout=layout)
figure.show()

In [18]:
# plot model performance without arima
traces = []
traces.append(go.Scatter(x=val_daily_sales.index, y=val_daily_sales['sales'], mode='lines', name='Actual'))

for model, mdict in performance.items():
    if model != 'arima':
        traces.append(go.Scatter(x=val_daily_sales.index, y=mdict['preds'], mode='lines', name=model))
    
layout = go.Layout(title='Predictions vs Actual', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
figure = go.Figure(data=traces, layout=layout)
figure.show()

In [19]:
# print model performance results as a dataframe
res = pd.DataFrame().from_dict({model: mdict['metrics'] for model, mdict in performance.items()}).T
res.sort_values('rmse')

Unnamed: 0,mae,mse,rmse,rmsle,r2
snaive,221228.1,76766570000.0,277067.8,0.40531,-1.622443
average,257443.8,95807580000.0,309528.0,0.462902,0.0
naive,273791.6,96534050000.0,310699.3,0.446778,0.0
drift,349868.0,150468000000.0,387902.1,0.498026,-59.673791
arima,1489355.0,2892953000000.0,1700868.0,1.084887,-3.436333


## Conclusion
---
Ultimately the snaive was the best performing baseline model with an RMSLE of 0.40 on the validation data