# Modeling notebook

In [1]:
import pandas as pd
import numpy as np

testing_models = False

forecast_horizon = 547

from beepy import beep

# set benchmark with fourier series.

In [2]:
%run setup.ipynb

# tune models on a subset of categories

In [3]:

# subset for testing

if testing_models:
    train = train[train.store_item.isin(['1-1', '1-2', '1-3'])]
    test = test[test.store_item.isin(['1-1', '1-2', '1-3'])]
else: 
    from warnings import filterwarnings
    filterwarnings('ignore')
    

# Create df to populate with predictions

In [4]:
from datetime import timedelta

date_list = [train.index[-1] + timedelta(days=x+1) for x in range(forecast_horizon)]
horizon_end_date = date_list[-1]
fcast_begin = date_list[0]

In [5]:
store_item, dates = [], []
for series in train.store_item.unique():
    store_item.append(np.repeat(series, len(date_list)))
    dates.append(date_list)

In [6]:
predictions = pd.DataFrame(
    {
        'store_item': [x for sub in store_item for x in sub],
        'sales': test.sales
    }, index=[x for sub in dates for x in sub]
    )

# Fourier series (benchmark)

In [7]:

# https://notebook.community/statsmodels/statsmodels.github.io/devel/examples/notebooks/generated/deterministics
# from statsmodels.tsa.deterministic import Fourier, Seasonality, TimeTrend
# from statsmodels.tsa.deterministic import DeterministicProcess

# index = temp.index
# tt = TimeTrend(constant=True)
# four = Fourier(period=365.25, order=2)
# seas = Seasonality(period=7)
# det_proc = DeterministicProcess(index, additional_terms=[tt, seas, four])
# det_proc.in_sample().head(28)

In [8]:

from statsmodels.tsa.deterministic import Fourier

fourier_gen = Fourier(11, order=2)

temp = test[test.store_item == "1-1"]
fourier_gen.in_sample(temp.index).sum(axis=1)
# fourier_gen.out_of_sample(365, index=temp.index)


date
2016-07-03    2.000000
2016-07-04    2.706941
2016-07-05    1.425936
2016-07-06   -0.393719
2016-07-07   -1.031247
                ...   
2017-12-27   -0.393719
2017-12-28   -1.031247
2017-12-29   -0.377148
2017-12-30    0.140669
2017-12-31   -0.563104
Length: 547, dtype: float64

# vector autoreg

In [9]:
from statsmodels.tsa.ar_model import AutoReg
from tqdm import tqdm # progressbar

In [10]:
def fit_autoreg(df, fcast, horizion_end):
    preds_autoreg, trouble_series = [], []
    for series in tqdm(df.store_item.unique()):

        temp = df[df.store_item == series]
        temp.index.freq = "d"
        
        try:
            yhat = AutoReg(temp.sales, lags=5, old_names=False, seasonal=True, period=365)\
                .fit()\
                .predict(start = fcast, end = horizion_end)
            preds_autoreg.append(yhat)
        except np.linalg.LinAlgError:
            trouble_series.append(series)
            print(f'series {series} error')
            
    return [x for sub in preds_autoreg for x in sub]  

In [11]:
predictions["autoreg"] = fit_autoreg(df=train, fcast=fcast_begin, horizion_end=horizon_end_date)

100%|██████████| 498/498 [01:03<00:00,  7.80it/s]


Unnamed: 0,store_item,sales,autoreg
2016-07-03,1-1,35,33.967322
2016-07-04,1-1,18,27.360797
2016-07-05,1-1,31,26.177283
2016-07-06,1-1,22,27.303443
2016-07-07,1-1,33,23.602249
...,...,...,...
2017-12-27,10-50,63,54.000000
2017-12-28,10-50,59,48.333333
2017-12-29,10-50,74,41.333333
2017-12-30,10-50,62,43.666667


# exp smoothing

In [12]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [13]:
def exp_smooth_predictor(df, seas, fcast=fcast_begin, horizion_end=horizon_end_date):
    exp_smooth_preds = []
    trouble_series = []
    for series in tqdm(df.store_item.unique()):
        temp = train[df.store_item == series]
        temp.index.freq = "d"
        try:
            preds = ExponentialSmoothing(temp.sales,     
                seasonal_periods=365,
                trend="add",
                seasonal=seas,
                use_boxcox=True,
                initialization_method="estimated")\
            .fit()\
            .predict(start = fcast, end = horizion_end)
            exp_smooth_preds.append(preds)
        except ValueError:
            trouble_series.append(series)
            print(series)

    return [x for sub in exp_smooth_preds for x in sub]  

In [14]:
predictions["exp_smooth"] = exp_smooth_predictor(df=train, seas="add")
# predictions["exp_smooth_multi"] = exp_smooth_predictor(df=train, seas="multiplicative") 
# multiplicative is slow and inaccurate

100%|██████████| 498/498 [13:14<00:00,  1.59s/it]


Unnamed: 0,store_item,sales,autoreg,exp_smooth
2016-07-03,1-1,35,33.967322,34.968360
2016-07-04,1-1,18,27.360797,27.868874
2016-07-05,1-1,31,26.177283,26.861003
2016-07-06,1-1,22,27.303443,28.344893
2016-07-07,1-1,33,23.602249,25.971329
...,...,...,...,...
2017-12-27,10-50,63,54.000000,70.629780
2017-12-28,10-50,59,48.333333,64.050097
2017-12-29,10-50,74,41.333333,56.527002
2017-12-30,10-50,62,43.666667,59.522268


# autoregressive distributed lag ARDL

In [15]:
from statsmodels.tsa.api import ARDL

In [16]:
def ardl_predictor(df, fcast=fcast_begin, horizion_end=horizon_end_date):
    ardl_preds = []
    for series in df.store_item.unique():
        temp = df[df.store_item == series]
        temp.index.freq = "d"
        
        ardl_pred = ARDL(temp.sales, 365, period=365, trend="t")\
            .fit()\
            .predict(start = fcast, end = horizion_end)
        ardl_preds.append(ardl_pred)
    return [x for sub in ardl_preds for x in sub]  

In [17]:
predictions["ardl"] = ardl_predictor(train)

Unnamed: 0,store_item,sales,autoreg,exp_smooth,ardl
2016-07-03,1-1,35,33.967322,34.968360,28.270421
2016-07-04,1-1,18,27.360797,27.868874,21.079731
2016-07-05,1-1,31,26.177283,26.861003,21.245674
2016-07-06,1-1,22,27.303443,28.344893,27.543148
2016-07-07,1-1,33,23.602249,25.971329,29.099083
...,...,...,...,...,...
2017-12-27,10-50,63,54.000000,70.629780,52.203080
2017-12-28,10-50,59,48.333333,64.050097,60.144470
2017-12-29,10-50,74,41.333333,56.527002,60.437527
2017-12-30,10-50,62,43.666667,59.522268,69.928363


# xgboost


In [18]:
from xgboost import XGBRegressor
# pip install xgboost==0.80
# the latest version kept crashing on me

In [19]:
from helper import create_features

In [20]:
def xgb_predictor(df):
    X_pred = create_features(pd.DataFrame(date_list, columns=["date"]))

    reg = XGBRegressor(n_estimators=1000)
    xgb_preds = []
    for series in tqdm(df.store_item.unique()):
        temp = df[df.store_item == series]
        X = create_features(pd.DataFrame(temp.index, columns=["date"]))
        preds = reg.fit(X, temp.sales)\
            .predict(X_pred)
        xgb_preds.append(preds)
    return [x for sub in xgb_preds for x in sub] 

In [21]:
predictions["xgb_preds"] = xgb_predictor(train)

100%|██████████| 498/498 [03:12<00:00,  2.58it/s]


Unnamed: 0,store_item,sales,autoreg,exp_smooth,ardl,xgb_preds
2016-07-03,1-1,35,33.967322,34.968360,28.270421,26.026503
2016-07-04,1-1,18,27.360797,27.868874,21.079731,11.948491
2016-07-05,1-1,31,26.177283,26.861003,21.245674,9.154426
2016-07-06,1-1,22,27.303443,28.344893,27.543148,10.446870
2016-07-07,1-1,33,23.602249,25.971329,29.099083,11.971621
...,...,...,...,...,...,...
2017-12-27,10-50,63,54.000000,70.629780,52.203080,53.286358
2017-12-28,10-50,59,48.333333,64.050097,60.144470,61.416859
2017-12-29,10-50,74,41.333333,56.527002,60.437527,62.223686
2017-12-30,10-50,62,43.666667,59.522268,69.928363,67.933411


# prophet model

In [22]:
from prophet import Prophet
# https://www.youtube.com/watch?v=pOYAXv15r3A

Importing plotly failed. Interactive plots will not work.


In [23]:
def strict_inputs(df):
    df = df.drop("store_item", axis=1)\
        .reset_index()
    df["unique_id"] = series
    df = df.rename(columns={"sales":"y", "date":"ds"})
    return df[["ds", "y", "unique_id"]]

In [24]:
def prophet_predictor(df, date_list=date_list):
    prophet_preds = []
    for series in tqdm(df.store_item.unique()):  
        temp = df[df.store_item == series]
        temp = strict_inputs(temp)
        model = Prophet(daily_seasonality=True)
        model.fit(temp)
        pred_frame = model.make_future_dataframe(periods=len(date_list), include_history=False)
        preds = model.predict(pred_frame)
        prophet_preds.append(preds.yhat)
    return [x for sub in prophet_preds for x in sub] 

In [None]:
predictions["prophet"] = prophet_predictor(df=train)


# Neural prophet

In [26]:
from neuralprophet import NeuralProphet
# https://neuralprophet.com/html/model/README.html

In [27]:
m = NeuralProphet(n_forecasts=1)
def neural_prophet(df, date_list=date_list):
    m_preds = []
    for series in tqdm(df.store_item.unique()):
        temp = df[df.store_item == series]
        temp = strict_inputs(temp)
        temp = temp[["y", "ds"]]
        temp.index.freq = "d"
        m.fit(temp, freq="D")
        future = m.make_future_dataframe(temp, periods=len(date_list))
        forecast = m.predict(future)
        m_preds.append(forecast.yhat1)
    return [x for sub in m_preds for x in sub] 

In [None]:
from neuralprophet import set_random_seed 
set_random_seed(0)

In [None]:
predictions["neural_prophet"] = neural_prophet(train)

In [None]:
store_sales = pd.concat([train, predictions])

In [None]:
from datetime import date

today = date.today()

In [None]:
predictions.to_csv(f"../data/predictions/predictions-{today.month}-{today.day}.csv")

# plot results

In [None]:
import matplotlib.pyplot as plt

In [None]:

def fcastplotter(x):
        
    for store in store_sales.store_item.unique():
        temp = store_sales[store_sales.store_item == store]
        plt.figure()
        plt.plot(temp.sales, "-b", label = "sales")
        plt.plot(temp[x],  "-r", label = f"Forecast {x} ")
        plt.legend(loc="upper left")
        plt.ylim([0, None])
        plt.title(f"store {store}")
        plt.show()

In [None]:
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})

# for forecast in store_sales.columns[2:]:
#     fcastplotter(forecast)


In [None]:
from beepy import beep
beep()