# Modeling notebook

In [1]:
import pandas as pd
import numpy as np

testing_models = True

forecast_horizon = 547

from beepy import beep
# TODO:
# Add forecasting models that work
# Determine when a forecasting model works given a time series features.

In [2]:
%run setup.ipynb

# tune models on a subset of categories

In [3]:

# subset for testing

if testing_models:
    train = train[train.store_item.isin(['1-1', '1-2', '1-3'])]
    test = test[test.store_item.isin(['1-1', '1-2', '1-3'])]
else: 
    from warnings import filterwarnings
    filterwarnings('ignore')
    

# Create df to populate with predictions

In [4]:
from datetime import timedelta

date_list = [train.index[-1] + timedelta(days=x+1) for x in range(forecast_horizon)]
horizon_end_date = date_list[-1]
fcast_begin = date_list[0]

In [5]:
store_item, dates = [], []
for series in train.store_item.unique():
    store_item.append(np.repeat(series, len(date_list)))
    dates.append(date_list)

In [6]:
predictions = pd.DataFrame(
    {
        'store_item': [x for sub in store_item for x in sub],
        'sales': test.sales
    }, index=[x for sub in dates for x in sub]
    )

# vector autoreg

In [7]:
from statsmodels.tsa.ar_model import AutoReg

In [8]:
def fit_autoreg(df, fcast, horizion_end):
    preds_autoreg, trouble_series = [], []
    for series in df.store_item.unique(): 

        temp = df[df.store_item == series]
        temp.index.freq = "d"
        
        try:
            yhat = AutoReg(temp.sales, lags=5, old_names=False, seasonal=True, period=365)\
                .fit()\
                .predict(start = fcast, end = horizion_end)
            preds_autoreg.append(yhat)
        except np.linalg.LinAlgError:
            trouble_series.append(series)
            print(f'series {series} error')
            
    return [x for sub in preds_autoreg for x in sub]  

In [9]:
predictions["autoreg"] = fit_autoreg(df=train,fcast=fcast_begin, horizion_end=horizon_end_date)

# exp smoothing

In [10]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from tqdm import tqdm # progressbar

In [11]:
def exp_smooth_predictor(df, seas, fcast=fcast_begin, horizion_end=horizon_end_date, progress_bar=False):
    exp_smooth_preds = []
    trouble_series = []
    for series in tqdm(df.store_item.unique(), disable= not progress_bar):
        temp = train[df.store_item == series]
        temp.index.freq = "d"
        try:
            preds = ExponentialSmoothing(temp.sales,     
                seasonal_periods=365,
                trend="add",
                seasonal=seas,
                use_boxcox=True,
                initialization_method="estimated")\
            .fit()\
            .predict(start = fcast, end = horizion_end)
            exp_smooth_preds.append(preds)
        except ValueError:
            trouble_series.append(series)
            print(series)

    return [x for sub in exp_smooth_preds for x in sub]  

In [12]:
predictions["exp_smooth"] = exp_smooth_predictor(df=train, seas="add")
predictions["exp_smooth_multi"] = exp_smooth_predictor(df=train, seas="multiplicative")



# ardl autoregressive distributed lag

In [13]:
from statsmodels.tsa.api import ARDL

In [14]:
def ardl_predictor(df, fcast=fcast_begin, horizion_end=horizon_end_date):
    ardl_preds = []
    for series in df.store_item.unique():
        temp = df[df.store_item == series]
        temp.index.freq = "d"
        
        ardl_pred = ARDL(temp.sales, 365, period=365, trend="t")\
            .fit()\
            .predict(start = fcast, end = horizion_end)
        ardl_preds.append(ardl_pred)
    return [x for sub in ardl_preds for x in sub]  

In [15]:
predictions["ardl"] = ardl_predictor(train)

# prophet model

In [16]:
from prophet import Prophet
# https://www.youtube.com/watch?v=pOYAXv15r3A

Importing plotly failed. Interactive plots will not work.


In [17]:
def strict_inputs(df):
    df = df.drop("store_item", axis=1)\
        .reset_index()
    df["unique_id"] = series
    df = df.rename(columns={"sales":"y", "date":"ds"})
    return df[["ds", "y", "unique_id"]]

In [18]:
def prophet_predictor(df, date_list=date_list):
    prophet_preds = []
    for series in df.store_item.unique():  
        temp = df[df.store_item == series]
        temp = strict_inputs(temp)
        model = Prophet(daily_seasonality=True)
        model.fit(temp)
        pred_frame = model.make_future_dataframe(periods=len(date_list), include_history=False)
        preds = model.predict(pred_frame)
        prophet_preds.append(preds.yhat)
    return [x for sub in prophet_preds for x in sub] 

In [19]:
predictions["prophet"] = prophet_predictor(df=train)

Initial log joint probability = -23.8441
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      94       2304.99    0.00059656       212.127    4.31e-06       0.001      145  LS failed, Hessian reset 
      99       2305.17   0.000825136       89.4793      0.9746      0.9746      150   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     193        2305.9    0.00037808       161.031   2.206e-06       0.001      316  LS failed, Hessian reset 
     199       2306.19    0.00069264       73.7016      0.3052           1      323   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     243       2306.29   4.25779e-05       86.2633   5.088e-07       0.001      415  LS failed, Hessian reset 
     271       2306.29   1.51602e-07       79.5501           1           1      452   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is

# xgboost

In [20]:
from xgboost import XGBRegressor
# pip install xgboost==0.80
# the latest version kept crashing on me

In [21]:
from helper import create_features

In [45]:
def xgb_predictor(df):
    X_pred = create_features(pd.DataFrame(date_list, columns=["date"]))

    reg = XGBRegressor(n_estimators=1000)
    xgb_preds = []
    for series in tqdm(df.store_item.unique()):
        temp = df[df.store_item == series]
        
        X = create_features(pd.DataFrame(temp.index, columns=["date"]))
        
        preds = reg.fit(X, temp.sales)\
            .predict(X_pred)
        xgb_preds.append(preds)
    return [x for sub in xgb_preds for x in sub] 

In [46]:
predictions["xgb_preds"] = xgb_predictor(train)

100%|██████████| 3/3 [00:01<00:00,  2.73it/s]


In [47]:
from neuralprophet import NeuralProphet

In [None]:
store_sales = pd.concat([train, predictions])

In [None]:
from datetime import date

today = date.today()

In [None]:
predictions.to_csv(f"../data/predictions/predictions-{today.month}-{today.day}.csv")

# plot results

In [None]:
import matplotlib.pyplot as plt

In [None]:

def fcastplotter(x):
        
    for store in store_sales.store_item.unique():
        temp = store_sales[store_sales.store_item == store]
        plt.figure()
        plt.plot(temp.sales, "-b", label = "sales")
        plt.plot(temp[x],  "-r", label = f"Forecast {x} ")
        plt.legend(loc="upper left")
        plt.ylim([0, None])
        plt.title(f"store {store}")
        plt.show()

In [None]:
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})

# for forecast in store_sales.columns[2:]:
#     fcastplotter(forecast)


In [None]:
from beepy import beep
beep()