# Modeling notebook

In [2]:
import pandas as pd
import numpy as np

testing_models = False

from beepy import beep
# TODO:
# Add forecasting models that work
# Determine when a forecasting model works given a time series features.

In [3]:
%run setup.ipynb

# tune models on a subset of categories

In [4]:

# subset for testing

if testing_models:
    train = train[train.store_item.isin(['1-1', '1-2', '1-3'])]
    test = test[test.store_item.isin(['1-1', '1-2', '1-3'])]
else: 
    from warnings import filterwarnings
    filterwarnings('ignore')
    

# vector autoreg

In [5]:
from statsmodels.tsa.ar_model import AutoReg

In [7]:
# generalize this a bit later, 
preds_autoreg = []
store_item = []
for series in train.store_item.unique(): 
    temp = train[train.store_item == series]
    temp.index.freq = "d"
    
    model = AutoReg(temp.sales, lags=10, old_names=False, seasonal=True, period=365)
    
    try:
        model_fit = model.fit()
        yhat = model_fit.predict(start = test.index[0], end = test.index[-1])
        preds_autoreg.append(yhat)
        item = np.repeat(series, len(yhat))
        store_item.append(item)
    except np.linalg.LinAlgError:
        print(f'series {series}')

In [8]:
dates = []
for item in preds_autoreg:
    for sub in item.index:
        dates.append(sub)

In [9]:
predictions = pd.DataFrame(
    {
        'store_item': [x for sub in store_item for x in sub],
        'sales': test.sales,
        'autoreg': [x for sub in preds_autoreg for x in sub]     
    }, index = dates
)

# exp smoothing

In [10]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [11]:

from tqdm import tqdm # progressbar

In [12]:
def exp_smooth_predictor(seas="multiplicative"):
    exp_smooth_preds = []
    trouble_series = []
    
    for item in tqdm(train.store_item.unique()):
        temp = train[train.store_item == item]
        temp.index.freq = "d"
        try:
            preds = ExponentialSmoothing(temp.sales,     
                seasonal_periods=365,
                trend="add",
                seasonal=seas,
                use_boxcox=True,
                initialization_method="estimated")\
            .fit()\
            .predict(start = test.index[0], end = test.index[-1])
            exp_smooth_preds.append(preds)
        except ValueError:
            trouble_series.append(item)
            print(item)

            
    return [x for sub in exp_smooth_preds for x in sub]  

In [13]:
# predictions["exp_smooth"] = exp_smooth_predictor(seas="add")
# predictions["exp_smooth_multi"] = exp_smooth_predictor()
# exp_smooth_preds = predictions[["exp_smooth", "exp_smooth_multi"]]
# exp_smooth_preds.to_csv("../data/predictions/exp_smooth_preds.csv")

100%|██████████| 498/498 [13:35<00:00,  1.64s/it]
100%|██████████| 498/498 [1:05:39<00:00,  7.91s/it]


In [6]:
exp_smooth_preds = pd.read_csv("../data/predictions/exp_smooth_preds.csv")

predictions[["exp_smooth", "exp_smooth_multi"]] = exp_smooth_preds[["exp_smooth", "exp_smooth_multi"]]

NameError: name 'predictions' is not defined

# ardl autoregressive distributed lag

In [22]:
from statsmodels.tsa.api import ARDL

In [24]:
ardl_preds = []

for item in train.store_item.unique():
    temp = train[train.store_item == item]
    temp.index.freq = "d"
    
    ar = ARDL(temp.sales, 365, period=365, trend="t")\
        .fit()\
        .predict(start = test.index[0], end = test.index[-1])
    ardl_preds.append(ar)
    
predictions["ardl"] = [x for sub in ardl_preds for x in sub]  

100%|██████████| 498/498 [02:35<00:00,  3.21it/s]


# prophet model

In [25]:
from prophet import Prophet

Importing plotly failed. Interactive plots will not work.


In [26]:
# https://www.youtube.com/watch?v=pOYAXv15r3A
model = Prophet()


In [27]:
forecast_h = test.index[-1] - test.index[0]

In [28]:
prophet_preds = []
for item in train.store_item.unique():
    
    tester = train[train.store_item == item]
    tester.reset_index(inplace=True)

    tester["unique_id"] = item
    tester.drop("store_item", axis=1, inplace=True)
    tester.rename(columns={"sales":"y", "date":"ds"}, inplace=True)
    
    model = Prophet(daily_seasonality=True)
    model.fit(tester)
    pred_frame = model.make_future_dataframe(periods=forecast_h.days + 1, include_history=False)
    preds = model.predict(pred_frame)
    prophet_preds.append(preds.yhat)


Initial log joint probability = -23.8441
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      94       2304.99    0.00059656       212.127    4.31e-06       0.001      145  LS failed, Hessian reset 
      99       2305.17   0.000825136       89.4793      0.9746      0.9746      150   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     193        2305.9    0.00037808       161.031   2.206e-06       0.001      316  LS failed, Hessian reset 
     199       2306.19    0.00069264       73.7016      0.3052           1      323   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     243       2306.29   4.25779e-05       86.2633   5.088e-07       0.001      415  LS failed, Hessian reset 
     271       2306.29   1.51602e-07       79.5501           1           1      452   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is

In [29]:
predictions["prophet"] = [x for sub in prophet_preds for x in sub] 

# xgboost

In [31]:
from xgboost import XGBRegressor
# pip install xgboost==0.80
# the latest version kept crashing on me

In [32]:
def create_features(df):
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    
    return X

In [33]:
reg = XGBRegressor(n_estimators=1000)

xgb_preds = []
for item in tqdm(train.store_item.unique()):
    tmp = train[train.store_item == item]
    test_df = test[test == item] # this is not cheating cuz we will know days and such
    
    feat_frame = create_features(tmp)
    tester = create_features(test_df)
    
    reg.fit(feat_frame, tmp.sales)
    all_preds = reg.predict(tester)
    xgb_preds.append(all_preds[-forecast_h.days-1:])

100%|██████████| 498/498 [49:49<00:00,  6.00s/it]


In [34]:
predictions["xgb_preds"] = [x for sub in xgb_preds for x in sub] 

In [36]:
store_sales = pd.concat([train, predictions])

In [8]:
from datetime import date

today = date.today()

In [9]:
predictions.to_csv(f"../data/predictions/predictions-{today.month}-{today.day}.csv")

# plot results

In [12]:
import matplotlib.pyplot as plt

In [13]:

def fcastplotter(x):
        
    for store in store_sales.store_item.unique():
        temp = store_sales[store_sales.store_item == store]
        plt.figure()
        plt.plot(temp.sales, "-b", label = "sales")
        plt.plot(temp[x],  "-r", label = f"Forecast {x} ")
        plt.legend(loc="upper left")
        plt.ylim([0, None])
        plt.title(f"store {store}")
        plt.show()

In [14]:
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})

# for forecast in store_sales.columns[2:]:
#     fcastplotter(forecast)


In [None]:
from beepy import beep
beep()