# Lecture 23           
                                            
## Forecasting a time-series object         
   - Data munging with time-series (ts)
   - Descriptive graphs for ts         
     - analyzing different periods     
       to understand seasonality       
   - Sample splitting with ts          
   - Simple time-series models with:   
     - deterministic trend/seasonality 
   - Cross-validation with time-series 
   - prophet package                   
   - Forecasting                       
     - comparing model based on        
       forecasting performance (RMSE)  
     - graphical representation             

Case-studies:

   - CH18A Forecasting daily ticket sales for a swimming pool   
                                             
Data used:

    swim-transactions                  

___

In [None]:
import pandas as pd
import numpy as np
import warnings
import sys
import os

import pandas_market_calendars as mcal
from datetime import datetime
from plotnine import *
from mizani.formatters import date_format
from patsy import dmatrices
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

### Read data

In [None]:
daily_agg = pd.read_csv("https://osf.io/5qyfv/download", parse_dates=["date"])

In [None]:
daily_agg.head()

Add multiple time variables:
  - year, quarter, month and day
  - add weekdays and weekend

In [None]:
daily_agg["year"] = daily_agg["date"].dt.year
daily_agg["quarter"] = daily_agg["date"].dt.quarter
daily_agg["month"] = daily_agg["date"].dt.month
daily_agg["day"] = daily_agg["date"].dt.day
daily_agg["dow"] = daily_agg["date"].dt.dayofweek + 1
daily_agg["weekend"] = daily_agg["dow"].isin([6, 7])

School off days -> specific dates (domain knowledge), need to know the US state specific schools

In [None]:
daily_agg["school_off"] = (
    ((daily_agg["day"] > 15) & (daily_agg["month"] == 5) & (daily_agg["day"] <= 30))
    | ((daily_agg["month"] == 6) | (daily_agg["month"] == 7))
    | ((daily_agg["day"] < 15) & (daily_agg["month"] == 8))
    | ((daily_agg["day"] > 20) & (daily_agg["month"] == 12))
)

Add a trend variable (1 to number of observations)

In [None]:
daily_agg["trend"] = daily_agg.index + 1

Get holiday calendar

In [None]:
nyse = mcal.get_calendar("NYSE")

holidays = nyse.holidays().holidays

daily_agg["isHoliday"] = daily_agg["date"].isin(holidays)

In [None]:
daily_agg.describe()

Alternative summary with skimpy

In [None]:
from skimpy import skim

skim(daily_agg)

### Define vars for analysis

Add a monthly average quantity sold

In [None]:
daily_agg["q_month"] = daily_agg.groupby("month")["QUANTITY"].transform("mean")

Create a log quantity with adjusting below 1

In [None]:
daily_agg["QUANTITY2"] = np.where(daily_agg["QUANTITY"] < 1, 1, daily_agg["QUANTITY"])

daily_agg["q_ln"] = np.log(daily_agg["QUANTITY2"])

Create tickets variable as quantity sold for each day for given months

In [None]:
daily_agg["tickets"] = daily_agg.groupby(["month", "dow"])["QUANTITY"].transform("mean")

daily_agg["tickets_ln"] = daily_agg.groupby(["month", "dow"])["q_ln"].transform("mean")

Named date vars for graphs

In [None]:
daily_agg["dow_abb"] = daily_agg["date"].dt.day_name().str[:3]

daily_agg["month_abb"] = daily_agg["date"].dt.month_name().str[:3]

## Descriptive graphs

Check:
1. within year pattern
2. Across years pattern
3. Across months
4. Across days
5. Heatmap to have an idea across month and daily pattern

Daily ticket sales 2015

In [None]:
(
    ggplot(
        daily_agg.loc[daily_agg.year == 2015, :].reset_index(drop=True),
        aes(x="date", y="QUANTITY"),
    )
    + geom_line(size=0.4)
    + scale_x_date(
        breaks=["2015-01-01", "2015-04-01", "2015-07-01", "2015-10-01", "2016-01-01"],
        labels=date_format("%d%b%Y"),
        date_minor_breaks="1 month",
    )
    + labs(x="Date (day)", y="Daily ticket sales")
    + theme_bw()
)

 Daily ticket sales 2010 - 2014

In [None]:
(
    ggplot(
        daily_agg.loc[(daily_agg.year >= 2010) & (daily_agg.year <= 2014), :],
        aes(x="date", y="QUANTITY"),
    )
    + geom_line(size=0.2)
    + scale_x_date(
        breaks=[
            "2010-01-01",
            "2011-01-01",
            "2012-01-01",
            "2013-01-01",
            "2014-01-01",
            "2015-01-01",
        ],
        labels=date_format("%d%b%Y"),
        date_minor_breaks="3 months",
    )
    + labs(x="Date (day)", y="Daily ticket sales")
    + theme_bw()
)

 Monthly box-plots for ticket sales

In [None]:
(
    ggplot(daily_agg, aes(x="reorder(month_abb,month)", y="QUANTITY"))
    + geom_boxplot(
        size=0.8,
        outlier_stroke=0.4,
        outlier_color="yellow",
        outlier_alpha=0.6,
    )
    + labs(x="Date (month)", y="Daily ticket sales")
    + theme_bw()
)

 Daily box-plots for ticket sales

In [None]:
(
    ggplot(daily_agg, aes(x="reorder(dow_abb,dow)", y="QUANTITY"))
    + geom_boxplot(
        size=0.8,
        outlier_stroke=0.4,
        outlier_color="yellow",
        outlier_alpha=0.6,
    )
    + labs(x="Day of the week", y="Daily ticket sales")
    + theme_bw()
)

to check for interactions between months and days look at the heatmap

In [None]:
swim_heatmap = (
    ggplot(
        daily_agg,
        aes(x="reorder(dow_abb,dow)", y="reorder(month_abb,month)", fill="tickets"),
    )
    + geom_tile(colour="white")
    + scale_fill_cmap(trans="reverse")
    + labs(x="Day of the week", y="Month")
    + theme_bw()
    + theme(
        legend_position="right",
        legend_text=element_text(size=10),
        legend_title=element_text(size=10),
    )
)
swim_heatmap

Same but with log sales

In [None]:
swim_heatmap_log = (
    ggplot(
        daily_agg,
        aes(x="reorder(dow_abb,dow)", y="reorder(month_abb,month)", fill="tickets_ln"),
    )
    + geom_tile(colour="white")
    + scale_fill_cmap(trans="reverse")
    + labs(x="Day of the week", y="Month") 
    + theme_bw()
    + theme(
        legend_position="right",
        legend_text=element_text(size=10),
        legend_title=element_text(size=10),
    )
)
swim_heatmap_log

## Prediction

### Creat train/holdout data

Create factor variables

In [None]:
factor_cols = ["month", "dow", "isHoliday", "school_off"]
daily_agg[factor_cols] = daily_agg[factor_cols].astype("category")


 Last year of data

In [None]:
data_holdout = daily_agg.loc[daily_agg['year']==2016,:]

 Rest of data for training

In [None]:
data_train = daily_agg.loc[daily_agg['year']<2016,:]

In [None]:
data_train.tail()

We are doing cross validation in the train sample. At each CV sample, the algo leaves out a sample of one year during training

In [None]:
logo = LeaveOneGroupOut()
groups = data_train.loc[:,'year'].to_numpy()

In [None]:
data_train.loc[:,'year'].unique()

Use OLS

In [None]:
lin_reg = LinearRegression(fit_intercept=False)

In [None]:
def fit_cv_model_get_rmse(y, X, groups):
    """
    A function to calculate cross-validated RMSE for time series
    """
    rmse_folds = []
    for train_index, test_index in logo.split(X, y, groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        lin_reg.fit(X_train, y_train)
        y_hat = lin_reg.predict(X_test)
        rmse_folds.append(mean_squared_error(y_test, y_hat, squared=False))

    return np.mean(rmse_folds)

### Fit models: here simple OLS which is applicable to TS data

Model 1 linear trend + monthly seasonality

In [None]:
y, X = dmatrices("QUANTITY ~ 1+ trend + month", data_train)

rmse_reg1 = fit_cv_model_get_rmse(y, X, groups)

Model 2 linear trend + monthly seasonality + days of week seasonality 

In [None]:
y, X = dmatrices("QUANTITY ~ 1+ trend + month + dow", data_train)

rmse_reg2 = fit_cv_model_get_rmse(y, X, groups)

Model 3 linear trend + monthly seasonality + days of week  seasonality + holidays 

In [None]:
y, X = dmatrices("QUANTITY ~ 1 + trend + month + dow + isHoliday", data_train)

rmse_reg3 = fit_cv_model_get_rmse(y, X, groups)

Model 4 linear trend + monthly seasonality + days of week  seasonality + holidays + sch*dow

In [None]:
y, X = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow", data_train
)

rmse_reg4 = fit_cv_model_get_rmse(y, X, groups)

Model 5 linear trend + monthly seasonality + days of week  seasonality + holidays + interactions

In [None]:
y, X = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    data_train,
)

rmse_reg5 = fit_cv_model_get_rmse(y, X, groups)

Model 6. 

For the log model we need to compute the RMSE with the adjustment term!

In [None]:
y, X = dmatrices(
    "q_ln ~ 1 + trend + month + dow +school_off*dow", data_train
)

rmse_folds = []
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lin_reg.fit(X_train, y_train)

    y_hat = lin_reg.predict(X)

    corrb = mean_squared_error(y , y_hat)

    y_hat = np.exp((lin_reg.predict(X_test) + corrb / 2))

    rmse_folds.append(mean_squared_error(np.exp(y_test), y_hat, squared=False))
    
rmse_reg6 = np.mean(rmse_folds)
rmse_reg6

Use prophet prediction,

add CV into prophet

can be done with prophet: https://facebook.github.io/prophet/docs/diagnostics.html

done but this is a different cross-validation as for the other models as it must be time-series like

prophet -  multiplicative option -- tried but produced much worse results (~34. RMSE)


In [None]:
from prophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

In [None]:
model_prophet = Prophet(
    seasonality_mode="additive",
    yearly_seasonality="auto",
    weekly_seasonality="auto",
    growth="linear",
    daily_seasonality=True,
)

model_prophet = Prophet.add_country_holidays(model_prophet,"US")

In [None]:
model_prophet = Prophet.fit(
    model_prophet,
    df=data_train[["date", "QUANTITY"]].rename({"date": "ds", "QUANTITY": "y"}, axis=1),
)

In [None]:
cv_pred = cross_validation(
    model_prophet, initial="365 days", period="365 days", horizon="365 days"
)

In [None]:
rmse_prophet_cv = performance_metrics(cv_pred,rolling_window = 1)["rmse"].values[0]

In [None]:
# Note: M6 log model rmse is different from book
pd.DataFrame(
    [rmse_reg1, rmse_reg2, rmse_reg3, rmse_reg4, rmse_reg5, rmse_reg6, rmse_prophet_cv],
    ["M" + str(i) for i in range(1, 6)] + ["M6 (log)", "M7 (Prophet)"],
    columns=["RMSE"],
).round(2)

## Evaluate best model on holdout set

In [None]:
lin_reg = LinearRegression(fit_intercept=False)

y, X = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    data_train,
)

lin_reg.fit(X, y)

_, X_holdout = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    data_holdout,
)
data_holdout["y_hat_5"] = lin_reg.predict(X_holdout)

In [None]:
rmse_holdout_best = mean_squared_error(
    data_holdout.QUANTITY, data_holdout.y_hat_5, squared=False
)
rmse_holdout_best

### Plot best predictions

Graph relative RMSE (on holdout) per month

In [None]:
group = data_holdout.sort_values(by=["date"]).groupby("month")
rmse_monthly = pd.DataFrame(
    {
        "date": group["date"].first(),
        "RMSE": group.apply(
            lambda x: mean_squared_error(x["QUANTITY"], x["y_hat_5"], squared=False)
        ),
        "RMSE_norm": group.apply(
            lambda x: mean_squared_error(x["QUANTITY"], x["y_hat_5"], squared=False)
            / np.mean(x["QUANTITY"])
        ),
    }
).reset_index()
rmse_monthly

### Figure 18.7 b)

In [None]:
(
    ggplot(rmse_monthly, aes(x="date", y="RMSE_norm"))
    + geom_col(color="red", fill="red")
    + scale_x_date(expand=(0.02, 0.02), labels=date_format("%b"), breaks="1 month")
    + labs(x="Date (month)", y="RMSE (normalized by monthly sales)")
    + theme_bw()
)

Create a long dataframe from the holdout predictions for nice plotting with plotnine

In [None]:
plotdata = (
    data_holdout.filter(["date", "month", "QUANTITY", "y_hat_5"])
    .melt(id_vars=["date", "month"])
    .merge(data_holdout.filter(["date", "QUANTITY"]), on="date")
    .merge(data_holdout.filter(["date", "y_hat_5"]), on="date")
    .rename(columns={"QUANTITY": "ymin", "y_hat_5": "ymax"})
)

### Figure 18.6 – Prediction on training sample

In [None]:
(
    ggplot(plotdata, aes(x="date", y="value", color="variable", linetype="variable"))
    + geom_line(size=0.8)
    + scale_y_continuous(expand=(0, 0))
    + scale_x_date(
        expand=(0, 0),
        breaks=[
            "2016-01-01",
            "2016-03-01",
            "2016-05-01",
            "2016-07-01",
            "2016-09-01",
            "2016-11-01",
            "2017-01-01",
        ],
        labels=date_format("%d%b%Y"),
        date_minor_breaks="1 month",
    )
    + scale_linetype_manual(name="", values=("-", "--"))
    + labs(x="Date (day)", y="Daily ticket sales")
    + scale_color_manual(
        name=" ", values=("red","blue"), labels=("Actual", "Predicted")
    )
    + scale_linetype_manual(
        name=" ", values=("-", "--"), labels=("Actual", "Predicted")
    )
    + theme_bw()
    + theme(legend_position=(0.74, 0.77), legend_direction="horizontal")
)

### Figure 18.7 a) – Prediction on hold-out sample

In [None]:
(
    ggplot(
        plotdata.loc[lambda x: x["month"] == 8].reset_index(),
        aes(x="date", y="value", color="variable", linetype="variable"),
    )
    + geom_line(size=1)
    + geom_ribbon(
        aes(ymin="ymin", ymax="ymax"),
        fill="yellow",
        color=None,
        alpha=0.2,
        show_legend=False,
    )
    + scale_y_continuous(expand=(0.01, 0.01), limits=(0, 150))
    + scale_x_date(
        expand=(0.01, 0.01),
        breaks=["2016-08-01", "2016-08-08", "2016-08-15", "2016-08-22", "2016-08-29"],
        labels=date_format("%d%b"),
    )
    + scale_color_manual(
        name=" ", values=("red","blue"), labels=("Actual", "Predicted")
    )
    + scale_linetype_manual(
        name=" ", values=("-", "--"), labels=("Actual", "Predicted")
    )
    + labs(x="Date (day)", y="Daily ticket sales")
    + theme_bw()
    + theme(legend_position=(0.74, 0.77), legend_direction="horizontal")
)