# Methods validation: Autoregression with seasonality

Using synthetic data with known properties, we verify what behaviors the parameters of various `statsmodels.tsa` and `statsmodels.api.sm.ols` methods yield.

In [None]:
from pathlib import Path
import datetime as dt
from typing import Union, Tuple, List
from pprint import pprint
from math import ceil
import pandas as pd
import numpy as np
import altair as alt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tools import eval_measures
from jinja2 import Template

In [None]:
parent_path = Path('.').resolve().parent  # Path.cwd()

## Plotting functions

In [None]:
def altair_axis_encoding(source: pd.DataFrame,
                         z_: str) -> str:
    """
    Provides Altair with the proper encoding for an axis variable in shorthand.
    Consider refactoring using the long-form alt.X('name', type='quantitative'), etc.
    
    :param source: tabular data set containing a column to be plotted
    :param z_: column to be plotted
    """  
    if source[z_].dtype in [int, float]:
        axis_encoding = z_ + ':Q'  # a continuous real-valued quantity
    elif ((pd.api.types.is_datetime64_ns_dtype(source[z_])) |
          (pd.api.types.is_period_dtype(source[z_]))
         ):
        axis_encoding = z_ + ':T'  # a time or date value
    elif source[z_].dtype == 'object':
        print(f'Column {z_} is currently of dtype object; please convert to either int, float, or datetime.')
    else:
        print(f'Column {z_} is not correctly formatted; please convert to either int, float, or datetime.')
    return axis_encoding


def altair_ts_scatter(source: pd.DataFrame,
                      x_: str,
                      y_: str,
                      tooltip_fld: str,
                      categorical_colors: bool = False,
                      _cat: str = 'darkblue',
                      x_title: str = '',
                      y_title: str = '',
                      _zero: bool = False,
                      _title: str = '',
                      h_w: Tuple[str] = (200, 400)):
    """
    Using filled circles, plots time series of transaction metrics (amount, volume, etc.) 
    on a daily resolution. To the right, plot a frequency distribution of the same variable.
    
    :param source: tabular data set containing columns to be plotted
    :param x_: column name containing datetimes or integer denoting day of week or year, week or month, etc.
    :param y_: transaction amount, volume, or other metric
    :param _cat: categorical field used to color-code symbols, or a default single color
    :param x_title: horizontal axis title
    :param y_title: vertical axis title
    :param _zero: whether to scale the vertical axis from zero (True) or on the basis of the range of values (False)
    :param _title: chart title
    :returns: altair graph (json) object 
    """   
    x_axis_encoding = altair_axis_encoding(source, x_)
    
    if not categorical_colors:
        chart = alt.Chart(source).mark_circle(opacity=0.6, color=_cat).encode(
            x=alt.X(x_axis_encoding, title=x_title, 
                    axis=alt.Axis(grid=False,
                                  ticks=True,
                                 )
                   ),
            y=alt.Y(y_ + ':Q', title=y_title, scale=alt.Scale(zero=_zero),
                    axis=alt.Axis(grid=True,
                                  ticks=True,
                                  domain=False  # axis line
                                 )
                   ),
            tooltip=altair_axis_encoding(source, tooltip_fld)
        ).properties(
            title=_title,
            height=h_w[0],
            width=h_w[1],
        )
    else:
        chart = alt.Chart(source).mark_circle(opacity=0.6).encode(
        x=alt.X(x_axis_encoding, title=x_title, 
                axis=alt.Axis(grid=False,
                              ticks=True,
                             )
               ),
        y=alt.Y(y_ + ':Q', title=y_title, scale=alt.Scale(zero=_zero),
                axis=alt.Axis(grid=True,
                              ticks=True,
                              domain=False
                             )
               ),
        color=_cat + ':N',
        tooltip=altair_axis_encoding(source, tooltip_fld)
        ).properties(
            title=_title,
            height=h_w[0],
            width=h_w[1],
        ).configure_view(strokeWidth=0)
    return chart


def altair_ts_line(source: pd.DataFrame,
                   x_: str,
                   y_: str,
                   _color: str = 'palevioletred',
                   x_title: str = '',
                   y_title: str = '',
                   _zero: bool = False,
                   _title: str = '',
                   h_w: Tuple[str] = (200, 400)):
    """
    Using symbols, plots time series of transaction metrics (amount, volume, etc.) on a daily resolution.
    To the right, plot a frequency distribution of the same variable.
    
    :param source: tabular data set containing columns to be plotted
    :param x_: column name containing datetimes or integer denoting day of week or year, week or month, etc.
    :param y_: transaction amount, volume, or other metric
    :param x_title: horizontal axis title
    :param y_title: vertical axis title
    :param _zero: whether to scale the vertical axis from zero (True) or on the basis of the range of values (False)
    :param _title: chart title
    :param cat: categorical field used to color-code symbols.
    :returns: altair graph (json) object 
    """
    x_axis_encoding = altair_axis_encoding(source, x_)
    ts_line = alt.Chart(source).mark_line(color=_color).encode(
        x=alt.X(x_axis_encoding, title=x_title,
                axis=alt.Axis(grid=False,
                              ticks=True,
                             )
               ),
        y=alt.Y(y_ + ':Q', title=y_title, scale=alt.Scale(zero=_zero),
               axis=alt.Axis(grid=True,
                              ticks=True,
                              domain=False  # axis line
                             )
               ),
    ).properties(
        title=_title,
        height=h_w[0],
        width=h_w[1],
    ) 
    return ts_line

## Create artificial data with known patterns

Create one year of daily timestamps and initialize the observations with random numbers $\in (0, 1)$.

In [None]:
def artifice(day_one: str = '2022-01-01',
             day_after_end: str = '2023-01-01',
            ) -> pd.DataFrame:
    """Generate an artificial time series of random numbers [0, 1] from
    [day_one, day_after_end).
    """
    days = np.arange(day_one, day_after_end, dtype='datetime64[D]')
    print(f'There are {len(days)} days in the period starting {days.min()} and ending {days.max()}.')
    rng = np.random.RandomState(1216)
    return pd.DataFrame({'t': days,
                         'y': rng.random(len(days))})


def week_of_month(stamp: dt.date):
    """
    Determines which week of the month a date occurs.
    
    Parameters
    ----------
    stamp : datetime stamp
    """
    # Replace the day portion of the date with 1
    first_day = stamp.replace(day=1)
    
    day_of_month = stamp.day
    adjusted_dom = day_of_month + first_day.weekday()
    return int(ceil(adjusted_dom/7.0))


def seq_ordinal(data: pd.DataFrame,
                date_col: 'str' = 't',
               ):
    """
    """
    data.loc[:, 'dow'] = data[date_col].dt.weekday
    data.loc[:, 'wom'] = data[date_col].apply(week_of_month)
    return data


def signal_boost(data,
                 level1_to_boost: int = 4,  # Friday
                 # level2_to_boost: Tuple = (2, 4),  # last __ of the month
                 boost: float = 2.,
                 every_other_week: bool = False,
                ):
    """
    """
    if every_other_week:
        data.y = data.y + np.where((data.dow==level1_to_boost) &
                                   (data.wom % 2 == 0), boost, 0)
    else:
        data.y = data.y + np.where(data.dow==level1_to_boost, boost, 0)
    return data

In [None]:
ts = artifice()
print(ts[0:4])

In [None]:
def run_autoreg(data: pd.DataFrame,
                y: str,
                lags_: int,
                seasonal_: bool,
                period_: int = 0,
                plot_fit: bool = True,
                exhibit_no: str = '',
                ):
    auto_reg = AutoReg(data[y],
                       lags=lags_,
                       trend='t',
                       seasonal=seasonal_,
                       period=period_
                       )
    auto_reg0 = auto_reg.fit()
    model_params_md = auto_reg0.params.to_markdown()
    data.loc[:, 'y_hat'] = auto_reg0.predict()
    maedf = data.dropna()
    mae = eval_measures.meanabs(maedf[y], maedf.y_hat)
    
    if plot_fit:
        pan_zoom = alt.selection_interval(bind='scales')
        c0 = altair_ts_scatter(data, 't', 'y', 't')
        c2 = altair_ts_line(data, 't', 'y_hat', 't',
                            _title=f"{exhibit_no}{lags_} lags, period {period_}: MAE {mae:.3f}")
        return auto_reg0, model_params_md, data, (c0 + c2).add_selection(pan_zoom)
    else:
        return auto_reg0, model_params_md, data

### Optional: Plot raw data

In [None]:
stage0 = altair_ts_scatter(ts, 't', 'y', 't', _title='Figure 0. Random scatter.')

In [None]:
stage0

### Encode a time characteristic, such as day of the week (dow) and boost the signal on certain days (or weeks, etc.).

In [None]:
ts = seq_ordinal(ts)
ts = signal_boost(ts,every_other_week=False)
s1data = ts[0:19].to_markdown(index=False)
print(ts[0:29])

### Option to remove some points

In [None]:
def abscond(data,
            points: int = 10,
           ):
    
    rng = np.random.RandomState(1216)
    idx_mask = rng.randint(0, len(data), points)
    data = data[~data.index.isin(idx_mask)]
    print(f'{points} points randomly removed at positions {idx_mask}, leaving {len(data)} observations.')
    
    # Inspect that a row was removed
    print(data[idx_mask[0] - 5 : idx_mask[0] + 2])
    return data, idx_mask

### MOVE downstream?  Option to reformat the index
When the column is populated with datetimes, `.set_index()` produces a `DatimeIndex` with `freq=None`.

`.asfreq('d')` conforms the data to a daily schedule by adding any missing days and putting `nan` in associated columns of those rows. `ts.index = ts.index.to_period('D')` is an alternative that will _not_ do so.  We currently opt to have all days present, assuming that any missingness implies no transactions; hence we replace `nan` with zero.  These measures accommodate `statsmodels.tsa.ar_model.AutoReg` seasonality feature, which rely on regularly spaced sequences.

In [None]:
# Move downstream
# ts.set_index('t', inplace=True)
# ts = ts.asfreq('d')
# ts.fillna(0, inplace=True)
# print(ts.index)

## Fit autoregressive model

With `seasonal=False`, we obtain an upwardly trending oscillation.

In [None]:
s1model, s1params, _, stage1 = run_autoreg(ts, 'y',
                                           lags_=2,
                                           seasonal_=False,
                                           exhibit_no='Figure 1. '
                                          )
print(s1model.params)
stage1

Enable `seasonal`, and we get the expected level baseline with weekly peaks:

In [None]:
_, s2params, _, stage2 = run_autoreg(ts, 'y',
                                     lags_=4,
                                     seasonal_=True,
                                     period_=7,
                                     exhibit_no='Figure 2. '
                                    )
stage2

If we increase `lags=7` to include the weekly effect, we get almost as good a model as with seasonal terms:

In [None]:
_, s3params, _, stage3 = run_autoreg(ts, 'y',
                                     lags_=7,
                                     seasonal_=False,
                                     exhibit_no='Figure 3. '
                                    )
stage3

Look at the coefficients by applying the `.params` method to the fitted model object; e.g., with `lags=2`, `seasonal=True`, and `period=7`: 

In [None]:
s4model, s4params, _, stage4 = run_autoreg(ts, 'y',
                                           lags_=2,
                                           seasonal_=True,
                                           period_=7,
                                           exhibit_no='Figure 4. '
                                          )
print(s4model.params)

If we remove a small number of points at random such that there are gaps in the index, the model falls apart (not shown).  Can we make the seasonal regression algorithm aware that observations are made on calendar days?

In [None]:
ts, idx_mask = abscond(ts, points=5)

Before and after applying `.asfreq('d')`:

In [None]:
s5_before_asfreq = ts[280:292].to_markdown(index=False)
ts[280:292]

In [None]:
ts = ts.set_index('t').asfreq('d')

In [None]:
s5_after_asfreq = ts[280:292].to_markdown(index=False)
ts[280:292]

try a `PeriodIndex`, a subclass of `Index` that is regularly spaced:  We still have gaps in the time series, but no `nan` have been inserted:

In [None]:
ts2 = artifice()
ts2 = seq_ordinal(ts2)
ts2 = signal_boost(ts2, every_other_week=False)
ts2, idx_mask = abscond(ts2, points=5)
ts2.set_index('t', inplace=True)

In [None]:
ts2.index

In [None]:
ts2.index = pd.DatetimeIndex(ts2.index).to_period('D')

In [None]:
ts2.index

In [None]:
s5_periodIndex = ts2[280:292].to_markdown(index=False)  # or part of sequence with gaps
ts2[280:292]

It is unnecessary to include `missing='drop'` in `AutoReg()`, since the data has no missing values (it is probably good practice to include `missing='raise'` as a check).  `.fit()` runs and `.predict()` returns values without alterring the input data structure: it still has a `PeriodIndex`:

In [None]:
_, s5params, ts5, stage5 = run_autoreg(ts2.reset_index(), 'y',
                                       lags_=7,
                                       seasonal_=False,
                                      )

In [None]:
ts5md = ts5[0:9].to_markdown(index=False)
ts5.head(10)

In [None]:
ts2.index = ts2.index.to_timestamp()

In [None]:
_, _, _, stage5 = run_autoreg(ts2.reset_index(), 'y',
                                       lags_=7,
                                       seasonal_=False,
                                       exhibit_no='Figure 5. '
                                      )

In [None]:
stage5

Using `.asfreq` to conform a daily time series of 365 points minus 10 removed at
random to a `DatetimeIndex`-ed dataframe with `freq='D'`, we get a good fit.

In [None]:
ts3 = artifice()
ts3 = seq_ordinal(ts3)
ts3 = signal_boost(ts3, every_other_week=False)
ts3, _ = abscond(ts3)
ts3 = ts3.set_index('t').asfreq('d')
ts3.fillna(0, inplace=True)
_, s6params, _, stage6 = run_autoreg(ts3.reset_index(), 'y',
                                     lags_=2,
                                     seasonal_=True,
                                     period_=7,
                                     exhibit_no='Figure 6. '
                                    )

In [None]:
stage6

What if the signal boost occurred _every other_ Friday?

In [None]:
ts4 = artifice()
ts4 = seq_ordinal(ts4)
ts4 = signal_boost(ts4, every_other_week=False)
ts4 = signal_boost(ts4,
                  level1_to_boost=4,  # Friday
                  boost=2.,
                  every_other_week=True,
                 )
_, s7params, _, stage7 = run_autoreg(ts4, 'y',
                                     lags_=2,
                                     seasonal_=True,
                                     period_=7,
                                     exhibit_no='Figure 7. '
                                    )

In [None]:
compound = (stage0 | stage1) & (stage2 | stage3) & (stage5 | stage7).configure_view(strokeWidth=0).add_selection(pan_zoom)

In [None]:
compound

# Put content into template

In [None]:
p = parent_path/'content/posts/practical_ts01.md'
template = Template(p.read_text())
o = parent_path/'content/posts/practical_ts01.md'
o.write_text(template.render({'alt': alt,
                              'stage0': 'Figure 0 appended.' # stage0.to_html(),
                              's1data': s1data,
                              'stage1': 'Figure 1 appended.' # stage1.to_html(),
                              'stage2': 'Figure 2 appended.' # stage2.to_html(),
                              'stage3': 'Figure 3 appended.' # stage3.to_html(),
                              's4params': s4params,
                              's5_before_asfreq': s5_before_asfreq,
                              's5_after_asfreq': s5_after_asfreq,
                              's5_periodIndex': s5_periodIndex,
                              'ts5md': ts5md,
                              'stage5': 'Figure 5 appended.' # stage5.to_html(),
                              's5params': s5params,
                              's6params': s6params,
                              'stage7': compound.to_json(),
                             }
                            )
            )

Troubleshoot jinja, hugo issues

In [None]:
# parent_path

In [None]:
# p = parent_path/'content/posts/aa1chart.md'
# template = Template(p.read_text())
# o = parent_path/'content/posts/aa1chart.md'
# o.write_text(template.render({
#                               'stage0': stage0.to_json(),
# #                               'stage7': stage7.to_html(),
#                              }
#                             )
#             )

In [None]:
# stage0

In [None]:
# Path.cwd()