# 4 | _Greykite, Silver_: Autoregress
* [01 API Data Requests](01_API_pulls.ipynb)
* [02 Initial EDA](02_EDA.ipynb)
* [03 Prophet](03_prophet.ipynb)
* _[04 Greykite: Silverkite Fuel](04_greykite.ipynb)_
---
  

In [120]:
import pandas as pd
from prophet import Prophet

In [121]:
from collections import defaultdict
import pandas as pd
import plotly

from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

In [122]:
import warnings
warnings.filterwarnings("ignore")

In [123]:
# function to ensure date-time is proper format and index
def date_index(df): 
    df['date'] = pd.to_datetime(df['ds'])
    df = df.set_index('date')
    return(df)

In [124]:
file = '../data/processed/bart.csv'
bart_df = pd.read_csv(file)
bart_df = bart_df[['ridership', 'ds']]

bart_df = date_index(bart_df)

In [128]:
# FUNCTION RETURNS PLOTLY TRACES
# TAKES 3 ARGUMENTS: (dataframe, y, and title for plot)
def plot_traces(df, y, title):
    y_trace = go.Scatter(
                    # x = df['date'],
                    x = df.index,
                    y = df[y], 
                    name = y + 'trace',
                    line = dict(color = 'blue'),
                    opacity = 0.4)

    layout = dict(title = title)

    fig = dict(data=[y_trace], layout=layout)
    iplot(fig)
    return (print ('done') )

In [160]:
df = bart_df['2010-01-01':'2022-01-01']
df.columns = ('y', 'ts')

In [162]:
# specify dataset information
metadata = MetadataParam(
    time_col = 'ts',    # name of the time column ("date" in example above)
    value_col=  'y',    # name of the value column ("sessions" in example above)
    #freq = 'd'          # "H" for hourly, "D" for daily, "W" for weekly, etc.
                        # Any format accepted by `pandas.date_range` ### USE LOWER CASE OR ERROR for w, m # or remove? dunno why w, m, don't work. 
)

In [163]:
 forecaster = Forecaster()  # Creates forecasts and stores the result
 result = forecaster.run_forecast_config(  # result is also stored as `forecaster.forecast_result`.
     df=df,
     config=ForecastConfig(
         model_template=ModelTemplateEnum.SILVERKITE.name,
         forecast_horizon=10,  # forecasts 365 steps ahead
         coverage=0.95,         # 95% prediction intervals
         metadata_param=metadata
     )
 )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [164]:
ts = result.timeseries
fig = ts.plot()
plotly.io.show(fig)

### GREYKITE EVALUATION 
* creates holdout(test) set by default 
* cross-validation is run on saved data 

In [165]:
 grid_search = result.grid_search
 cv_results = summarize_grid_search_results(
     grid_search=grid_search,
     decimals=2,
     # code below collapse printed output: remove/comment out to show all available metrics and columns.
     cv_report_metrics=None,
     column_order=["rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params"])
 # Transposes to save space in the printed output
 cv_results["params"] = cv_results["params"].astype(str)
 cv_results.set_index("params", drop=True, inplace=True)
 cv_results.transpose()

params,[]
rank_test_MAPE,1
mean_test_MAPE,335.17
split_test_MAPE,"(4.06, 322.77, 678.66)"
mean_train_MAPE,11.93
split_train_MAPE,"(2.22, 4.32, 29.24)"
mean_fit_time,7.96
mean_score_time,1.66


In [166]:
 # Backtest: plot the historical forecast on the holdout test set. You can zoom in to see how it performed in any given period.
 backtest = result.backtest
 fig = backtest.plot()
 plotly.io.show(fig)

In [167]:
 # check historical evaluation metrics (on the historical training/test set).
 backtest_eval = defaultdict(list)
 for metric, value in backtest.train_evaluation.items():
     backtest_eval[metric].append(value)
     backtest_eval[metric].append(backtest.test_evaluation[metric])
 metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T
 metrics

Unnamed: 0,train,test
CORR,0.281402,-0.688016
R2,0.002199,-193.110463
MSE,177889550589.81827,1110633739830.198
RMSE,421769.546779,1053866.091982
MAE,267175.903651,1051141.75962
MedAE,179241.4016,1043104.803638
MAPE,73.777434,288.631924
MedAPE,11.891825,265.044382
sMAPE,12.011315,57.880942
Q80,133587.951825,210228.351924


ID      | MODEL   | DATA      | RMSE        | MSE       | MAE       | CV        | MAPE      | MASE      | AIC 
---     | ---     | ---        | ---       | ---       | ---       | ---       | ---       | ---       | ---  
A       | PROPHET | < 2020   | 446 152   | 199 052 198 567| 375 686   |          
B       | PROPHET | All BART   | 1 243 269   | 5 457 200 928 927| 1 181 450   |          
C       | Greykite| All BART    | 911 443 | 830 729 769 011   |909 234 | | 205
D       | Greykite| < 2020    | 1 053 866| 110 633 739 830  |1 051 141| | 5.25

In [168]:
forecast = result.forecast
fig = forecast.plot()
plotly.io.show(fig)

In [169]:
# The forecasted values are available in `df`

forecast.df.head().round(2)

Unnamed: 0,ts,actual,forecast,forecast_lower,forecast_upper
0,2010-01-01,1329472.0,1374892.67,419276.51,2330508.83
1,2010-02-01,1318752.0,1374805.75,419189.59,2330421.91
2,2010-03-01,1414724.0,1374682.56,419066.4,2330298.72
3,2010-04-01,1433632.0,1374610.43,418994.27,2330226.59
4,2010-05-01,1381416.0,1374511.46,418895.29,2330127.62


#### Model Diagnostics

The component plot shows how your dataset’s trend, seasonality, and event / holiday patterns are handled in the model:

In [170]:
 fig = forecast.plot_components()
 plotly.io.show(fig)     # fig.show() if you are using "PROPHET" template

> Model summary allows inspection of individual model terms. Check parameter estimates and their significance for insights on how the model works and what can be further improved.

In [171]:
 summary = result.model[-1].summary()  # -1 retrieves the estimator from the pipeline
 print(summary)


Number of observations: 145,   Number of features: 52
Method: Ridge regression
Number of nonzero features: 21
Regularization parameter: 1.000e+05

Residuals:
         Min           1Q       Median           3Q          Max
  -1.258e+06   -1.833e+04    1.656e+05    3.136e+05    4.645e+05

            Pred_col  Estimate  Std. Err Pr(>)_boot sig. code                  95%CI
           Intercept 1.375e+06 3.913e+04     <2e-16       *** (1.297e+06, 1.452e+06)
 events_C...New Year     2.284     2.315      0.170                      (0., 7.885)
 events_C...w Year-1     2.284     2.315      0.170                      (0., 7.885)
 events_C...w Year-2   -0.1833    0.5922      0.812                 (-1.602, 0.7487)
 events_C...w Year+1     2.284     2.315      0.170                      (0., 7.885)
 events_C...w Year+2     3.426      3.51      0.546                      (0., 11.89)
events_Christmas Day        0.        0.      1.000                         (0., 0.)
 events_C...as Day-1        0.

#### Apply the model

The trained model is available as a fitted `sklearn.pipeline.Pipeline`

In [172]:
 model = result.model
 model

Pipeline(steps=[('input',
                 PandasFeatureUnion(transformer_list=[('date',
                                                       Pipeline(steps=[('select_date',
                                                                        ColumnSelector(column_names=['ts']))])),
                                                      ('response',
                                                       Pipeline(steps=[('select_val',
                                                                        ColumnSelector(column_names=['y'])),
                                                                       ('outlier',
                                                                        ZscoreOutlierTransformer()),
                                                                       ('null',
                                                                        NullTransformer(impute_algorithm='interpolate',
                                                                 

In [173]:
 future_df = result.timeseries.make_future_dataframe(
     periods=4,
     include_history=False)
 future_df

Unnamed: 0,ts,y
2022-02-01,2022-02-01,
2022-03-01,2022-03-01,
2022-04-01,2022-04-01,
2022-05-01,2022-05-01,


> Call .predict() to compute predictions

In [174]:
 model.predict(future_df)

Unnamed: 0,ts,forecast,forecast_lower,forecast_upper,y_quantile_summary,err_std
0,2022-02-01,1361715.0,406098.358861,2317331.0,"(406098.35886123264, 2317330.679128503)",487568.224555
1,2022-03-01,1361626.0,406010.257891,2317243.0,"(406010.2578910806, 2317242.5781583507)",487568.224555
2,2022-04-01,1361534.0,405917.572031,2317150.0,"(405917.57203099015, 2317149.8922982607)",487568.224555
3,2022-05-01,1361424.0,405807.44119,2317040.0,"(405807.44118990866, 2317039.7614571787)",487568.224555
