# 4 | _Greykite, Silver_: Autoregress, Daily, Pre-Covid
* [01 API Data Requests](01_API_pulls.ipynb)
* [02 Initial EDA](02_EDA.ipynb)
* [03 Prophet](03_prophet.ipynb)
* [04 Greykite: Silverkite Fuel](04_greykite.ipynb)
* _[04.1 Greykite: Silverkite Fuel](04_greykite_pre.ipynb)_
---

In [1]:
import pandas as pd
from prophet import Prophet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import defaultdict
import pandas as pd
import plotly

from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)  # for plots to render in jupyter notebook

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
def date_index(df): 
    df['date'] = pd.to_datetime(df['ds'])
    df = df.set_index('date')
    df.rename(columns = {'ridership' : 'y'}, inplace = True)

    return(df)

In [8]:
# importing bart data
filename = 'bart_daily.csv'
file = '../data/processed/' + filename
bart_df = pd.read_csv(file)

bart_df = date_index(bart_df)

bart_df.head()

Unnamed: 0_level_0,y,ds
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-01,124162.0,2011-01-01
2011-01-02,93666.0,2011-01-02
2011-01-03,285891.0,2011-01-03
2011-01-04,322306.0,2011-01-04
2011-01-05,327006.0,2011-01-05


In [9]:
# FUNCTION RETURNS PLOTLY TRACES
# TAKES 3 ARGUMENTS: (dataframe, y, and title for plot)
def plot_traces(df, y, title):
    y_trace = go.Scatter(
                    # x = df['date'],
                    x = df.index,
                    y = df[y], 
                    name = y + 'trace',
                    line = dict(color = 'blue'),
                    opacity = 0.4)

    layout = dict(title = title)

    fig = dict(data=[y_trace], layout=layout)
    iplot(fig)
    return (print ('done') )

In [26]:
df = bart_df['2010-01-01':'2020-02-28']
df.columns = ('y', 'ts')
df.head()

Unnamed: 0_level_0,y,ts
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-01,124162.0,2011-01-01
2011-01-02,93666.0,2011-01-02
2011-01-03,285891.0,2011-01-03
2011-01-04,322306.0,2011-01-04
2011-01-05,327006.0,2011-01-05


In [27]:
# specify dataset information
metadata = MetadataParam(
    time_col = 'ts',    # name of the time column ("date" in example above)
    value_col=  'y',    # name of the value column ("sessions" in example above)
    #freq = 'd'          # "H" for hourly, "D" for daily, "W" for weekly, etc.
                        # Any format accepted by `pandas.date_range` ### USE LOWER CASE OR ERROR for w, m # or remove? dunno why w, m, don't work. 
)

In [28]:
 forecaster = Forecaster()  # Creates forecasts and stores the result
 result = forecaster.run_forecast_config(  # result is also stored as `forecaster.forecast_result`.
     df=df,
     config=ForecastConfig(
         model_template=ModelTemplateEnum.SILVERKITE.name,
         forecast_horizon=10,  # forecasts 365 steps ahead
         coverage=0.95,         # 95% prediction intervals
         metadata_param=metadata
     )
 )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [29]:
ts = result.timeseries
fig = ts.plot()
plotly.io.show(fig)

### GREYKITE EVALUATION 
* creates holdout(test) set by default 
* cross-validation is run on saved data 

In [30]:
 grid_search = result.grid_search
 cv_results = summarize_grid_search_results(
     grid_search=grid_search,
     decimals=2,
     # code below collapse printed output: remove/comment out to show all available metrics and columns.
     cv_report_metrics=None,
     column_order=["rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params"])
 # Transposes to save space in the printed output
 cv_results["params"] = cv_results["params"].astype(str)
 cv_results.set_index("params", drop=True, inplace=True)
 cv_results.transpose()

params,[]
rank_test_MAPE,1
mean_test_MAPE,18.59
split_test_MAPE,"(20.2, 13.04, 22.52)"
mean_train_MAPE,14.29
split_train_MAPE,"(6.38, 18.26, 18.22)"
mean_fit_time,10.82
mean_score_time,0.95


In [31]:
 # Backtest: plot the historical forecast on the holdout test set. You can zoom in to see how it performed in any given period.
 backtest = result.backtest
 fig = backtest.plot()
 plotly.io.show(fig)

In [18]:
 # check historical evaluation metrics (on the historical training/test set).
 backtest_eval = defaultdict(list)
 for metric, value in backtest.train_evaluation.items():
     backtest_eval[metric].append(value)
     backtest_eval[metric].append(backtest.test_evaluation[metric])
 metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T
 metrics

Unnamed: 0,train,test
CORR,0.619283,0.793099
R2,0.30067,-13.656625
MSE,15823927549.061258,17931527301.264828
RMSE,125793.193572,133908.652824
MAE,111566.374776,122878.104656
MedAE,112500.866252,144273.983688
MAPE,115.360438,125.299114
MedAPE,30.452578,105.42045
sMAPE,24.694105,35.744095
Q80,55908.223653,24575.620931


ID      | MODEL   | DATA      | RMSE        | MSE       | MAE       | CV        | MAPE      | MASE      | AIC 
---     | ---     | ---        | ---       | ---       | ---       | ---       | ---       | ---       | ---  
A       | PROPHET | < 2020   | 446 152   | 199 052 198 567| 375 686   |          
B       | PROPHET | All BART   | 1 243 269   | 5 457 200 928 927| 1 181 450   |          
C       | Greykite| All BART    | 911 443 | 830 729 769 011   |909 234 | | 205
D       | Greykite| < 2020    | 1 053 866| 110 633 739 830  |1 051 141| | 5.25

In [32]:
forecast = result.forecast
fig = forecast.plot()
plotly.io.show(fig)

In [20]:
# The forecasted values are available in `df`

forecast.df.head().round(2)

Unnamed: 0,ts,actual,forecast,forecast_lower,forecast_upper
0,2011-01-01,124162.0,314965.65,221262.02,408669.28
1,2011-01-02,93666.0,314020.66,222018.28,406023.03
2,2011-01-03,285891.0,317715.8,61248.95,574182.65
3,2011-01-04,322306.0,318165.03,53772.13,582557.92
4,2011-01-05,327006.0,319212.74,55686.26,582739.23


#### Model Diagnostics

The component plot shows how your dataset’s trend, seasonality, and event / holiday patterns are handled in the model:

In [21]:
 fig = forecast.plot_components()
 plotly.io.show(fig)     # fig.show() if you are using "PROPHET" template

> Model summary allows inspection of individual model terms. Check parameter estimates and their significance for insights on how the model works and what can be further improved.

In [22]:
 summary = result.model[-1].summary()  # -1 retrieves the estimator from the pipeline
 print(summary)


Number of observations: 4169,   Number of features: 117
Method: Ridge regression
Number of nonzero features: 117
Regularization parameter: 1.000e+05

Residuals:
         Min           1Q       Median           3Q          Max
  -3.053e+05   -1.008e+05    4.081e+04    1.167e+05    2.524e+05

            Pred_col  Estimate Std. Err Pr(>)_boot sig. code                  95%CI
           Intercept 3.191e+05   1870.0     <2e-16       *** (3.157e+05, 3.226e+05)
 events_C...New Year   -0.3036    6.644      0.962                  (-13.73, 11.89)
 events_C...w Year-1   -0.7097     7.18      0.912                  (-15.79, 12.62)
 events_C...w Year-2     2.792    7.427      0.718                  (-11.16, 17.26)
 events_C...w Year+1    -1.037    6.798      0.876                  (-15.43, 12.18)
 events_C...w Year+2     4.136    7.407      0.564                  (-10.89, 17.77)
events_Christmas Day    -23.92    7.287     <2e-16       ***       (-39.05, -11.28)
 events_C...as Day-1    -14.22    4

#### Apply the model

The trained model is available as a fitted `sklearn.pipeline.Pipeline`

In [23]:
 model = result.model
 model

Pipeline(steps=[('input',
                 PandasFeatureUnion(transformer_list=[('date',
                                                       Pipeline(steps=[('select_date',
                                                                        ColumnSelector(column_names=['ts']))])),
                                                      ('response',
                                                       Pipeline(steps=[('select_val',
                                                                        ColumnSelector(column_names=['y'])),
                                                                       ('outlier',
                                                                        ZscoreOutlierTransformer()),
                                                                       ('null',
                                                                        NullTransformer(impute_algorithm='interpolate',
                                                                 

In [24]:
 future_df = result.timeseries.make_future_dataframe(
     periods=4,
     include_history=False)
 future_df

Unnamed: 0,ts,y
2022-06-01,2022-06-01,
2022-06-02,2022-06-02,
2022-06-03,2022-06-03,
2022-06-04,2022-06-04,


> Call .predict() to compute predictions

In [25]:
 model.predict(future_df)

Unnamed: 0,ts,forecast,forecast_lower,forecast_upper,y_quantile_summary,err_std
0,2022-06-01,281160.893745,17634.40994,544687.377551,"(17634.409939811914, 544687.3775509914)",134454.758294
1,2022-06-02,287164.339701,15280.815673,559047.863729,"(15280.815673142613, 559047.8637288993)",138718.632675
2,2022-06-03,270578.404905,21773.670767,519383.139044,"(21773.6707665695, 519383.1390440488)",126943.523504
3,2022-06-04,127706.627153,34003.000372,221410.253934,"(34003.000372431285, 221410.25393375003)",47808.851346
