# 4 | _Greykite, Silver_: Autoregress
* [01 API Data Requests](01_API_pulls.ipynb)
* [02 Initial EDA](02_EDA.ipynb)
* [03 Prophet](03_prophet.ipynb)
* _[04 Greykite: Silverkite Fuel](04_greykite.ipynb)_
---
  

In [1]:
import pandas as pd
from prophet import Prophet

In [3]:
from collections import defaultdict
import pandas as pd
import plotly

from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)  # for plots to render in jupyter notebook

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
# function to ensure date-time is proper format and index
def date_index(df): 
    df['date'] = pd.to_datetime(df['ds'])
    df = df.set_index('date')
    return(df)

In [8]:
file = '../data/processed/bart.csv'
bart_df = pd.read_csv(file)
bart_df = bart_df[['ridership', 'ds']]

bart_df = date_index(bart_df)

In [9]:
# FUNCTION RETURNS PLOTLY TRACES
# TAKES 3 ARGUMENTS: (dataframe, y, and title for plot)
def plot_traces(df, y, title):
    y_trace = go.Scatter(
                    # x = df['date'],
                    x = df.index,
                    y = df[y], 
                    name = y + 'trace',
                    line = dict(color = 'blue'),
                    opacity = 0.4)

    layout = dict(title = title)

    fig = dict(data=[y_trace], layout=layout)
    iplot(fig)
    return (print ('done') )

In [10]:
df = bart_df['2010-01-01':'2022-01-01']
df.columns = ('y', 'ts')

In [11]:
# specify dataset information
metadata = MetadataParam(
    time_col = 'ts',    # name of the time column ("date" in example above)
    value_col=  'y',    # name of the value column ("sessions" in example above)
    #freq = 'd'          # "H" for hourly, "D" for daily, "W" for weekly, etc.
                        # Any format accepted by `pandas.date_range` ### USE LOWER CASE OR ERROR for w, m # or remove? dunno why w, m, don't work. 
)

In [12]:
 forecaster = Forecaster()  # Creates forecasts and stores the result
 result = forecaster.run_forecast_config(  # result is also stored as `forecaster.forecast_result`.
     df=df,
     config=ForecastConfig(
         model_template=ModelTemplateEnum.SILVERKITE.name,
         forecast_horizon=10,  # forecasts 365 steps ahead
         coverage=0.95,         # 95% prediction intervals
         metadata_param=metadata
     )
 )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [13]:
ts = result.timeseries
fig = ts.plot()
plotly.io.show(fig)

### GREYKITE EVALUATION 
* creates holdout(test) set by default 
* cross-validation is run on saved data 

In [14]:
 grid_search = result.grid_search
 cv_results = summarize_grid_search_results(
     grid_search=grid_search,
     decimals=2,
     # code below collapse printed output: remove/comment out to show all available metrics and columns.
     cv_report_metrics=None,
     column_order=["rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params"])
 # Transposes to save space in the printed output
 cv_results["params"] = cv_results["params"].astype(str)
 cv_results.set_index("params", drop=True, inplace=True)
 cv_results.transpose()

params,[]
rank_test_MAPE,1
mean_test_MAPE,385.09
split_test_MAPE,"(4.06, 364.78, 786.42)"
mean_train_MAPE,13.97
split_train_MAPE,"(2.22, 6.63, 33.07)"
mean_fit_time,5.0
mean_score_time,0.82


In [15]:
 # Backtest: plot the historical forecast on the holdout test set. You can zoom in to see how it performed in any given period.
 backtest = result.backtest
 fig = backtest.plot()
 plotly.io.show(fig)

In [16]:
 # check historical evaluation metrics (on the historical training/test set).
 backtest_eval = defaultdict(list)
 for metric, value in backtest.train_evaluation.items():
     backtest_eval[metric].append(value)
     backtest_eval[metric].append(backtest.test_evaluation[metric])
 metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T
 metrics

Unnamed: 0,train,test
CORR,0.38036,-0.70522
R2,0.004025,-272.138765
MSE,178559700089.11884,1148183980536.4492
RMSE,422563.249809,1071533.471496
MAE,266124.335232,1069562.915839
MedAE,173394.878165,1062694.418533
MAPE,84.00904,341.79134
MedAPE,11.301793,314.97873
sMAPE,12.33733,61.959929
Q80,133062.167616,213912.583168


ID      | MODEL   | DATA      | RMSE        | MSE       | MAE       | CV        | MAPE      | MASE      | AIC 
---     | ---     | ---        | ---       | ---       | ---       | ---       | ---       | ---       | ---  
A       | PROPHET | < 2020   | 446 152   | 199 052 198 567| 375 686   |          
B       | PROPHET | All BART   | 1 243 269   | 5 457 200 928 927| 1 181 450   |          
C       | Greykite| All BART    | 911 443 | 830 729 769 011   |909 234 | | 205
D       | Greykite| < 2020    | 1 053 866| 110 633 739 830  |1 051 141| | 5.25

In [17]:
forecast = result.forecast
fig = forecast.plot()
plotly.io.show(fig)

In [18]:
# The forecasted values are available in `df`

forecast.df.head().round(2)

Unnamed: 0,ts,actual,forecast,forecast_lower,forecast_upper
0,2010-01-01,1329472.0,1399308.36,471956.07,2326660.65
1,2010-02-01,1318752.0,1398397.81,471045.52,2325750.1
2,2010-03-01,1414724.0,1397142.79,469790.5,2324495.08
3,2010-04-01,1433632.0,1396389.08,469036.79,2323741.37
4,2010-05-01,1381416.0,1395395.45,468043.16,2322747.74


#### Model Diagnostics

The component plot shows how your dataset’s trend, seasonality, and event / holiday patterns are handled in the model:

In [19]:
 fig = forecast.plot_components()
 plotly.io.show(fig)     # fig.show() if you are using "PROPHET" template

> Model summary allows inspection of individual model terms. Check parameter estimates and their significance for insights on how the model works and what can be further improved.

In [20]:
 summary = result.model[-1].summary()  # -1 retrieves the estimator from the pipeline
 print(summary)


Number of observations: 145,   Number of features: 52
Method: Ridge regression
Number of nonzero features: 21
Regularization parameter: 9770.0

Residuals:
         Min           1Q       Median           3Q          Max
  -1.193e+06   -4.522e+04    1.360e+05    3.172e+05    4.619e+05

            Pred_col   Estimate  Std. Err Pr(>)_boot sig. code                  95%CI
           Intercept  1.399e+06 2.810e+04     <2e-16       *** (1.340e+06, 1.449e+06)
 events_C...New Year      25.15     25.38      0.158                      (0., 88.36)
 events_C...w Year-1      25.15     25.38      0.158                      (0., 88.36)
 events_C...w Year-2     -3.242     5.279      0.306                  (-17.15, 3.892)
 events_C...w Year+1      25.15     25.38      0.158                      (0., 88.36)
 events_C...w Year+2      39.99     38.16      0.164                      (0., 132.7)
events_Christmas Day         0.        0.      1.000                         (0., 0.)
 events_C...as Day-1     

#### Apply the model

The trained model is available as a fitted `sklearn.pipeline.Pipeline`

In [21]:
 model = result.model
 model

In [22]:
 future_df = result.timeseries.make_future_dataframe(
     periods=4,
     include_history=False)
 future_df

Unnamed: 0,ts,y
2022-02-01,2022-02-01,
2022-03-01,2022-03-01,
2022-04-01,2022-04-01,
2022-05-01,2022-05-01,


> Call .predict() to compute predictions

In [23]:
 model.predict(future_df)

Unnamed: 0,ts,forecast,forecast_lower,forecast_upper,y_quantile_summary,err_std
0,2022-02-01,1262411.0,335058.775494,2189763.0,"(335058.7754936146, 2189763.3547908794)",473147.617489
1,2022-03-01,1261494.0,334142.094707,2188847.0,"(334142.09470715816, 2188846.6740044234)",473147.617489
2,2022-04-01,1260532.0,333179.298381,2187884.0,"(333179.29838129086, 2187883.877678556)",473147.617489
3,2022-05-01,1259444.0,332091.679331,2186796.0,"(332091.67933075526, 2186796.25862802)",473147.617489
