In [None]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
output_table_name = 'Forecast_Initial_Submission'
sql_filename = 'initial_submission_count_by_date.sql'

In [None]:
from functools import partial

import pandas as pd

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import, wrong-import-order

from fbprophet import Prophet

from IPython.display import display

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)
from data_science_pipeline.utils.timeseries import (
    to_date_isoformat,
    to_timestamp_isoformat,
    get_month_start_date,
    get_quarter_start_date,
    get_year_start_date,
    get_quarter_week_date,
    filter_by_month,
    filter_by_quarter,
    filter_by_year
)

import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import, ungrouped-imports

In [None]:
forecast_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix=output_table_name
)

In [None]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [None]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [None]:
raw_timeseries_df = read_big_query(
    get_sql(sql_filename).format(
        **default_query_props
    )
)
raw_timeseries_df.head()

In [None]:
raw_timeseries_df.dtypes

In [None]:
assert len(raw_timeseries_df.columns) == 2

In [None]:
assert pd.api.types.is_datetime64_dtype(raw_timeseries_df.dtypes[0])

In [None]:
assert pd.api.types.is_integer_dtype(raw_timeseries_df.dtypes[1])

In [None]:
timeseries_df = raw_timeseries_df.copy()
timeseries_df.columns = ['ds', 'y']
timeseries_df.head()

In [None]:
model = Prophet(daily_seasonality=True)
model.fit(timeseries_df)
None

In [None]:
forecast_df = model.predict(
    model.make_future_dataframe(periods=365 + 31, include_history=False)
)

In [None]:
forecast_df[['ds', 'yhat_lower', 'yhat', 'yhat_upper']].head(30)

In [None]:
prediction_start_date = forecast_df['ds'][0] + pd.offsets.DateOffset(days=0)
print('prediction_start_date:', prediction_start_date)
prediction_month_start_date = get_month_start_date(prediction_start_date)
print('prediction_month_start_date:', prediction_month_start_date)
prediction_quarter_start_date = get_quarter_start_date(prediction_start_date)
print('prediction_quarter_start_date:', prediction_quarter_start_date)
prediction_year_start_date = get_year_start_date(prediction_start_date)
print('prediction_year_start_date:', prediction_year_start_date)

In [None]:
_df = filter_by_month(forecast_df, prediction_month_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

In [None]:
_df = filter_by_quarter(forecast_df, prediction_quarter_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

In [None]:
_df = filter_by_year(forecast_df, prediction_year_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

In [None]:
_month_dates = [
    prediction_month_start_date + pd.offsets.MonthBegin(month_offset)
    for month_offset in range(-12, 13)
]
predicted_month_count_incl_history_df = pd.DataFrame([
    {
        'ds': month_date,
        'y_actual': filter_by_month(timeseries_df, month_date)['y'].sum(),
        'y_forecast': filter_by_month(forecast_df, month_date)['yhat'].sum()
    }
    for month_date in _month_dates
])
predicted_month_count_incl_history_df['y'] = (
    predicted_month_count_incl_history_df['y_actual']
    + predicted_month_count_incl_history_df['y_forecast']
)
predicted_month_count_incl_history_df.head(25)

In [None]:
def add_initial(ser, initial_value):
    _values = ser.values.copy()
    _values[0] += initial_value
    return _values


forecast_result_df = (
    forecast_df[['ds', 'yhat_lower', 'yhat', 'yhat_upper']]
    .rename(columns={
        'ds': 'Forecast_Date',
        'yhat': 'Forecast_Value',
        'yhat_lower': 'Forecast_Lower_Bound_Value',
        'yhat_upper': 'Forecast_Upper_Bound_Value'
    })
    .copy()
)

for c in ['Forecast_Value', 'Forecast_Lower_Bound_Value', 'Forecast_Upper_Bound_Value']:
    forecast_result_df[c] = forecast_result_df[c].round().astype(int)

forecast_result_df['Forecast_Month_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_month(timeseries_df, prediction_month_start_date)['y'].sum()
)
forecast_result_df['Forecast_Month_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_quarter(timeseries_df, prediction_month_start_date)['y'].sum()
)
forecast_result_df['Forecast_Quarter_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_quarter(timeseries_df, prediction_quarter_start_date)['y'].sum()
)
forecast_result_df['Forecast_Year_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_year(timeseries_df, prediction_year_start_date)['y'].sum()
)
forecast_result_df['Forecast_Date'] = forecast_result_df['Forecast_Date'].apply(to_date_isoformat)
forecast_result_df['Date'] = forecast_result_df['Forecast_Date']
forecast_result_df['Model'] = 'fbprophet'
print(len(forecast_result_df))
forecast_result_df.head()

In [None]:
actual_result_df = timeseries_df.rename(columns={'ds': 'Date', 'y': 'Actual_Value'}).copy()
actual_result_df['Date'] = actual_result_df['Date'].apply(to_date_isoformat)

forecast_result_with_actual_df = forecast_result_df.merge(
    actual_result_df,
    on='Date',
    how='outer'
).sort_values('Date')

forecast_result_with_actual_df['Actual_Or_Forecast_Value'] = (
    forecast_result_with_actual_df['Actual_Value'].combine_first(
        forecast_result_with_actual_df['Forecast_Value']
    )
)

forecast_result_with_actual_df['Date_Timestamp'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date'])
    .apply(to_timestamp_isoformat)
)
forecast_result_with_actual_df['Month_Start_Date'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_month_start_date)
    .apply(to_date_isoformat)
)
forecast_result_with_actual_df['Quarter_Start_Date'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_quarter_start_date)
    .apply(to_date_isoformat)
)
forecast_result_with_actual_df['Year_Start_Date'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_year_start_date)
    .apply(to_date_isoformat)
)
forecast_result_with_actual_df['Quarter_Week_Date'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_quarter_week_date)
    .apply(to_date_isoformat)
)
forecast_result_with_actual_df['Is_Current_Quarter'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_quarter_start_date)
    == prediction_quarter_start_date
)
print(len(forecast_result_with_actual_df))
forecast_result_with_actual_df.sort_values('Date').head()

In [None]:
forecast_result_with_actual_df['Actual_Month_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Month_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df['Actual_Quarter_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Quarter_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df['Actual_Year_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Year_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df.head()

In [None]:
print('writing to:', forecast_output_table_name)
to_gbq(
    forecast_result_with_actual_df,
    forecast_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')