In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
output_table_name = 'forecast_initial_submission'
sql_filename = 'initial_submission_count_by_date.sql'

In [2]:
from functools import partial

import pandas as pd

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import, wrong-import-order

from fbprophet import Prophet

from IPython.display import display

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import, ungrouped-imports

In [3]:
forecast_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix=output_table_name
)

In [4]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [5]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [6]:
raw_timeseries_df = read_big_query(
    get_sql(sql_filename).format(
        **default_query_props
    )
)
raw_timeseries_df.head()

> ```sql
> SELECT
>   DATE(QC_Complete_Timestamp) AS initial_submission_date,
>   COUNT(DISTINCT Manuscript_ID) AS manuscript_count
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Manuscript_Version`
> WHERE Overall_Stage = 'Initial Submission'
>   AND Position_In_Overall_Stage = 1
>   AND DATE(QC_Complete_Timestamp) < DATE(CURRENT_TIMESTAMP())
> GROUP BY initial_submission_date
> ORDER BY initial_submission_date DESC
> ```

Unnamed: 0,initial_submission_date,manuscript_count
0,2020-09-30,36
1,2020-09-29,32
2,2020-09-28,36
3,2020-09-27,16
4,2020-09-26,8


In [7]:
raw_timeseries_df.dtypes

initial_submission_date    datetime64[ns]
manuscript_count                    int64
dtype: object

In [8]:
assert len(raw_timeseries_df.columns) == 2

In [9]:
assert pd.api.types.is_datetime64_dtype(raw_timeseries_df.dtypes[0])

In [10]:
assert pd.api.types.is_integer_dtype(raw_timeseries_df.dtypes[1])

In [11]:
timeseries_df = raw_timeseries_df.copy()
timeseries_df.columns = ['ds', 'y']
timeseries_df.head()

Unnamed: 0,ds,y
0,2020-09-30,36
1,2020-09-29,32
2,2020-09-28,36
3,2020-09-27,16
4,2020-09-26,8


In [12]:
model = Prophet(daily_seasonality=True)
model.fit(timeseries_df)
None

In [13]:
forecast_df = model.predict(
    model.make_future_dataframe(periods=365 + 31, include_history=False)
)

In [14]:
forecast_df[['ds', 'yhat_lower', 'yhat', 'yhat_upper']].head(30)

Unnamed: 0,ds,yhat_lower,yhat,yhat_upper
0,2020-10-01,24.82949,33.698625,43.191054
1,2020-10-02,23.975671,32.702327,42.007543
2,2020-10-03,12.516366,21.695877,31.234941
3,2020-10-04,11.378244,20.225463,29.308168
4,2020-10-05,21.339802,30.273191,39.509846
5,2020-10-06,24.848674,33.899554,43.219248
6,2020-10-07,24.912196,34.081482,42.717996
7,2020-10-08,24.02141,33.582865,42.510825
8,2020-10-09,22.798691,32.54239,40.9745
9,2020-10-10,12.92222,21.497123,30.774038


In [15]:
def get_month_start_date(d):
    return d + pd.offsets.DateOffset(days=1) - pd.offsets.MonthBegin(1)


def get_quarter_start_date(d):
    return d + pd.offsets.DateOffset(days=1) - pd.offsets.QuarterBegin(1, startingMonth=1)


def get_year_start_date(d):
    return d + pd.offsets.DateOffset(days=1) - pd.offsets.YearBegin(1)


prediction_start_date = forecast_df['ds'][0] + pd.offsets.DateOffset(days=0)
print('prediction_start_date:', prediction_start_date)
prediction_month_start_date = get_month_start_date(prediction_start_date)
print('prediction_month_start_date:', prediction_month_start_date)
prediction_quarter_start_date = get_quarter_start_date(prediction_start_date)
print('prediction_quarter_start_date:', prediction_quarter_start_date)
prediction_year_start_date = get_year_start_date(prediction_start_date)
print('prediction_year_start_date:', prediction_year_start_date)

prediction_start_date: 2020-10-01 00:00:00
prediction_month_start_date: 2020-10-01 00:00:00
prediction_quarter_start_date: 2020-10-01 00:00:00
prediction_year_start_date: 2020-01-01 00:00:00


In [16]:
def filter_date_between(df: pd.DataFrame, start_date, excl_end_date, date_column='ds') -> pd.DataFrame:
    date_column_ser = df[date_column]
    return df[(date_column_ser >= start_date) & (date_column_ser < excl_end_date)]


def filter_by_month(df: pd.DataFrame, month_date, **kwargs) -> pd.DataFrame:
    month_start_date = get_month_start_date(month_date)
    next_month_start_date = month_start_date + pd.offsets.MonthBegin(1)
    return filter_date_between(df, month_start_date, next_month_start_date, **kwargs)


_df = filter_by_month(forecast_df, prediction_month_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

31


0   2020-10-01
1   2020-10-02
2   2020-10-03
3   2020-10-04
4   2020-10-05
Name: ds, dtype: datetime64[ns]

26   2020-10-27
27   2020-10-28
28   2020-10-29
29   2020-10-30
30   2020-10-31
Name: ds, dtype: datetime64[ns]

In [17]:
def filter_by_quarter(df: pd.DataFrame, quarter_date, **kwargs) -> pd.DataFrame:
    quarter_start_date = get_quarter_start_date(quarter_date)
    next_quarter_start_date = quarter_start_date + pd.offsets.MonthBegin(3)
    return filter_date_between(df, quarter_start_date, next_quarter_start_date, **kwargs)


_df = filter_by_quarter(forecast_df, prediction_quarter_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

92


0   2020-10-01
1   2020-10-02
2   2020-10-03
3   2020-10-04
4   2020-10-05
Name: ds, dtype: datetime64[ns]

87   2020-12-27
88   2020-12-28
89   2020-12-29
90   2020-12-30
91   2020-12-31
Name: ds, dtype: datetime64[ns]

In [18]:
def filter_by_year(df: pd.DataFrame, year_date, **kwargs) -> pd.DataFrame:
    year_start_date = get_year_start_date(year_date)
    next_year_start_date = year_start_date + pd.offsets.YearBegin(1)
    return filter_date_between(df, year_start_date, next_year_start_date, **kwargs)


_df = filter_by_year(forecast_df, prediction_year_start_date)
print(len(_df))
display(_df['ds'].head())
display(_df['ds'].tail())

92


0   2020-10-01
1   2020-10-02
2   2020-10-03
3   2020-10-04
4   2020-10-05
Name: ds, dtype: datetime64[ns]

87   2020-12-27
88   2020-12-28
89   2020-12-29
90   2020-12-30
91   2020-12-31
Name: ds, dtype: datetime64[ns]

In [19]:
_month_dates = [
    prediction_month_start_date + pd.offsets.MonthBegin(month_offset)
    for month_offset in range(-12, 13)
]
predicted_month_count_incl_history_df = pd.DataFrame([
    {
        'ds': month_date,
        'y_actual': filter_by_month(timeseries_df, month_date)['y'].sum(),
        'y_forecast': filter_by_month(forecast_df, month_date)['yhat'].sum()
    }
    for month_date in _month_dates
])
predicted_month_count_incl_history_df['y'] = (
    predicted_month_count_incl_history_df['y_actual']
    + predicted_month_count_incl_history_df['y_forecast']
)
predicted_month_count_incl_history_df.head(25)

Unnamed: 0,ds,y_actual,y_forecast,y
0,2019-10-01,825,0.0,825.0
1,2019-11-01,768,0.0,768.0
2,2019-12-01,684,0.0,684.0
3,2020-01-01,758,0.0,758.0
4,2020-02-01,759,0.0,759.0
5,2020-03-01,843,0.0,843.0
6,2020-04-01,891,0.0,891.0
7,2020-05-01,951,0.0,951.0
8,2020-06-01,942,0.0,942.0
9,2020-07-01,958,0.0,958.0


In [20]:
def add_initial(ser, initial_value):
    _values = ser.values.copy()
    _values[0] += initial_value
    return _values


def to_isodate(ser) -> pd.Series:
    return ser.apply(
        lambda dt: dt.strftime('%Y-%m-%d')
    )


forecast_result_df = (
    forecast_df[['ds', 'yhat_lower', 'yhat', 'yhat_upper']]
    .rename(columns={
        'ds': 'Forecast_Date',
        'yhat': 'Forecast_Value',
        'yhat_lower': 'Forecast_Lower_Bound_Value',
        'yhat_upper': 'Forecast_Upper_Bound_Value'
    })
    .copy()
)

for c in ['Forecast_Value', 'Forecast_Lower_Bound_Value', 'Forecast_Upper_Bound_Value']:
    forecast_result_df[c] = forecast_result_df[c].round().astype(int)

forecast_result_df['Forecast_Month_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_month(timeseries_df, prediction_month_start_date)['y'].sum()
)
forecast_result_df['Forecast_Month_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_quarter(timeseries_df, prediction_month_start_date)['y'].sum()
)
forecast_result_df['Forecast_Quarter_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_quarter(timeseries_df, prediction_quarter_start_date)['y'].sum()
)
forecast_result_df['Forecast_Year_Offset_Value'] = add_initial(
    forecast_result_df['Forecast_Value'],
    filter_by_year(timeseries_df, prediction_year_start_date)['y'].sum()
)
forecast_result_df['Forecast_Date'] = to_isodate(forecast_result_df['Forecast_Date'])
forecast_result_df['Date'] = forecast_result_df['Forecast_Date']
forecast_result_df['Model'] = 'fbprophet'
print(len(forecast_result_df))
forecast_result_df.head()

396


Unnamed: 0,Forecast_Date,Forecast_Lower_Bound_Value,Forecast_Value,Forecast_Upper_Bound_Value,Forecast_Month_Offset_Value,Forecast_Quarter_Offset_Value,Forecast_Year_Offset_Value,Date,Model
0,2020-10-01,25,34,43,34,34,7855,2020-10-01,fbprophet
1,2020-10-02,24,33,42,33,33,33,2020-10-02,fbprophet
2,2020-10-03,13,22,31,22,22,22,2020-10-03,fbprophet
3,2020-10-04,11,20,29,20,20,20,2020-10-04,fbprophet
4,2020-10-05,21,30,40,30,30,30,2020-10-05,fbprophet


In [21]:
def to_iso_timestamp(ser) -> pd.Series:
    return ser.apply(
        lambda dt: dt.strftime('%Y-%m-%dT00:00:00Z')
    )


actual_result_df = timeseries_df.rename(columns={'ds': 'Date', 'y': 'Actual_Value'}).copy()
actual_result_df['Date'] = to_isodate(actual_result_df['Date'])

forecast_result_with_actual_df = forecast_result_df.merge(
    actual_result_df,
    on='Date',
    how='outer'
).sort_values('Date')

forecast_result_with_actual_df['Date_Timestamp'] = to_iso_timestamp(
    pd.to_datetime(forecast_result_with_actual_df['Date'])
)
forecast_result_with_actual_df['Month_Start_Date'] = to_isodate(
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_month_start_date)
)
forecast_result_with_actual_df['Quarter_Start_Date'] = to_isodate(
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_quarter_start_date)
)
forecast_result_with_actual_df['Year_Start_Date'] = to_isodate(
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_year_start_date)
)
forecast_result_with_actual_df['Is_Current_Quarter'] = (
    pd.to_datetime(forecast_result_with_actual_df['Date']).apply(get_quarter_start_date)
    == prediction_quarter_start_date
)
print(len(forecast_result_with_actual_df))
forecast_result_with_actual_df.sort_values('Date').head()

3210


Unnamed: 0,Forecast_Date,Forecast_Lower_Bound_Value,Forecast_Value,Forecast_Upper_Bound_Value,Forecast_Month_Offset_Value,Forecast_Quarter_Offset_Value,Forecast_Year_Offset_Value,Date,Model,Actual_Value,Date_Timestamp,Month_Start_Date,Quarter_Start_Date,Year_Start_Date,Is_Current_Quarter
3209,,,,,,,,2012-05-04,,1.0,2012-05-04T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False
3208,,,,,,,,2012-05-07,,1.0,2012-05-07T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False
3207,,,,,,,,2012-05-19,,1.0,2012-05-19T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False
3206,,,,,,,,2012-05-22,,1.0,2012-05-22T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False
3205,,,,,,,,2012-05-30,,1.0,2012-05-30T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False


In [22]:
forecast_result_with_actual_df['Actual_Month_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Month_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df['Actual_Quarter_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Quarter_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df['Actual_Year_Cumsum_Value'] = (
    forecast_result_with_actual_df.groupby('Year_Start_Date')['Actual_Value'].cumsum()
)
forecast_result_with_actual_df.head()

Unnamed: 0,Forecast_Date,Forecast_Lower_Bound_Value,Forecast_Value,Forecast_Upper_Bound_Value,Forecast_Month_Offset_Value,Forecast_Quarter_Offset_Value,Forecast_Year_Offset_Value,Date,Model,Actual_Value,Date_Timestamp,Month_Start_Date,Quarter_Start_Date,Year_Start_Date,Is_Current_Quarter,Actual_Month_Cumsum_Value,Actual_Quarter_Cumsum_Value,Actual_Year_Cumsum_Value
3209,,,,,,,,2012-05-04,,1.0,2012-05-04T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False,1.0,1.0,1.0
3208,,,,,,,,2012-05-07,,1.0,2012-05-07T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False,2.0,2.0,2.0
3207,,,,,,,,2012-05-19,,1.0,2012-05-19T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False,3.0,3.0,3.0
3206,,,,,,,,2012-05-22,,1.0,2012-05-22T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False,4.0,4.0,4.0
3205,,,,,,,,2012-05-30,,1.0,2012-05-30T00:00:00Z,2012-05-01,2012-04-01,2012-01-01,False,5.0,5.0,5.0


In [23]:
print('writing to:', forecast_output_table_name)
to_gbq(
    forecast_result_with_actual_df,
    forecast_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_forecast_initial_submission


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processed 3210 lines
INFO:data_science_pipeline.utils.bq:loading from /tmp/tmp7wpth0d3/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 3210 rows into de_dev:data_science_forecast_initial_submission.


done
