# Facebook Prophet

In [4]:
 
import plotly.io as pio
pio.renderers.default = "colab+notebook_connected+vscode"

import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import optuna
import warnings

warnings.filterwarnings('ignore')

from IPython.display import display, Markdown

# mape
from sklearn.metrics import mean_absolute_percentage_error

optuna.logging.set_verbosity(optuna.logging.ERROR)

# facebook prophet
from prophet import Prophet

In [10]:
orig = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")

# Individual Region Forecasting

In [7]:
train_region_code_agg

Unnamed: 0,Date,Region_Code,Total_Sales,Avg_Sales,Total_Orders,Avg_Orders,Num_Stores,Holiday,Total_Discounts
0,2018-01-01,R4,2286812,45736,2914,58,50,1,50
1,2018-01-01,R2,4436859,42256,5644,54,105,1,105
2,2018-01-01,R3,3527439,41017,4599,53,86,1,86
3,2018-01-01,R1,5094374,41084,6509,52,124,1,124
4,2018-01-02,R4,2545119,50902,3057,61,50,0,50
...,...,...,...,...,...,...,...,...,...
2059,2019-05-30,R4,1966320,39326,2829,57,50,0,4
2060,2019-05-31,R2,4351299,41441,6411,61,105,1,11
2061,2019-05-31,R4,1909319,38186,2746,55,50,1,1
2062,2019-05-31,R1,5900798,47587,9433,76,124,1,18


In [11]:
holiday_df = holiday_df[holiday_df["Holiday"]==1]
holiday_df = holiday_df.rename(columns={"Date": "ds", "Holiday": "holiday"})
holiday_df["holiday"] = "Holiday"
holiday_df

Unnamed: 0,ds,holiday
2,2019-01-13,Holiday
10,2019-03-21,Holiday
14,2019-04-14,Holiday
15,2019-04-17,Holiday
16,2019-04-19,Holiday
...,...,...
483,2018-03-02,Holiday
505,2018-10-02,Holiday
506,2018-10-18,Holiday
508,2018-11-13,Holiday


In [13]:
df = train_region_code_agg[["Date", "Region_Code", "Total_Sales", "Num_Stores","Total_Discounts"  ]].rename(columns={"Date": "ds", "Total_Sales": "y"})
df

Unnamed: 0,ds,Region_Code,y,Num_Stores,Total_Discounts
0,2018-01-01,R4,2286812,50,50
1,2018-01-01,R2,4436859,105,105
2,2018-01-01,R3,3527439,86,86
3,2018-01-01,R1,5094374,124,124
4,2018-01-02,R4,2545119,50,50
...,...,...,...,...,...
2059,2019-05-30,R4,1966320,50,4
2060,2019-05-31,R2,4351299,105,11
2061,2019-05-31,R4,1909319,50,1
2062,2019-05-31,R1,5900798,124,18


In [15]:
print("Min date:", df["ds"].min(), ", Max ds:", df["ds"].max())

total_days = (df["ds"].max() - df["ds"].min()).days
train_max = df["ds"].min() + pd.DateOffset(days=total_days * 0.8)
print("Total days:", total_days, ", Training days: ",total_days * 0.8 )
print("Train max date:", train_max)

Min date: 2018-01-01 00:00:00 , Max ds: 2019-05-31 00:00:00
Total days: 515 , Training days:  412.0
Train max date: 2019-02-17 00:00:00


In [65]:
train = df[df["ds"] <= train_max]
test = df[df["ds"] > train_max]

In [191]:
region_1_train = train[train["Region_Code"]=="R1"].drop(columns=["Region_Code"], axis=1)
region_1_test = test[test["Region_Code"]=="R1"].drop(columns=["Region_Code"], axis=1)

region_2_train = train[train["Region_Code"]=="R2"].drop(columns=["Region_Code"], axis=1)
region_2_test = test[test["Region_Code"]=="R2"].drop(columns=["Region_Code"], axis=1)

region_3_train = train[train["Region_Code"]=="R3"].drop(columns=["Region_Code"], axis=1)
region_3_test = test[test["Region_Code"]=="R3"].drop(columns=["Region_Code"], axis=1)

region_4_train = train[train["Region_Code"]=="R4"].drop(columns=["Region_Code"], axis=1)
region_4_test = test[test["Region_Code"]=="R4"].drop(columns=["Region_Code"], axis=1)

In [172]:
m = Prophet(holidays=holiday_df, yearly_seasonality=True, weekly_seasonality=True, seasonality_mode="multiplicative", holidays_prior_scale=0.01, seasonality_prior_scale=0.01, changepoint_prior_scale=0.01)
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
# m.add_seasonality(name='fortnightly', period=12, fourier_order=5)
m.add_regressor("Num_Stores")
m.add_regressor("Total_Discounts")

<prophet.forecaster.Prophet at 0x354ae3aa0>

In [173]:
m.fit(region_1_train)
forecast = m.predict(region_1_test.drop(columns=["y"], axis=1))

22:49:59 - cmdstanpy - INFO - Chain [1] start processing
22:50:00 - cmdstanpy - INFO - Chain [1] done processing


In [174]:
fig = px.line(forecast, x='ds', y='yhat', title='Sales Forecast')
fig.add_scatter(x=region_1_test["ds"], y=region_1_test["y"], mode='lines', name='Actual Sales')
# plot confidence intervals
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_upper'], fill=None, mode='lines', line_color='lightblue', name='Upper Bound'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_lower'], fill='tonexty', mode='lines', line_color='lightblue', name='Lower Bound'))
fig.show()

In [175]:
mean_absolute_percentage_error(region_1_test["y"], forecast["yhat"])

0.1741072454750751

# Hyperparameter tuning 

In [195]:
# set up optuna

data_train = region_4_train
data_test = region_4_test

def objective(trial):
    changepoint_prior_scale = trial.suggest_loguniform('changepoint_prior_scale', 0.001, 0.5)
    seasonality_prior_scale = trial.suggest_loguniform('seasonality_prior_scale', 0.01, 10)
    holidays_prior_scale = trial.suggest_loguniform('holidays_prior_scale', 0.01, 10)
    seasonality_mode = trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative'])
    m = Prophet(holidays=holiday_df, yearly_seasonality=True, weekly_seasonality=True, seasonality_mode=seasonality_mode, holidays_prior_scale=holidays_prior_scale, seasonality_prior_scale=seasonality_prior_scale, changepoint_prior_scale=changepoint_prior_scale)
    m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    m.add_regressor("Num_Stores")
    m.add_regressor("Total_Discounts")
    m.fit(data_train)
    forecast = m.predict(data_test.drop(columns=["y"], axis=1))
    mape = mean_absolute_percentage_error(data_test["y"], forecast["yhat"])
    return mape

study = optuna.create_study(direction='minimize', study_name='prophet_region_4', storage='sqlite:///optuna.db', load_if_exists=True)
study.optimize(objective, n_trials=500)

23:19:28 - cmdstanpy - INFO - Chain [1] start processing
23:19:28 - cmdstanpy - INFO - Chain [1] done processing
23:19:28 - cmdstanpy - INFO - Chain [1] start processing
23:19:28 - cmdstanpy - INFO - Chain [1] done processing
23:19:28 - cmdstanpy - INFO - Chain [1] start processing
23:19:28 - cmdstanpy - INFO - Chain [1] done processing
23:19:28 - cmdstanpy - INFO - Chain [1] start processing
23:19:28 - cmdstanpy - INFO - Chain [1] done processing
23:19:28 - cmdstanpy - INFO - Chain [1] start processing
23:19:29 - cmdstanpy - INFO - Chain [1] done processing
23:19:29 - cmdstanpy - INFO - Chain [1] start processing
23:19:29 - cmdstanpy - INFO - Chain [1] done processing
23:19:29 - cmdstanpy - INFO - Chain [1] start processing
23:19:29 - cmdstanpy - INFO - Chain [1] done processing
23:19:29 - cmdstanpy - INFO - Chain [1] start processing
23:19:29 - cmdstanpy - INFO - Chain [1] done processing
23:19:29 - cmdstanpy - INFO - Chain [1] start processing
23:19:29 - cmdstanpy - INFO - Chain [1]

In [196]:
study.best_params

{'changepoint_prior_scale': 0.007254441353678806,
 'seasonality_prior_scale': 1.4941005961264453,
 'holidays_prior_scale': 0.9143078357824553,
 'seasonality_mode': 'additive'}

In [197]:
study.best_value

0.16044228296575438

# Global Forecasting

In [198]:
train_agg

Unnamed: 0,Date,Total_Sales,Avg_Sales,Total_Orders,Avg_Orders,Holiday,Total_Discounts
0,2018-01-01,15345484,42042,19666,54,1,365
1,2018-01-02,19592415,53678,25326,69,0,365
2,2018-01-03,18652527,51103,24047,66,0,365
3,2018-01-04,19956267,54675,25584,70,0,364
4,2018-01-05,22902651,62747,28436,78,0,364
...,...,...,...,...,...,...,...
511,2019-05-27,17197023,47115,25447,70,0,321
512,2019-05-28,18652065,51102,27184,74,0,319
513,2019-05-29,16213497,44421,24047,66,0,193
514,2019-05-30,16082139,44061,24318,67,0,76


In [200]:
df = train_agg[["Date", "Total_Sales", "Total_Discounts"  ]].rename(columns={"Date": "ds", "Total_Sales": "y"})
df

Unnamed: 0,ds,y,Total_Discounts
0,2018-01-01,15345484,365
1,2018-01-02,19592415,365
2,2018-01-03,18652527,365
3,2018-01-04,19956267,364
4,2018-01-05,22902651,364
...,...,...,...
511,2019-05-27,17197023,321
512,2019-05-28,18652065,319
513,2019-05-29,16213497,193
514,2019-05-30,16082139,76


In [201]:
train = df[df["ds"] <= train_max]
test = df[df["ds"] > train_max]

In [202]:
def objective(trial):
    changepoint_prior_scale = trial.suggest_loguniform('changepoint_prior_scale', 0.001, 0.5)
    seasonality_prior_scale = trial.suggest_loguniform('seasonality_prior_scale', 0.01, 10)
    holidays_prior_scale = trial.suggest_loguniform('holidays_prior_scale', 0.01, 10)
    seasonality_mode = trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative'])
    m = Prophet(holidays=holiday_df, yearly_seasonality=True, weekly_seasonality=True, seasonality_mode=seasonality_mode, holidays_prior_scale=holidays_prior_scale, seasonality_prior_scale=seasonality_prior_scale, changepoint_prior_scale=changepoint_prior_scale)
    m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    m.add_regressor("Total_Discounts")
    m.fit(train)
    forecast = m.predict(test.drop(columns=["y"], axis=1))
    mape = mean_absolute_percentage_error(test["y"], forecast["yhat"])
    return mape

study = optuna.create_study(direction='minimize', study_name='prophet_global', storage='sqlite:///optuna.db', load_if_exists=True)
study.optimize(objective, n_trials=500)

23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1] done processing
23:27:51 - cmdstanpy - INFO - Chain [1] start processing
23:27:51 - cmdstanpy - INFO - Chain [1]

In [205]:
study.best_params

{'changepoint_prior_scale': 0.17502184036303478,
 'seasonality_prior_scale': 1.0519288873084944,
 'holidays_prior_scale': 0.3145864202229293,
 'seasonality_mode': 'additive'}

In [206]:
m = Prophet(holidays=holiday_df, yearly_seasonality=True, weekly_seasonality=True, **study.best_params)
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
m.add_regressor("Total_Discounts")
m.fit(train)
forecast = m.predict(test.drop(columns=["y"], axis=1))


23:30:42 - cmdstanpy - INFO - Chain [1] start processing
23:30:42 - cmdstanpy - INFO - Chain [1] done processing


In [207]:
fig = px.line(forecast, x='ds', y='yhat', title='Sales Forecast')
fig.add_scatter(x=test["ds"], y=test["y"], mode='lines', name='Actual Sales')
# plot confidence intervals
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_upper'], fill=None, mode='lines', line_color='lightblue', name='Upper Bound'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_lower'], fill='tonexty', mode='lines', line_color='lightblue', name='Lower Bound'))
fig.show()