In [2]:
pip install prophet

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Connect to server
#import pyodbc
#from dotenv import dotenv_values

# Datetime
from datetime import datetime

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import missingno as msno
from prophet.plot import plot_plotly, plot_components_plotly

# Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Statistical Analysis
import scipy.stats as stats
from statsmodels.stats.weightstats import ttest_ind
import statsmodels.api as sm
#from pmdarima.arima import CHTest, nsdiffs
#from arch.unitroot import ADF, KPSS
from statsmodels.stats.diagnostic import acorr_ljungbox
#import phik
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Machine Learning Modeling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
#import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.pipeline import Pipeline
from prophet import Prophet

import os

import random

import warnings

# ignore warnings
warnings.filterwarnings('ignore')

from pathlib import Path, PureWindowsPath

  from .autonotebook import tqdm as notebook_tqdm


### Import data

In [3]:
path_cwd = Path(PureWindowsPath(os.path.dirname(os.getcwd())))
path_cwd
path = path_cwd / 'Feature Engineering'

In [5]:
df_test_p.columns

Index(['date', 'sales', 'family', 'familycluster', 'store_nbr'], dtype='object')

In [24]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'family' column
df_train_p['family_enc'] = label_encoder.fit_transform(df_train_p['family'])
df_test_p['family_enc'] = label_encoder.fit_transform(df_test_p['family'])

In [33]:
model_df = df_train_p[['date', 'sales', 'family_enc']]
model_df.columns = ['ds', 'y', 'family_enc']
model_df['ds'] = pd.to_datetime(model_df['ds'])

In [35]:
model_df_test = df_test_p[['date', 'sales', 'family_enc']]
model_df_test.columns = ['ds', 'y', 'family_enc']
model_df_test['ds'] = pd.to_datetime(model_df_test['ds'])

In [37]:
m = Prophet()
m.add_regressor('family_enc')
m.fit(model_df)

17:55:37 - cmdstanpy - INFO - Chain [1] start processing
17:58:41 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x16f61e32e00>

In [38]:
forecast = m.predict(model_df_test.drop(columns="y"))

In [42]:
forecast

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,extra_regressors_additive,...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-03-13,439.702570,-829.400597,1754.479862,439.702570,439.702570,-72.687683,-72.687683,-72.687683,-83.502622,...,-11.437016,-11.437016,-11.437016,22.251955,22.251955,22.251955,0.0,0.0,0.0,367.014888
1,2017-03-13,439.702570,-813.308380,1877.814268,439.702570,439.702570,24.733268,24.733268,24.733268,13.918329,...,-11.437016,-11.437016,-11.437016,22.251955,22.251955,22.251955,0.0,0.0,0.0,464.435838
2,2017-03-13,439.702570,-865.667619,1866.640033,439.702570,439.702570,38.650547,38.650547,38.650547,27.835608,...,-11.437016,-11.437016,-11.437016,22.251955,22.251955,22.251955,0.0,0.0,0.0,478.353117
3,2017-03-13,439.702570,-997.539087,1789.619915,439.702570,439.702570,52.567826,52.567826,52.567826,41.752887,...,-11.437016,-11.437016,-11.437016,22.251955,22.251955,22.251955,0.0,0.0,0.0,492.270396
4,2017-03-13,439.702570,-954.850976,1905.409454,439.702570,439.702570,80.402383,80.402383,80.402383,69.587444,...,-11.437016,-11.437016,-11.437016,22.251955,22.251955,22.251955,0.0,0.0,0.0,520.104953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276566,2017-08-15,434.900615,-972.683922,1691.520095,291.261630,561.573803,-54.513967,-54.513967,-54.513967,27.835608,...,-38.014831,-38.014831,-38.014831,-44.334745,-44.334745,-44.334745,0.0,0.0,0.0,380.386648
276567,2017-08-15,434.900615,-720.560841,1896.371969,291.261076,561.574377,42.906983,42.906983,42.906983,125.256559,...,-38.014831,-38.014831,-38.014831,-44.334745,-44.334745,-44.334745,0.0,0.0,0.0,477.807599
276568,2017-08-15,434.900615,-803.825740,1827.492987,291.260521,561.574951,28.989705,28.989705,28.989705,111.339280,...,-38.014831,-38.014831,-38.014831,-44.334745,-44.334745,-44.334745,0.0,0.0,0.0,463.890320
276569,2017-08-15,434.900615,-1001.360413,1712.822883,291.259966,561.575525,-96.265804,-96.265804,-96.265804,-13.916228,...,-38.014831,-38.014831,-38.014831,-44.334745,-44.334745,-44.334745,0.0,0.0,0.0,338.634812


In [39]:
forecast['yhat']

0         367.014888
1         464.435838
2         478.353117
3         492.270396
4         520.104953
             ...    
276566    380.386648
276567    477.807599
276568    463.890320
276569    338.634812
276570    129.875631
Name: yhat, Length: 276571, dtype: float64

In [40]:
# Define a function to compute the evaluations metrics after the forecast
def evaluate_forecast(y_test, forecast):
    """
    Compute MSE, RMSE, and RMSLE for a forecast.

    Parameters:
    y_test (array-like): Actual values.
    forecast (array-like): Predicted values.

    Returns:
    dict: Dictionary containing MSE, RMSE, and RMSLE.
    """
    def rmsle(predicted, actual):
        return np.sqrt(np.mean(np.square(np.log1p(predicted) - np.log1p(actual))))

    # Compute Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, forecast)
    
    # Compute Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    
    # Compute Root Mean Squared Logarithmic Error (RMSLE)
    rmsle_value = rmsle(forecast, y_test)
    
    # Return the evaluation metrics as a dictionary
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'RMSLE': rmsle_value
    }
    
    return metrics

In [41]:
evaluate_forecast(model_df_test['y'], forecast['yhat'])

{'MSE': 1836449.216817036,
 'RMSE': 1355.156528529836,
 'RMSLE': 3.718822699991647}

In [47]:
df_test_all = pd.read_pickle(path / 'df_test.pkl')
df_train_all = pd.read_pickle(path / 'df_train.pkl')

In [48]:
df_test_all.columns

Index(['date', 'sales', 'onpromotion', 'day_of_week', 'month', 'year',
       'oil_price', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4',
       'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9',
       'cluster_10', 'cluster_11', 'cluster_12', 'cluster_13', 'cluster_14',
       'cluster_15', 'cluster_16', 'cluster_17', 'holiday_Carnaval',
       'holiday_Dia de la Madre', 'holiday_Dia del Trabajo',
       'holiday_Fundacion de Quito', 'holiday_Independencia de Cuenca',
       'holiday_Mundial de futbol Brasil: Ecuador-Suiza', 'holiday_Navidad-1',
       'holiday_Navidad-2', 'holiday_Navidad-3', 'holiday_Navidad-4',
       'holiday_Primer dia del ano', 'holiday_Terremoto Manabi+1',
       'holiday_Terremoto Manabi+2', 'holiday_Terremoto Manabi+3',
       'holiday_Terremoto Manabi+4', 'holiday_Terremoto Manabi+5',
       'holiday_Traslado Primer dia del ano', 'familycluster_0',
       'familycluster_1', 'familycluster_2', 'familycluster_3',
       'familycluster_4', '

In [52]:
model_df_all = df_train_all
model_df_test_all = df_test_all
model_df_all.rename(columns = {'date':'ds'}, inplace = True)
model_df_all.rename(columns = {'sales':'y'}, inplace = True)
model_df_all['ds'] = pd.to_datetime(model_df_all['ds'])
model_df_test_all.rename(columns = {'date':'ds'}, inplace = True)
model_df_test_all.rename(columns = {'sales':'y'}, inplace = True)
model_df_test_all['ds'] = pd.to_datetime(model_df_test_all['ds'])

In [58]:
model_df_all

Unnamed: 0,ds,y,onpromotion,day_of_week,month,year,oil_price,cluster_1,cluster_2,cluster_3,...,holiday_Terremoto Manabi+3,holiday_Terremoto Manabi+4,holiday_Terremoto Manabi+5,holiday_Traslado Primer dia del ano,familycluster_0,familycluster_1,familycluster_2,familycluster_3,familycluster_4,familycluster_5
844600,2013-01-01,0.000,0.000000,0.166667,0.000000,0.00,0.792965,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
844576,2013-01-01,186.000,0.000000,0.166667,0.000000,0.00,0.792965,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
844577,2013-01-01,143.000,0.000000,0.166667,0.000000,0.00,0.792965,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
844578,2013-01-01,71.090,0.000000,0.166667,0.000000,0.00,0.792965,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
844579,2013-01-01,46.000,0.000000,0.166667,0.000000,0.00,0.792965,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990714,2016-10-03,214.000,0.000000,0.000000,0.818182,0.75,0.267796,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1990715,2016-10-03,13.000,0.000000,0.000000,0.818182,0.75,0.267796,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1990716,2016-10-03,786.588,0.001350,0.000000,0.818182,0.75,0.267796,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1990717,2016-10-03,610.000,0.012146,0.000000,0.818182,0.75,0.267796,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
c = Prophet()
for i in model_df_all.drop(columns = ['ds', 'y']).columns:
    c.add_regressor(i)
c.fit(model_df_all)

09:56:05 - cmdstanpy - INFO - Chain [1] start processing
10:06:01 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x16f6eed05b0>

In [62]:
forecast = c.predict(model_df_test_all.drop(columns="y"))

In [64]:
evaluate_forecast(model_df_test_all['y'], forecast['yhat'])

{'MSE': 9240871.102214858,
 'RMSE': 3039.8801131319074,
 'RMSLE': 7.29259899659889}