# Vitamix Forecasting Models

# Global
    * Data Load and Preparation

In [1]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np

from datetime import datetime, timedelta

# copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username>
%env snowflakepass=<your_snowflake_password>

In [3]:
# Query Snowflake

def snowflake_to_pandas(connection_params, query):
    try:
        # Establish a connection to Snowflake
        conn = snowflake.connector.connect(**connection_params)

        # Execute the SQL query and fetch the results into a DataFrame
        df = pd.read_sql_query(query, conn)

        # Close the connection
        conn.close()

        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# SQL command 
query = 'SELECT * FROM "ANALYTICS"."FORECASTING"."sales_fcast_combined_v"'

# Call the function to retrieve the data into a Pandas DataFrame
result_df = snowflake_to_pandas(connection_params, query)

if result_df is not None:
    print(result_df.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to retrieve data from Snowflake.")

  df = pd.read_sql_query(query, conn)


   DEP_ENT       MONTH   NET_SALES  BUDGET_AMOUNT      FORECAST
0  210_155  2023-07-01  2817972.34      5004554.0  3.885478e+06
1  210_155  2023-08-01         NaN      5812184.0  3.431112e+06
2  210_155  2023-09-01         NaN      5786571.0  4.134850e+06
3  210_155  2023-10-01         NaN      5239358.0  3.793092e+06
4  210_155  2023-11-01         NaN      7036356.0  5.865456e+06


In [30]:
### Data prep ###

result_df["MONTH"] = pd.to_datetime(result_df["MONTH"]) # convert month field to date
df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2023,6,1)] # we have data through July '23 when we are training these models. 
#Some random months will have data that we want to remove (* Want to test without July though)
df_all = df_all[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df_all = df_all.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe

In [31]:
### Spot checks ###

# check the unique time_series. 14 different department-entity combinations
print(df_all['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have a full 91 months of data
print(df_all.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df_all.isnull().sum())

14
-----
DEP_ENT
160_155    90
170_155    90
200_155    90
200_310    90
210_155    90
210_165    90
210_310    90
220_155    90
220_310    84
240_155    90
250_155    90
250_165    56
250_310    90
260_155    53
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


In [111]:
# create dataset to test for one department
df_200_155 = df_all[(df_all.DEP_ENT == "200_155")]

### 1
# Time Series Forecasting with PyCaret Regression

Pycaret 3.04 regression documentation: https://pycaret.readthedocs.io/en/stable/api/regression.html

In [112]:
### Regression data preparation ###

# extract month and year from dates
df_200_155['Month'] = [i.month for i in df_200_155['MONTH']]
df_200_155['Year'] = [i.year for i in df_200_155['MONTH']]

# create a sequence of numbers
df_200_155['Series'] = np.arange(1,len(df_200_155)+1)

# trim
df_200_155_trim = df_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 

# ## Testing and Training datsets. This is need to determine best model. We can't include the whole dataset or we will overfit
# train_200_155 = df_200_155[(df_200_155.Series <= round(len(df_200_155.index) * .8))] # ~80% for training
# test_200_155 = df_200_155[(df_200_155.Series > round(len(df_200_155.index) * .8))] # ~20% for testing

# # drop unnecessary columns and re-arrange
# train_200_155 = train_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 
# test_200_155 = test_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 

# # review
# print(test_200_155.head())
# print("-----")
# # check shape
# print(train_200_155.shape, test_200_155.shape)

In [113]:
### Regression Functional API

# import the regression module from pycaret   
#from pycaret.regression import *

import pycaret.regression as pycr

# initialize setup
s = pycr.setup(data = df_200_155_trim, target = 'NET_SALES', session_id = 123)


### Modeling steps ###

# model training and selection
regress_best = pycr.compare_models(sort = 'MAPE')

# evaluate trained model
pycr.evaluate_model(regress_best)

# predict on hold-out/test set
regress_pred_holdout = pycr.predict_model(regress_best)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,NET_SALES
2,Target type,Regression
3,Original data shape,"(53, 4)"
4,Transformed data shape,"(53, 4)"
5,Transformed train set shape,"(37, 4)"
6,Transformed test set shape,"(16, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,1762.0577,7517732.6111,2341.3916,-0.497,0.4229,0.3409,0.027
et,Extra Trees Regressor,1656.7375,4981953.7917,2018.3463,-1.1121,0.3764,0.3573,0.06
gbr,Gradient Boosting Regressor,1660.2737,5068835.4297,2067.3175,-1.8721,0.3965,0.3735,0.039
rf,Random Forest Regressor,1675.214,5083260.1345,2042.9198,-1.7228,0.3897,0.3846,0.061
knn,K Neighbors Regressor,1901.1064,8363720.1625,2510.9009,-0.964,0.4731,0.3949,0.029
ada,AdaBoost Regressor,1726.6644,6357299.197,2145.9246,-3.29,0.4015,0.3972,0.04
en,Elastic Net,1882.9834,6970746.304,2405.85,-1.701,0.4506,0.412,0.028
ridge,Ridge Regression,1902.0814,6994124.8977,2418.2547,-1.8263,0.4557,0.4175,0.027
lasso,Lasso Regression,1903.8192,6996262.9381,2419.1281,-1.8345,0.4561,0.418,0.035
llar,Lasso Least Angle Regression,1903.8192,6996262.7389,2419.1281,-1.8345,0.4561,0.418,0.027


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,1505.7891,3340498.8363,1827.7032,0.1405,0.4763,0.502


In [114]:
### New data to predict ###

# max date from original dataset
max_timestamp = df_200_155['MONTH'].max()

# create dataframe for future dates, in this case the rest of 2023
next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 7)] # Need to change range based on how many periods ahead you want to predict
new_dates_df = pd.DataFrame({'MONTH': next_dates})
new_dates_df["MONTH"] = pd.to_datetime(new_dates_df["MONTH"]) 

# extract month and year from dates
new_dates_df['Month'] = [i.month for i in new_dates_df['MONTH']]
new_dates_df['Year'] = [i.year for i in new_dates_df['MONTH']]

# create a sequence of numbers
new_dates_df['Series'] = np.arange(1,len(new_dates_df)+1)

# select cols
new_dates_df = new_dates_df[['Series', 'Year', 'Month']] 

In [115]:
# predict on new data
regress_predictions = pycr.predict_model(regress_best, data = new_dates_df)

### 2
# Time Series Forecasting with PyCaret Time Series

Pycaret 3.04 regression documentation: https://pycaret.readthedocs.io/en/stable/api/time_series.html

In [103]:
### Time Series Forecasting Functional API

# import the time seris module from pycaret
#from pycaret.time_series import *

import pycaret.time_series as pycts


In [116]:
# create time series dataset
df_200_155_trim2 = df_200_155[["MONTH", "NET_SALES"]]
    
# create series
df_200_155_series = df_200_155_trim2.set_index('MONTH')['NET_SALES']

ts_train = round(len(df_200_155_trim2.index) * .8) # 80% train
ts_test = len(df_200_155_trim2.index) - round(len(df_200_155_trim2.index) * .8) # 20% test

In [117]:
# init setup on exp
s = pycts.setup(df_200_155_series, fh = ts_test, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,NET_SALES
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(53, 1)"
5,Transformed data shape,"(53, 1)"
6,Transformed train set shape,"(42, 1)"
7,Transformed test set shape,"(11, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [119]:
best = pycts.compare_models(exclude = ["naive", "grand_means"])

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
auto_arima,Auto ARIMA,1.3929,1.5466,2028.6996,2750.1185,0.4794,0.4173,-0.3022,0.0667
arima,ARIMA,1.393,1.5452,2029.576,2748.5019,0.4812,0.4176,-0.3076,0.0267
exp_smooth,Exponential Smoothing,1.4467,1.5711,2099.0318,2785.0159,0.4583,0.4389,-0.3085,0.0167
theta,Theta Forecaster,1.448,1.5847,2112.0139,2818.624,0.4825,0.4358,-0.3289,0.0167
croston,Croston,1.4544,1.5587,2131.1461,2784.165,0.4828,0.441,-0.2806,0.01
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,1.5908,1.6825,2409.8932,3104.5106,0.6196,0.482,-0.7383,0.0567
knn_cds_dt,K Neighbors w/ Cond. Deseasonalize & Detrending,1.6036,1.6307,2449.5356,3028.6899,0.6169,0.489,-0.6282,0.1367
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,1.6348,1.7064,2490.149,3174.2737,0.6397,0.4946,-0.8044,0.0567
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,1.6348,1.7064,2490.1489,3174.2737,0.6397,0.4946,-0.8044,0.0567
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,1.6348,1.7064,2490.1489,3174.2737,0.6397,0.4946,-0.8044,0.07


In [65]:
df_200_155_trim2 = df_200_155_trim2.sort_values(['MONTH']) # reorder
    
#df_200_155_trim2.set_index('MONTH') # create index
    


pycaret.time_series.forecasting.oop.TSForecastingExperiment

In [None]:


### TS prep

# df_200_155_trim2 = df_200_155_trim2.sort_values(['MONTH']) # reorder
    
# df_200_155_trim2.set_index('MONTH', drop = True, inplace=True) # create index
    
# df_200_155_trim2 = df_200_155_trim2.asfreq('MS')
    
# ts_train = round(len(df_200_155_trim2.index) * .8) # 80% train
# ts_test = len(df_200_155_trim2.index) - round(len(df_200_155_trim2.index) * .8) # 20% test


# init setup
s = setup(df_200_155_trim2, target = 'NET_SALES', fh = ts_test, session_id = 123)

# model training and selection
ts_best = compare_models(sort = 'MAPE', exclude = ["naive"])

# plot trained model
#plt.plot(ts_best)

#ts_pred_holdout = predict_model(ts_best)

# predict in unseen future
#ts_predictions = predict_model(ts_best, fh=6)

In [75]:
check_stats()

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Transformed,Length,,90.0
1,Summary,Statistics,Transformed,# Missing Values,,0.0
2,Summary,Statistics,Transformed,Mean,,2580538.262138
3,Summary,Statistics,Transformed,Median,,2224486.9
4,Summary,Statistics,Transformed,Standard Deviation,,1073172.834318
5,Summary,Statistics,Transformed,Variance,,1151699932317.54834
6,Summary,Statistics,Transformed,Kurtosis,,1.529008
7,Summary,Statistics,Transformed,Skewness,,1.406315
8,Summary,Statistics,Transformed,# Distinct Values,,90.0
9,White Noise,Ljung-Box,Transformed,Test Statictic,"{'alpha': 0.05, 'K': 24}",75.73407
