# Vitamix Forecasting Models

# Global
    * Data Load and Preparation

In [1]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np

from datetime import datetime, timedelta

# import the time seris module from pycaret
#from pycaret.time_series import *

# copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username>
%env snowflakepass=<your_snowflake_password>

In [3]:
# Query Snowflake

def snowflake_to_pandas(connection_params, query):
    try:
        # Establish a connection to Snowflake
        conn = snowflake.connector.connect(**connection_params)

        # Execute the SQL query and fetch the results into a DataFrame
        df = pd.read_sql_query(query, conn)

        # Close the connection
        conn.close()

        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# SQL command 
query = 'SELECT * FROM "ANALYTICS"."FORECASTING"."sales_fcast_combined_v"'

# Call the function to retrieve the data into a Pandas DataFrame
result_df = snowflake_to_pandas(connection_params, query)

if result_df is not None:
    print(result_df.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to retrieve data from Snowflake.")

  df = pd.read_sql_query(query, conn)


   DEP_ENT       MONTH   NET_SALES  BUDGET_AMOUNT      FORECAST
0  210_155  2023-07-01  2817972.34      5004554.0  3.885478e+06
1  210_155  2023-08-01         NaN      5812184.0  3.431112e+06
2  210_155  2023-09-01         NaN      5786571.0  4.134850e+06
3  210_155  2023-10-01         NaN      5239358.0  3.793092e+06
4  210_155  2023-11-01         NaN      7036356.0  5.865456e+06


In [30]:
### Data prep ###

result_df["MONTH"] = pd.to_datetime(result_df["MONTH"]) # convert month field to date
df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2023,6,1)] # we have data through July '23 when we are training these models. Some random months will have data that we want to remove (* Want to test without July though)
df_all = df_all[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df_all = df_all.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe

In [31]:
### Spot checks ###

# check the unique time_series. 14 different department-entity combinations
print(df_all['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have a full 91 months of data
print(df_all.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df_all.isnull().sum())

14
-----
DEP_ENT
160_155    90
170_155    90
200_155    90
200_310    90
210_155    90
210_165    90
210_310    90
220_155    90
220_310    84
240_155    90
250_155    90
250_165    56
250_310    90
260_155    53
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


In [32]:
# create dataset to test for one department
df_200_155 = df_all[(df_all.DEP_ENT == "200_155")]

### 1
# Time Series Forecasting with PyCaret Regression

Pycaret 3.04 regression documentation: https://pycaret.readthedocs.io/en/stable/api/regression.html

In [33]:
### Regression data preparation ###

# extract month and year from dates
df_200_155['Month'] = [i.month for i in df_200_155['MONTH']]
df_200_155['Year'] = [i.year for i in df_200_155['MONTH']]

# create a sequence of numbers
df_200_155['Series'] = np.arange(1,len(df_200_155)+1)

# trim
df_200_155_trim = df_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 

# ## Testing and Training datsets. This is need to determine best model. We can't include the whole dataset or we will overfit
# train_200_155 = df_200_155[(df_200_155.Series <= round(len(df_200_155.index) * .8))] # ~80% for training
# test_200_155 = df_200_155[(df_200_155.Series > round(len(df_200_155.index) * .8))] # ~20% for testing

# # drop unnecessary columns and re-arrange
# train_200_155 = train_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 
# test_200_155 = test_200_155[['Series', 'Year', 'Month', 'NET_SALES']] 

# # review
# print(test_200_155.head())
# print("-----")
# # check shape
# print(train_200_155.shape, test_200_155.shape)

     Series  Year  Month   NET_SALES
976      73  2022      1  1985988.92
977      74  2022      2  2328955.97
978      75  2022      3  2240477.51
979      76  2022      4  2394403.39
980      77  2022      5  2275736.56
-----
(72, 4) (18, 4)


In [34]:
### Regression Functional API

# import the regression module from pycaret   
from pycaret.regression import *

# initialize setup**
s = setup(data = df_200_155_trim, target = 'NET_SALES', session_id = 123)


### Modeling steps ###

# model training and selection
regress_best = compare_models(sort = 'R2')

# evaluate trained model
evaluate_model(regress_best)

# predict on hold-out/test set
regress_pred_holdout = predict_model(regress_best)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,NET_SALES
2,Target type,Regression
3,Original data shape,"(90, 4)"
4,Transformed data shape,"(90, 4)"
5,Transformed train set shape,"(62, 4)"
6,Transformed test set shape,"(28, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


In [38]:
### New data to predict ###

# max date from original dataset
max_timestamp = df_200_155['MONTH'].max()

# create dataframe for future dates, in this case the rest of 2023
next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 7)] # Need to change range based on how many periods ahead you want to predict
new_dates_df = pd.DataFrame({'MONTH': next_dates})
new_dates_df["MONTH"] = pd.to_datetime(new_dates_df["MONTH"]) 

# extract month and year from dates
new_dates_df['Month'] = [i.month for i in new_dates_df['MONTH']]
new_dates_df['Year'] = [i.year for i in new_dates_df['MONTH']]

# create a sequence of numbers
new_dates_df['Series'] = np.arange(1,len(new_dates_df)+1)

# select cols
new_dates_df = new_dates_df[['Series', 'Year', 'Month']] 

In [39]:
# predict on new data
predictions = predict_model(regress_best, data = new_dates_df)