# Vitamix Forecasting Models

# Global
    * Data Load and Preparation

In [25]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np
import xlsxwriter
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from tqdm import tqdm

# import the regression module from pycaret   
import pycaret.regression as pycr

# import the time seris module from pycaret

import pycaret.time_series as pycts

# copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username>
%env snowflakepass=<your_snowflake_password>

In [3]:
# Query Snowflake

def snowflake_to_pandas(connection_params, query):
    try:
        # Establish a connection to Snowflake
        conn = snowflake.connector.connect(**connection_params)

        # Execute the SQL query and fetch the results into a DataFrame
        df = pd.read_sql_query(query, conn)

        # Close the connection
        conn.close()

        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# SQL command 
query = 'SELECT * FROM "ANALYTICS"."FORECASTING"."sales_fcast_combined_v"'

# Call the function to retrieve the data into a Pandas DataFrame
result_df = snowflake_to_pandas(connection_params, query)

if result_df is not None:
    print(result_df.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to retrieve data from Snowflake.")

   DEP_ENT       MONTH     NET_SALES  BUDGET_AMOUNT  FORECAST
0  210_310  2020-09-01  7.449778e+05      1330384.0       NaN
1  210_310  2020-10-01  5.257454e+05       936992.0       NaN
2  210_310  2020-11-01  1.998459e+06      1681287.0       NaN
3  210_310  2020-12-01  4.443028e+05       734573.0       NaN
4  210_310  2021-01-01  3.309476e+05            NaN       NaN


###############
#### NOTES ####
###############

*Change the date below to forecast all of 2023 (8/15/23)

In [36]:
### Data prep ###

result_df["MONTH"] = pd.to_datetime(result_df["MONTH"]) # convert month field to date
#df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2023,6,1)] # we have data through July '23 when we are training these models. 

df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2022,12,1)] # CHANGE FOR 2023

#Some random months will have data that we want to remove (* Want to test without July though)
df_all = df_all[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df_all = df_all.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe

In [37]:
### Spot checks ###

# check the unique time_series. 14 different department-entity combinations
print(df_all['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have all months of data
print(df_all.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df_all.isnull().sum())

14
-----
DEP_ENT
160_155    84
170_155    84
200_155    84
200_310    84
210_155    84
210_165    84
210_310    84
220_155    84
220_310    78
240_155    84
250_155    84
250_165    50
250_310    84
260_155    47
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


# Finished with global data work above
__________________________________________________________________________________________________________________________________________________________________________________________________

### 1
# Time Series Forecasting with PyCaret Regression

Pycaret 3.04 regression documentation: https://pycaret.readthedocs.io/en/stable/api/regression.html

In [6]:
### Data Prep for Regression ###

df_all_rg = df_all

# extract month and year from dates
df_all_rg['Month'] = [i.month for i in df_all_rg['MONTH']]
df_all_rg['Year'] = [i.year for i in df_all_rg['MONTH']]

# create a sequence of numbers
df_all_rg['Series'] = np.arange(1,len(df_all_rg)+1)

# Notes on below TS loop
* Manual changes:
  * periods to forecast ahead using the next_dates variable 

In [None]:
### Regression Loop ###

# Create empty dataframes
all_results_rg = pd.DataFrame()
all_predictions_rg = pd.DataFrame()

# list of each dep-ent
all_ts_rg = df_all_rg['DEP_ENT'].unique()

for i in tqdm(all_ts_rg):
    
    # temp df for a dep-ent
    df_subset = df_all_rg[df_all_rg['DEP_ENT'] == i]
    
    # trim
    df_subset_trim = df_subset[['Series', 'Year', 'Month', 'NET_SALES']] 
    
    # initialize setup
    s = pycr.setup(data = df_subset_trim, target = 'NET_SALES', session_id = 123)
    
    # model training and selection
    best_model = pycr.compare_models()
    
    # capture the compare result grid and store best model in list
    p = pycr.pull().iloc[0:1]
    p['DEP_ENT'] = str(i)
    
    all_results_rg = all_results_rg.append(p)
    
    ### New data to predict ###

    # max date from original dataset
    max_timestamp = df_subset['MONTH'].max()

    # create dataframe for future dates, in this case the rest of 2023
    next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 7)] # Need to change range based on how many periods ahead you want to predict
    new_dates_df = pd.DataFrame({'MONTH': next_dates})
    new_dates_df["MONTH"] = pd.to_datetime(new_dates_df["MONTH"]) 

    # extract month and year from dates
    new_dates_df['Month'] = [i.month for i in new_dates_df['MONTH']]
    new_dates_df['Year'] = [i.year for i in new_dates_df['MONTH']]

    # create a sequence of numbers
    new_dates_df['Series'] = np.arange(1,len(new_dates_df)+1)

    # select cols
    new_dates_df = new_dates_df[['Series', 'Year', 'Month']] 
    
    # predict on new data
    predictions = pycr.predict_model(best_model, data = new_dates_df)
    
    ### Combine predictions in to all_predictions dataframe ###
    
    pred_temp = predictions.reset_index()
    
    pred_comb = pd.concat([pd.DataFrame({'MONTH': next_dates}), pred_temp['prediction_label']], axis = 1)
    pred_comb['DEP_ENT'] = i # add dep
    pred_comb.rename(columns = {'prediction_label':'PRED'}, inplace = True)
    
    # append to master dataframe
    all_predictions_rg = all_predictions_rg.append(pred_comb)

In [12]:
### Write Regression to Excel ###

# Create a Pandas Excel writer using XlsxWriter
rg_writer = pd.ExcelWriter('regression.xlsx', engine='xlsxwriter')

# sheet 1
all_results_rg.to_excel(rg_writer, sheet_name='results', index=False)
# sheet 2
all_predictions_rg.to_excel(rg_writer, sheet_name='predictions', index=False)

# Save Data to File
rg_writer.save()

__________________________________________________________________________________________________________________________________________________________________________________________________

### 2
# Time Series Forecasting with PyCaret Time Series

Pycaret 3.04 time series documentation: https://pycaret.readthedocs.io/en/stable/api/time_series.html

In [38]:
### Data Prep for Time Series Anlysis ###

df_all_ts = df_all

# create time series dataset
df_all_ts = df_all_ts[["DEP_ENT", "MONTH", "NET_SALES"]]

# Notes on below TS loop
* Manual changes:
  * periods to include in the dates dataset created using next_dates
  * forecast horizon (fh) in predict_model function

###############
#### NOTES ####
###############

*When changing dataset to forecast all of 2023, we also need to chnage the dates and fh for thew two notes above (8/15/23)

In [None]:
### Time Series Loop ###

# Create empty dataframes
all_results_ts = pd.DataFrame()
all_predictions_ts = pd.DataFrame()

# list of each dep-ent
all_ts_ts = df_all_ts['DEP_ENT'].unique()

for i in tqdm(all_ts_ts):
    
    # temp df for a dep-ent
    df_subset = df_all_ts[df_all_ts['DEP_ENT'] == i]
    
    # trim
    df_subset_trim = df_subset[['MONTH', 'NET_SALES']] 
    
    # create series
    df_subset_s = df_subset_trim.set_index('MONTH')['NET_SALES']
    
    # set frequency of series
    df_subset_s = df_subset_s.asfreq('MS') 

    # test and train
    ts_train = round(len(df_subset_trim.index) * .8) # 80% train
    ts_test = len(df_subset_trim.index) - round(len(df_subset_trim.index) * .8) # 20% test
    
    # initialize setup
    s = pycts.setup(df_subset_s, numeric_imputation_target = "mean", fh = ts_test, session_id = 123)

    # model training and selection
    best_model = pycts.compare_models(sort = 'MAPE', exclude = ["naive", "grand_means", "croston", ])
    #best_model = pycts.compare_models(sort = 'MAPE', include = ["arima", "auto_arima", "ets", ])
    
    # plot trained model
    pycts.plot_model(best_model, plot = 'forecast')
    
    # capture the compare result grid and store best model in list
    p = pycts.pull().iloc[0:1]
    p['DEP_ENT'] = str(i)
    
    all_results_ts = all_results_ts.append(p)
    
    ### Predict future periods ###
    
    # max date from original dataset
    max_timestamp = df_subset_trim['MONTH'].max()
    
    # create dataframe for future dates, in this case the rest of 2023. This is not for predicting but to join back up with predictions
    #next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 7)] # Need to change range based on how many periods ahead you want to predict
    
    next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 13)] # CHANGE FOR 2023
    
    new_dates_df = pd.DataFrame({'MONTH': next_dates})
    new_dates_df["MONTH"] = pd.to_datetime(new_dates_df["MONTH"]) 
    
    # predict in unseen future
    #predictions = pycts.predict_model(best_model, fh=6) # Need to change range fh for how many periods ahead you want to predict
    
    predictions = pycts.predict_model(best_model, fh=12) # CHANGE FOR 2023
    
    pred_temp = predictions.reset_index()
    
    pred_comb = pd.concat([pd.DataFrame({'MONTH': next_dates}), pred_temp['y_pred']], axis = 1)
    pred_comb['DEP_ENT'] = i # add dep
    pred_comb.rename(columns = {'y_pred':'PRED'}, inplace = True)
    
    # append to master dataframe
    all_predictions_ts = all_predictions_ts.append(pred_comb)

In [43]:
### Write Time Series to Excel ###

# Create a Pandas Excel writer using XlsxWriter
ts_writer = pd.ExcelWriter('time_series.xlsx', engine='xlsxwriter')

# sheet 1
all_results_ts.to_excel(ts_writer, sheet_name='results', index=False)
# sheet 2
all_predictions_ts.to_excel(ts_writer, sheet_name='predictions', index=False)

# Save Data to File
ts_writer.save()