# Vitamix Department-Entity Forecast Models

1. Import packages (use base environment)
_______________________________________________________________

In [1]:
# Packages
import pandas as pd
import numpy as np
import snowflake.connector
import os
from sklearn.impute import SimpleImputer
import pmdarima as pm
from pmdarima import model_selection
from matplotlib import pyplot as plt

2. Load Data
_______________________________________________________________

### To Do: copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username> <br>
%env snowflakepass=<your_snowflake_password>

In [3]:
# Query Snowflake

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('net_sales_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    df_in = pd.read_sql_query(query.read(),conn)

# Close the connection
conn.close()

  df_in = pd.read_sql_query(query.read(),conn)


3. Time Series Forecasting
_______________________________________________________________

### Net Sales Forecasting using auto_arima

[auto_arima inputs](https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html)

In [4]:
### 1. Data prep ###

df = df_in

df["MONTH"] = pd.to_datetime(df["MONTH"]) # convert month field to date
#df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2023,6,1)] # we have data through July '23 when we are training these models. 

df = df[df['MONTH'] <= pd.Timestamp(2023,6,1)] # ***** ADJUSTABLE *****

#Some random months will have data that we want to remove (* Want to test without July though)
df = df[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df = df.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe


### 2. Spot checks ###

# check data types
print(df.dtypes)
print("-----")

# check the unique time_series. 14 different department-entity combinations
print(df['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have all months of data
print(df.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df.isnull().sum())

DEP_ENT              object
MONTH        datetime64[ns]
NET_SALES           float64
dtype: object
-----
14
-----
DEP_ENT
160_155    90
170_155    90
200_155    90
200_310    90
210_155    90
210_165    90
210_310    90
220_155    90
220_310    84
240_155    90
250_155    90
250_165    56
250_310    90
260_155    53
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


In [5]:
### TEMP - just one department ###

# test data
df_1 = df[df["DEP_ENT"] == '200_155']
# trim
df_1_trim = df_1[['MONTH', 'NET_SALES']]
# create series
df_1_s = df_1_trim.set_index('MONTH')['NET_SALES']
# set frequency of series
df_1_s = df_1_s.asfreq('MS') 
# convert back to dataframe
df_1_d = df_1_s.to_frame()
# impute
mean_imputer = SimpleImputer(strategy='mean')
df_1_d['NET_SALES'] = mean_imputer.fit_transform(df_1_d['NET_SALES'].values.reshape(-1,1))
#reset index
df_1_d.reset_index(inplace=True)

In [40]:
# split it into separate pieces
train, test = model_selection.train_test_split(df_1_d, train_size=round(len(df_1_d)*.8))

train_arr = train['NET_SALES'].values
test_arr = test['NET_SALES'].values

In [22]:
auto_arima = pm.auto_arima(train_arr, stepwise=False, m=12)

In [25]:
auto_arima

In [26]:
auto_arima.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,72.0
Model:,"SARIMAX(0, 1, 2)x(1, 0, [], 12)",Log Likelihood,-1082.212
Date:,"Tue, 15 Aug 2023",AIC,2174.424
Time:,19:03:08,BIC,2185.738
Sample:,0,HQIC,2178.923
,- 72,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,5.075e+04,4.18e+04,1.214,0.225,-3.12e+04,1.33e+05
ma.L1,-0.4243,0.112,-3.779,0.000,-0.644,-0.204
ma.L2,-0.2751,0.172,-1.604,0.109,-0.611,0.061
ar.S.L12,0.4410,0.114,3.884,0.000,0.218,0.663
sigma2,1.225e+12,0.004,3.24e+14,0.000,1.22e+12,1.22e+12

0,1,2,3
Ljung-Box (L1) (Q):,0.43,Jarque-Bera (JB):,15.91
Prob(Q):,0.51,Prob(JB):,0.0
Heteroskedasticity (H):,1.05,Skew:,0.56
Prob(H) (two-sided):,0.91,Kurtosis:,5.03


In [28]:
auto_test_pred = auto_arima.predict(n_periods=len(test_arr))

In [33]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

mae = mean_absolute_error(test_arr, auto_test_pred)
mape = mean_absolute_percentage_error(test_arr, auto_test_pred)
rmse = np.sqrt(mean_squared_error(test_arr, auto_test_pred))

print(f'mae: {mae}')
print(f'mape: {mape}')
print(f'rmse: {rmse}')

mae: 1247753.0157727539
mape: 0.6380719403421183
rmse: 1387143.6190703383


In [41]:
# append predictions to df
test_df = test

In [42]:
test.plot()

<Axes: >

Error in callback <function _draw_all_if_interactive at 0x000002527DC6C860> (for post_execute):


ValueError: Date ordinal 4317074.581500001 converts to 13789-09-30T13:57:21.600128 (using epoch 1970-01-01T00:00:00), but Matplotlib dates must be between year 0001 and 9999.

ValueError: Date ordinal 4317074.581500001 converts to 13789-09-30T13:57:21.600128 (using epoch 1970-01-01T00:00:00), but Matplotlib dates must be between year 0001 and 9999.

<Figure size 640x480 with 1 Axes>