use newpycaret env

In [5]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np
import xlsxwriter
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.impute import SimpleImputer
from tqdm import tqdm

# import the regression module from pycaret   
#import pycaret.regression as pycr

# import the time seris module from pycaret
import pycaret.time_series as pycts

2. Load Data
_______________________________________________________________

### To Do: copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username> <br>
%env snowflakepass=<your_snowflake_password>

In [3]:
# Query Snowflake

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('net_sales_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    df_in = pd.read_sql_query(query.read(),conn)

# Close the connection
conn.close()

In [4]:
### 1. Data prep ###

df = df_in

# convert month field to date
df["MONTH"] = pd.to_datetime(df["MONTH"])

# ***** ADJUSTABLE ***** #
# current params - want to get forecasts for 2023 (Jan - Dec) so holding out data 
df = df[df['MONTH'] <= pd.Timestamp(2022,12,1)]

# Some random months will have data that we want to remove (* Want to test without July though)
df = df[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df = df.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe


### 2. Spot checks ###

# check data types
print(df.dtypes)
print("-----")

# check the unique time_series. 14 different department-entity combinations
print(df['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have all months of data
print(df.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df.isnull().sum())

DEP_ENT              object
MONTH        datetime64[ns]
NET_SALES           float64
dtype: object
-----
14
-----
DEP_ENT
160_155    84
170_155    84
200_155    84
200_310    84
210_155    84
210_165    84
210_310    84
220_155    84
220_310    78
240_155    84
250_155    84
250_165    50
250_310    84
260_155    47
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


# Multiple Series with Pycaret
Pycaret 3.04 time series documentation: https://pycaret.readthedocs.io/en/stable/api/time_series.html

* could do with regression as well: https://pycaret.readthedocs.io/en/stable/api/regression.html

In [8]:
### Additional data prep ###

# create series
df_s = df.set_index(['DEP_ENT','MONTH'])['NET_SALES']
# convert back to dataframe
df_d = df_s.to_frame()
# impute
mean_imputer = SimpleImputer(strategy='mean')
df_d['NET_SALES'] = mean_imputer.fit_transform(df_d['NET_SALES'].values.reshape(-1,1))
#reset index
df_d.reset_index(inplace=True)

In [None]:
### Time Series Loop ###

# Create empty dataframes
all_results_ts = pd.DataFrame()
all_predictions_ts = pd.DataFrame()

# list of each dep-ent
all_ts_ts = df_d['DEP_ENT'].unique()

for i in tqdm(all_ts_ts):
    
    # temp df for a dep-ent
    df_subset = df_d[df_d['DEP_ENT'] == i]
    
    # trim
    df_subset_trim = df_subset[['MONTH', 'NET_SALES']] 
    
    # create series
    df_subset_s = df_subset_trim.set_index('MONTH')['NET_SALES']
    
    # set frequency of series
    df_subset_s = df_subset_s.asfreq('MS') 

    # test and train
    ts_train = round(len(df_subset_trim.index) * .8) # 80% train
    ts_test = len(df_subset_trim.index) - round(len(df_subset_trim.index) * .8) # 20% test
    
    # initialize setup
    #s = pycts.setup(df_subset_s, numeric_imputation_target = "mean", fh = ts_test, session_id = 123)
    s = pycts.setup(df_subset_s, fh=12, numeric_imputation_target = "mean", session_id = 123)

    # model training and selection
    best_model = pycts.compare_models(sort = 'MAPE', exclude = ["naive", "grand_means", "croston", ])
    #best_model = pycts.compare_models(sort = 'MAPE', include = ["arima", "auto_arima", "ets", ])
    
    # plot trained model
    pycts.plot_model(best_model, plot = 'forecast')
    
    # capture the compare result grid and store best model in list
    p = pycts.pull().iloc[0:1]
    p['DEP_ENT'] = str(i)
    
    all_results_ts = all_results_ts.append(p)
    
    ### Predict future periods ###
    
    # max date from original dataset
    max_timestamp = df_subset_trim['MONTH'].max()
    
    # create dataframe for future dates, in this case the rest of 2023. This is not for predicting but to join back up with predictions
    #next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 7)] # Need to change range based on how many periods ahead you want to predict
    
    next_dates = [max_timestamp.replace(day=1) + pd.DateOffset(months=i) for i in range(1, 13)] # *** CHANGE IF WANT TO FORECAST DIFFERENT THAN 12 ***
    
    new_dates_df = pd.DataFrame({'MONTH': next_dates})
    new_dates_df["MONTH"] = pd.to_datetime(new_dates_df["MONTH"]) 
    
    # predict in unseen future
    #predictions = pycts.predict_model(best_model, fh=6) # Need to change range fh for how many periods ahead you want to predict
    
    predictions = pycts.predict_model(best_model, fh=12) # *** CHANGE IF WANT TO FORECAST DIFFERENT THAN 12 ***
    
    pred_temp = predictions.reset_index()
    
    pred_comb = pd.concat([pd.DataFrame({'MONTH': next_dates}), pred_temp['y_pred']], axis = 1)
    pred_comb['DEP_ENT'] = i # add dep
    pred_comb.rename(columns = {'y_pred':'PRED'}, inplace = True)
    
    # append to master dataframe
    all_predictions_ts = all_predictions_ts.append(pred_comb)

* Error above - 26_155 didnt have enough data
Merge data

In [18]:
# join the model info (just name for now)
model_names = all_results_ts[['Model', 'DEP_ENT']]
merged = pd.merge(all_predictions_ts, model_names, on =['DEP_ENT'])

In [19]:
#initial sales pull
sales = df_in
# select cols
sales = sales[["DEP_ENT", "MONTH", "NET_SALES"]]
# only include sales data for the full months we have
first_of_month = datetime.today().replace(day=1).date()
sales = sales[sales['MONTH'] < pd.to_datetime(first_of_month)]

# combine prediction data and original sales data
merged2 = pd.merge(merged, sales, how = 'left', on =['DEP_ENT', 'MONTH'])

In [21]:
### Budget ###

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('budget_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    budg = pd.read_sql_query(query.read(),conn)

# Close the connection
conn.close()

# convert month to datetime
budg["MONTH"] = pd.to_datetime(budg["MONTH"])
# select cols
budg = budg[["MONTH", "BUDGET_AMOUNT", "DEP_ENT"]]

In [22]:
# combine prediction/sales data with budget data
merged3 = pd.merge(merged2, budg, how = 'left', on =['DEP_ENT', 'MONTH'])

In [23]:
# write csv
merged3.to_csv('pycaret_results.csv')