In [2]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np

# for forecasting
from pycaret.time_series import *

In [8]:
from pycaret.datasets import get_data

In [9]:
data = get_data('insurance')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username>
%env snowflakepass=<your_snowflake_password>

In [4]:
# Query Snowflake

def snowflake_to_pandas(connection_params, query):
    try:
        # Establish a connection to Snowflake
        conn = snowflake.connector.connect(**connection_params)

        # Execute the SQL query and fetch the results into a DataFrame
        df = pd.read_sql_query(query, conn)

        # Close the connection
        conn.close()

        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# SQL command 
query = 'SELECT * FROM "ANALYTICS"."FORECASTING"."sales_fcast_combined_v"'

# Call the function to retrieve the data into a Pandas DataFrame
result_df = snowflake_to_pandas(connection_params, query)

if result_df is not None:
    print(result_df.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to retrieve data from Snowflake.")

   DEP_ENT       MONTH   NET_SALES  BUDGET_AMOUNT      FORECAST
0  210_155  2023-07-01  2817972.34      5004554.0  3.885478e+06
1  210_155  2023-08-01         NaN      5812184.0  3.431112e+06
2  210_155  2023-09-01         NaN      5786571.0  4.134850e+06
3  210_155  2023-10-01         NaN      5239358.0  3.793092e+06
4  210_155  2023-11-01         NaN      7036356.0  5.865456e+06


In [5]:
# data prep
result_df["MONTH"] = pd.to_datetime(result_df["MONTH"]) # convert to date
df_all = result_df[result_df['MONTH'] <= pd.Timestamp(2023,6,1)] # filter out data beyond June 2023 (see note below)
"""
ABOVE: we have data beyond july '23, but we want to compare with 
some models that only had through june. No way to check model accuracy beyond the daya we have
"""
df_all = df_all[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields
df_all = df_all.sort_values(['MONTH']) # reorder

# create index
# df_all.set_index(['DEP_ENT', 'MONTH'], drop = True, inplace=True)

In [6]:
# check how many months for each dep-ent
df_all.groupby(['DEP_ENT']).size()

DEP_ENT
160_155    90
170_155    90
200_155    90
200_310    90
210_155    90
210_165    90
210_310    90
220_155    90
220_310    84
240_155    90
250_155    90
250_165    56
250_310    90
260_155    53
dtype: int64

In [7]:
# check the unique time_series - 14 different department-entity combinations
df_all['DEP_ENT'].nunique()

14

In [10]:
from tqdm import tqdm

In [None]:
all_ts = df_all['DEP_ENT'].unique()

all_results = []
final_model = {}

# prediction dates (last 6 months of 2023)
pred_dates = pd.date_range(start='2023-07-01', end = '2023-12-01', freq = 'MS') # ----------OPPORTUNITY TO AUTOMATE
# create empty dataframe
score_df = pd.DataFrame()
# add columns to dataset
score_df['MONTH'] = pred_dates

all_predictions = pd.DataFrame(columns=['MONTH', 'y_pred', 'DEP_ENT'])

for i in tqdm(all_ts):
    
    df_subset = df_all[df_all['DEP_ENT'] == i]
    
    df_subset = df_subset.sort_values(['MONTH']) # reorder
    
    df_subset.set_index('MONTH', drop = True, inplace=True) # create index
    
    df_subset = df_subset.asfreq('MS')
    
    train = round(len(df_subset.index) * .8) # 80% train
    test = len(df_subset.index) - round(len(df_subset.index) * .8) # 20% test
    
    # initialize setup from pycaret.regression
    s = setup(df_subset, target = 'NET_SALES', ignore_features = ['DEP_ENT'], 
              numeric_imputation_target = "mean", fold = 3, fh = test, session_id = 123) # session id is for reproducability
    
    # compare all models and select best one based on MAE
    best_model = compare_models(sort = 'MAPE', verbose=False)
    
    # capture the compare result grid and store best model in list
    p = pull().iloc[0:1]
    p['DEP_ENT'] = str(i)
    all_results.append(p)
    
    # finalize model i.e. fit on entire data including test set
    f = finalize_model(best_model)
    
    # attach final model to a dictionary
    final_model[i] = f
    
    pred = predict_model(f, fh=6) # forecast 6 months ahead
    
    pred_temp = pred.reset_index()
    
    pred_comb = pd.concat([score_df, pred_temp['y_pred']], axis=1) # combine dates and predictions ----------OPPORTUNITY TO AUTOMATE
    pred_comb['DEP_ENT'] = i # add dep
    
    all_predictions = pd.concat([all_predictions, pred_comb])
    
    # save transformation pipeline and model as pickle file 
    #save_model(f, model_name = 'trained_models/' +  str(i), verbose=False)

In [43]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec),DEP_ENT
br_cds_dt,Bayesian Ridge w/ Cond. Deseasonalize & Detren...,1.4671,1.3131,2315709.4559,2981673.1044,0.4398,0.3786,-0.0794,0.1267,250_155
grand_means,Grand Means Forecaster,0.7977,0.8547,1053265.9363,1403731.8462,0.7008,0.4233,-0.5374,0.02,160_155
snaive,Seasonal Naive Forecaster,1.1985,1.2402,225701.7823,277107.6545,22.33,0.605,0.2107,0.05,250_310
snaive,Seasonal Naive Forecaster,0.8778,0.8897,369532.1249,477226.5829,1.1889,0.681,-0.2643,0.0467,210_165
grand_means,Grand Means Forecaster,1.2368,1.309,920082.2451,1143575.2413,0.2201,0.2007,-0.116,0.03,240_155


In [127]:
# create file
all_predictions.to_csv('2023_predictions.csv')
concat_results.to_csv('2023_models.csv')