use newpycaret env

In [10]:
# packages
import snowflake.connector
import pandas as pd
import os
import numpy as np
from autots import AutoTS
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime

2. Load Data
_______________________________________________________________

### To Do: copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username> <br>
%env snowflakepass=<your_snowflake_password>

In [4]:
# Query Snowflake

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('net_sales_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    df_in = pd.read_sql_query(query.read(),conn)

# Close the connection
conn.close()

  df_in = pd.read_sql_query(query.read(),conn)


In [5]:
### 1. Data prep ###

df = df_in

# convert month field to date
df["MONTH"] = pd.to_datetime(df["MONTH"])

# ***** ADJUSTABLE ***** #
# current params - want to get forecasts for 2023 (Jan - Dec) so holding out data 
df = df[df['MONTH'] <= pd.Timestamp(2022,12,1)]

# Some random months will have data that we want to remove (* Want to test without July though)
df = df[["DEP_ENT", "MONTH", "NET_SALES"]] # select fields of interest
df = df.sort_values(['DEP_ENT', 'MONTH']) # reorder dataframe


### 2. Spot checks ###

# check data types
print(df.dtypes)
print("-----")

# check the unique time_series. 14 different department-entity combinations
print(df['DEP_ENT'].nunique())
print("-----")

# check how many months for each dep-ent. 3 dep-ent do not have all months of data
print(df.groupby(['DEP_ENT']).size())
print("-----")

# check for nulls. No nulls
print(df.isnull().sum())

DEP_ENT              object
MONTH        datetime64[ns]
NET_SALES           float64
dtype: object
-----
14
-----
DEP_ENT
160_155    84
170_155    84
200_155    84
200_310    84
210_155    84
210_165    84
210_310    84
220_155    84
220_310    78
240_155    84
250_155    84
250_165    50
250_310    84
260_155    47
dtype: int64
-----
DEP_ENT      0
MONTH        0
NET_SALES    0
dtype: int64


# Multiple Series with auto TS
Tutorial: https://medium.com/data-science-in-your-pocket/automl-for-time-series-forecasting-using-autots-with-example-735fb559443a


https://winedarksea.github.io/AutoTS/build/html/source/tutorial.html

In [6]:
### Additional data prep ###

# create series
df_s = df.set_index(['DEP_ENT','MONTH'])['NET_SALES']
# convert back to dataframe
df_d = df_s.to_frame()
# impute
mean_imputer = SimpleImputer(strategy='mean')
df_d['NET_SALES'] = mean_imputer.fit_transform(df_d['NET_SALES'].values.reshape(-1,1))
#reset index
df_d.reset_index(inplace=True)

____________________________________________________________________________________________________________________________________________________________________________

https://github.com/winedarksea/AutoTS#basic-use

From above, add some more info to extract in loop:

# upper and lower forecasts
forecasts_up, forecasts_low = prediction.upper_forecast, prediction.lower_forecast

# accuracy of all tried model results
model_results = model.results()
# and aggregated from cross validation
validation_results = model.results("validation")

In [None]:
### Time Series Loop ###

# Create empty dataframes
all_predictions = pd.DataFrame()

# list of each dep-ent
all_ts_ts = df_d['DEP_ENT'].unique()

for i in tqdm(all_ts_ts):
    
    # create subset
    df_subset = df_d[df_d['DEP_ENT'] == i]
    
    # trim
    df_subset_trim = df_subset[['MONTH', 'NET_SALES']]

    # model
    model = AutoTS(
        forecast_length=12,
        frequency='infer',
        prediction_interval=0.95,
        ensemble='simple',
        models_mode='deep',
        model_list = 'univariate', # or could do a list like ['ARIMA','ETS']
        max_generations=3,
        num_validations=3,
        no_negatives=True,
        n_jobs='auto')

    model = model.fit(
        df_subset_trim,
        date_col='MONTH', 
        value_col='NET_SALES',
    )
    
    print(model)

    # create prediction
    prediction = model.predict(forecast_length=12)
    
    # temp fcast dataframe
    temp_fcasts = prediction.forecast
    
    # rename
    temp_fcasts.rename(columns = {'index':'MONTH'}, inplace = True)
    
    temp_fcasts['DEP_ENT'] = i # add dep
    
    # append to master dataframe
    all_predictions = all_predictions.append(temp_fcasts)

In [None]:
# from autots.models.model_list import model_lists
# print(model_lists['univariate'])

# param_df = prediction.model_parameters
# param_df = pd.DataFrame.from_dict(param_df)

# upper_forecasts_df = prediction.upper_forecast
# lower_forecasts_df = prediction.lower_forecast

In [16]:
# rename index col
all_predictions.rename(columns={'index': 'MONTH', 'NET_SALES': 'PRED'}, inplace=True)

#initial sales pull
sales = df_in
# select cols
sales = sales[["DEP_ENT", "MONTH", "NET_SALES"]]
# only include sales data for the full months we have
first_of_month = datetime.today().replace(day=1).date()
sales = sales[sales['MONTH'] < pd.to_datetime(first_of_month)]

# combine prediction data and original sales data
merged = pd.merge(all_predictions, sales, how = 'left', on =['DEP_ENT', 'MONTH'])

In [18]:
### Budget ###

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('budget_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    budg = pd.read_sql_query(query.read(),conn)

# Close the connection
conn.close()

# convert month to datetime
budg["MONTH"] = pd.to_datetime(budg["MONTH"])
# select cols
budg = budg[["MONTH", "BUDGET_AMOUNT", "DEP_ENT"]]

  budg = pd.read_sql_query(query.read(),conn)



In [19]:
# combine prediction/sales data with budget data
merged2 = pd.merge(merged, budg, how = 'left', on =['DEP_ENT', 'MONTH'])

In [20]:
# write csv
merged2.to_csv('auto_ts_results.csv')