use newpycaret env


In [62]:
# packages
import snowflake.connector
import pandas as pd
import os
import numpy as np
from autots import AutoTS
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
from dateutil import relativedelta


2. Load Data

---

### To Do: copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk

%env snowflakeuser=<your_snowflake_username> <br>
%env snowflakepass=<your_snowflake_password>


In [29]:
# Query Snowflake

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('net_sales_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    df_initial = pd.read_sql_query(query.read(), conn)

# Close the connection
conn.close()

  df_initial = pd.read_sql_query(query.read(), conn)


In [80]:
##### INITIAL VARIABLE DECLARATION #####

# Get the current date
current_date = datetime.today()

# Calculate the first day of the current month
first_day_of_current_month = pd.to_datetime(current_date.replace(day=1))

# Calculate the first day of the next month
first_day_of_next_month = datetime(current_date.year, current_date.month + 1, 1)

# Calculate the first day of the last month
last_day_of_last_month = first_day_of_current_month - timedelta(days=1)
first_day_of_last_month = last_day_of_last_month.replace(day=1)


### MANUAL INPUTS ###

# how much data do you want to keep to train models. 
#end_of_data = first_day_of_last_month # the default which is the max date this should ever be because it would include all months that have full data
end_of_data = pd.to_datetime('2022-12-01')

# forecast horizon = how many months in to the future you want to forecast. So, we will forecast this many months past the above end_of_data
fh = 12


### FORECAST HORIZON (FH) DATAFRAME ###

end_of_data_next_month = end_of_data + relativedelta.relativedelta(months=1, day=1)
end_of_data_df = pd.DataFrame({'end_of_data': [end_of_data_next_month]})
end_of_data_df['end_of_data'] = pd.to_datetime(end_of_data_df['end_of_data'])

# Create a date range for the next 12 months
next_12_months = pd.date_range(start=end_of_data_df['end_of_data'].iloc[0], periods=fh, freq='MS')

fh_dates_df = pd.DataFrame({'MONTH': next_12_months})


### INITIAL DATA PREPARATION ###

# create copy of df_d
df = df_initial.copy(deep=True)

# convert month field to date
df["MONTH"] = pd.to_datetime(df["MONTH"])

# Some random months will have data that we want to remove (* Want to test without July though)
df = df[["DEP_ENT", "MONTH", "NET_SALES"]]  # select fields of interest
df = df.sort_values(['DEP_ENT', 'MONTH'])  # reorder dataframe

# remove data after the 'end of data' setting above
df_subset = df[df['MONTH'] <= end_of_data]

# create series
df_s = df_subset.set_index(['DEP_ENT', 'MONTH'])['NET_SALES']
# convert back to dataframe
df_d = df_s.to_frame()
# impute
mean_imputer = SimpleImputer(strategy='mean')
df_d['NET_SALES'] = mean_imputer.fit_transform(df_d['NET_SALES'].values.reshape(-1, 1))
# reset index
df_d.reset_index(inplace=True)


### LOOP TO MAKE SURE ALL DEP_ENT ARE FULL ###
# some dep_ent might still hjave less months but not on the backend

# Create empty dataframe
df_subset_all = pd.DataFrame()

# list of each dep-ent
all_dep_ent = df_d['DEP_ENT'].unique()

for i in tqdm(all_dep_ent):
    # filter to one dep_ent
    temp_subset = df_subset[df_subset['DEP_ENT'] == i]
    # temp result
    temp_result = pd.merge(temp_subset, fh_dates_df, on='MONTH', how='outer')
    temp_result['DEP_ENT'] = i
    
    # combine for all dep_ents
    df_subset_all = df_subset_all.append(temp_result)
    
    
### REGRESSOR DATA PREPARATION ###

# create copy of df_d
df_d_regress = df_subset_all.copy(deep=True)

# Prepare regressors

# Define the prime COVID period
cov_start_date = pd.Timestamp(2020, 1, 1)
cov_end_date = pd.Timestamp(2021, 12, 1)

# Create the binary dummy variable
df_d_regress['COVID'] = df_d_regress['MONTH'].apply(lambda date: '1' if cov_start_date <= date <= cov_end_date else '0')

# Amazon prime day shipments

# Define the dictionary
amzn_ship_dict = {
    'MONTH': [pd.Timestamp(2016, 7, 1), pd.Timestamp(2017, 5, 1), pd.Timestamp(2018, 6, 1), pd.Timestamp(2019, 6, 1), pd.Timestamp(2020, 6, 1),
              pd.Timestamp(2020, 10, 1), pd.Timestamp(2021, 5, 1), pd.Timestamp(2022, 5, 1), pd.Timestamp(2023, 5, 1), pd.Timestamp(2023, 6, 1), pd.Timestamp(2023, 7, 1)],
    'AMZN': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
}

# Create a DataFrame from the dictionary
amzn_ship_df = pd.DataFrame(amzn_ship_dict)
# add Amazon department
amzn_ship_df['DEP_ENT'] = '250_155'

# combine in to single table
df_d_regress1 = df_d_regress.merge(
    amzn_ship_df, on=['DEP_ENT', 'MONTH'], how='left')

df_d_regress1['AMZN'] = df_d_regress1['AMZN'].fillna(0)

  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
  df_subset_all = df_subset_all.append(temp_result)
100%|██████████| 14/14 [00:00<00:00, 468.21it/s]


---


In [None]:
### Time Series Loop w/ Regressors ###

# Create empty dataframes
all_predictions = pd.DataFrame()

# list of each dep-ent
all_ts_ts = df_d['DEP_ENT'].unique()

for i in tqdm(all_ts_ts):
    
    i = '220_310'

    df_subset = df_d_regress1[df_d_regress1['DEP_ENT'] == i]
    df_subset_trim = df_subset[df_subset['MONTH'] <= pd.Timestamp(2022, 12, 1)]
    df_subset_trim = df_subset_trim[['MONTH', 'NET_SALES']]

    # split the train and original df

    # train
    df_subset_train = df_subset[df_subset['MONTH'] <= pd.Timestamp(2022, 12, 1)]
    df_subset_train = df_subset_train[["MONTH", "COVID", "AMZN"]]

    # for future forecast
    df_subset_fcast = df_subset[df_subset['MONTH'] > pd.Timestamp(2022, 12, 1)]
    df_subset_fcast = df_subset_fcast[["MONTH", "COVID", "AMZN"]]

    # model
    model = AutoTS(
        forecast_length=12,
        frequency='infer',
        # prediction_interval=0.95,
        # ensemble='simple',
        # models_mode='deep',
        # model_list = 'univariate', # or could do a list like ['ARIMA','ETS']
        max_generations=3,
        num_validations=3,
        no_negatives=True,
        n_jobs='auto'
    )

    model = model.fit(
        df_subset_trim,
        future_regressor=df_subset_train,
        date_col='MONTH',
        value_col='NET_SALES',
    )

    # print(model)

    # create prediction
    prediction = model.predict(
        future_regressor=df_subset_fcast, forecast_length=12)

    # temp fcast dataframe
    temp_fcasts = prediction.forecast

    # rename
    temp_fcasts.rename(columns={'index': 'MONTH'}, inplace=True)

    temp_fcasts['DEP_ENT'] = i  # add dep

    # append to master dataframe
    all_predictions = all_predictions.append(temp_fcasts)


In [16]:
# rename index col
all_predictions.rename(
    columns={'index': 'MONTH', 'NET_SALES': 'PRED'}, inplace=True)

# initial sales pull
sales = df_in
# select cols
sales = sales[["DEP_ENT", "MONTH", "NET_SALES"]]
# only include sales data for the full months we have
first_of_month = datetime.today().replace(day=1).date()
sales = sales[sales['MONTH'] < pd.to_datetime(first_of_month)]

# combine prediction data and original sales data
merged = pd.merge(all_predictions, sales, how='left', on=['DEP_ENT', 'MONTH'])


In [18]:
### Budget ###

# Establish a connection to Snowflake
conn = snowflake.connector.connect(**connection_params)

with open('budget_query.sql', 'r') as query:
    # connection == the connection to your database, in your case prob_db
    budg = pd.read_sql_query(query.read(), conn)

# Close the connection
conn.close()

# convert month to datetime
budg["MONTH"] = pd.to_datetime(budg["MONTH"])
# select cols
budg = budg[["MONTH", "BUDGET_AMOUNT", "DEP_ENT"]]


  budg = pd.read_sql_query(query.read(),conn)



In [19]:
# combine prediction/sales data with budget data
merged2 = pd.merge(merged, budg, how='left', on=['DEP_ENT', 'MONTH'])


In [20]:
# write csv
merged2.to_csv('auto_ts_results.csv')
