# Query Snowflake in Python

# Run in terminal if not already installed
pip install snowflake-connector-python
pip install pandas

In [None]:
# copy and paste in to a new chunk, enter credentials and run to save in environment. Then delete chunk
%env snowflakeuser=<your_snowflake_username>
%env snowflakepass=<your_snowflake_password>

In [None]:
# upgrade conda
conda update -n base -c defaults conda

In [64]:
conda list pandas$

# packages in environment at c:\Users\crudek\Anaconda3:
#
# Name                    Version                   Build  Channel
pandas                    1.4.4            py39hd77b12b_0  

Note: you may need to restart the kernel to use updated packages.


In [None]:
!conda install --yes --prefix {sys.prefix} pandas

In [13]:
# Packages
import snowflake.connector
import pandas as pd
import os
import numpy as np

In [3]:
def snowflake_to_pandas(connection_params, query):
    try:
        # Establish a connection to Snowflake
        conn = snowflake.connector.connect(**connection_params)

        # Execute the SQL query and fetch the results into a DataFrame
        df = pd.read_sql_query(query, conn)

        # Close the connection
        conn.close()

        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

# Snowflake connection parameters
connection_params = {
    "user": os.environ['snowflakeuser'],
    "password": os.environ['snowflakepass'],
    "account": "zib52348.us-east-1",
    "role": "ACCOUNTADMIN",
    "warehouse": "REPORTING",
    "database": "ANALYTICS",
    "schema": "FORECASTING",
}

# SQL command 
query = 'SELECT * FROM "ANALYTICS"."FORECASTING"."sales_fcast_combined_v"'

# Call the function to retrieve the data into a Pandas DataFrame
result_df = snowflake_to_pandas(connection_params, query)

if result_df is not None:
    print(result_df.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to retrieve data from Snowflake.")



   DEP_ENT       MONTH   NET_SALES  BUDGET_AMOUNT  FORECAST
0  240_155  2017-02-01  4153796.71      4198000.0       NaN
1  240_155  2017-03-01  5940741.71      4892400.0       NaN
2  240_155  2017-04-01  4789757.73      5580200.0       NaN
3  240_155  2017-05-01  6356822.97      5298000.0       NaN
4  240_155  2017-06-01  5756533.29      4678000.0       NaN


# Phase 1
Lets understand the packages and process being used and forecast for one department-entity combination

Referecen: https://moez-62905.medium.com/time-series-in-python-10502f9fac2a

In [49]:
# head and tail
print(result_df)

      DEP_ENT      MONTH    NET_SALES  BUDGET_AMOUNT  FORECAST
0     240_155 2017-02-01   4153796.71      4198000.0       NaN
1     240_155 2017-03-01   5940741.71      4892400.0       NaN
2     240_155 2017-04-01   4789757.73      5580200.0       NaN
3     240_155 2017-05-01   6356822.97      5298000.0       NaN
4     240_155 2017-06-01   5756533.29      4678000.0       NaN
...       ...        ...          ...            ...       ...
1260  210_155 2016-07-01   6283148.80     11311652.0       NaN
1261  210_155 2016-08-01   5626725.62     11326527.0       NaN
1262  210_155 2016-09-01  10283086.45     11377763.0       NaN
1263  210_155 2016-10-01   6239512.66     11377763.0       NaN
1264  210_155 2016-11-01  16467966.49     11690688.0       NaN

[1265 rows x 5 columns]


In [50]:
result_df["MONTH"] = pd.to_datetime(result_df["MONTH"])
print(result_df.head())
print(result_df.dtypes)

   DEP_ENT      MONTH   NET_SALES  BUDGET_AMOUNT  FORECAST
0  240_155 2017-02-01  4153796.71      4198000.0       NaN
1  240_155 2017-03-01  5940741.71      4892400.0       NaN
2  240_155 2017-04-01  4789757.73      5580200.0       NaN
3  240_155 2017-05-01  6356822.97      5298000.0       NaN
4  240_155 2017-06-01  5756533.29      4678000.0       NaN
DEP_ENT                  object
MONTH            datetime64[ns]
NET_SALES               float64
BUDGET_AMOUNT           float64
FORECAST                float64
dtype: object


In [51]:
df_200_155 = result_df[result_df["DEP_ENT"] == '200_155']
df_200_155 = df_200_155[df_200_155['MONTH'] <= pd.Timestamp(2023,7,1)] # dont have data beyond july '23 so no way to check model accuracy
df_200_155 = df_200_155[["MONTH", "NET_SALES"]]
print(df_200_155)
print(df_200_155.dtypes)

         MONTH   NET_SALES
192 2016-01-01  2592017.04
193 2016-02-01  2348092.91
194 2016-03-01  2894584.98
195 2016-04-01  2163636.39
196 2016-05-01  3210147.13
..         ...         ...
358 2023-03-01  2325654.24
359 2023-04-01  1918474.98
360 2023-05-01  2443940.14
361 2023-06-01  1748262.30
362 2023-07-01  1787253.79

[91 rows x 2 columns]
MONTH        datetime64[ns]
NET_SALES           float64
dtype: object


In [52]:
# ML Can't deal with dates directly so we need to extract some basic properties

# extract month and year from dates
df_200_155['Month'] = [i.month for i in df_200_155['MONTH']]
df_200_155['Year'] = [i.year for i in df_200_155['MONTH']]
# create a sequence of numbers
df_200_155['Series'] = np.arange(1,len(df_200_155)+1)
print(df_200_155.head())

         MONTH   NET_SALES  Month  Year  Series
192 2016-01-01  2592017.04      1  2016       1
193 2016-02-01  2348092.91      2  2016       2
194 2016-03-01  2894584.98      3  2016       3
195 2016-04-01  2163636.39      4  2016       4
196 2016-05-01  3210147.13      5  2016       5


In [54]:
# Create training dataset
train = df_200_155[df_200_155['MONTH'] <= pd.Timestamp(2022,4,1)] # ~80%
# drop unnecessary columns and re-arrange
train.drop(['MONTH'], axis=1, inplace=True)
train = train[['Series', 'Year', 'Month', 'NET_SALES']] 


# Create testing dataset
test = df_200_155[df_200_155['MONTH'] > pd.Timestamp(2022,4,1)] # ~20%
# drop unnecessary columns and re-arrange
test.drop(['MONTH'], axis=1, inplace=True)
test = test[['Series', 'Year', 'Month', 'NET_SALES']] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['MONTH'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['MONTH'], axis=1, inplace=True)


In [55]:
# good
train.shape, test.shape

((76, 4), (15, 4))

In [57]:
# import the regression module from pycaret
from pycaret import regression

ModuleNotFoundError: No module named 'pycaret'